diff --git a/models/audio/speech_recognition/conformer/igie/README.md b/models/audio/speech_recognition/conformer/igie/README.md index 1596f260f849630df77b19e23e8210e345d0ad16..a7770ff53ba86c234735aef8a7855122e9cdadef 100644 --- a/models/audio/speech_recognition/conformer/igie/README.md +++ b/models/audio/speech_recognition/conformer/igie/README.md @@ -18,10 +18,19 @@ Conformer applies convolution to the Encoder layer of Transformer, enhancing the ### Prepare Resources -Pretrained model: - Dataset: to download the Aishell dataset. +```bash +# Download and put model in conformer_checkpoints +wget http://files.deepspark.org.cn:880/deepspark/conformer_checkpoints.tar +tar xf conformer_checkpoints.tar + +# Prepare AISHELL Data +DATA_DIR=/PATH/to/aishell_test_data +TOOL_DIR="$(pwd)/tools" +bash scripts/aishell_data_prepare.sh ${DATA_DIR} ${TOOL_DIR} +``` + ### Install Dependencies ```bash @@ -32,41 +41,10 @@ yum install sox sox-devel -y apt install sox libsox-fmt-all -y pip3 install -r requirements.txt -cd ctc_decoder/swig && bash setup.sh -cd ../../ -``` - -### Model Conversion - -```bash -tar -zxvf 20211025_conformer_exp.tar.gz - -export PYTHONPATH=`pwd`/wenet:$PYTHONPATH - -# Get Onnx Model -cd wenet -python3 wenet/bin/export_onnx_gpu.py \ - --config ../20211025_conformer_exp/train.yaml \ - --checkpoint ../20211025_conformer_exp/final.pt \ - --batch_size 24 \ - --seq_len 384 \ - --beam 4 \ - --cmvn_file ../20211025_conformer_exp/global_cmvn \ - --output_onnx_dir ../ -cd .. - -# Use onnxsim optimize onnx model -onnxsim encoder_bs24_seq384_static.onnx encoder_bs24_seq384_static_opt.onnx -python3 alter_onnx.py --batch_size 24 --path encoder_bs24_seq384_static_opt.onnx ``` ## Model Inference -```bash -# Need to unzip aishell to the current directory. For details, refer to data.list -tar -zxvf aishell.tar.gz -``` - ### FP16 ```bash @@ -78,6 +56,6 @@ bash scripts/infer_conformer_fp16_performance.sh ## Model Results -| Model | BatchSize | Precision | FPS | ACC | -| :----: | :----: | :----: | :----: | :----: | -| Conformer | 32 | FP16 | 1940.759 | 95.29 | +| Model | BatchSize | Precision | QPS | CER | +| --------- | --------- | --------- | ------- | ------ | +| Conformer | 24 | FP16 | 1408.352 | 0.0497 | diff --git a/models/audio/speech_recognition/conformer/igie/alter_onnx.py b/models/audio/speech_recognition/conformer/igie/alter_onnx.py deleted file mode 100644 index ad1b380d7058d947449c44881d4c4b5e0be53568..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/alter_onnx.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import onnx -from onnx import numpy_helper -import numpy as np -import os -import argparse - -def get_args_parser(add_help=True): - parser = argparse.ArgumentParser(description='alter onnx model', add_help=add_help) - - parser.add_argument('--batch_size', type=int, default=24, help='Model batch size.') - parser.add_argument('--path', type=str, required=True, help='ONNX model path.') - return parser - - -args = get_args_parser().parse_args() - -encoder_onnx_path=args.path -batch_size = args.batch_size -onnx_model = onnx.load(encoder_onnx_path) - - -graph = onnx_model.graph -node = graph.node - -matmul_input_node = [] -for i in range(len(node)): - if node[i].op_type == 'MatMul': - for name in node[i].input: - matmul_input_node.append(name) - -## alter node -for initializer in graph.initializer: - if initializer.name in matmul_input_node: - if initializer.dims[0] == 1: - W = numpy_helper.to_array(initializer) - W_new = [] - for i in range(batch_size): - W_new.append(W[0]) - W_new = np.array(W_new) - tensor = numpy_helper.from_array(W_new, initializer.name) - initializer.CopyFrom(tensor) - initializer.dims[0] = batch_size - -## print node -for initializer in graph.initializer: - if initializer.name in matmul_input_node: - if initializer.dims[0] == 24: - W = numpy_helper.to_array(initializer) - weights_map = {} - weights_map[initializer.name] = W - -onnx_model = onnx.shape_inference.infer_shapes(onnx_model) -onnx.checker.check_model(onnx_model) - -file_name, file_ext = os.path.splitext(encoder_onnx_path) -print("Save New Model to ", file_name + "_matmul.onnx") -onnx.save(onnx_model, file_name + "_matmul.onnx") diff --git a/models/audio/speech_recognition/conformer/igie/build_engine.py b/models/audio/speech_recognition/conformer/igie/build_engine.py index aee72f0ce3ac1cb01e4d9b0e5dabbaf59d120bb3..554d94c9db3e435d20838ce1d165b37405c51cb8 100644 --- a/models/audio/speech_recognition/conformer/igie/build_engine.py +++ b/models/audio/speech_recognition/conformer/igie/build_engine.py @@ -1,83 +1,85 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. +import os +import json +import onnx +import logging +import argparse +import tensorrt +from tensorrt import Dims import tvm -import argparse from tvm import relay from tvm.relay.import_model import import_model_to_igie +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin() + def parse_args(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description="Build tensorrt engine of deepspeech2") + parser.add_argument("--onnx_model", type=str, required=True, help="The onnx path") + parser.add_argument("--bsz", type=int, default=1, help="batch size") + parser.add_argument("--input_size", type=tuple, default=(-1, 161), help="inference size") + parser.add_argument("--engine_path", type=str, required=True, help="engine path to save") + parser.add_argument( "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4") - parser.add_argument("--model_path", - type=str, - required=True, - help="original model path.") - - parser.add_argument("--engine_path", - type=str, - required=True, - help="igie export engine path.") - - parser.add_argument("--input", - type=str, - nargs='+', - required=True, - help=""" - input info of the model, format should be: - input_name:input_shape - eg: --input input:1,3,224,224. - """) - - parser.add_argument("--precision", - type=str, - choices=["fp32", "fp16", "int8"], - required=True, - help="model inference precision.") - args = parser.parse_args() - return args -def main(): - args = parse_args() - # get input valueinfo - input_dict = {} - for input_info in args.input: - input_name, input_shape = input_info.split(":") - shape = tuple([int(s) for s in input_shape.split(",")]) - input_dict[input_name] = shape +def build_engine_trtapi_dynamicshape(args): + onnx_model = args.onnx_model + assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" + IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) + builder = tensorrt.Builder(IXRT_LOGGER) + EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(EXPLICIT_BATCH) + build_config = builder.create_builder_config() + profile = builder.create_optimization_profile() + + profile.set_shape( + "input", Dims([1,1,80]),Dims([16,800,80]),Dims([128,1500,80]) + ) + profile.set_shape( + "seq_lengths", Dims([1]), Dims([16]), Dims([128]) + ) - target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer") + build_config.add_optimization_profile(profile) - mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie") + parser = tensorrt.OnnxParser(network, IXRT_LOGGER) - func = mod["main"] - body = func.body - new_body = relay.Tuple([body[0], body[1], body[2]]) - func = relay.Function(relay.analysis.free_vars(new_body), new_body) - encoder_mod = tvm.IRModule.from_expr(func) - encoder_mod = relay.transform.InferType()(encoder_mod) + parser.parse_from_file(onnx_model) + build_config.set_flag(tensorrt.BuilderFlag.FP16) - # build engine - lib = tvm.relay.build(encoder_mod, target=target, params=params, precision=args.precision) + # set dynamic + input_tensor = network.get_input(0) + input_tensor.shape = Dims([-1, -1, 80]) + + seq_lengths_tensor = network.get_input(1) + seq_lengths_tensor.shape = Dims([-1]) - # export engine - lib.export_library(args.engine_path) + plan = builder.build_serialized_network(network, build_config) + with open(args.engine_path, "wb") as f: + f.write(plan) + print("Build dynamic shape engine done!") + + +def build_engine_igieapi_dynamicshape(args): + onnx_model = args.onnx_model + assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" + + target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer") + device = tvm.device(target.kind.name, 0) + inputs_info = {'input': ([128, 1500, 80], 'float16'), 'seq_lengths': ([128], 'int32')} + precision = "fp16" + + mod, params = import_model_to_igie(onnx_model, inputs_info, outputs_info=None, precision=precision, backend="tensorrt") + lib = relay.build(mod, target=target, params=params, precision=precision, device=device) + lib.export_library(args.engine_path) + + print("Build dynamic shape engine done!") + if __name__ == "__main__": - main() \ No newline at end of file + args = parse_args() + build_engine_trtapi_dynamicshape(args) + diff --git a/models/audio/speech_recognition/conformer/igie/ci/prepare.sh b/models/audio/speech_recognition/conformer/igie/ci/prepare.sh index 4ad9e36177af9375c777c8fa104563d1445a4627..49ec77a0ed67c0a5218b117f8f6145aac817797e 100644 --- a/models/audio/speech_recognition/conformer/igie/ci/prepare.sh +++ b/models/audio/speech_recognition/conformer/igie/ci/prepare.sh @@ -13,13 +13,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - set -x ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') if [[ ${ID} == "ubuntu" ]]; then - apt-get update - apt install sox libsox-fmt-all + apt install sox libsox-fmt-all -y elif [[ ${ID} == "centos" ]]; then yum install sox sox-devel -y else @@ -27,26 +25,9 @@ else fi pip3 install -r requirements.txt -cd ctc_decoder/swig && bash setup.sh -cd ../../ - -# tar -zxvf 20211025_conformer_exp.tar.gz - -# Get Onnx Model -cd wenet -python3 wenet/bin/export_onnx_gpu.py \ - --config ../20211025_conformer_exp/train.yaml \ - --checkpoint ../20211025_conformer_exp/final.pt \ - --batch_size 24 \ - --seq_len 384 \ - --beam 4 \ - --cmvn_file ../20211025_conformer_exp/global_cmvn \ - --output_onnx_dir ../ -cd .. - -# Use onnxsim optimize onnx model -onnxsim encoder_bs24_seq384_static.onnx encoder_bs24_seq384_static_opt.onnx -python3 alter_onnx.py --batch_size 24 --path encoder_bs24_seq384_static_opt.onnx -# Need to unzip aishell to the current directory. For details, refer to data.list -# tar -zxvf aishell.tar.gz +ln -s /mnt/deepspark/data/checkpoints/conformer_checkpoints.tar ./ +tar xf conformer_checkpoints.tar +cp /mnt/deepspark/data/datasets/aishell_test_data.tar ./ +tar xf aishell_test_data.tar +bash scripts/aishell_data_prepare.sh ./aishell_test_data ./tools \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/common.py b/models/audio/speech_recognition/conformer/igie/common.py new file mode 100644 index 0000000000000000000000000000000000000000..6081f807c3a709e8d73f1c1a6bc62185ddcdfc09 --- /dev/null +++ b/models/audio/speech_recognition/conformer/igie/common.py @@ -0,0 +1,107 @@ +import os +import cv2 +import glob +import torch +import tensorrt +import numpy as np +import cuda.cuda as cuda +import cuda.cudart as cudart + + +def trtapi(engine_file): + datatype = tensorrt.DataType.FLOAT + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + with open(engine_file, "rb") as f, tensorrt.Runtime(logger) as runtime: + runtime = tensorrt.Runtime(logger) + assert runtime + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine + context = engine.create_execution_context() + assert context + + return engine, context + + +def create_engine_context(engine_path, logger): + with open(engine_path, "rb") as f: + runtime = tensorrt.Runtime(logger) + assert runtime + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine + context = engine.create_execution_context() + assert context + + return engine, context + +def get_io_bindings(engine): + # Setup I/O bindings + inputs = [] + outputs = [] + allocations = [] + + for i in range(engine.num_bindings): + is_input = False + if engine.binding_is_input(i): + is_input = True + name = engine.get_binding_name(i) + dtype = engine.get_binding_dtype(i) + shape = engine.get_binding_shape(i) + if is_input: + batch_size = shape[0] + size = np.dtype(tensorrt.nptype(dtype)).itemsize + for s in shape: + size *= s + err, allocation = cudart.cudaMalloc(size) + assert(err == cuda.CUresult.CUDA_SUCCESS) + binding = { + "index": i, + "name": name, + "dtype": np.dtype(tensorrt.nptype(dtype)), + "shape": list(shape), + "allocation": allocation, + "nbytes": size, + } + print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}") + allocations.append(allocation) + if engine.binding_is_input(i): + inputs.append(binding) + else: + outputs.append(binding) + return inputs, outputs, allocations + + +def setup_io_bindings(engine, context): + # Setup I/O bindings + inputs = [] + outputs = [] + allocations = [] + + for i in range(engine.num_bindings): + is_input = False + if engine.binding_is_input(i): + is_input = True + name = engine.get_binding_name(i) + dtype = engine.get_binding_dtype(i) + shape = context.get_binding_shape(i) + if is_input: + batch_size = shape[0] + size = np.dtype(tensorrt.nptype(dtype)).itemsize + for s in shape: + size *= s + err, allocation = cudart.cudaMalloc(size) + assert(err == cuda.CUresult.CUDA_SUCCESS) + binding = { + "index": i, + "name": name, + "dtype": np.dtype(tensorrt.nptype(dtype)), + "shape": list(shape), + "allocation": allocation, + "nbytes": size, + } + allocations.append(allocation) + if engine.binding_is_input(i): + inputs.append(binding) + else: + outputs.append(binding) + return inputs, outputs, allocations \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/config.yaml b/models/audio/speech_recognition/conformer/igie/config.yaml deleted file mode 100644 index a5bdda4d400d7d42ac88beb2c95d9c612db5a8dd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/config.yaml +++ /dev/null @@ -1,4 +0,0 @@ -beam_size: 4 -ctc_weight: -1.0 -fp16: false -reverse_weight: -1.0 diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/README.md b/models/audio/speech_recognition/conformer/igie/ctc_decoder/README.md deleted file mode 100644 index d722d30e57770e9b135ae72dbc0b9d90ae62933a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/README.md +++ /dev/null @@ -1,57 +0,0 @@ -## Installation - -We adapted this ctc decoder from [here](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/deepspeech/decoders/swig). -This decoder can only run on cpu. - -* continuous decoding for streaming asr -* support kenlm language model -* multiprocessing - -To install the decoder: -```bash -git clone https://github.com/Slyne/ctc_decoder.git -apt-get update -apt-get install swig -apt-get install python3-dev -cd ctc_decoder/swig && bash setup.sh -``` - -## Usage - -Please refer to ```swig/test/test_en.py``` and ```swig/test/test_zh.py``` for how to do streaming decoding and offline decoding w/o language model. - -### Adding language model -How to build the language model ? -You may refer to [kenlm](https://github.com/kpu/kenlm). -For Mandarin, the input text for language model should be like: -``` -好 好 学 习 ,天 天 向 上 ! -再 接 再 厉 -... -``` -There's a space between two characters. - -For English, the input text is just like the normal text. -``` -Share Market Today - Stock Market and Share Market Live Updates -``` - -How to add language model: -``` -alpha = 0.5 -beta = 0.5 -lm_path = '../kenlm/lm/test.arpa' -scorer = decoder.Scorer(alpha, beta, lm_path, vocab_list) -...... -result1 = decoder.ctc_beam_search_decoder_batch(batch_chunk_log_prob_seq, - batch_chunk_log_probs_idx, - batch_root_trie, - batch_start, - beam_size, num_processes, - blank_id, space_id, - cutoff_prob, scorer) -``` -How language model in called in this implementation of ctc prefix beam search ? - -If the language model is char based (like the Mandarin lm), it will call the language model scorer all the times. -If the language model is word based (like the English lm), it will only call the scorer whenever `space_id` is detected. diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/ctc_beam_search_decoder.cpp b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/ctc_beam_search_decoder.cpp deleted file mode 100644 index 4e85faebd75f23134aa628aa4a236cc3297d58d4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/ctc_beam_search_decoder.cpp +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ctc_beam_search_decoder.h" -#include -#include -#include -#include -#include -#include -#include -#include "ThreadPool/ThreadPool.h" -#include "decoder_utils.h" -#include "fst/fstlib.h" -#include "path_trie.h" - -using FSTMATCH = fst::SortedMatcher; -std::vector>> ctc_beam_search_decoder( - const std::vector> &log_probs_seq, - const std::vector> &log_probs_idx, PathTrie &root, - const bool start, size_t beam_size, int blank_id, int space_id, - double cutoff_prob, Scorer *ext_scorer) { - if (start) { - if (ext_scorer != nullptr && !ext_scorer->is_character_based()) { - auto fst_dict = static_cast(ext_scorer->dictionary); - fst::StdVectorFst *dict_ptr = fst_dict->Copy(true); - root.set_dictionary(dict_ptr); - auto matcher = std::make_shared(*dict_ptr, fst::MATCH_INPUT); - root.set_matcher(matcher); - } - } - int timesteps = log_probs_seq.size(); - - std::vector prefixes; - - // update log probs - if (root.log_prob_b_prev == -NUM_FLT_INF && start) { - root.score = root.log_prob_b_prev = 0.0; - } - root.iterate_to_vec_only(prefixes); - int prev_id = -1; - // prefix search over time - for (size_t time_step = 0; time_step < timesteps; ++time_step) { - float min_cutoff = -NUM_FLT_INF; - bool full_beam = false; - - auto &log_prob = log_probs_seq[time_step]; - auto &log_prob_idx = log_probs_idx[time_step]; - - double top_prob = exp(log_prob[0]); - auto top_id = log_prob_idx[0]; - if (top_prob >= cutoff_prob && top_id == blank_id) - if (prev_id == blank_id) { - continue; // skip this round - } else - prev_id = top_id; - else - prev_id = -1; - - // loop over chars - double cur_acc_prob = 0.0; - for (size_t index = 0; index < log_prob.size(); index++) { - auto c = log_prob_idx[index]; - float log_prob_c = log_prob[index]; - cur_acc_prob += exp(log_prob_c); - if (cur_acc_prob > cutoff_prob && index >= 1) break; - for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) { - auto prefix = prefixes[i]; - if (full_beam && log_prob_c + prefix->score < min_cutoff) { - break; - } - // blank - if (c == blank_id) { - prefix->log_prob_b_cur = - log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score); - continue; - } - // repeated character - if (c == prefix->character) { - prefix->log_prob_nb_cur = log_sum_exp( - prefix->log_prob_nb_cur, log_prob_c + prefix->log_prob_nb_prev); - } - // get new prefix - auto prefix_new = prefix->get_path_trie(c); - if (prefix_new != nullptr) { - float log_p = -NUM_FLT_INF; - - if (c == prefix->character && - prefix->log_prob_b_prev > -NUM_FLT_INF) { - log_p = log_prob_c + prefix->log_prob_b_prev; - } else if (c != prefix->character) { - log_p = log_prob_c + prefix->score; - } - - // language model scoring - if (ext_scorer != nullptr && - (c == space_id || ext_scorer->is_character_based())) { - PathTrie *prefix_to_score = nullptr; - // skip scoring the space - if (ext_scorer->is_character_based()) { - prefix_to_score = prefix_new; - } else { - prefix_to_score = prefix; - } - float score = 0.0; - std::vector ngram; - ngram = ext_scorer->make_ngram(prefix_to_score); - score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha; - log_p += score; - log_p += ext_scorer->beta; - } - - prefix_new->log_prob_nb_cur = - log_sum_exp(prefix_new->log_prob_nb_cur, log_p); - } - } // end of loop over prefix - } // end of loop over vocabulary - prefixes.clear(); - // update log probs - root.iterate_to_vec(prefixes); - // only preserve top beam_size prefixes - if (prefixes.size() >= beam_size) { - std::nth_element(prefixes.begin(), prefixes.begin() + beam_size, - prefixes.end(), prefix_compare); - for (size_t i = beam_size; i < prefixes.size(); ++i) { - prefixes[i]->remove(); - } - } - } // end of loop over time - size_t num_prefixes = std::min(prefixes.size(), beam_size); - std::sort(prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); - return get_beam_search_result(prefixes, beam_size); -} - -std::string map_sent(const std::vector &sent, - const std::vector &vocabulary, bool greedy, - int blank_id) { - std::string output_str; - - if (!greedy) { - for (size_t j = 0; j < sent.size(); j++) { - output_str += vocabulary[sent[j]]; - } - } else { - // greedy search - int prev = -1; - for (size_t i = 0; i < sent.size(); i++) { - int cur = sent[i]; - if (cur != prev && cur != blank_id) output_str += vocabulary[cur]; - prev = cur; - } - } - return output_str; -} - -std::vector map_batch( - const std::vector> &batch_sents, - const std::vector &vocabulary, size_t num_processes, - bool greedy, int blank_id) { - ThreadPool pool(num_processes); - size_t batch_size = batch_sents.size(); - std::vector> res; - for (size_t i = 0; i < batch_size; ++i) { - res.emplace_back(pool.enqueue(map_sent, std::ref(batch_sents[i]), - std::ref(vocabulary), greedy, blank_id)); - } - // get decoding results - std::vector batch_results; - for (size_t i = 0; i < batch_size; ++i) { - batch_results.emplace_back(res[i].get()); - } - return batch_results; -} - -std::vector>>> -ctc_beam_search_decoder_batch( - const std::vector>> &batch_log_probs_seq, - const std::vector>> &batch_log_probs_idx, - std::vector &batch_root_trie, - const std::vector &batch_start, size_t beam_size, - size_t num_processes, int blank_id, int space_id, double cutoff_prob, - Scorer *ext_scorer) { - // thread pool - ThreadPool pool(num_processes); - // number of samples - size_t batch_size = batch_log_probs_seq.size(); - - // enqueue the tasks of decoding - - std::vector>>>> - res; - - for (size_t i = 0; i < batch_size; ++i) { - res.emplace_back( - pool.enqueue(ctc_beam_search_decoder, std::ref(batch_log_probs_seq[i]), - std::ref(batch_log_probs_idx[i]), - std::ref(*batch_root_trie[i]), batch_start[i], beam_size, - blank_id, space_id, cutoff_prob, ext_scorer)); - } - - // get decoding results - std::vector>>> batch_results; - for (size_t i = 0; i < batch_size; ++i) { - batch_results.emplace_back(res[i].get()); - } - return batch_results; -} diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/ctc_beam_search_decoder.h b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/ctc_beam_search_decoder.h deleted file mode 100644 index 8b3921456a73843b1ce9a9936e4fb083c9ae12a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/ctc_beam_search_decoder.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CTC_BEAM_SEARCH_DECODER_H_ -#define CTC_BEAM_SEARCH_DECODER_H_ - -#include -#include -#include -#include "path_trie.h" -#include "scorer.h" - -/* CTC Beam Search Decoder - - * Parameters: - * log_probs_seq: 2-D vector that each element is a vector of log - probabilities - * for one time step, it is sorted (topk) - * log_probs_idx: 2-D vector that the index of every element in - log_probs_seq - * topk index - * root: A PathTrie root - * start: whether this the first chunk of this sequence - * beam_size: The width of beam search. - * blank_id: default is 0 - * space_id: default is -1 - * cutoff_prob: Cutoff probability for pruning. - * ext_scorer: External scorer to evaluate a prefix, which consists of - * n-gram language model scoring and word insertion term. - * Default null, decoding the input sample without scorer. - * Return: - * A vector that each element is a pair of score and decoding result, - * in desending order. -*/ -std::vector>> ctc_beam_search_decoder( - const std::vector> &log_probs_seq, - const std::vector> &log_probs_idx, PathTrie &root, - const bool start, size_t beam_size, int blank_id = 0, int space_id = -1, - double cutoff_prob = 0.999, Scorer *ext_scorer = nullptr); - -/* CTC Beam Search Decoder for batch data - - * Parameters: - * batch_log_probs_seq: 3-D vector that each element is a 2-D vector that - can be used - * by ctc_beam_search_decoder(). - * batch_log_probs_idx: 3-D vector that each element is a 2-D vector that - can be used - * by ctc_beam_search_decoder(). - * batch_root_trie: a batch of Path trie for each sequence - * batch_start: a batch of boolean value to indicate whether this is the - first - * chunk of each sequence - * beam_size: The width of beam search. - * num_processes: Number of threads for beam search. - * blank_id: default blank_id is 0 - * space_id: default space_id is -1, this is for word based scorer - * cutoff_prob: Cutoff probability for pruning. - * ext_scorer: External scorer to evaluate a prefix, which consists of - * n-gram language model scoring and word insertion term. - * Default null, decoding the input sample without scorer. - * Return: - * A 2-D vector that each element is a vector of beam search decoding - * result for one audio sample. -*/ -std::vector>>> -ctc_beam_search_decoder_batch( - const std::vector>> &batch_log_probs_seq, - const std::vector>> &batch_log_probs_idx, - std::vector &batch_root_trie, - const std::vector &batch_start, size_t beam_size, - size_t num_processes, int blank_id = 0, int space_id = -1, - double cutoff_prob = 0.999, Scorer *ext_scorer = nullptr); - -/* Map vector of int to string - - * Parameters: - * sent: a vector of int ids - * vocabulary: vocabulary - * Return: - * A decoded string -*/ -std::string map_sent(const std::vector &sent, - const std::vector &vocabulary, - bool greedy = false, int blank_id = 0); - -/* Map batch vector of int to string - - * Parameters: - * batch_sents: a batch of vector of int ids - * vocabulary: vocabulary - * num_processes: number of processes to use - * Return: - * A vector decoded string -*/ -std::vector map_batch( - const std::vector> &batch_sents, - const std::vector &vocabulary, size_t num_processes, - bool greedy = false, int blank_id = 0); - -#endif // CTC_BEAM_SEARCH_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoder_utils.cpp b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoder_utils.cpp deleted file mode 100644 index 404fefda7188cfdd967a6b1d8a4b733e898a3a79..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoder_utils.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder_utils.h" - -#include -#include -#include - -std::vector> get_pruned_log_probs( - const std::vector &prob_step, double cutoff_prob, - size_t cutoff_top_n) { - std::vector> prob_idx; - for (size_t i = 0; i < prob_step.size(); ++i) { - prob_idx.push_back(std::pair(i, prob_step[i])); - } - // pruning of vacobulary - size_t cutoff_len = prob_step.size(); - if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) { - std::sort(prob_idx.begin(), prob_idx.end(), - pair_comp_second_rev); - if (cutoff_prob < 1.0) { - double cum_prob = 0.0; - cutoff_len = 0; - for (size_t i = 0; i < prob_idx.size(); ++i) { - cum_prob += prob_idx[i].second; - cutoff_len += 1; - if (cum_prob >= cutoff_prob || cutoff_len >= cutoff_top_n) break; - } - } - prob_idx = std::vector>( - prob_idx.begin(), prob_idx.begin() + cutoff_len); - } - std::vector> log_prob_idx; - for (size_t i = 0; i < cutoff_len; ++i) { - log_prob_idx.push_back(std::pair( - prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN))); - } - return log_prob_idx; -} - -std::vector>> get_beam_search_result( - const std::vector &prefixes, size_t beam_size) { - // allow for the post processing - std::vector space_prefixes; - if (space_prefixes.empty()) { - for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { - space_prefixes.push_back(prefixes[i]); - } - } - - std::sort(space_prefixes.begin(), space_prefixes.end(), prefix_compare); - std::vector>> output_vecs; - for (size_t i = 0; i < beam_size && i < space_prefixes.size(); ++i) { - std::vector output; - space_prefixes[i]->get_path_vec(output); - // convert index to string - - std::pair> output_pair(space_prefixes[i]->score, - output); - output_vecs.emplace_back(output_pair); - } - return output_vecs; -} - -size_t get_utf8_str_len(const std::string &str) { - size_t str_len = 0; - for (char c : str) { - str_len += ((c & 0xc0) != 0x80); - } - return str_len; -} - -std::vector split_utf8_str(const std::string &str) { - std::vector result; - std::string out_str; - - for (char c : str) { - if ((c & 0xc0) != 0x80) // new UTF-8 character - { - if (!out_str.empty()) { - result.push_back(out_str); - out_str.clear(); - } - } - - out_str.append(1, c); - } - result.push_back(out_str); - return result; -} - -std::vector split_str(const std::string &s, - const std::string &delim) { - std::vector result; - std::size_t start = 0, delim_len = delim.size(); - while (true) { - std::size_t end = s.find(delim, start); - if (end == std::string::npos) { - if (start < s.size()) { - result.push_back(s.substr(start)); - } - break; - } - if (end > start) { - result.push_back(s.substr(start, end - start)); - } - start = end + delim_len; - } - return result; -} - -bool prefix_compare(const PathTrie *x, const PathTrie *y) { - if (x->score == y->score) { - if (x->character == y->character) { - return false; - } else { - return (x->character < y->character); - } - } else { - return x->score > y->score; - } -} - -void add_word_to_fst(const std::vector &word, - fst::StdVectorFst *dictionary) { - if (dictionary->NumStates() == 0) { - fst::StdVectorFst::StateId start = dictionary->AddState(); - assert(start == 0); - dictionary->SetStart(start); - } - fst::StdVectorFst::StateId src = dictionary->Start(); - fst::StdVectorFst::StateId dst; - for (auto c : word) { - dst = dictionary->AddState(); - dictionary->AddArc(src, fst::StdArc(c, c, 0, dst)); - src = dst; - } - dictionary->SetFinal(dst, fst::StdArc::Weight::One()); -} - -bool add_word_to_dictionary( - const std::string &word, - const std::unordered_map &char_map, bool add_space, - int SPACE_ID, fst::StdVectorFst *dictionary) { - auto characters = split_utf8_str(word); - - std::vector int_word; - - for (auto &c : characters) { - if (c == " ") { - int_word.push_back(SPACE_ID); - } else { - auto int_c = char_map.find(c); - if (int_c != char_map.end()) { - int_word.push_back(int_c->second); - } else { - return false; // return without adding - } - } - } - - if (add_space) { - int_word.push_back(SPACE_ID); - } - - add_word_to_fst(int_word, dictionary); - return true; // return with successful adding -} diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoder_utils.h b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoder_utils.h deleted file mode 100644 index 4d100fb3af5835ec3e08e9f53407948180c4cc66..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoder_utils.h +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_UTILS_H_ -#define DECODER_UTILS_H_ - -#include -#include "fst/log.h" -#include "path_trie.h" - -const float NUM_FLT_INF = std::numeric_limits::max(); -const float NUM_FLT_MIN = std::numeric_limits::min(); - -// inline function for validation check -inline void check(bool x, const char *expr, const char *file, int line, - const char *err) { - if (!x) { - std::cout << "[" << file << ":" << line << "] "; - LOG(FATAL) << "\"" << expr << "\" check failed. " << err; - } -} - -#define VALID_CHECK(x, info) \ - check(static_cast(x), #x, __FILE__, __LINE__, info) -#define VALID_CHECK_EQ(x, y, info) VALID_CHECK((x) == (y), info) -#define VALID_CHECK_GT(x, y, info) VALID_CHECK((x) > (y), info) -#define VALID_CHECK_LT(x, y, info) VALID_CHECK((x) < (y), info) - -// Function template for comparing two pairs -template -bool pair_comp_first_rev(const std::pair &a, - const std::pair &b) { - return a.first > b.first; -} - -// Function template for comparing two pairs -template -bool pair_comp_second_rev(const std::pair &a, - const std::pair &b) { - return a.second > b.second; -} - -// Return the sum of two probabilities in log scale -template -T log_sum_exp(const T &x, const T &y) { - static T num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - T xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -// Get pruned probability vector for each time step's beam search -std::vector> get_pruned_log_probs( - const std::vector &prob_step, double cutoff_prob, - size_t cutoff_top_n); - -// Get beam search result from prefixes in trie tree -std::vector> get_beam_search_result( - const std::vector &prefixes, - const std::vector &vocabulary, size_t beam_size); - -std::vector>> get_beam_search_result( - const std::vector &prefixes, size_t beam_size); - -// Functor for prefix comparsion -bool prefix_compare(const PathTrie *x, const PathTrie *y); - -/* Get length of utf8 encoding string - * See: http://stackoverflow.com/a/4063229 - */ -size_t get_utf8_str_len(const std::string &str); - -/* Split a string into a list of strings on a given string - * delimiter. NB: delimiters on beginning / end of string are - * trimmed. Eg, "FooBarFoo" split on "Foo" returns ["Bar"]. - */ -std::vector split_str(const std::string &s, - const std::string &delim); - -/* Splits string into vector of strings representing - * UTF-8 characters (not same as chars) - */ -std::vector split_utf8_str(const std::string &str); - -// Add a word in index to the dicionary of fst -void add_word_to_fst(const std::vector &word, - fst::StdVectorFst *dictionary); - -// Add a word in string to dictionary -bool add_word_to_dictionary( - const std::string &word, - const std::unordered_map &char_map, bool add_space, - int SPACE_ID, fst::StdVectorFst *dictionary); -#endif // DECODER_UTILS_H diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoders.i b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoders.i deleted file mode 100644 index a53ab46ab1aa32607d7b71dbb252e13c478d9dc6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoders.i +++ /dev/null @@ -1,37 +0,0 @@ -%module swig_decoders -%{ -#include "scorer.h" -#include "ctc_beam_search_decoder.h" -#include "decoder_utils.h" -#include "path_trie.h" -%} - -%include "std_vector.i" -%include "std_pair.i" -%include "std_string.i" -%include "path_trie.h" -%import "decoder_utils.h" - -namespace std { - %template(DoubleVector) std::vector; - %template(IntVector) std::vector; - %template(StringVector) std::vector; - %template(VectorOfStructVectorDouble) std::vector >; - %template(VectorOfStructVectorInt) std::vector>; - %template(FloatVector) std::vector; - %template(Pair) std::pair>; - %template(PairFloatVectorVector) std::vector>>; - %template(PairDoubleVectorVector) std::vector>>; - %template(PairDoubleVectorVector2) std::vector>>>; - %template(DoubleVector3) std::vector>>; - %template(IntVector3) std::vector>>; - %template(TrieVector) std::vector; - %template(BoolVector) std::vector; -} -%template(IntDoublePairCompSecondRev) pair_comp_second_rev; -%template(StringDoublePairCompSecondRev) pair_comp_second_rev; -%template(DoubleStringPairCompFirstRev) pair_comp_first_rev; - -%include "scorer.h" -%include "path_trie.h" -%include "ctc_beam_search_decoder.h" diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoders_wrap.cxx b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoders_wrap.cxx deleted file mode 100644 index b9c3c8aea63dd6efc29f5b937adc615b8835e0ae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/decoders_wrap.cxx +++ /dev/null @@ -1,38204 +0,0 @@ -/* ---------------------------------------------------------------------------- - * This file was automatically generated by SWIG (http://www.swig.org). - * Version 3.0.12 - * - * This file is not intended to be easily readable and contains a number of - * coding conventions designed to improve portability and efficiency. Do not make - * changes to this file unless you know what you are doing--modify the SWIG - * interface file instead. - * ----------------------------------------------------------------------------- */ - - -#ifndef SWIGPYTHON -#define SWIGPYTHON -#endif - -#define SWIG_PYTHON_DIRECTOR_NO_VTABLE - - -#ifdef __cplusplus -/* SwigValueWrapper is described in swig.swg */ -template class SwigValueWrapper { - struct SwigMovePointer { - T *ptr; - SwigMovePointer(T *p) : ptr(p) { } - ~SwigMovePointer() { delete ptr; } - SwigMovePointer& operator=(SwigMovePointer& rhs) { T* oldptr = ptr; ptr = 0; delete oldptr; ptr = rhs.ptr; rhs.ptr = 0; return *this; } - } pointer; - SwigValueWrapper& operator=(const SwigValueWrapper& rhs); - SwigValueWrapper(const SwigValueWrapper& rhs); -public: - SwigValueWrapper() : pointer(0) { } - SwigValueWrapper& operator=(const T& t) { SwigMovePointer tmp(new T(t)); pointer = tmp; return *this; } - operator T&() const { return *pointer.ptr; } - T *operator&() { return pointer.ptr; } -}; - -template T SwigValueInit() { - return T(); -} -#endif - -/* ----------------------------------------------------------------------------- - * This section contains generic SWIG labels for method/variable - * declarations/attributes, and other compiler dependent labels. - * ----------------------------------------------------------------------------- */ - -/* template workaround for compilers that cannot correctly implement the C++ standard */ -#ifndef SWIGTEMPLATEDISAMBIGUATOR -# if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x560) -# define SWIGTEMPLATEDISAMBIGUATOR template -# elif defined(__HP_aCC) -/* Needed even with `aCC -AA' when `aCC -V' reports HP ANSI C++ B3910B A.03.55 */ -/* If we find a maximum version that requires this, the test would be __HP_aCC <= 35500 for A.03.55 */ -# define SWIGTEMPLATEDISAMBIGUATOR template -# else -# define SWIGTEMPLATEDISAMBIGUATOR -# endif -#endif - -/* inline attribute */ -#ifndef SWIGINLINE -# if defined(__cplusplus) || (defined(__GNUC__) && !defined(__STRICT_ANSI__)) -# define SWIGINLINE inline -# else -# define SWIGINLINE -# endif -#endif - -/* attribute recognised by some compilers to avoid 'unused' warnings */ -#ifndef SWIGUNUSED -# if defined(__GNUC__) -# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) -# define SWIGUNUSED __attribute__ ((__unused__)) -# else -# define SWIGUNUSED -# endif -# elif defined(__ICC) -# define SWIGUNUSED __attribute__ ((__unused__)) -# else -# define SWIGUNUSED -# endif -#endif - -#ifndef SWIG_MSC_UNSUPPRESS_4505 -# if defined(_MSC_VER) -# pragma warning(disable : 4505) /* unreferenced local function has been removed */ -# endif -#endif - -#ifndef SWIGUNUSEDPARM -# ifdef __cplusplus -# define SWIGUNUSEDPARM(p) -# else -# define SWIGUNUSEDPARM(p) p SWIGUNUSED -# endif -#endif - -/* internal SWIG method */ -#ifndef SWIGINTERN -# define SWIGINTERN static SWIGUNUSED -#endif - -/* internal inline SWIG method */ -#ifndef SWIGINTERNINLINE -# define SWIGINTERNINLINE SWIGINTERN SWIGINLINE -#endif - -/* exporting methods */ -#if defined(__GNUC__) -# if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) -# ifndef GCC_HASCLASSVISIBILITY -# define GCC_HASCLASSVISIBILITY -# endif -# endif -#endif - -#ifndef SWIGEXPORT -# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) -# if defined(STATIC_LINKED) -# define SWIGEXPORT -# else -# define SWIGEXPORT __declspec(dllexport) -# endif -# else -# if defined(__GNUC__) && defined(GCC_HASCLASSVISIBILITY) -# define SWIGEXPORT __attribute__ ((visibility("default"))) -# else -# define SWIGEXPORT -# endif -# endif -#endif - -/* calling conventions for Windows */ -#ifndef SWIGSTDCALL -# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) -# define SWIGSTDCALL __stdcall -# else -# define SWIGSTDCALL -# endif -#endif - -/* Deal with Microsoft's attempt at deprecating C standard runtime functions */ -#if !defined(SWIG_NO_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE) -# define _CRT_SECURE_NO_DEPRECATE -#endif - -/* Deal with Microsoft's attempt at deprecating methods in the standard C++ library */ -#if !defined(SWIG_NO_SCL_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_SCL_SECURE_NO_DEPRECATE) -# define _SCL_SECURE_NO_DEPRECATE -#endif - -/* Deal with Apple's deprecated 'AssertMacros.h' from Carbon-framework */ -#if defined(__APPLE__) && !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES) -# define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0 -#endif - -/* Intel's compiler complains if a variable which was never initialised is - * cast to void, which is a common idiom which we use to indicate that we - * are aware a variable isn't used. So we just silence that warning. - * See: https://github.com/swig/swig/issues/192 for more discussion. - */ -#ifdef __INTEL_COMPILER -# pragma warning disable 592 -#endif - - -#if defined(_DEBUG) && defined(SWIG_PYTHON_INTERPRETER_NO_DEBUG) -/* Use debug wrappers with the Python release dll */ -# undef _DEBUG -# include -# define _DEBUG -#else -# include -#endif - -/* ----------------------------------------------------------------------------- - * swigrun.swg - * - * This file contains generic C API SWIG runtime support for pointer - * type checking. - * ----------------------------------------------------------------------------- */ - -/* This should only be incremented when either the layout of swig_type_info changes, - or for whatever reason, the runtime changes incompatibly */ -#define SWIG_RUNTIME_VERSION "4" - -/* define SWIG_TYPE_TABLE_NAME as "SWIG_TYPE_TABLE" */ -#ifdef SWIG_TYPE_TABLE -# define SWIG_QUOTE_STRING(x) #x -# define SWIG_EXPAND_AND_QUOTE_STRING(x) SWIG_QUOTE_STRING(x) -# define SWIG_TYPE_TABLE_NAME SWIG_EXPAND_AND_QUOTE_STRING(SWIG_TYPE_TABLE) -#else -# define SWIG_TYPE_TABLE_NAME -#endif - -/* - You can use the SWIGRUNTIME and SWIGRUNTIMEINLINE macros for - creating a static or dynamic library from the SWIG runtime code. - In 99.9% of the cases, SWIG just needs to declare them as 'static'. - - But only do this if strictly necessary, ie, if you have problems - with your compiler or suchlike. -*/ - -#ifndef SWIGRUNTIME -# define SWIGRUNTIME SWIGINTERN -#endif - -#ifndef SWIGRUNTIMEINLINE -# define SWIGRUNTIMEINLINE SWIGRUNTIME SWIGINLINE -#endif - -/* Generic buffer size */ -#ifndef SWIG_BUFFER_SIZE -# define SWIG_BUFFER_SIZE 1024 -#endif - -/* Flags for pointer conversions */ -#define SWIG_POINTER_DISOWN 0x1 -#define SWIG_CAST_NEW_MEMORY 0x2 - -/* Flags for new pointer objects */ -#define SWIG_POINTER_OWN 0x1 - - -/* - Flags/methods for returning states. - - The SWIG conversion methods, as ConvertPtr, return an integer - that tells if the conversion was successful or not. And if not, - an error code can be returned (see swigerrors.swg for the codes). - - Use the following macros/flags to set or process the returning - states. - - In old versions of SWIG, code such as the following was usually written: - - if (SWIG_ConvertPtr(obj,vptr,ty.flags) != -1) { - // success code - } else { - //fail code - } - - Now you can be more explicit: - - int res = SWIG_ConvertPtr(obj,vptr,ty.flags); - if (SWIG_IsOK(res)) { - // success code - } else { - // fail code - } - - which is the same really, but now you can also do - - Type *ptr; - int res = SWIG_ConvertPtr(obj,(void **)(&ptr),ty.flags); - if (SWIG_IsOK(res)) { - // success code - if (SWIG_IsNewObj(res) { - ... - delete *ptr; - } else { - ... - } - } else { - // fail code - } - - I.e., now SWIG_ConvertPtr can return new objects and you can - identify the case and take care of the deallocation. Of course that - also requires SWIG_ConvertPtr to return new result values, such as - - int SWIG_ConvertPtr(obj, ptr,...) { - if () { - if () { - *ptr = ; - return SWIG_NEWOBJ; - } else { - *ptr = ; - return SWIG_OLDOBJ; - } - } else { - return SWIG_BADOBJ; - } - } - - Of course, returning the plain '0(success)/-1(fail)' still works, but you can be - more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the - SWIG errors code. - - Finally, if the SWIG_CASTRANK_MODE is enabled, the result code - allows to return the 'cast rank', for example, if you have this - - int food(double) - int fooi(int); - - and you call - - food(1) // cast rank '1' (1 -> 1.0) - fooi(1) // cast rank '0' - - just use the SWIG_AddCast()/SWIG_CheckState() -*/ - -#define SWIG_OK (0) -#define SWIG_ERROR (-1) -#define SWIG_IsOK(r) (r >= 0) -#define SWIG_ArgError(r) ((r != SWIG_ERROR) ? r : SWIG_TypeError) - -/* The CastRankLimit says how many bits are used for the cast rank */ -#define SWIG_CASTRANKLIMIT (1 << 8) -/* The NewMask denotes the object was created (using new/malloc) */ -#define SWIG_NEWOBJMASK (SWIG_CASTRANKLIMIT << 1) -/* The TmpMask is for in/out typemaps that use temporal objects */ -#define SWIG_TMPOBJMASK (SWIG_NEWOBJMASK << 1) -/* Simple returning values */ -#define SWIG_BADOBJ (SWIG_ERROR) -#define SWIG_OLDOBJ (SWIG_OK) -#define SWIG_NEWOBJ (SWIG_OK | SWIG_NEWOBJMASK) -#define SWIG_TMPOBJ (SWIG_OK | SWIG_TMPOBJMASK) -/* Check, add and del mask methods */ -#define SWIG_AddNewMask(r) (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r) -#define SWIG_DelNewMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r) -#define SWIG_IsNewObj(r) (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK)) -#define SWIG_AddTmpMask(r) (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r) -#define SWIG_DelTmpMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r) -#define SWIG_IsTmpObj(r) (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK)) - -/* Cast-Rank Mode */ -#if defined(SWIG_CASTRANK_MODE) -# ifndef SWIG_TypeRank -# define SWIG_TypeRank unsigned long -# endif -# ifndef SWIG_MAXCASTRANK /* Default cast allowed */ -# define SWIG_MAXCASTRANK (2) -# endif -# define SWIG_CASTRANKMASK ((SWIG_CASTRANKLIMIT) -1) -# define SWIG_CastRank(r) (r & SWIG_CASTRANKMASK) -SWIGINTERNINLINE int SWIG_AddCast(int r) { - return SWIG_IsOK(r) ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR) : r; -} -SWIGINTERNINLINE int SWIG_CheckState(int r) { - return SWIG_IsOK(r) ? SWIG_CastRank(r) + 1 : 0; -} -#else /* no cast-rank mode */ -# define SWIG_AddCast(r) (r) -# define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0) -#endif - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void *(*swig_converter_func)(void *, int *); -typedef struct swig_type_info *(*swig_dycast_func)(void **); - -/* Structure to store information on one type */ -typedef struct swig_type_info { - const char *name; /* mangled name of this type */ - const char *str; /* human readable name of this type */ - swig_dycast_func dcast; /* dynamic cast function down a hierarchy */ - struct swig_cast_info *cast; /* linked list of types that can cast into this type */ - void *clientdata; /* language specific type data */ - int owndata; /* flag if the structure owns the clientdata */ -} swig_type_info; - -/* Structure to store a type and conversion function used for casting */ -typedef struct swig_cast_info { - swig_type_info *type; /* pointer to type that is equivalent to this type */ - swig_converter_func converter; /* function to cast the void pointers */ - struct swig_cast_info *next; /* pointer to next cast in linked list */ - struct swig_cast_info *prev; /* pointer to the previous cast */ -} swig_cast_info; - -/* Structure used to store module information - * Each module generates one structure like this, and the runtime collects - * all of these structures and stores them in a circularly linked list.*/ -typedef struct swig_module_info { - swig_type_info **types; /* Array of pointers to swig_type_info structures that are in this module */ - size_t size; /* Number of types in this module */ - struct swig_module_info *next; /* Pointer to next element in circularly linked list */ - swig_type_info **type_initial; /* Array of initially generated type structures */ - swig_cast_info **cast_initial; /* Array of initially generated casting structures */ - void *clientdata; /* Language specific module data */ -} swig_module_info; - -/* - Compare two type names skipping the space characters, therefore - "char*" == "char *" and "Class" == "Class", etc. - - Return 0 when the two name types are equivalent, as in - strncmp, but skipping ' '. -*/ -SWIGRUNTIME int -SWIG_TypeNameComp(const char *f1, const char *l1, - const char *f2, const char *l2) { - for (;(f1 != l1) && (f2 != l2); ++f1, ++f2) { - while ((*f1 == ' ') && (f1 != l1)) ++f1; - while ((*f2 == ' ') && (f2 != l2)) ++f2; - if (*f1 != *f2) return (*f1 > *f2) ? 1 : -1; - } - return (int)((l1 - f1) - (l2 - f2)); -} - -/* - Check type equivalence in a name list like ||... - Return 0 if equal, -1 if nb < tb, 1 if nb > tb -*/ -SWIGRUNTIME int -SWIG_TypeCmp(const char *nb, const char *tb) { - int equiv = 1; - const char* te = tb + strlen(tb); - const char* ne = nb; - while (equiv != 0 && *ne) { - for (nb = ne; *ne; ++ne) { - if (*ne == '|') break; - } - equiv = SWIG_TypeNameComp(nb, ne, tb, te); - if (*ne) ++ne; - } - return equiv; -} - -/* - Check type equivalence in a name list like ||... - Return 0 if not equal, 1 if equal -*/ -SWIGRUNTIME int -SWIG_TypeEquiv(const char *nb, const char *tb) { - return SWIG_TypeCmp(nb, tb) == 0 ? 1 : 0; -} - -/* - Check the typename -*/ -SWIGRUNTIME swig_cast_info * -SWIG_TypeCheck(const char *c, swig_type_info *ty) { - if (ty) { - swig_cast_info *iter = ty->cast; - while (iter) { - if (strcmp(iter->type->name, c) == 0) { - if (iter == ty->cast) - return iter; - /* Move iter to the top of the linked list */ - iter->prev->next = iter->next; - if (iter->next) - iter->next->prev = iter->prev; - iter->next = ty->cast; - iter->prev = 0; - if (ty->cast) ty->cast->prev = iter; - ty->cast = iter; - return iter; - } - iter = iter->next; - } - } - return 0; -} - -/* - Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer comparison -*/ -SWIGRUNTIME swig_cast_info * -SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) { - if (ty) { - swig_cast_info *iter = ty->cast; - while (iter) { - if (iter->type == from) { - if (iter == ty->cast) - return iter; - /* Move iter to the top of the linked list */ - iter->prev->next = iter->next; - if (iter->next) - iter->next->prev = iter->prev; - iter->next = ty->cast; - iter->prev = 0; - if (ty->cast) ty->cast->prev = iter; - ty->cast = iter; - return iter; - } - iter = iter->next; - } - } - return 0; -} - -/* - Cast a pointer up an inheritance hierarchy -*/ -SWIGRUNTIMEINLINE void * -SWIG_TypeCast(swig_cast_info *ty, void *ptr, int *newmemory) { - return ((!ty) || (!ty->converter)) ? ptr : (*ty->converter)(ptr, newmemory); -} - -/* - Dynamic pointer casting. Down an inheritance hierarchy -*/ -SWIGRUNTIME swig_type_info * -SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) { - swig_type_info *lastty = ty; - if (!ty || !ty->dcast) return ty; - while (ty && (ty->dcast)) { - ty = (*ty->dcast)(ptr); - if (ty) lastty = ty; - } - return lastty; -} - -/* - Return the name associated with this type -*/ -SWIGRUNTIMEINLINE const char * -SWIG_TypeName(const swig_type_info *ty) { - return ty->name; -} - -/* - Return the pretty name associated with this type, - that is an unmangled type name in a form presentable to the user. -*/ -SWIGRUNTIME const char * -SWIG_TypePrettyName(const swig_type_info *type) { - /* The "str" field contains the equivalent pretty names of the - type, separated by vertical-bar characters. We choose - to print the last name, as it is often (?) the most - specific. */ - if (!type) return NULL; - if (type->str != NULL) { - const char *last_name = type->str; - const char *s; - for (s = type->str; *s; s++) - if (*s == '|') last_name = s+1; - return last_name; - } - else - return type->name; -} - -/* - Set the clientdata field for a type -*/ -SWIGRUNTIME void -SWIG_TypeClientData(swig_type_info *ti, void *clientdata) { - swig_cast_info *cast = ti->cast; - /* if (ti->clientdata == clientdata) return; */ - ti->clientdata = clientdata; - - while (cast) { - if (!cast->converter) { - swig_type_info *tc = cast->type; - if (!tc->clientdata) { - SWIG_TypeClientData(tc, clientdata); - } - } - cast = cast->next; - } -} -SWIGRUNTIME void -SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) { - SWIG_TypeClientData(ti, clientdata); - ti->owndata = 1; -} - -/* - Search for a swig_type_info structure only by mangled name - Search is a O(log #types) - - We start searching at module start, and finish searching when start == end. - Note: if start == end at the beginning of the function, we go all the way around - the circular list. -*/ -SWIGRUNTIME swig_type_info * -SWIG_MangledTypeQueryModule(swig_module_info *start, - swig_module_info *end, - const char *name) { - swig_module_info *iter = start; - do { - if (iter->size) { - size_t l = 0; - size_t r = iter->size - 1; - do { - /* since l+r >= 0, we can (>> 1) instead (/ 2) */ - size_t i = (l + r) >> 1; - const char *iname = iter->types[i]->name; - if (iname) { - int compare = strcmp(name, iname); - if (compare == 0) { - return iter->types[i]; - } else if (compare < 0) { - if (i) { - r = i - 1; - } else { - break; - } - } else if (compare > 0) { - l = i + 1; - } - } else { - break; /* should never happen */ - } - } while (l <= r); - } - iter = iter->next; - } while (iter != end); - return 0; -} - -/* - Search for a swig_type_info structure for either a mangled name or a human readable name. - It first searches the mangled names of the types, which is a O(log #types) - If a type is not found it then searches the human readable names, which is O(#types). - - We start searching at module start, and finish searching when start == end. - Note: if start == end at the beginning of the function, we go all the way around - the circular list. -*/ -SWIGRUNTIME swig_type_info * -SWIG_TypeQueryModule(swig_module_info *start, - swig_module_info *end, - const char *name) { - /* STEP 1: Search the name field using binary search */ - swig_type_info *ret = SWIG_MangledTypeQueryModule(start, end, name); - if (ret) { - return ret; - } else { - /* STEP 2: If the type hasn't been found, do a complete search - of the str field (the human readable name) */ - swig_module_info *iter = start; - do { - size_t i = 0; - for (; i < iter->size; ++i) { - if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name))) - return iter->types[i]; - } - iter = iter->next; - } while (iter != end); - } - - /* neither found a match */ - return 0; -} - -/* - Pack binary data into a string -*/ -SWIGRUNTIME char * -SWIG_PackData(char *c, void *ptr, size_t sz) { - static const char hex[17] = "0123456789abcdef"; - const unsigned char *u = (unsigned char *) ptr; - const unsigned char *eu = u + sz; - for (; u != eu; ++u) { - unsigned char uu = *u; - *(c++) = hex[(uu & 0xf0) >> 4]; - *(c++) = hex[uu & 0xf]; - } - return c; -} - -/* - Unpack binary data from a string -*/ -SWIGRUNTIME const char * -SWIG_UnpackData(const char *c, void *ptr, size_t sz) { - unsigned char *u = (unsigned char *) ptr; - const unsigned char *eu = u + sz; - for (; u != eu; ++u) { - char d = *(c++); - unsigned char uu; - if ((d >= '0') && (d <= '9')) - uu = (unsigned char)((d - '0') << 4); - else if ((d >= 'a') && (d <= 'f')) - uu = (unsigned char)((d - ('a'-10)) << 4); - else - return (char *) 0; - d = *(c++); - if ((d >= '0') && (d <= '9')) - uu |= (unsigned char)(d - '0'); - else if ((d >= 'a') && (d <= 'f')) - uu |= (unsigned char)(d - ('a'-10)); - else - return (char *) 0; - *u = uu; - } - return c; -} - -/* - Pack 'void *' into a string buffer. -*/ -SWIGRUNTIME char * -SWIG_PackVoidPtr(char *buff, void *ptr, const char *name, size_t bsz) { - char *r = buff; - if ((2*sizeof(void *) + 2) > bsz) return 0; - *(r++) = '_'; - r = SWIG_PackData(r,&ptr,sizeof(void *)); - if (strlen(name) + 1 > (bsz - (r - buff))) return 0; - strcpy(r,name); - return buff; -} - -SWIGRUNTIME const char * -SWIG_UnpackVoidPtr(const char *c, void **ptr, const char *name) { - if (*c != '_') { - if (strcmp(c,"NULL") == 0) { - *ptr = (void *) 0; - return name; - } else { - return 0; - } - } - return SWIG_UnpackData(++c,ptr,sizeof(void *)); -} - -SWIGRUNTIME char * -SWIG_PackDataName(char *buff, void *ptr, size_t sz, const char *name, size_t bsz) { - char *r = buff; - size_t lname = (name ? strlen(name) : 0); - if ((2*sz + 2 + lname) > bsz) return 0; - *(r++) = '_'; - r = SWIG_PackData(r,ptr,sz); - if (lname) { - strncpy(r,name,lname+1); - } else { - *r = 0; - } - return buff; -} - -SWIGRUNTIME const char * -SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) { - if (*c != '_') { - if (strcmp(c,"NULL") == 0) { - memset(ptr,0,sz); - return name; - } else { - return 0; - } - } - return SWIG_UnpackData(++c,ptr,sz); -} - -#ifdef __cplusplus -} -#endif - -/* Errors in SWIG */ -#define SWIG_UnknownError -1 -#define SWIG_IOError -2 -#define SWIG_RuntimeError -3 -#define SWIG_IndexError -4 -#define SWIG_TypeError -5 -#define SWIG_DivisionByZero -6 -#define SWIG_OverflowError -7 -#define SWIG_SyntaxError -8 -#define SWIG_ValueError -9 -#define SWIG_SystemError -10 -#define SWIG_AttributeError -11 -#define SWIG_MemoryError -12 -#define SWIG_NullReferenceError -13 - - - -/* Compatibility macros for Python 3 */ -#if PY_VERSION_HEX >= 0x03000000 - -#define PyClass_Check(obj) PyObject_IsInstance(obj, (PyObject *)&PyType_Type) -#define PyInt_Check(x) PyLong_Check(x) -#define PyInt_AsLong(x) PyLong_AsLong(x) -#define PyInt_FromLong(x) PyLong_FromLong(x) -#define PyInt_FromSize_t(x) PyLong_FromSize_t(x) -#define PyString_Check(name) PyBytes_Check(name) -#define PyString_FromString(x) PyUnicode_FromString(x) -#define PyString_Format(fmt, args) PyUnicode_Format(fmt, args) -#define PyString_AsString(str) PyBytes_AsString(str) -#define PyString_Size(str) PyBytes_Size(str) -#define PyString_InternFromString(key) PyUnicode_InternFromString(key) -#define Py_TPFLAGS_HAVE_CLASS Py_TPFLAGS_BASETYPE -#define PyString_AS_STRING(x) PyUnicode_AS_STRING(x) -#define _PyLong_FromSsize_t(x) PyLong_FromSsize_t(x) - -#endif - -#ifndef Py_TYPE -# define Py_TYPE(op) ((op)->ob_type) -#endif - -/* SWIG APIs for compatibility of both Python 2 & 3 */ - -#if PY_VERSION_HEX >= 0x03000000 -# define SWIG_Python_str_FromFormat PyUnicode_FromFormat -#else -# define SWIG_Python_str_FromFormat PyString_FromFormat -#endif - - -/* Warning: This function will allocate a new string in Python 3, - * so please call SWIG_Python_str_DelForPy3(x) to free the space. - */ -SWIGINTERN char* -SWIG_Python_str_AsChar(PyObject *str) -{ -#if PY_VERSION_HEX >= 0x03000000 - char *cstr; - char *newstr; - Py_ssize_t len; - str = PyUnicode_AsUTF8String(str); - PyBytes_AsStringAndSize(str, &cstr, &len); - newstr = (char *) malloc(len+1); - memcpy(newstr, cstr, len+1); - Py_XDECREF(str); - return newstr; -#else - return PyString_AsString(str); -#endif -} - -#if PY_VERSION_HEX >= 0x03000000 -# define SWIG_Python_str_DelForPy3(x) free( (void*) (x) ) -#else -# define SWIG_Python_str_DelForPy3(x) -#endif - - -SWIGINTERN PyObject* -SWIG_Python_str_FromChar(const char *c) -{ -#if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_FromString(c); -#else - return PyString_FromString(c); -#endif -} - -/* Add PyOS_snprintf for old Pythons */ -#if PY_VERSION_HEX < 0x02020000 -# if defined(_MSC_VER) || defined(__BORLANDC__) || defined(_WATCOM) -# define PyOS_snprintf _snprintf -# else -# define PyOS_snprintf snprintf -# endif -#endif - -/* A crude PyString_FromFormat implementation for old Pythons */ -#if PY_VERSION_HEX < 0x02020000 - -#ifndef SWIG_PYBUFFER_SIZE -# define SWIG_PYBUFFER_SIZE 1024 -#endif - -static PyObject * -PyString_FromFormat(const char *fmt, ...) { - va_list ap; - char buf[SWIG_PYBUFFER_SIZE * 2]; - int res; - va_start(ap, fmt); - res = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - return (res < 0 || res >= (int)sizeof(buf)) ? 0 : PyString_FromString(buf); -} -#endif - -#ifndef PyObject_DEL -# define PyObject_DEL PyObject_Del -#endif - -/* A crude PyExc_StopIteration exception for old Pythons */ -#if PY_VERSION_HEX < 0x02020000 -# ifndef PyExc_StopIteration -# define PyExc_StopIteration PyExc_RuntimeError -# endif -# ifndef PyObject_GenericGetAttr -# define PyObject_GenericGetAttr 0 -# endif -#endif - -/* Py_NotImplemented is defined in 2.1 and up. */ -#if PY_VERSION_HEX < 0x02010000 -# ifndef Py_NotImplemented -# define Py_NotImplemented PyExc_RuntimeError -# endif -#endif - -/* A crude PyString_AsStringAndSize implementation for old Pythons */ -#if PY_VERSION_HEX < 0x02010000 -# ifndef PyString_AsStringAndSize -# define PyString_AsStringAndSize(obj, s, len) {*s = PyString_AsString(obj); *len = *s ? strlen(*s) : 0;} -# endif -#endif - -/* PySequence_Size for old Pythons */ -#if PY_VERSION_HEX < 0x02000000 -# ifndef PySequence_Size -# define PySequence_Size PySequence_Length -# endif -#endif - -/* PyBool_FromLong for old Pythons */ -#if PY_VERSION_HEX < 0x02030000 -static -PyObject *PyBool_FromLong(long ok) -{ - PyObject *result = ok ? Py_True : Py_False; - Py_INCREF(result); - return result; -} -#endif - -/* Py_ssize_t for old Pythons */ -/* This code is as recommended by: */ -/* http://www.python.org/dev/peps/pep-0353/#conversion-guidelines */ -#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) -typedef int Py_ssize_t; -# define PY_SSIZE_T_MAX INT_MAX -# define PY_SSIZE_T_MIN INT_MIN -typedef inquiry lenfunc; -typedef intargfunc ssizeargfunc; -typedef intintargfunc ssizessizeargfunc; -typedef intobjargproc ssizeobjargproc; -typedef intintobjargproc ssizessizeobjargproc; -typedef getreadbufferproc readbufferproc; -typedef getwritebufferproc writebufferproc; -typedef getsegcountproc segcountproc; -typedef getcharbufferproc charbufferproc; -static long PyNumber_AsSsize_t (PyObject *x, void *SWIGUNUSEDPARM(exc)) -{ - long result = 0; - PyObject *i = PyNumber_Int(x); - if (i) { - result = PyInt_AsLong(i); - Py_DECREF(i); - } - return result; -} -#endif - -#if PY_VERSION_HEX < 0x02050000 -#define PyInt_FromSize_t(x) PyInt_FromLong((long)x) -#endif - -#if PY_VERSION_HEX < 0x02040000 -#define Py_VISIT(op) \ - do { \ - if (op) { \ - int vret = visit((op), arg); \ - if (vret) \ - return vret; \ - } \ - } while (0) -#endif - -#if PY_VERSION_HEX < 0x02030000 -typedef struct { - PyTypeObject type; - PyNumberMethods as_number; - PyMappingMethods as_mapping; - PySequenceMethods as_sequence; - PyBufferProcs as_buffer; - PyObject *name, *slots; -} PyHeapTypeObject; -#endif - -#if PY_VERSION_HEX < 0x02030000 -typedef destructor freefunc; -#endif - -#if ((PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 6) || \ - (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 0) || \ - (PY_MAJOR_VERSION > 3)) -# define SWIGPY_USE_CAPSULE -# define SWIGPY_CAPSULE_NAME ((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION ".type_pointer_capsule" SWIG_TYPE_TABLE_NAME) -#endif - -#if PY_VERSION_HEX < 0x03020000 -#define PyDescr_TYPE(x) (((PyDescrObject *)(x))->d_type) -#define PyDescr_NAME(x) (((PyDescrObject *)(x))->d_name) -#define Py_hash_t long -#endif - -/* ----------------------------------------------------------------------------- - * error manipulation - * ----------------------------------------------------------------------------- */ - -SWIGRUNTIME PyObject* -SWIG_Python_ErrorType(int code) { - PyObject* type = 0; - switch(code) { - case SWIG_MemoryError: - type = PyExc_MemoryError; - break; - case SWIG_IOError: - type = PyExc_IOError; - break; - case SWIG_RuntimeError: - type = PyExc_RuntimeError; - break; - case SWIG_IndexError: - type = PyExc_IndexError; - break; - case SWIG_TypeError: - type = PyExc_TypeError; - break; - case SWIG_DivisionByZero: - type = PyExc_ZeroDivisionError; - break; - case SWIG_OverflowError: - type = PyExc_OverflowError; - break; - case SWIG_SyntaxError: - type = PyExc_SyntaxError; - break; - case SWIG_ValueError: - type = PyExc_ValueError; - break; - case SWIG_SystemError: - type = PyExc_SystemError; - break; - case SWIG_AttributeError: - type = PyExc_AttributeError; - break; - default: - type = PyExc_RuntimeError; - } - return type; -} - - -SWIGRUNTIME void -SWIG_Python_AddErrorMsg(const char* mesg) -{ - PyObject *type = 0; - PyObject *value = 0; - PyObject *traceback = 0; - - if (PyErr_Occurred()) PyErr_Fetch(&type, &value, &traceback); - if (value) { - char *tmp; - PyObject *old_str = PyObject_Str(value); - PyErr_Clear(); - Py_XINCREF(type); - - PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), mesg); - SWIG_Python_str_DelForPy3(tmp); - Py_DECREF(old_str); - Py_DECREF(value); - } else { - PyErr_SetString(PyExc_RuntimeError, mesg); - } -} - -#if defined(SWIG_PYTHON_NO_THREADS) -# if defined(SWIG_PYTHON_THREADS) -# undef SWIG_PYTHON_THREADS -# endif -#endif -#if defined(SWIG_PYTHON_THREADS) /* Threading support is enabled */ -# if !defined(SWIG_PYTHON_USE_GIL) && !defined(SWIG_PYTHON_NO_USE_GIL) -# if (PY_VERSION_HEX >= 0x02030000) /* For 2.3 or later, use the PyGILState calls */ -# define SWIG_PYTHON_USE_GIL -# endif -# endif -# if defined(SWIG_PYTHON_USE_GIL) /* Use PyGILState threads calls */ -# ifndef SWIG_PYTHON_INITIALIZE_THREADS -# define SWIG_PYTHON_INITIALIZE_THREADS PyEval_InitThreads() -# endif -# ifdef __cplusplus /* C++ code */ - class SWIG_Python_Thread_Block { - bool status; - PyGILState_STATE state; - public: - void end() { if (status) { PyGILState_Release(state); status = false;} } - SWIG_Python_Thread_Block() : status(true), state(PyGILState_Ensure()) {} - ~SWIG_Python_Thread_Block() { end(); } - }; - class SWIG_Python_Thread_Allow { - bool status; - PyThreadState *save; - public: - void end() { if (status) { PyEval_RestoreThread(save); status = false; }} - SWIG_Python_Thread_Allow() : status(true), save(PyEval_SaveThread()) {} - ~SWIG_Python_Thread_Allow() { end(); } - }; -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK SWIG_Python_Thread_Block _swig_thread_block -# define SWIG_PYTHON_THREAD_END_BLOCK _swig_thread_block.end() -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW SWIG_Python_Thread_Allow _swig_thread_allow -# define SWIG_PYTHON_THREAD_END_ALLOW _swig_thread_allow.end() -# else /* C code */ -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK PyGILState_STATE _swig_thread_block = PyGILState_Ensure() -# define SWIG_PYTHON_THREAD_END_BLOCK PyGILState_Release(_swig_thread_block) -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW PyThreadState *_swig_thread_allow = PyEval_SaveThread() -# define SWIG_PYTHON_THREAD_END_ALLOW PyEval_RestoreThread(_swig_thread_allow) -# endif -# else /* Old thread way, not implemented, user must provide it */ -# if !defined(SWIG_PYTHON_INITIALIZE_THREADS) -# define SWIG_PYTHON_INITIALIZE_THREADS -# endif -# if !defined(SWIG_PYTHON_THREAD_BEGIN_BLOCK) -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK -# endif -# if !defined(SWIG_PYTHON_THREAD_END_BLOCK) -# define SWIG_PYTHON_THREAD_END_BLOCK -# endif -# if !defined(SWIG_PYTHON_THREAD_BEGIN_ALLOW) -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW -# endif -# if !defined(SWIG_PYTHON_THREAD_END_ALLOW) -# define SWIG_PYTHON_THREAD_END_ALLOW -# endif -# endif -#else /* No thread support */ -# define SWIG_PYTHON_INITIALIZE_THREADS -# define SWIG_PYTHON_THREAD_BEGIN_BLOCK -# define SWIG_PYTHON_THREAD_END_BLOCK -# define SWIG_PYTHON_THREAD_BEGIN_ALLOW -# define SWIG_PYTHON_THREAD_END_ALLOW -#endif - -/* ----------------------------------------------------------------------------- - * Python API portion that goes into the runtime - * ----------------------------------------------------------------------------- */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* ----------------------------------------------------------------------------- - * Constant declarations - * ----------------------------------------------------------------------------- */ - -/* Constant Types */ -#define SWIG_PY_POINTER 4 -#define SWIG_PY_BINARY 5 - -/* Constant information structure */ -typedef struct swig_const_info { - int type; - char *name; - long lvalue; - double dvalue; - void *pvalue; - swig_type_info **ptype; -} swig_const_info; - - -/* ----------------------------------------------------------------------------- - * Wrapper of PyInstanceMethod_New() used in Python 3 - * It is exported to the generated module, used for -fastproxy - * ----------------------------------------------------------------------------- */ -#if PY_VERSION_HEX >= 0x03000000 -SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *func) -{ - return PyInstanceMethod_New(func); -} -#else -SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *SWIGUNUSEDPARM(func)) -{ - return NULL; -} -#endif - -#ifdef __cplusplus -} -#endif - - -/* ----------------------------------------------------------------------------- - * pyrun.swg - * - * This file contains the runtime support for Python modules - * and includes code for managing global variables and pointer - * type checking. - * - * ----------------------------------------------------------------------------- */ - -/* Common SWIG API */ - -/* for raw pointers */ -#define SWIG_Python_ConvertPtr(obj, pptr, type, flags) SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, 0) -#define SWIG_ConvertPtr(obj, pptr, type, flags) SWIG_Python_ConvertPtr(obj, pptr, type, flags) -#define SWIG_ConvertPtrAndOwn(obj,pptr,type,flags,own) SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, own) - -#ifdef SWIGPYTHON_BUILTIN -#define SWIG_NewPointerObj(ptr, type, flags) SWIG_Python_NewPointerObj(self, ptr, type, flags) -#else -#define SWIG_NewPointerObj(ptr, type, flags) SWIG_Python_NewPointerObj(NULL, ptr, type, flags) -#endif - -#define SWIG_InternalNewPointerObj(ptr, type, flags) SWIG_Python_NewPointerObj(NULL, ptr, type, flags) - -#define SWIG_CheckImplicit(ty) SWIG_Python_CheckImplicit(ty) -#define SWIG_AcquirePtr(ptr, src) SWIG_Python_AcquirePtr(ptr, src) -#define swig_owntype int - -/* for raw packed data */ -#define SWIG_ConvertPacked(obj, ptr, sz, ty) SWIG_Python_ConvertPacked(obj, ptr, sz, ty) -#define SWIG_NewPackedObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type) - -/* for class or struct pointers */ -#define SWIG_ConvertInstance(obj, pptr, type, flags) SWIG_ConvertPtr(obj, pptr, type, flags) -#define SWIG_NewInstanceObj(ptr, type, flags) SWIG_NewPointerObj(ptr, type, flags) - -/* for C or C++ function pointers */ -#define SWIG_ConvertFunctionPtr(obj, pptr, type) SWIG_Python_ConvertFunctionPtr(obj, pptr, type) -#define SWIG_NewFunctionPtrObj(ptr, type) SWIG_Python_NewPointerObj(NULL, ptr, type, 0) - -/* for C++ member pointers, ie, member methods */ -#define SWIG_ConvertMember(obj, ptr, sz, ty) SWIG_Python_ConvertPacked(obj, ptr, sz, ty) -#define SWIG_NewMemberObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type) - - -/* Runtime API */ - -#define SWIG_GetModule(clientdata) SWIG_Python_GetModule(clientdata) -#define SWIG_SetModule(clientdata, pointer) SWIG_Python_SetModule(pointer) -#define SWIG_NewClientData(obj) SwigPyClientData_New(obj) - -#define SWIG_SetErrorObj SWIG_Python_SetErrorObj -#define SWIG_SetErrorMsg SWIG_Python_SetErrorMsg -#define SWIG_ErrorType(code) SWIG_Python_ErrorType(code) -#define SWIG_Error(code, msg) SWIG_Python_SetErrorMsg(SWIG_ErrorType(code), msg) -#define SWIG_fail goto fail - - -/* Runtime API implementation */ - -/* Error manipulation */ - -SWIGINTERN void -SWIG_Python_SetErrorObj(PyObject *errtype, PyObject *obj) { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - PyErr_SetObject(errtype, obj); - Py_DECREF(obj); - SWIG_PYTHON_THREAD_END_BLOCK; -} - -SWIGINTERN void -SWIG_Python_SetErrorMsg(PyObject *errtype, const char *msg) { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - PyErr_SetString(errtype, msg); - SWIG_PYTHON_THREAD_END_BLOCK; -} - -#define SWIG_Python_Raise(obj, type, desc) SWIG_Python_SetErrorObj(SWIG_Python_ExceptionType(desc), obj) - -/* Set a constant value */ - -#if defined(SWIGPYTHON_BUILTIN) - -SWIGINTERN void -SwigPyBuiltin_AddPublicSymbol(PyObject *seq, const char *key) { - PyObject *s = PyString_InternFromString(key); - PyList_Append(seq, s); - Py_DECREF(s); -} - -SWIGINTERN void -SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, const char *name, PyObject *obj) { -#if PY_VERSION_HEX < 0x02030000 - PyDict_SetItemString(d, (char *)name, obj); -#else - PyDict_SetItemString(d, name, obj); -#endif - Py_DECREF(obj); - if (public_interface) - SwigPyBuiltin_AddPublicSymbol(public_interface, name); -} - -#else - -SWIGINTERN void -SWIG_Python_SetConstant(PyObject *d, const char *name, PyObject *obj) { -#if PY_VERSION_HEX < 0x02030000 - PyDict_SetItemString(d, (char *)name, obj); -#else - PyDict_SetItemString(d, name, obj); -#endif - Py_DECREF(obj); -} - -#endif - -/* Append a value to the result obj */ - -SWIGINTERN PyObject* -SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) { -#if !defined(SWIG_PYTHON_OUTPUT_TUPLE) - if (!result) { - result = obj; - } else if (result == Py_None) { - Py_DECREF(result); - result = obj; - } else { - if (!PyList_Check(result)) { - PyObject *o2 = result; - result = PyList_New(1); - PyList_SetItem(result, 0, o2); - } - PyList_Append(result,obj); - Py_DECREF(obj); - } - return result; -#else - PyObject* o2; - PyObject* o3; - if (!result) { - result = obj; - } else if (result == Py_None) { - Py_DECREF(result); - result = obj; - } else { - if (!PyTuple_Check(result)) { - o2 = result; - result = PyTuple_New(1); - PyTuple_SET_ITEM(result, 0, o2); - } - o3 = PyTuple_New(1); - PyTuple_SET_ITEM(o3, 0, obj); - o2 = result; - result = PySequence_Concat(o2, o3); - Py_DECREF(o2); - Py_DECREF(o3); - } - return result; -#endif -} - -/* Unpack the argument tuple */ - -SWIGINTERN Py_ssize_t -SWIG_Python_UnpackTuple(PyObject *args, const char *name, Py_ssize_t min, Py_ssize_t max, PyObject **objs) -{ - if (!args) { - if (!min && !max) { - return 1; - } else { - PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got none", - name, (min == max ? "" : "at least "), (int)min); - return 0; - } - } - if (!PyTuple_Check(args)) { - if (min <= 1 && max >= 1) { - Py_ssize_t i; - objs[0] = args; - for (i = 1; i < max; ++i) { - objs[i] = 0; - } - return 2; - } - PyErr_SetString(PyExc_SystemError, "UnpackTuple() argument list is not a tuple"); - return 0; - } else { - Py_ssize_t l = PyTuple_GET_SIZE(args); - if (l < min) { - PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", - name, (min == max ? "" : "at least "), (int)min, (int)l); - return 0; - } else if (l > max) { - PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", - name, (min == max ? "" : "at most "), (int)max, (int)l); - return 0; - } else { - Py_ssize_t i; - for (i = 0; i < l; ++i) { - objs[i] = PyTuple_GET_ITEM(args, i); - } - for (; l < max; ++l) { - objs[l] = 0; - } - return i + 1; - } - } -} - -/* A functor is a function object with one single object argument */ -#if PY_VERSION_HEX >= 0x02020000 -#define SWIG_Python_CallFunctor(functor, obj) PyObject_CallFunctionObjArgs(functor, obj, NULL); -#else -#define SWIG_Python_CallFunctor(functor, obj) PyObject_CallFunction(functor, "O", obj); -#endif - -/* - Helper for static pointer initialization for both C and C++ code, for example - static PyObject *SWIG_STATIC_POINTER(MyVar) = NewSomething(...); -*/ -#ifdef __cplusplus -#define SWIG_STATIC_POINTER(var) var -#else -#define SWIG_STATIC_POINTER(var) var = 0; if (!var) var -#endif - -/* ----------------------------------------------------------------------------- - * Pointer declarations - * ----------------------------------------------------------------------------- */ - -/* Flags for new pointer objects */ -#define SWIG_POINTER_NOSHADOW (SWIG_POINTER_OWN << 1) -#define SWIG_POINTER_NEW (SWIG_POINTER_NOSHADOW | SWIG_POINTER_OWN) - -#define SWIG_POINTER_IMPLICIT_CONV (SWIG_POINTER_DISOWN << 1) - -#define SWIG_BUILTIN_TP_INIT (SWIG_POINTER_OWN << 2) -#define SWIG_BUILTIN_INIT (SWIG_BUILTIN_TP_INIT | SWIG_POINTER_OWN) - -#ifdef __cplusplus -extern "C" { -#endif - -/* How to access Py_None */ -#if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) -# ifndef SWIG_PYTHON_NO_BUILD_NONE -# ifndef SWIG_PYTHON_BUILD_NONE -# define SWIG_PYTHON_BUILD_NONE -# endif -# endif -#endif - -#ifdef SWIG_PYTHON_BUILD_NONE -# ifdef Py_None -# undef Py_None -# define Py_None SWIG_Py_None() -# endif -SWIGRUNTIMEINLINE PyObject * -_SWIG_Py_None(void) -{ - PyObject *none = Py_BuildValue((char*)""); - Py_DECREF(none); - return none; -} -SWIGRUNTIME PyObject * -SWIG_Py_None(void) -{ - static PyObject *SWIG_STATIC_POINTER(none) = _SWIG_Py_None(); - return none; -} -#endif - -/* The python void return value */ - -SWIGRUNTIMEINLINE PyObject * -SWIG_Py_Void(void) -{ - PyObject *none = Py_None; - Py_INCREF(none); - return none; -} - -/* SwigPyClientData */ - -typedef struct { - PyObject *klass; - PyObject *newraw; - PyObject *newargs; - PyObject *destroy; - int delargs; - int implicitconv; - PyTypeObject *pytype; -} SwigPyClientData; - -SWIGRUNTIMEINLINE int -SWIG_Python_CheckImplicit(swig_type_info *ty) -{ - SwigPyClientData *data = (SwigPyClientData *)ty->clientdata; - return data ? data->implicitconv : 0; -} - -SWIGRUNTIMEINLINE PyObject * -SWIG_Python_ExceptionType(swig_type_info *desc) { - SwigPyClientData *data = desc ? (SwigPyClientData *) desc->clientdata : 0; - PyObject *klass = data ? data->klass : 0; - return (klass ? klass : PyExc_RuntimeError); -} - - -SWIGRUNTIME SwigPyClientData * -SwigPyClientData_New(PyObject* obj) -{ - if (!obj) { - return 0; - } else { - SwigPyClientData *data = (SwigPyClientData *)malloc(sizeof(SwigPyClientData)); - /* the klass element */ - data->klass = obj; - Py_INCREF(data->klass); - /* the newraw method and newargs arguments used to create a new raw instance */ - if (PyClass_Check(obj)) { - data->newraw = 0; - data->newargs = obj; - Py_INCREF(obj); - } else { -#if (PY_VERSION_HEX < 0x02020000) - data->newraw = 0; -#else - data->newraw = PyObject_GetAttrString(data->klass, (char *)"__new__"); -#endif - if (data->newraw) { - Py_INCREF(data->newraw); - data->newargs = PyTuple_New(1); - PyTuple_SetItem(data->newargs, 0, obj); - } else { - data->newargs = obj; - } - Py_INCREF(data->newargs); - } - /* the destroy method, aka as the C++ delete method */ - data->destroy = PyObject_GetAttrString(data->klass, (char *)"__swig_destroy__"); - if (PyErr_Occurred()) { - PyErr_Clear(); - data->destroy = 0; - } - if (data->destroy) { - int flags; - Py_INCREF(data->destroy); - flags = PyCFunction_GET_FLAGS(data->destroy); -#ifdef METH_O - data->delargs = !(flags & (METH_O)); -#else - data->delargs = 0; -#endif - } else { - data->delargs = 0; - } - data->implicitconv = 0; - data->pytype = 0; - return data; - } -} - -SWIGRUNTIME void -SwigPyClientData_Del(SwigPyClientData *data) { - Py_XDECREF(data->newraw); - Py_XDECREF(data->newargs); - Py_XDECREF(data->destroy); -} - -/* =============== SwigPyObject =====================*/ - -typedef struct { - PyObject_HEAD - void *ptr; - swig_type_info *ty; - int own; - PyObject *next; -#ifdef SWIGPYTHON_BUILTIN - PyObject *dict; -#endif -} SwigPyObject; - - -#ifdef SWIGPYTHON_BUILTIN - -SWIGRUNTIME PyObject * -SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) -{ - SwigPyObject *sobj = (SwigPyObject *)v; - - if (!sobj->dict) - sobj->dict = PyDict_New(); - - Py_INCREF(sobj->dict); - return sobj->dict; -} - -#endif - -SWIGRUNTIME PyObject * -SwigPyObject_long(SwigPyObject *v) -{ - return PyLong_FromVoidPtr(v->ptr); -} - -SWIGRUNTIME PyObject * -SwigPyObject_format(const char* fmt, SwigPyObject *v) -{ - PyObject *res = NULL; - PyObject *args = PyTuple_New(1); - if (args) { - if (PyTuple_SetItem(args, 0, SwigPyObject_long(v)) == 0) { - PyObject *ofmt = SWIG_Python_str_FromChar(fmt); - if (ofmt) { -#if PY_VERSION_HEX >= 0x03000000 - res = PyUnicode_Format(ofmt,args); -#else - res = PyString_Format(ofmt,args); -#endif - Py_DECREF(ofmt); - } - Py_DECREF(args); - } - } - return res; -} - -SWIGRUNTIME PyObject * -SwigPyObject_oct(SwigPyObject *v) -{ - return SwigPyObject_format("%o",v); -} - -SWIGRUNTIME PyObject * -SwigPyObject_hex(SwigPyObject *v) -{ - return SwigPyObject_format("%x",v); -} - -SWIGRUNTIME PyObject * -#ifdef METH_NOARGS -SwigPyObject_repr(SwigPyObject *v) -#else -SwigPyObject_repr(SwigPyObject *v, PyObject *args) -#endif -{ - const char *name = SWIG_TypePrettyName(v->ty); - PyObject *repr = SWIG_Python_str_FromFormat("", (name ? name : "unknown"), (void *)v); - if (v->next) { -# ifdef METH_NOARGS - PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next); -# else - PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next, args); -# endif -# if PY_VERSION_HEX >= 0x03000000 - PyObject *joined = PyUnicode_Concat(repr, nrep); - Py_DecRef(repr); - Py_DecRef(nrep); - repr = joined; -# else - PyString_ConcatAndDel(&repr,nrep); -# endif - } - return repr; -} - -SWIGRUNTIME int -SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w) -{ - void *i = v->ptr; - void *j = w->ptr; - return (i < j) ? -1 : ((i > j) ? 1 : 0); -} - -/* Added for Python 3.x, would it also be useful for Python 2.x? */ -SWIGRUNTIME PyObject* -SwigPyObject_richcompare(SwigPyObject *v, SwigPyObject *w, int op) -{ - PyObject* res; - if( op != Py_EQ && op != Py_NE ) { - Py_INCREF(Py_NotImplemented); - return Py_NotImplemented; - } - res = PyBool_FromLong( (SwigPyObject_compare(v, w)==0) == (op == Py_EQ) ? 1 : 0); - return res; -} - - -SWIGRUNTIME PyTypeObject* SwigPyObject_TypeOnce(void); - -#ifdef SWIGPYTHON_BUILTIN -static swig_type_info *SwigPyObject_stype = 0; -SWIGRUNTIME PyTypeObject* -SwigPyObject_type(void) { - SwigPyClientData *cd; - assert(SwigPyObject_stype); - cd = (SwigPyClientData*) SwigPyObject_stype->clientdata; - assert(cd); - assert(cd->pytype); - return cd->pytype; -} -#else -SWIGRUNTIME PyTypeObject* -SwigPyObject_type(void) { - static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyObject_TypeOnce(); - return type; -} -#endif - -SWIGRUNTIMEINLINE int -SwigPyObject_Check(PyObject *op) { -#ifdef SWIGPYTHON_BUILTIN - PyTypeObject *target_tp = SwigPyObject_type(); - if (PyType_IsSubtype(op->ob_type, target_tp)) - return 1; - return (strcmp(op->ob_type->tp_name, "SwigPyObject") == 0); -#else - return (Py_TYPE(op) == SwigPyObject_type()) - || (strcmp(Py_TYPE(op)->tp_name,"SwigPyObject") == 0); -#endif -} - -SWIGRUNTIME PyObject * -SwigPyObject_New(void *ptr, swig_type_info *ty, int own); - -SWIGRUNTIME void -SwigPyObject_dealloc(PyObject *v) -{ - SwigPyObject *sobj = (SwigPyObject *) v; - PyObject *next = sobj->next; - if (sobj->own == SWIG_POINTER_OWN) { - swig_type_info *ty = sobj->ty; - SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0; - PyObject *destroy = data ? data->destroy : 0; - if (destroy) { - /* destroy is always a VARARGS method */ - PyObject *res; - - /* PyObject_CallFunction() has the potential to silently drop - the active active exception. In cases of unnamed temporary - variable or where we just finished iterating over a generator - StopIteration will be active right now, and this needs to - remain true upon return from SwigPyObject_dealloc. So save - and restore. */ - - PyObject *val = NULL, *type = NULL, *tb = NULL; - PyErr_Fetch(&val, &type, &tb); - - if (data->delargs) { - /* we need to create a temporary object to carry the destroy operation */ - PyObject *tmp = SwigPyObject_New(sobj->ptr, ty, 0); - res = SWIG_Python_CallFunctor(destroy, tmp); - Py_DECREF(tmp); - } else { - PyCFunction meth = PyCFunction_GET_FUNCTION(destroy); - PyObject *mself = PyCFunction_GET_SELF(destroy); - res = ((*meth)(mself, v)); - } - if (!res) - PyErr_WriteUnraisable(destroy); - - PyErr_Restore(val, type, tb); - - Py_XDECREF(res); - } -#if !defined(SWIG_PYTHON_SILENT_MEMLEAK) - else { - const char *name = SWIG_TypePrettyName(ty); - printf("swig/python detected a memory leak of type '%s', no destructor found.\n", (name ? name : "unknown")); - } -#endif - } - Py_XDECREF(next); - PyObject_DEL(v); -} - -SWIGRUNTIME PyObject* -SwigPyObject_append(PyObject* v, PyObject* next) -{ - SwigPyObject *sobj = (SwigPyObject *) v; -#ifndef METH_O - PyObject *tmp = 0; - if (!PyArg_ParseTuple(next,(char *)"O:append", &tmp)) return NULL; - next = tmp; -#endif - if (!SwigPyObject_Check(next)) { - PyErr_SetString(PyExc_TypeError, "Attempt to append a non SwigPyObject"); - return NULL; - } - sobj->next = next; - Py_INCREF(next); - return SWIG_Py_Void(); -} - -SWIGRUNTIME PyObject* -#ifdef METH_NOARGS -SwigPyObject_next(PyObject* v) -#else -SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) -#endif -{ - SwigPyObject *sobj = (SwigPyObject *) v; - if (sobj->next) { - Py_INCREF(sobj->next); - return sobj->next; - } else { - return SWIG_Py_Void(); - } -} - -SWIGINTERN PyObject* -#ifdef METH_NOARGS -SwigPyObject_disown(PyObject *v) -#else -SwigPyObject_disown(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) -#endif -{ - SwigPyObject *sobj = (SwigPyObject *)v; - sobj->own = 0; - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject* -#ifdef METH_NOARGS -SwigPyObject_acquire(PyObject *v) -#else -SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args)) -#endif -{ - SwigPyObject *sobj = (SwigPyObject *)v; - sobj->own = SWIG_POINTER_OWN; - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject* -SwigPyObject_own(PyObject *v, PyObject *args) -{ - PyObject *val = 0; -#if (PY_VERSION_HEX < 0x02020000) - if (!PyArg_ParseTuple(args,(char *)"|O:own",&val)) -#elif (PY_VERSION_HEX < 0x02050000) - if (!PyArg_UnpackTuple(args, (char *)"own", 0, 1, &val)) -#else - if (!PyArg_UnpackTuple(args, "own", 0, 1, &val)) -#endif - { - return NULL; - } - else - { - SwigPyObject *sobj = (SwigPyObject *)v; - PyObject *obj = PyBool_FromLong(sobj->own); - if (val) { -#ifdef METH_NOARGS - if (PyObject_IsTrue(val)) { - SwigPyObject_acquire(v); - } else { - SwigPyObject_disown(v); - } -#else - if (PyObject_IsTrue(val)) { - SwigPyObject_acquire(v,args); - } else { - SwigPyObject_disown(v,args); - } -#endif - } - return obj; - } -} - -#ifdef METH_O -static PyMethodDef -swigobject_methods[] = { - {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_NOARGS, (char *)"releases ownership of the pointer"}, - {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS, (char *)"acquires ownership of the pointer"}, - {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS, (char *)"returns/sets ownership of the pointer"}, - {(char *)"append", (PyCFunction)SwigPyObject_append, METH_O, (char *)"appends another 'this' object"}, - {(char *)"next", (PyCFunction)SwigPyObject_next, METH_NOARGS, (char *)"returns the next 'this' object"}, - {(char *)"__repr__",(PyCFunction)SwigPyObject_repr, METH_NOARGS, (char *)"returns object representation"}, - {0, 0, 0, 0} -}; -#else -static PyMethodDef -swigobject_methods[] = { - {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_VARARGS, (char *)"releases ownership of the pointer"}, - {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_VARARGS, (char *)"acquires ownership of the pointer"}, - {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS, (char *)"returns/sets ownership of the pointer"}, - {(char *)"append", (PyCFunction)SwigPyObject_append, METH_VARARGS, (char *)"appends another 'this' object"}, - {(char *)"next", (PyCFunction)SwigPyObject_next, METH_VARARGS, (char *)"returns the next 'this' object"}, - {(char *)"__repr__",(PyCFunction)SwigPyObject_repr, METH_VARARGS, (char *)"returns object representation"}, - {0, 0, 0, 0} -}; -#endif - -#if PY_VERSION_HEX < 0x02020000 -SWIGINTERN PyObject * -SwigPyObject_getattr(SwigPyObject *sobj,char *name) -{ - return Py_FindMethod(swigobject_methods, (PyObject *)sobj, name); -} -#endif - -SWIGRUNTIME PyTypeObject* -SwigPyObject_TypeOnce(void) { - static char swigobject_doc[] = "Swig object carries a C/C++ instance pointer"; - - static PyNumberMethods SwigPyObject_as_number = { - (binaryfunc)0, /*nb_add*/ - (binaryfunc)0, /*nb_subtract*/ - (binaryfunc)0, /*nb_multiply*/ - /* nb_divide removed in Python 3 */ -#if PY_VERSION_HEX < 0x03000000 - (binaryfunc)0, /*nb_divide*/ -#endif - (binaryfunc)0, /*nb_remainder*/ - (binaryfunc)0, /*nb_divmod*/ - (ternaryfunc)0,/*nb_power*/ - (unaryfunc)0, /*nb_negative*/ - (unaryfunc)0, /*nb_positive*/ - (unaryfunc)0, /*nb_absolute*/ - (inquiry)0, /*nb_nonzero*/ - 0, /*nb_invert*/ - 0, /*nb_lshift*/ - 0, /*nb_rshift*/ - 0, /*nb_and*/ - 0, /*nb_xor*/ - 0, /*nb_or*/ -#if PY_VERSION_HEX < 0x03000000 - 0, /*nb_coerce*/ -#endif - (unaryfunc)SwigPyObject_long, /*nb_int*/ -#if PY_VERSION_HEX < 0x03000000 - (unaryfunc)SwigPyObject_long, /*nb_long*/ -#else - 0, /*nb_reserved*/ -#endif - (unaryfunc)0, /*nb_float*/ -#if PY_VERSION_HEX < 0x03000000 - (unaryfunc)SwigPyObject_oct, /*nb_oct*/ - (unaryfunc)SwigPyObject_hex, /*nb_hex*/ -#endif -#if PY_VERSION_HEX >= 0x03050000 /* 3.5 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_matrix_multiply */ -#elif PY_VERSION_HEX >= 0x03000000 /* 3.0 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index, nb_inplace_divide removed */ -#elif PY_VERSION_HEX >= 0x02050000 /* 2.5.0 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index */ -#elif PY_VERSION_HEX >= 0x02020000 /* 2.2.0 */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_true_divide */ -#elif PY_VERSION_HEX >= 0x02000000 /* 2.0.0 */ - 0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_or */ -#endif - }; - - static PyTypeObject swigpyobject_type; - static int type_init = 0; - if (!type_init) { - const PyTypeObject tmp = { -#if PY_VERSION_HEX >= 0x03000000 - PyVarObject_HEAD_INIT(NULL, 0) -#else - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ -#endif - (char *)"SwigPyObject", /* tp_name */ - sizeof(SwigPyObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)SwigPyObject_dealloc, /* tp_dealloc */ - 0, /* tp_print */ -#if PY_VERSION_HEX < 0x02020000 - (getattrfunc)SwigPyObject_getattr, /* tp_getattr */ -#else - (getattrfunc)0, /* tp_getattr */ -#endif - (setattrfunc)0, /* tp_setattr */ -#if PY_VERSION_HEX >= 0x03000000 - 0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */ -#else - (cmpfunc)SwigPyObject_compare, /* tp_compare */ -#endif - (reprfunc)SwigPyObject_repr, /* tp_repr */ - &SwigPyObject_as_number, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - (hashfunc)0, /* tp_hash */ - (ternaryfunc)0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - swigobject_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - (richcmpfunc)SwigPyObject_richcompare,/* tp_richcompare */ - 0, /* tp_weaklistoffset */ -#if PY_VERSION_HEX >= 0x02020000 - 0, /* tp_iter */ - 0, /* tp_iternext */ - swigobject_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ - 0, /* tp_free */ - 0, /* tp_is_gc */ - 0, /* tp_bases */ - 0, /* tp_mro */ - 0, /* tp_cache */ - 0, /* tp_subclasses */ - 0, /* tp_weaklist */ -#endif -#if PY_VERSION_HEX >= 0x02030000 - 0, /* tp_del */ -#endif -#if PY_VERSION_HEX >= 0x02060000 - 0, /* tp_version_tag */ -#endif -#if PY_VERSION_HEX >= 0x03040000 - 0, /* tp_finalize */ -#endif -#ifdef COUNT_ALLOCS - 0, /* tp_allocs */ - 0, /* tp_frees */ - 0, /* tp_maxalloc */ -#if PY_VERSION_HEX >= 0x02050000 - 0, /* tp_prev */ -#endif - 0 /* tp_next */ -#endif - }; - swigpyobject_type = tmp; - type_init = 1; -#if PY_VERSION_HEX < 0x02020000 - swigpyobject_type.ob_type = &PyType_Type; -#else - if (PyType_Ready(&swigpyobject_type) < 0) - return NULL; -#endif - } - return &swigpyobject_type; -} - -SWIGRUNTIME PyObject * -SwigPyObject_New(void *ptr, swig_type_info *ty, int own) -{ - SwigPyObject *sobj = PyObject_NEW(SwigPyObject, SwigPyObject_type()); - if (sobj) { - sobj->ptr = ptr; - sobj->ty = ty; - sobj->own = own; - sobj->next = 0; - } - return (PyObject *)sobj; -} - -/* ----------------------------------------------------------------------------- - * Implements a simple Swig Packed type, and use it instead of string - * ----------------------------------------------------------------------------- */ - -typedef struct { - PyObject_HEAD - void *pack; - swig_type_info *ty; - size_t size; -} SwigPyPacked; - -SWIGRUNTIME int -SwigPyPacked_print(SwigPyPacked *v, FILE *fp, int SWIGUNUSEDPARM(flags)) -{ - char result[SWIG_BUFFER_SIZE]; - fputs("pack, v->size, 0, sizeof(result))) { - fputs("at ", fp); - fputs(result, fp); - } - fputs(v->ty->name,fp); - fputs(">", fp); - return 0; -} - -SWIGRUNTIME PyObject * -SwigPyPacked_repr(SwigPyPacked *v) -{ - char result[SWIG_BUFFER_SIZE]; - if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) { - return SWIG_Python_str_FromFormat("", result, v->ty->name); - } else { - return SWIG_Python_str_FromFormat("", v->ty->name); - } -} - -SWIGRUNTIME PyObject * -SwigPyPacked_str(SwigPyPacked *v) -{ - char result[SWIG_BUFFER_SIZE]; - if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))){ - return SWIG_Python_str_FromFormat("%s%s", result, v->ty->name); - } else { - return SWIG_Python_str_FromChar(v->ty->name); - } -} - -SWIGRUNTIME int -SwigPyPacked_compare(SwigPyPacked *v, SwigPyPacked *w) -{ - size_t i = v->size; - size_t j = w->size; - int s = (i < j) ? -1 : ((i > j) ? 1 : 0); - return s ? s : strncmp((char *)v->pack, (char *)w->pack, 2*v->size); -} - -SWIGRUNTIME PyTypeObject* SwigPyPacked_TypeOnce(void); - -SWIGRUNTIME PyTypeObject* -SwigPyPacked_type(void) { - static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyPacked_TypeOnce(); - return type; -} - -SWIGRUNTIMEINLINE int -SwigPyPacked_Check(PyObject *op) { - return ((op)->ob_type == SwigPyPacked_TypeOnce()) - || (strcmp((op)->ob_type->tp_name,"SwigPyPacked") == 0); -} - -SWIGRUNTIME void -SwigPyPacked_dealloc(PyObject *v) -{ - if (SwigPyPacked_Check(v)) { - SwigPyPacked *sobj = (SwigPyPacked *) v; - free(sobj->pack); - } - PyObject_DEL(v); -} - -SWIGRUNTIME PyTypeObject* -SwigPyPacked_TypeOnce(void) { - static char swigpacked_doc[] = "Swig object carries a C/C++ instance pointer"; - static PyTypeObject swigpypacked_type; - static int type_init = 0; - if (!type_init) { - const PyTypeObject tmp = { -#if PY_VERSION_HEX>=0x03000000 - PyVarObject_HEAD_INIT(NULL, 0) -#else - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ -#endif - (char *)"SwigPyPacked", /* tp_name */ - sizeof(SwigPyPacked), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)SwigPyPacked_dealloc, /* tp_dealloc */ - (printfunc)SwigPyPacked_print, /* tp_print */ - (getattrfunc)0, /* tp_getattr */ - (setattrfunc)0, /* tp_setattr */ -#if PY_VERSION_HEX>=0x03000000 - 0, /* tp_reserved in 3.0.1 */ -#else - (cmpfunc)SwigPyPacked_compare, /* tp_compare */ -#endif - (reprfunc)SwigPyPacked_repr, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - (hashfunc)0, /* tp_hash */ - (ternaryfunc)0, /* tp_call */ - (reprfunc)SwigPyPacked_str, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - swigpacked_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ -#if PY_VERSION_HEX >= 0x02020000 - 0, /* tp_iter */ - 0, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ - 0, /* tp_free */ - 0, /* tp_is_gc */ - 0, /* tp_bases */ - 0, /* tp_mro */ - 0, /* tp_cache */ - 0, /* tp_subclasses */ - 0, /* tp_weaklist */ -#endif -#if PY_VERSION_HEX >= 0x02030000 - 0, /* tp_del */ -#endif -#if PY_VERSION_HEX >= 0x02060000 - 0, /* tp_version_tag */ -#endif -#if PY_VERSION_HEX >= 0x03040000 - 0, /* tp_finalize */ -#endif -#ifdef COUNT_ALLOCS - 0, /* tp_allocs */ - 0, /* tp_frees */ - 0, /* tp_maxalloc */ -#if PY_VERSION_HEX >= 0x02050000 - 0, /* tp_prev */ -#endif - 0 /* tp_next */ -#endif - }; - swigpypacked_type = tmp; - type_init = 1; -#if PY_VERSION_HEX < 0x02020000 - swigpypacked_type.ob_type = &PyType_Type; -#else - if (PyType_Ready(&swigpypacked_type) < 0) - return NULL; -#endif - } - return &swigpypacked_type; -} - -SWIGRUNTIME PyObject * -SwigPyPacked_New(void *ptr, size_t size, swig_type_info *ty) -{ - SwigPyPacked *sobj = PyObject_NEW(SwigPyPacked, SwigPyPacked_type()); - if (sobj) { - void *pack = malloc(size); - if (pack) { - memcpy(pack, ptr, size); - sobj->pack = pack; - sobj->ty = ty; - sobj->size = size; - } else { - PyObject_DEL((PyObject *) sobj); - sobj = 0; - } - } - return (PyObject *) sobj; -} - -SWIGRUNTIME swig_type_info * -SwigPyPacked_UnpackData(PyObject *obj, void *ptr, size_t size) -{ - if (SwigPyPacked_Check(obj)) { - SwigPyPacked *sobj = (SwigPyPacked *)obj; - if (sobj->size != size) return 0; - memcpy(ptr, sobj->pack, size); - return sobj->ty; - } else { - return 0; - } -} - -/* ----------------------------------------------------------------------------- - * pointers/data manipulation - * ----------------------------------------------------------------------------- */ - -SWIGRUNTIMEINLINE PyObject * -_SWIG_This(void) -{ - return SWIG_Python_str_FromChar("this"); -} - -static PyObject *swig_this = NULL; - -SWIGRUNTIME PyObject * -SWIG_This(void) -{ - if (swig_this == NULL) - swig_this = _SWIG_This(); - return swig_this; -} - -/* #define SWIG_PYTHON_SLOW_GETSET_THIS */ - -/* TODO: I don't know how to implement the fast getset in Python 3 right now */ -#if PY_VERSION_HEX>=0x03000000 -#define SWIG_PYTHON_SLOW_GETSET_THIS -#endif - -SWIGRUNTIME SwigPyObject * -SWIG_Python_GetSwigThis(PyObject *pyobj) -{ - PyObject *obj; - - if (SwigPyObject_Check(pyobj)) - return (SwigPyObject *) pyobj; - -#ifdef SWIGPYTHON_BUILTIN - (void)obj; -# ifdef PyWeakref_CheckProxy - if (PyWeakref_CheckProxy(pyobj)) { - pyobj = PyWeakref_GET_OBJECT(pyobj); - if (pyobj && SwigPyObject_Check(pyobj)) - return (SwigPyObject*) pyobj; - } -# endif - return NULL; -#else - - obj = 0; - -#if (!defined(SWIG_PYTHON_SLOW_GETSET_THIS) && (PY_VERSION_HEX >= 0x02030000)) - if (PyInstance_Check(pyobj)) { - obj = _PyInstance_Lookup(pyobj, SWIG_This()); - } else { - PyObject **dictptr = _PyObject_GetDictPtr(pyobj); - if (dictptr != NULL) { - PyObject *dict = *dictptr; - obj = dict ? PyDict_GetItem(dict, SWIG_This()) : 0; - } else { -#ifdef PyWeakref_CheckProxy - if (PyWeakref_CheckProxy(pyobj)) { - PyObject *wobj = PyWeakref_GET_OBJECT(pyobj); - return wobj ? SWIG_Python_GetSwigThis(wobj) : 0; - } -#endif - obj = PyObject_GetAttr(pyobj,SWIG_This()); - if (obj) { - Py_DECREF(obj); - } else { - if (PyErr_Occurred()) PyErr_Clear(); - return 0; - } - } - } -#else - obj = PyObject_GetAttr(pyobj,SWIG_This()); - if (obj) { - Py_DECREF(obj); - } else { - if (PyErr_Occurred()) PyErr_Clear(); - return 0; - } -#endif - if (obj && !SwigPyObject_Check(obj)) { - /* a PyObject is called 'this', try to get the 'real this' - SwigPyObject from it */ - return SWIG_Python_GetSwigThis(obj); - } - return (SwigPyObject *)obj; -#endif -} - -/* Acquire a pointer value */ - -SWIGRUNTIME int -SWIG_Python_AcquirePtr(PyObject *obj, int own) { - if (own == SWIG_POINTER_OWN) { - SwigPyObject *sobj = SWIG_Python_GetSwigThis(obj); - if (sobj) { - int oldown = sobj->own; - sobj->own = own; - return oldown; - } - } - return 0; -} - -/* Convert a pointer value */ - -SWIGRUNTIME int -SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int flags, int *own) { - int res; - SwigPyObject *sobj; - int implicit_conv = (flags & SWIG_POINTER_IMPLICIT_CONV) != 0; - - if (!obj) - return SWIG_ERROR; - if (obj == Py_None && !implicit_conv) { - if (ptr) - *ptr = 0; - return SWIG_OK; - } - - res = SWIG_ERROR; - - sobj = SWIG_Python_GetSwigThis(obj); - if (own) - *own = 0; - while (sobj) { - void *vptr = sobj->ptr; - if (ty) { - swig_type_info *to = sobj->ty; - if (to == ty) { - /* no type cast needed */ - if (ptr) *ptr = vptr; - break; - } else { - swig_cast_info *tc = SWIG_TypeCheck(to->name,ty); - if (!tc) { - sobj = (SwigPyObject *)sobj->next; - } else { - if (ptr) { - int newmemory = 0; - *ptr = SWIG_TypeCast(tc,vptr,&newmemory); - if (newmemory == SWIG_CAST_NEW_MEMORY) { - assert(own); /* badly formed typemap which will lead to a memory leak - it must set and use own to delete *ptr */ - if (own) - *own = *own | SWIG_CAST_NEW_MEMORY; - } - } - break; - } - } - } else { - if (ptr) *ptr = vptr; - break; - } - } - if (sobj) { - if (own) - *own = *own | sobj->own; - if (flags & SWIG_POINTER_DISOWN) { - sobj->own = 0; - } - res = SWIG_OK; - } else { - if (implicit_conv) { - SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0; - if (data && !data->implicitconv) { - PyObject *klass = data->klass; - if (klass) { - PyObject *impconv; - data->implicitconv = 1; /* avoid recursion and call 'explicit' constructors*/ - impconv = SWIG_Python_CallFunctor(klass, obj); - data->implicitconv = 0; - if (PyErr_Occurred()) { - PyErr_Clear(); - impconv = 0; - } - if (impconv) { - SwigPyObject *iobj = SWIG_Python_GetSwigThis(impconv); - if (iobj) { - void *vptr; - res = SWIG_Python_ConvertPtrAndOwn((PyObject*)iobj, &vptr, ty, 0, 0); - if (SWIG_IsOK(res)) { - if (ptr) { - *ptr = vptr; - /* transfer the ownership to 'ptr' */ - iobj->own = 0; - res = SWIG_AddCast(res); - res = SWIG_AddNewMask(res); - } else { - res = SWIG_AddCast(res); - } - } - } - Py_DECREF(impconv); - } - } - } - } - if (!SWIG_IsOK(res) && obj == Py_None) { - if (ptr) - *ptr = 0; - if (PyErr_Occurred()) - PyErr_Clear(); - res = SWIG_OK; - } - } - return res; -} - -/* Convert a function ptr value */ - -SWIGRUNTIME int -SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, swig_type_info *ty) { - if (!PyCFunction_Check(obj)) { - return SWIG_ConvertPtr(obj, ptr, ty, 0); - } else { - void *vptr = 0; - swig_cast_info *tc; - - /* here we get the method pointer for callbacks */ - const char *doc = (((PyCFunctionObject *)obj) -> m_ml -> ml_doc); - const char *desc = doc ? strstr(doc, "swig_ptr: ") : 0; - if (desc) - desc = ty ? SWIG_UnpackVoidPtr(desc + 10, &vptr, ty->name) : 0; - if (!desc) - return SWIG_ERROR; - tc = SWIG_TypeCheck(desc,ty); - if (tc) { - int newmemory = 0; - *ptr = SWIG_TypeCast(tc,vptr,&newmemory); - assert(!newmemory); /* newmemory handling not yet implemented */ - } else { - return SWIG_ERROR; - } - return SWIG_OK; - } -} - -/* Convert a packed value value */ - -SWIGRUNTIME int -SWIG_Python_ConvertPacked(PyObject *obj, void *ptr, size_t sz, swig_type_info *ty) { - swig_type_info *to = SwigPyPacked_UnpackData(obj, ptr, sz); - if (!to) return SWIG_ERROR; - if (ty) { - if (to != ty) { - /* check type cast? */ - swig_cast_info *tc = SWIG_TypeCheck(to->name,ty); - if (!tc) return SWIG_ERROR; - } - } - return SWIG_OK; -} - -/* ----------------------------------------------------------------------------- - * Create a new pointer object - * ----------------------------------------------------------------------------- */ - -/* - Create a new instance object, without calling __init__, and set the - 'this' attribute. -*/ - -SWIGRUNTIME PyObject* -SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this) -{ -#if (PY_VERSION_HEX >= 0x02020000) - PyObject *inst = 0; - PyObject *newraw = data->newraw; - if (newraw) { - inst = PyObject_Call(newraw, data->newargs, NULL); - if (inst) { -#if !defined(SWIG_PYTHON_SLOW_GETSET_THIS) - PyObject **dictptr = _PyObject_GetDictPtr(inst); - if (dictptr != NULL) { - PyObject *dict = *dictptr; - if (dict == NULL) { - dict = PyDict_New(); - *dictptr = dict; - PyDict_SetItem(dict, SWIG_This(), swig_this); - } - } -#else - PyObject *key = SWIG_This(); - PyObject_SetAttr(inst, key, swig_this); -#endif - } - } else { -#if PY_VERSION_HEX >= 0x03000000 - inst = ((PyTypeObject*) data->newargs)->tp_new((PyTypeObject*) data->newargs, Py_None, Py_None); - if (inst) { - PyObject_SetAttr(inst, SWIG_This(), swig_this); - Py_TYPE(inst)->tp_flags &= ~Py_TPFLAGS_VALID_VERSION_TAG; - } -#else - PyObject *dict = PyDict_New(); - if (dict) { - PyDict_SetItem(dict, SWIG_This(), swig_this); - inst = PyInstance_NewRaw(data->newargs, dict); - Py_DECREF(dict); - } -#endif - } - return inst; -#else -#if (PY_VERSION_HEX >= 0x02010000) - PyObject *inst = 0; - PyObject *dict = PyDict_New(); - if (dict) { - PyDict_SetItem(dict, SWIG_This(), swig_this); - inst = PyInstance_NewRaw(data->newargs, dict); - Py_DECREF(dict); - } - return (PyObject *) inst; -#else - PyInstanceObject *inst = PyObject_NEW(PyInstanceObject, &PyInstance_Type); - if (inst == NULL) { - return NULL; - } - inst->in_class = (PyClassObject *)data->newargs; - Py_INCREF(inst->in_class); - inst->in_dict = PyDict_New(); - if (inst->in_dict == NULL) { - Py_DECREF(inst); - return NULL; - } -#ifdef Py_TPFLAGS_HAVE_WEAKREFS - inst->in_weakreflist = NULL; -#endif -#ifdef Py_TPFLAGS_GC - PyObject_GC_Init(inst); -#endif - PyDict_SetItem(inst->in_dict, SWIG_This(), swig_this); - return (PyObject *) inst; -#endif -#endif -} - -SWIGRUNTIME void -SWIG_Python_SetSwigThis(PyObject *inst, PyObject *swig_this) -{ - PyObject *dict; -#if (PY_VERSION_HEX >= 0x02020000) && !defined(SWIG_PYTHON_SLOW_GETSET_THIS) - PyObject **dictptr = _PyObject_GetDictPtr(inst); - if (dictptr != NULL) { - dict = *dictptr; - if (dict == NULL) { - dict = PyDict_New(); - *dictptr = dict; - } - PyDict_SetItem(dict, SWIG_This(), swig_this); - return; - } -#endif - dict = PyObject_GetAttrString(inst, (char*)"__dict__"); - PyDict_SetItem(dict, SWIG_This(), swig_this); - Py_DECREF(dict); -} - - -SWIGINTERN PyObject * -SWIG_Python_InitShadowInstance(PyObject *args) { - PyObject *obj[2]; - if (!SWIG_Python_UnpackTuple(args, "swiginit", 2, 2, obj)) { - return NULL; - } else { - SwigPyObject *sthis = SWIG_Python_GetSwigThis(obj[0]); - if (sthis) { - SwigPyObject_append((PyObject*) sthis, obj[1]); - } else { - SWIG_Python_SetSwigThis(obj[0], obj[1]); - } - return SWIG_Py_Void(); - } -} - -/* Create a new pointer object */ - -SWIGRUNTIME PyObject * -SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int flags) { - SwigPyClientData *clientdata; - PyObject * robj; - int own; - - if (!ptr) - return SWIG_Py_Void(); - - clientdata = type ? (SwigPyClientData *)(type->clientdata) : 0; - own = (flags & SWIG_POINTER_OWN) ? SWIG_POINTER_OWN : 0; - if (clientdata && clientdata->pytype) { - SwigPyObject *newobj; - if (flags & SWIG_BUILTIN_TP_INIT) { - newobj = (SwigPyObject*) self; - if (newobj->ptr) { - PyObject *next_self = clientdata->pytype->tp_alloc(clientdata->pytype, 0); - while (newobj->next) - newobj = (SwigPyObject *) newobj->next; - newobj->next = next_self; - newobj = (SwigPyObject *)next_self; -#ifdef SWIGPYTHON_BUILTIN - newobj->dict = 0; -#endif - } - } else { - newobj = PyObject_New(SwigPyObject, clientdata->pytype); -#ifdef SWIGPYTHON_BUILTIN - newobj->dict = 0; -#endif - } - if (newobj) { - newobj->ptr = ptr; - newobj->ty = type; - newobj->own = own; - newobj->next = 0; - return (PyObject*) newobj; - } - return SWIG_Py_Void(); - } - - assert(!(flags & SWIG_BUILTIN_TP_INIT)); - - robj = SwigPyObject_New(ptr, type, own); - if (robj && clientdata && !(flags & SWIG_POINTER_NOSHADOW)) { - PyObject *inst = SWIG_Python_NewShadowInstance(clientdata, robj); - Py_DECREF(robj); - robj = inst; - } - return robj; -} - -/* Create a new packed object */ - -SWIGRUNTIMEINLINE PyObject * -SWIG_Python_NewPackedObj(void *ptr, size_t sz, swig_type_info *type) { - return ptr ? SwigPyPacked_New((void *) ptr, sz, type) : SWIG_Py_Void(); -} - -/* -----------------------------------------------------------------------------* - * Get type list - * -----------------------------------------------------------------------------*/ - -#ifdef SWIG_LINK_RUNTIME -void *SWIG_ReturnGlobalTypeList(void *); -#endif - -SWIGRUNTIME swig_module_info * -SWIG_Python_GetModule(void *SWIGUNUSEDPARM(clientdata)) { - static void *type_pointer = (void *)0; - /* first check if module already created */ - if (!type_pointer) { -#ifdef SWIG_LINK_RUNTIME - type_pointer = SWIG_ReturnGlobalTypeList((void *)0); -#else -# ifdef SWIGPY_USE_CAPSULE - type_pointer = PyCapsule_Import(SWIGPY_CAPSULE_NAME, 0); -# else - type_pointer = PyCObject_Import((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION, - (char*)"type_pointer" SWIG_TYPE_TABLE_NAME); -# endif - if (PyErr_Occurred()) { - PyErr_Clear(); - type_pointer = (void *)0; - } -#endif - } - return (swig_module_info *) type_pointer; -} - -#if PY_MAJOR_VERSION < 2 -/* PyModule_AddObject function was introduced in Python 2.0. The following function - is copied out of Python/modsupport.c in python version 2.3.4 */ -SWIGINTERN int -PyModule_AddObject(PyObject *m, char *name, PyObject *o) -{ - PyObject *dict; - if (!PyModule_Check(m)) { - PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs module as first arg"); - return SWIG_ERROR; - } - if (!o) { - PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs non-NULL value"); - return SWIG_ERROR; - } - - dict = PyModule_GetDict(m); - if (dict == NULL) { - /* Internal error -- modules must have a dict! */ - PyErr_Format(PyExc_SystemError, "module '%s' has no __dict__", - PyModule_GetName(m)); - return SWIG_ERROR; - } - if (PyDict_SetItemString(dict, name, o)) - return SWIG_ERROR; - Py_DECREF(o); - return SWIG_OK; -} -#endif - -SWIGRUNTIME void -#ifdef SWIGPY_USE_CAPSULE -SWIG_Python_DestroyModule(PyObject *obj) -#else -SWIG_Python_DestroyModule(void *vptr) -#endif -{ -#ifdef SWIGPY_USE_CAPSULE - swig_module_info *swig_module = (swig_module_info *) PyCapsule_GetPointer(obj, SWIGPY_CAPSULE_NAME); -#else - swig_module_info *swig_module = (swig_module_info *) vptr; -#endif - swig_type_info **types = swig_module->types; - size_t i; - for (i =0; i < swig_module->size; ++i) { - swig_type_info *ty = types[i]; - if (ty->owndata) { - SwigPyClientData *data = (SwigPyClientData *) ty->clientdata; - if (data) SwigPyClientData_Del(data); - } - } - Py_DECREF(SWIG_This()); - swig_this = NULL; -} - -SWIGRUNTIME void -SWIG_Python_SetModule(swig_module_info *swig_module) { -#if PY_VERSION_HEX >= 0x03000000 - /* Add a dummy module object into sys.modules */ - PyObject *module = PyImport_AddModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION); -#else - static PyMethodDef swig_empty_runtime_method_table[] = { {NULL, NULL, 0, NULL} }; /* Sentinel */ - PyObject *module = Py_InitModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION, swig_empty_runtime_method_table); -#endif -#ifdef SWIGPY_USE_CAPSULE - PyObject *pointer = PyCapsule_New((void *) swig_module, SWIGPY_CAPSULE_NAME, SWIG_Python_DestroyModule); - if (pointer && module) { - PyModule_AddObject(module, (char*)"type_pointer_capsule" SWIG_TYPE_TABLE_NAME, pointer); - } else { - Py_XDECREF(pointer); - } -#else - PyObject *pointer = PyCObject_FromVoidPtr((void *) swig_module, SWIG_Python_DestroyModule); - if (pointer && module) { - PyModule_AddObject(module, (char*)"type_pointer" SWIG_TYPE_TABLE_NAME, pointer); - } else { - Py_XDECREF(pointer); - } -#endif -} - -/* The python cached type query */ -SWIGRUNTIME PyObject * -SWIG_Python_TypeCache(void) { - static PyObject *SWIG_STATIC_POINTER(cache) = PyDict_New(); - return cache; -} - -SWIGRUNTIME swig_type_info * -SWIG_Python_TypeQuery(const char *type) -{ - PyObject *cache = SWIG_Python_TypeCache(); - PyObject *key = SWIG_Python_str_FromChar(type); - PyObject *obj = PyDict_GetItem(cache, key); - swig_type_info *descriptor; - if (obj) { -#ifdef SWIGPY_USE_CAPSULE - descriptor = (swig_type_info *) PyCapsule_GetPointer(obj, NULL); -#else - descriptor = (swig_type_info *) PyCObject_AsVoidPtr(obj); -#endif - } else { - swig_module_info *swig_module = SWIG_GetModule(0); - descriptor = SWIG_TypeQueryModule(swig_module, swig_module, type); - if (descriptor) { -#ifdef SWIGPY_USE_CAPSULE - obj = PyCapsule_New((void*) descriptor, NULL, NULL); -#else - obj = PyCObject_FromVoidPtr(descriptor, NULL); -#endif - PyDict_SetItem(cache, key, obj); - Py_DECREF(obj); - } - } - Py_DECREF(key); - return descriptor; -} - -/* - For backward compatibility only -*/ -#define SWIG_POINTER_EXCEPTION 0 -#define SWIG_arg_fail(arg) SWIG_Python_ArgFail(arg) -#define SWIG_MustGetPtr(p, type, argnum, flags) SWIG_Python_MustGetPtr(p, type, argnum, flags) - -SWIGRUNTIME int -SWIG_Python_AddErrMesg(const char* mesg, int infront) -{ - if (PyErr_Occurred()) { - PyObject *type = 0; - PyObject *value = 0; - PyObject *traceback = 0; - PyErr_Fetch(&type, &value, &traceback); - if (value) { - char *tmp; - PyObject *old_str = PyObject_Str(value); - Py_XINCREF(type); - PyErr_Clear(); - if (infront) { - PyErr_Format(type, "%s %s", mesg, tmp = SWIG_Python_str_AsChar(old_str)); - } else { - PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), mesg); - } - SWIG_Python_str_DelForPy3(tmp); - Py_DECREF(old_str); - } - return 1; - } else { - return 0; - } -} - -SWIGRUNTIME int -SWIG_Python_ArgFail(int argnum) -{ - if (PyErr_Occurred()) { - /* add information about failing argument */ - char mesg[256]; - PyOS_snprintf(mesg, sizeof(mesg), "argument number %d:", argnum); - return SWIG_Python_AddErrMesg(mesg, 1); - } else { - return 0; - } -} - -SWIGRUNTIMEINLINE const char * -SwigPyObject_GetDesc(PyObject *self) -{ - SwigPyObject *v = (SwigPyObject *)self; - swig_type_info *ty = v ? v->ty : 0; - return ty ? ty->str : ""; -} - -SWIGRUNTIME void -SWIG_Python_TypeError(const char *type, PyObject *obj) -{ - if (type) { -#if defined(SWIG_COBJECT_TYPES) - if (obj && SwigPyObject_Check(obj)) { - const char *otype = (const char *) SwigPyObject_GetDesc(obj); - if (otype) { - PyErr_Format(PyExc_TypeError, "a '%s' is expected, 'SwigPyObject(%s)' is received", - type, otype); - return; - } - } else -#endif - { - const char *otype = (obj ? obj->ob_type->tp_name : 0); - if (otype) { - PyObject *str = PyObject_Str(obj); - const char *cstr = str ? SWIG_Python_str_AsChar(str) : 0; - if (cstr) { - PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s(%s)' is received", - type, otype, cstr); - SWIG_Python_str_DelForPy3(cstr); - } else { - PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s' is received", - type, otype); - } - Py_XDECREF(str); - return; - } - } - PyErr_Format(PyExc_TypeError, "a '%s' is expected", type); - } else { - PyErr_Format(PyExc_TypeError, "unexpected type is received"); - } -} - - -/* Convert a pointer value, signal an exception on a type mismatch */ -SWIGRUNTIME void * -SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, int SWIGUNUSEDPARM(argnum), int flags) { - void *result; - if (SWIG_Python_ConvertPtr(obj, &result, ty, flags) == -1) { - PyErr_Clear(); -#if SWIG_POINTER_EXCEPTION - if (flags) { - SWIG_Python_TypeError(SWIG_TypePrettyName(ty), obj); - SWIG_Python_ArgFail(argnum); - } -#endif - } - return result; -} - -#ifdef SWIGPYTHON_BUILTIN -SWIGRUNTIME int -SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) { - PyTypeObject *tp = obj->ob_type; - PyObject *descr; - PyObject *encoded_name; - descrsetfunc f; - int res = -1; - -# ifdef Py_USING_UNICODE - if (PyString_Check(name)) { - name = PyUnicode_Decode(PyString_AsString(name), PyString_Size(name), NULL, NULL); - if (!name) - return -1; - } else if (!PyUnicode_Check(name)) -# else - if (!PyString_Check(name)) -# endif - { - PyErr_Format(PyExc_TypeError, "attribute name must be string, not '%.200s'", name->ob_type->tp_name); - return -1; - } else { - Py_INCREF(name); - } - - if (!tp->tp_dict) { - if (PyType_Ready(tp) < 0) - goto done; - } - - descr = _PyType_Lookup(tp, name); - f = NULL; - if (descr != NULL) - f = descr->ob_type->tp_descr_set; - if (!f) { - if (PyString_Check(name)) { - encoded_name = name; - Py_INCREF(name); - } else { - encoded_name = PyUnicode_AsUTF8String(name); - } - PyErr_Format(PyExc_AttributeError, "'%.100s' object has no attribute '%.200s'", tp->tp_name, PyString_AsString(encoded_name)); - Py_DECREF(encoded_name); - } else { - res = f(descr, obj, value); - } - - done: - Py_DECREF(name); - return res; -} -#endif - - -#ifdef __cplusplus -} -#endif - - - -#define SWIG_exception_fail(code, msg) do { SWIG_Error(code, msg); SWIG_fail; } while(0) - -#define SWIG_contract_assert(expr, msg) if (!(expr)) { SWIG_Error(SWIG_RuntimeError, msg); SWIG_fail; } else - - - - #define SWIG_exception(code, msg) do { SWIG_Error(code, msg); SWIG_fail;; } while(0) - - -/* -------- TYPES TABLE (BEGIN) -------- */ - -#define SWIGTYPE_p_PathTrie swig_types[0] -#define SWIGTYPE_p_RetriveStrEnumerateVocab swig_types[1] -#define SWIGTYPE_p_Scorer swig_types[2] -#define SWIGTYPE_p_StringPiece swig_types[3] -#define SWIGTYPE_p_allocator_type swig_types[4] -#define SWIGTYPE_p_char swig_types[5] -#define SWIGTYPE_p_const_reference swig_types[6] -#define SWIGTYPE_p_difference_type swig_types[7] -#define SWIGTYPE_p_first_type swig_types[8] -#define SWIGTYPE_p_fst__StdVectorFst swig_types[9] -#define SWIGTYPE_p_lm__WordIndex swig_types[10] -#define SWIGTYPE_p_p_PyObject swig_types[11] -#define SWIGTYPE_p_reference swig_types[12] -#define SWIGTYPE_p_second_type swig_types[13] -#define SWIGTYPE_p_size_type swig_types[14] -#define SWIGTYPE_p_std__allocatorT_PathTrie_p_t swig_types[15] -#define SWIGTYPE_p_std__allocatorT_bool_t swig_types[16] -#define SWIGTYPE_p_std__allocatorT_double_t swig_types[17] -#define SWIGTYPE_p_std__allocatorT_float_t swig_types[18] -#define SWIGTYPE_p_std__allocatorT_int_t swig_types[19] -#define SWIGTYPE_p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t swig_types[20] -#define SWIGTYPE_p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t swig_types[21] -#define SWIGTYPE_p_std__allocatorT_std__string_t swig_types[22] -#define SWIGTYPE_p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t swig_types[23] -#define SWIGTYPE_p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t swig_types[24] -#define SWIGTYPE_p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t swig_types[25] -#define SWIGTYPE_p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t swig_types[26] -#define SWIGTYPE_p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t swig_types[27] -#define SWIGTYPE_p_std__invalid_argument swig_types[28] -#define SWIGTYPE_p_std__pairT_double_std__string_t swig_types[29] -#define SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t swig_types[30] -#define SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t swig_types[31] -#define SWIGTYPE_p_std__pairT_int_double_t swig_types[32] -#define SWIGTYPE_p_std__pairT_std__string_double_t swig_types[33] -#define SWIGTYPE_p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t swig_types[34] -#define SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t swig_types[35] -#define SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t swig_types[36] -#define SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t swig_types[37] -#define SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t swig_types[38] -#define SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t swig_types[39] -#define SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t swig_types[40] -#define SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t swig_types[41] -#define SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t swig_types[42] -#define SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t swig_types[43] -#define SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t swig_types[44] -#define SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t swig_types[45] -#define SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t swig_types[46] -#define SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t swig_types[47] -#define SWIGTYPE_p_swig__SwigPyIterator swig_types[48] -#define SWIGTYPE_p_value_type swig_types[49] -#define SWIGTYPE_p_void swig_types[50] -static swig_type_info *swig_types[52]; -static swig_module_info swig_module = {swig_types, 51, 0, 0, 0, 0}; -#define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) -#define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) - -/* -------- TYPES TABLE (END) -------- */ - -#if (PY_VERSION_HEX <= 0x02000000) -# if !defined(SWIG_PYTHON_CLASSIC) -# error "This python version requires swig to be run with the '-classic' option" -# endif -#endif - -/*----------------------------------------------- - @(target):= _swig_decoders.so - ------------------------------------------------*/ -#if PY_VERSION_HEX >= 0x03000000 -# define SWIG_init PyInit__swig_decoders - -#else -# define SWIG_init init_swig_decoders - -#endif -#define SWIG_name "_swig_decoders" - -#define SWIGVERSION 0x030012 -#define SWIG_VERSION SWIGVERSION - - -#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a)) -#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a)) - - -#include - - -namespace swig { - class SwigPtr_PyObject { - protected: - PyObject *_obj; - - public: - SwigPtr_PyObject() :_obj(0) - { - } - - SwigPtr_PyObject(const SwigPtr_PyObject& item) : _obj(item._obj) - { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - Py_XINCREF(_obj); - SWIG_PYTHON_THREAD_END_BLOCK; - } - - SwigPtr_PyObject(PyObject *obj, bool initial_ref = true) :_obj(obj) - { - if (initial_ref) { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - Py_XINCREF(_obj); - SWIG_PYTHON_THREAD_END_BLOCK; - } - } - - SwigPtr_PyObject & operator=(const SwigPtr_PyObject& item) - { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - Py_XINCREF(item._obj); - Py_XDECREF(_obj); - _obj = item._obj; - SWIG_PYTHON_THREAD_END_BLOCK; - return *this; - } - - ~SwigPtr_PyObject() - { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - Py_XDECREF(_obj); - SWIG_PYTHON_THREAD_END_BLOCK; - } - - operator PyObject *() const - { - return _obj; - } - - PyObject *operator->() const - { - return _obj; - } - }; -} - - -namespace swig { - struct SwigVar_PyObject : SwigPtr_PyObject { - SwigVar_PyObject(PyObject* obj = 0) : SwigPtr_PyObject(obj, false) { } - - SwigVar_PyObject & operator = (PyObject* obj) - { - Py_XDECREF(_obj); - _obj = obj; - return *this; - } - }; -} - - -#include "scorer.h" -#include "ctc_beam_search_decoder.h" -#include "decoder_utils.h" -#include "path_trie.h" - - -#include - -#if PY_VERSION_HEX >= 0x03020000 -# define SWIGPY_SLICE_ARG(obj) ((PyObject*) (obj)) -#else -# define SWIGPY_SLICE_ARG(obj) ((PySliceObject*) (obj)) -#endif - - -#include -#include - - -#if defined(__GNUC__) -# if __GNUC__ == 2 && __GNUC_MINOR <= 96 -# define SWIG_STD_NOMODERN_STL -# endif -#endif - - -#include - - -#include - - -namespace swig { - struct stop_iteration { - }; - - struct SwigPyIterator { - private: - SwigPtr_PyObject _seq; - - protected: - SwigPyIterator(PyObject *seq) : _seq(seq) - { - } - - public: - virtual ~SwigPyIterator() {} - - // Access iterator method, required by Python - virtual PyObject *value() const = 0; - - // Forward iterator method, required by Python - virtual SwigPyIterator *incr(size_t n = 1) = 0; - - // Backward iterator method, very common in C++, but not required in Python - virtual SwigPyIterator *decr(size_t /*n*/ = 1) - { - throw stop_iteration(); - } - - // Random access iterator methods, but not required in Python - virtual ptrdiff_t distance(const SwigPyIterator &/*x*/) const - { - throw std::invalid_argument("operation not supported"); - } - - virtual bool equal (const SwigPyIterator &/*x*/) const - { - throw std::invalid_argument("operation not supported"); - } - - // C++ common/needed methods - virtual SwigPyIterator *copy() const = 0; - - PyObject *next() - { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; // disable threads - PyObject *obj = value(); - incr(); - SWIG_PYTHON_THREAD_END_BLOCK; // re-enable threads - return obj; - } - - /* Make an alias for Python 3.x */ - PyObject *__next__() - { - return next(); - } - - PyObject *previous() - { - SWIG_PYTHON_THREAD_BEGIN_BLOCK; // disable threads - decr(); - PyObject *obj = value(); - SWIG_PYTHON_THREAD_END_BLOCK; // re-enable threads - return obj; - } - - SwigPyIterator *advance(ptrdiff_t n) - { - return (n > 0) ? incr(n) : decr(-n); - } - - bool operator == (const SwigPyIterator& x) const - { - return equal(x); - } - - bool operator != (const SwigPyIterator& x) const - { - return ! operator==(x); - } - - SwigPyIterator& operator += (ptrdiff_t n) - { - return *advance(n); - } - - SwigPyIterator& operator -= (ptrdiff_t n) - { - return *advance(-n); - } - - SwigPyIterator* operator + (ptrdiff_t n) const - { - return copy()->advance(n); - } - - SwigPyIterator* operator - (ptrdiff_t n) const - { - return copy()->advance(-n); - } - - ptrdiff_t operator - (const SwigPyIterator& x) const - { - return x.distance(*this); - } - - static swig_type_info* descriptor() { - static int init = 0; - static swig_type_info* desc = 0; - if (!init) { - desc = SWIG_TypeQuery("swig::SwigPyIterator *"); - init = 1; - } - return desc; - } - }; - -#if defined(SWIGPYTHON_BUILTIN) - inline PyObject* make_output_iterator_builtin (PyObject *pyself) - { - Py_INCREF(pyself); - return pyself; - } -#endif -} - - -SWIGINTERN int -SWIG_AsVal_double (PyObject *obj, double *val) -{ - int res = SWIG_TypeError; - if (PyFloat_Check(obj)) { - if (val) *val = PyFloat_AsDouble(obj); - return SWIG_OK; -#if PY_VERSION_HEX < 0x03000000 - } else if (PyInt_Check(obj)) { - if (val) *val = (double) PyInt_AsLong(obj); - return SWIG_OK; -#endif - } else if (PyLong_Check(obj)) { - double v = PyLong_AsDouble(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_OK; - } else { - PyErr_Clear(); - } - } -#ifdef SWIG_PYTHON_CAST_MODE - { - int dispatch = 0; - double d = PyFloat_AsDouble(obj); - if (!PyErr_Occurred()) { - if (val) *val = d; - return SWIG_AddCast(SWIG_OK); - } else { - PyErr_Clear(); - } - if (!dispatch) { - long v = PyLong_AsLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_AddCast(SWIG_AddCast(SWIG_OK)); - } else { - PyErr_Clear(); - } - } - } -#endif - return res; -} - - -#include - - -#include - - -SWIGINTERNINLINE int -SWIG_CanCastAsInteger(double *d, double min, double max) { - double x = *d; - if ((min <= x && x <= max)) { - double fx = floor(x); - double cx = ceil(x); - double rd = ((x - fx) < 0.5) ? fx : cx; /* simple rint */ - if ((errno == EDOM) || (errno == ERANGE)) { - errno = 0; - } else { - double summ, reps, diff; - if (rd < x) { - diff = x - rd; - } else if (rd > x) { - diff = rd - x; - } else { - return 1; - } - summ = rd + x; - reps = diff/summ; - if (reps < 8*DBL_EPSILON) { - *d = rd; - return 1; - } - } - } - return 0; -} - - -SWIGINTERN int -SWIG_AsVal_unsigned_SS_long (PyObject *obj, unsigned long *val) -{ -#if PY_VERSION_HEX < 0x03000000 - if (PyInt_Check(obj)) { - long v = PyInt_AsLong(obj); - if (v >= 0) { - if (val) *val = v; - return SWIG_OK; - } else { - return SWIG_OverflowError; - } - } else -#endif - if (PyLong_Check(obj)) { - unsigned long v = PyLong_AsUnsignedLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_OK; - } else { - PyErr_Clear(); - return SWIG_OverflowError; - } - } -#ifdef SWIG_PYTHON_CAST_MODE - { - int dispatch = 0; - unsigned long v = PyLong_AsUnsignedLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_AddCast(SWIG_OK); - } else { - PyErr_Clear(); - } - if (!dispatch) { - double d; - int res = SWIG_AddCast(SWIG_AsVal_double (obj,&d)); - if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, 0, ULONG_MAX)) { - if (val) *val = (unsigned long)(d); - return res; - } - } - } -#endif - return SWIG_TypeError; -} - - -#include -#if !defined(SWIG_NO_LLONG_MAX) -# if !defined(LLONG_MAX) && defined(__GNUC__) && defined (__LONG_LONG_MAX__) -# define LLONG_MAX __LONG_LONG_MAX__ -# define LLONG_MIN (-LLONG_MAX - 1LL) -# define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL) -# endif -#endif - - -#if defined(LLONG_MAX) && !defined(SWIG_LONG_LONG_AVAILABLE) -# define SWIG_LONG_LONG_AVAILABLE -#endif - - -#ifdef SWIG_LONG_LONG_AVAILABLE -SWIGINTERN int -SWIG_AsVal_unsigned_SS_long_SS_long (PyObject *obj, unsigned long long *val) -{ - int res = SWIG_TypeError; - if (PyLong_Check(obj)) { - unsigned long long v = PyLong_AsUnsignedLongLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_OK; - } else { - PyErr_Clear(); - res = SWIG_OverflowError; - } - } else { - unsigned long v; - res = SWIG_AsVal_unsigned_SS_long (obj,&v); - if (SWIG_IsOK(res)) { - if (val) *val = v; - return res; - } - } -#ifdef SWIG_PYTHON_CAST_MODE - { - const double mant_max = 1LL << DBL_MANT_DIG; - double d; - res = SWIG_AsVal_double (obj,&d); - if (SWIG_IsOK(res) && !SWIG_CanCastAsInteger(&d, 0, mant_max)) - return SWIG_OverflowError; - if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, 0, mant_max)) { - if (val) *val = (unsigned long long)(d); - return SWIG_AddCast(res); - } - res = SWIG_TypeError; - } -#endif - return res; -} -#endif - - -SWIGINTERNINLINE int -SWIG_AsVal_size_t (PyObject * obj, size_t *val) -{ - int res = SWIG_TypeError; -#ifdef SWIG_LONG_LONG_AVAILABLE - if (sizeof(size_t) <= sizeof(unsigned long)) { -#endif - unsigned long v; - res = SWIG_AsVal_unsigned_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); -#ifdef SWIG_LONG_LONG_AVAILABLE - } else if (sizeof(size_t) <= sizeof(unsigned long long)) { - unsigned long long v; - res = SWIG_AsVal_unsigned_SS_long_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = static_cast< size_t >(v); - } -#endif - return res; -} - - - #define SWIG_From_long PyInt_FromLong - - -#ifdef SWIG_LONG_LONG_AVAILABLE -SWIGINTERNINLINE PyObject* -SWIG_From_long_SS_long (long long value) -{ - return ((value < LONG_MIN) || (value > LONG_MAX)) ? - PyLong_FromLongLong(value) : PyInt_FromLong(static_cast< long >(value)); -} -#endif - - -SWIGINTERNINLINE PyObject * -SWIG_From_ptrdiff_t (ptrdiff_t value) -{ -#ifdef SWIG_LONG_LONG_AVAILABLE - if (sizeof(ptrdiff_t) <= sizeof(long)) { -#endif - return SWIG_From_long (static_cast< long >(value)); -#ifdef SWIG_LONG_LONG_AVAILABLE - } else { - /* assume sizeof(ptrdiff_t) <= sizeof(long long) */ - return SWIG_From_long_SS_long (static_cast< long long >(value)); - } -#endif -} - - -SWIGINTERNINLINE PyObject* - SWIG_From_bool (bool value) -{ - return PyBool_FromLong(value ? 1 : 0); -} - - -SWIGINTERN int -SWIG_AsVal_long (PyObject *obj, long* val) -{ -#if PY_VERSION_HEX < 0x03000000 - if (PyInt_Check(obj)) { - if (val) *val = PyInt_AsLong(obj); - return SWIG_OK; - } else -#endif - if (PyLong_Check(obj)) { - long v = PyLong_AsLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_OK; - } else { - PyErr_Clear(); - return SWIG_OverflowError; - } - } -#ifdef SWIG_PYTHON_CAST_MODE - { - int dispatch = 0; - long v = PyInt_AsLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_AddCast(SWIG_OK); - } else { - PyErr_Clear(); - } - if (!dispatch) { - double d; - int res = SWIG_AddCast(SWIG_AsVal_double (obj,&d)); - if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, LONG_MIN, LONG_MAX)) { - if (val) *val = (long)(d); - return res; - } - } - } -#endif - return SWIG_TypeError; -} - - -#ifdef SWIG_LONG_LONG_AVAILABLE -SWIGINTERN int -SWIG_AsVal_long_SS_long (PyObject *obj, long long *val) -{ - int res = SWIG_TypeError; - if (PyLong_Check(obj)) { - long long v = PyLong_AsLongLong(obj); - if (!PyErr_Occurred()) { - if (val) *val = v; - return SWIG_OK; - } else { - PyErr_Clear(); - res = SWIG_OverflowError; - } - } else { - long v; - res = SWIG_AsVal_long (obj,&v); - if (SWIG_IsOK(res)) { - if (val) *val = v; - return res; - } - } -#ifdef SWIG_PYTHON_CAST_MODE - { - const double mant_max = 1LL << DBL_MANT_DIG; - const double mant_min = -mant_max; - double d; - res = SWIG_AsVal_double (obj,&d); - if (SWIG_IsOK(res) && !SWIG_CanCastAsInteger(&d, mant_min, mant_max)) - return SWIG_OverflowError; - if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, mant_min, mant_max)) { - if (val) *val = (long long)(d); - return SWIG_AddCast(res); - } - res = SWIG_TypeError; - } -#endif - return res; -} -#endif - - -SWIGINTERNINLINE int -SWIG_AsVal_ptrdiff_t (PyObject * obj, ptrdiff_t *val) -{ - int res = SWIG_TypeError; -#ifdef SWIG_LONG_LONG_AVAILABLE - if (sizeof(ptrdiff_t) <= sizeof(long)) { -#endif - long v; - res = SWIG_AsVal_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = static_cast< ptrdiff_t >(v); -#ifdef SWIG_LONG_LONG_AVAILABLE - } else if (sizeof(ptrdiff_t) <= sizeof(long long)) { - long long v; - res = SWIG_AsVal_long_SS_long (obj, val ? &v : 0); - if (SWIG_IsOK(res) && val) *val = static_cast< ptrdiff_t >(v); - } -#endif - return res; -} - - -#include - - -#include - - -#include - - -SWIGINTERN int -SWIG_AsVal_int (PyObject * obj, int *val) -{ - long v; - int res = SWIG_AsVal_long (obj, &v); - if (SWIG_IsOK(res)) { - if ((v < INT_MIN || v > INT_MAX)) { - return SWIG_OverflowError; - } else { - if (val) *val = static_cast< int >(v); - } - } - return res; -} - - -SWIGINTERN int -SWIG_AsVal_bool (PyObject *obj, bool *val) -{ - int r; - if (!PyBool_Check(obj)) - return SWIG_ERROR; - r = PyObject_IsTrue(obj); - if (r == -1) - return SWIG_ERROR; - if (val) *val = r ? true : false; - return SWIG_OK; -} - - -/* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */ -#ifndef SWIG_isfinite -/* isfinite() is a macro for C99 */ -# if defined(isfinite) -# define SWIG_isfinite(X) (isfinite(X)) -# elif defined __cplusplus && __cplusplus >= 201103L -/* Use a template so that this works whether isfinite() is std::isfinite() or - * in the global namespace. The reality seems to vary between compiler - * versions. - * - * Make sure namespace std exists to avoid compiler warnings. - * - * extern "C++" is required as this fragment can end up inside an extern "C" { } block - */ -namespace std { } -extern "C++" template -inline int SWIG_isfinite_func(T x) { - using namespace std; - return isfinite(x); -} -# define SWIG_isfinite(X) (SWIG_isfinite_func(X)) -# elif defined(_MSC_VER) -# define SWIG_isfinite(X) (_finite(X)) -# elif defined(__sun) && defined(__SVR4) -# include -# define SWIG_isfinite(X) (finite(X)) -# endif -#endif - - -/* Accept infinite as a valid float value unless we are unable to check if a value is finite */ -#ifdef SWIG_isfinite -# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX) && SWIG_isfinite(X)) -#else -# define SWIG_Float_Overflow_Check(X) ((X < -FLT_MAX || X > FLT_MAX)) -#endif - - -SWIGINTERN int -SWIG_AsVal_float (PyObject * obj, float *val) -{ - double v; - int res = SWIG_AsVal_double (obj, &v); - if (SWIG_IsOK(res)) { - if (SWIG_Float_Overflow_Check(v)) { - return SWIG_OverflowError; - } else { - if (val) *val = static_cast< float >(v); - } - } - return res; -} - - - #define SWIG_From_double PyFloat_FromDouble - - -SWIGINTERNINLINE PyObject * -SWIG_From_float (float value) -{ - return SWIG_From_double (value); -} - - -SWIGINTERNINLINE PyObject* - SWIG_From_int (int value) -{ - return PyInt_FromLong((long) value); -} - - -namespace swig { - template - struct noconst_traits { - typedef Type noconst_type; - }; - - template - struct noconst_traits { - typedef Type noconst_type; - }; - - /* - type categories - */ - struct pointer_category { }; - struct value_category { }; - - /* - General traits that provides type_name and type_info - */ - template struct traits { }; - - template - inline const char* type_name() { - return traits::noconst_type >::type_name(); - } - - template struct traits_info { - static swig_type_info *type_query(std::string name) { - name += " *"; - return SWIG_TypeQuery(name.c_str()); - } - static swig_type_info *type_info() { - static swig_type_info *info = type_query(type_name()); - return info; - } - }; - - /* - Partial specialization for pointers (traits_info) - */ - template struct traits_info { - static swig_type_info *type_query(std::string name) { - name += " *"; - return SWIG_TypeQuery(name.c_str()); - } - static swig_type_info *type_info() { - static swig_type_info *info = type_query(type_name()); - return info; - } - }; - - template - inline swig_type_info *type_info() { - return traits_info::type_info(); - } - - /* - Partial specialization for pointers (traits) - */ - template struct traits { - typedef pointer_category category; - static std::string make_ptr_name(const char* name) { - std::string ptrname = name; - ptrname += " *"; - return ptrname; - } - static const char* type_name() { - static std::string name = make_ptr_name(swig::type_name()); - return name.c_str(); - } - }; - - template - struct traits_as { }; - - template - struct traits_check { }; - -} - - -namespace swig { - /* - Traits that provides the from method - */ - template struct traits_from_ptr { - static PyObject *from(Type *val, int owner = 0) { - return SWIG_InternalNewPointerObj(val, type_info(), owner); - } - }; - - template struct traits_from { - static PyObject *from(const Type& val) { - return traits_from_ptr::from(new Type(val), 1); - } - }; - - template struct traits_from { - static PyObject *from(Type* val) { - return traits_from_ptr::from(val, 0); - } - }; - - template struct traits_from { - static PyObject *from(const Type* val) { - return traits_from_ptr::from(const_cast(val), 0); - } - }; - - - template - inline PyObject *from(const Type& val) { - return traits_from::from(val); - } - - template - inline PyObject *from_ptr(Type* val, int owner) { - return traits_from_ptr::from(val, owner); - } - - /* - Traits that provides the asval/as/check method - */ - template - struct traits_asptr { - static int asptr(PyObject *obj, Type **val) { - Type *p; - swig_type_info *descriptor = type_info(); - int res = descriptor ? SWIG_ConvertPtr(obj, (void **)&p, descriptor, 0) : SWIG_ERROR; - if (SWIG_IsOK(res)) { - if (val) *val = p; - } - return res; - } - }; - - template - inline int asptr(PyObject *obj, Type **vptr) { - return traits_asptr::asptr(obj, vptr); - } - - template - struct traits_asval { - static int asval(PyObject *obj, Type *val) { - if (val) { - Type *p = 0; - int res = traits_asptr::asptr(obj, &p); - if (!SWIG_IsOK(res)) return res; - if (p) { - typedef typename noconst_traits::noconst_type noconst_type; - *(const_cast(val)) = *p; - if (SWIG_IsNewObj(res)){ - delete p; - res = SWIG_DelNewMask(res); - } - return res; - } else { - return SWIG_ERROR; - } - } else { - return traits_asptr::asptr(obj, (Type **)(0)); - } - } - }; - - template struct traits_asval { - static int asval(PyObject *obj, Type **val) { - if (val) { - typedef typename noconst_traits::noconst_type noconst_type; - noconst_type *p = 0; - int res = traits_asptr::asptr(obj, &p); - if (SWIG_IsOK(res)) { - *(const_cast(val)) = p; - } - return res; - } else { - return traits_asptr::asptr(obj, (Type **)(0)); - } - } - }; - - template - inline int asval(PyObject *obj, Type *val) { - return traits_asval::asval(obj, val); - } - - template - struct traits_as { - static Type as(PyObject *obj, bool throw_error) { - Type v; - int res = asval(obj, &v); - if (!obj || !SWIG_IsOK(res)) { - if (!PyErr_Occurred()) { - ::SWIG_Error(SWIG_TypeError, swig::type_name()); - } - if (throw_error) throw std::invalid_argument("bad type"); - } - return v; - } - }; - - template - struct traits_as { - static Type as(PyObject *obj, bool throw_error) { - Type *v = 0; - int res = (obj ? traits_asptr::asptr(obj, &v) : SWIG_ERROR); - if (SWIG_IsOK(res) && v) { - if (SWIG_IsNewObj(res)) { - Type r(*v); - delete v; - return r; - } else { - return *v; - } - } else { - // Uninitialized return value, no Type() constructor required. - static Type *v_def = (Type*) malloc(sizeof(Type)); - if (!PyErr_Occurred()) { - SWIG_Error(SWIG_TypeError, swig::type_name()); - } - if (throw_error) throw std::invalid_argument("bad type"); - memset(v_def,0,sizeof(Type)); - return *v_def; - } - } - }; - - template - struct traits_as { - static Type* as(PyObject *obj, bool throw_error) { - Type *v = 0; - int res = (obj ? traits_asptr::asptr(obj, &v) : SWIG_ERROR); - if (SWIG_IsOK(res)) { - return v; - } else { - if (!PyErr_Occurred()) { - SWIG_Error(SWIG_TypeError, swig::type_name()); - } - if (throw_error) throw std::invalid_argument("bad type"); - return 0; - } - } - }; - - template - inline Type as(PyObject *obj, bool te = false) { - return traits_as::category>::as(obj, te); - } - - template - struct traits_check { - static bool check(PyObject *obj) { - int res = obj ? asval(obj, (Type *)(0)) : SWIG_ERROR; - return SWIG_IsOK(res) ? true : false; - } - }; - - template - struct traits_check { - static bool check(PyObject *obj) { - int res = obj ? asptr(obj, (Type **)(0)) : SWIG_ERROR; - return SWIG_IsOK(res) ? true : false; - } - }; - - template - inline bool check(PyObject *obj) { - return traits_check::category>::check(obj); - } -} - - -#include - -namespace std { - template <> - struct less - { - bool - operator()(PyObject * v, PyObject *w) const - { - bool res; - SWIG_PYTHON_THREAD_BEGIN_BLOCK; - res = PyObject_RichCompareBool(v, w, Py_LT) ? true : false; - /* This may fall into a case of inconsistent - eg. ObjA > ObjX > ObjB - but ObjA < ObjB - */ - if( PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_TypeError) ) - { - /* Objects can't be compared, this mostly occurred in Python 3.0 */ - /* Compare their ptr directly for a workaround */ - res = (v < w); - PyErr_Clear(); - } - SWIG_PYTHON_THREAD_END_BLOCK; - return res; - } - }; - - template <> - struct less - { - bool - operator()(const swig::SwigPtr_PyObject& v, const swig::SwigPtr_PyObject& w) const - { - return std::less()(v, w); - } - }; - - template <> - struct less - { - bool - operator()(const swig::SwigVar_PyObject& v, const swig::SwigVar_PyObject& w) const - { - return std::less()(v, w); - } - }; - -} - -namespace swig { - template <> struct traits { - typedef value_category category; - static const char* type_name() { return "PyObject *"; } - }; - - template <> struct traits_asval { - typedef PyObject * value_type; - static int asval(PyObject *obj, value_type *val) { - if (val) *val = obj; - return SWIG_OK; - } - }; - - template <> - struct traits_check { - static bool check(PyObject *) { - return true; - } - }; - - template <> struct traits_from { - typedef PyObject * value_type; - static PyObject *from(const value_type& val) { - Py_XINCREF(val); - return val; - } - }; - -} - -namespace swig { - template - inline size_t - check_index(Difference i, size_t size, bool insert = false) { - if ( i < 0 ) { - if ((size_t) (-i) <= size) - return (size_t) (i + size); - } else if ( (size_t) i < size ) { - return (size_t) i; - } else if (insert && ((size_t) i == size)) { - return size; - } - throw std::out_of_range("index out of range"); - } - - template - void - slice_adjust(Difference i, Difference j, Py_ssize_t step, size_t size, Difference &ii, Difference &jj, bool insert = false) { - if (step == 0) { - throw std::invalid_argument("slice step cannot be zero"); - } else if (step > 0) { - // Required range: 0 <= i < size, 0 <= j < size, i <= j - if (i < 0) { - ii = 0; - } else if (i < (Difference)size) { - ii = i; - } else if (insert && (i >= (Difference)size)) { - ii = (Difference)size; - } - if (j < 0) { - jj = 0; - } else { - jj = (j < (Difference)size) ? j : (Difference)size; - } - if (jj < ii) - jj = ii; - } else { - // Required range: -1 <= i < size-1, -1 <= j < size-1, i >= j - if (i < -1) { - ii = -1; - } else if (i < (Difference) size) { - ii = i; - } else if (i >= (Difference)(size-1)) { - ii = (Difference)(size-1); - } - if (j < -1) { - jj = -1; - } else { - jj = (j < (Difference)size ) ? j : (Difference)(size-1); - } - if (ii < jj) - ii = jj; - } - } - - template - inline typename Sequence::iterator - getpos(Sequence* self, Difference i) { - typename Sequence::iterator pos = self->begin(); - std::advance(pos, check_index(i,self->size())); - return pos; - } - - template - inline typename Sequence::const_iterator - cgetpos(const Sequence* self, Difference i) { - typename Sequence::const_iterator pos = self->begin(); - std::advance(pos, check_index(i,self->size())); - return pos; - } - - template - inline void - erase(Sequence* seq, const typename Sequence::iterator& position) { - seq->erase(position); - } - - template - struct traits_reserve { - static void reserve(Sequence & /*seq*/, typename Sequence::size_type /*n*/) { - // This should be specialized for types that support reserve - } - }; - - template - inline Sequence* - getslice(const Sequence* self, Difference i, Difference j, Py_ssize_t step) { - typename Sequence::size_type size = self->size(); - Difference ii = 0; - Difference jj = 0; - swig::slice_adjust(i, j, step, size, ii, jj); - - if (step > 0) { - typename Sequence::const_iterator sb = self->begin(); - typename Sequence::const_iterator se = self->begin(); - std::advance(sb,ii); - std::advance(se,jj); - if (step == 1) { - return new Sequence(sb, se); - } else { - Sequence *sequence = new Sequence(); - swig::traits_reserve::reserve(*sequence, (jj - ii + step - 1) / step); - typename Sequence::const_iterator it = sb; - while (it!=se) { - sequence->push_back(*it); - for (Py_ssize_t c=0; c::reserve(*sequence, (ii - jj - step - 1) / -step); - typename Sequence::const_reverse_iterator sb = self->rbegin(); - typename Sequence::const_reverse_iterator se = self->rbegin(); - std::advance(sb,size-ii-1); - std::advance(se,size-jj-1); - typename Sequence::const_reverse_iterator it = sb; - while (it!=se) { - sequence->push_back(*it); - for (Py_ssize_t c=0; c<-step && it!=se; ++c) - it++; - } - return sequence; - } - } - - template - inline void - setslice(Sequence* self, Difference i, Difference j, Py_ssize_t step, const InputSeq& is = InputSeq()) { - typename Sequence::size_type size = self->size(); - Difference ii = 0; - Difference jj = 0; - swig::slice_adjust(i, j, step, size, ii, jj, true); - if (step > 0) { - if (step == 1) { - size_t ssize = jj - ii; - if (ssize <= is.size()) { - // expanding/staying the same size - swig::traits_reserve::reserve(*self, self->size() - ssize + is.size()); - typename Sequence::iterator sb = self->begin(); - typename InputSeq::const_iterator isit = is.begin(); - std::advance(sb,ii); - std::advance(isit, jj - ii); - self->insert(std::copy(is.begin(), isit, sb), isit, is.end()); - } else { - // shrinking - typename Sequence::iterator sb = self->begin(); - typename Sequence::iterator se = self->begin(); - std::advance(sb,ii); - std::advance(se,jj); - self->erase(sb,se); - sb = self->begin(); - std::advance(sb,ii); - self->insert(sb, is.begin(), is.end()); - } - } else { - size_t replacecount = (jj - ii + step - 1) / step; - if (is.size() != replacecount) { - char msg[1024]; - sprintf(msg, "attempt to assign sequence of size %lu to extended slice of size %lu", (unsigned long)is.size(), (unsigned long)replacecount); - throw std::invalid_argument(msg); - } - typename Sequence::const_iterator isit = is.begin(); - typename Sequence::iterator it = self->begin(); - std::advance(it,ii); - for (size_t rc=0; rcend(); ++rc) { - *it++ = *isit++; - for (Py_ssize_t c=0; c<(step-1) && it != self->end(); ++c) - it++; - } - } - } else { - size_t replacecount = (ii - jj - step - 1) / -step; - if (is.size() != replacecount) { - char msg[1024]; - sprintf(msg, "attempt to assign sequence of size %lu to extended slice of size %lu", (unsigned long)is.size(), (unsigned long)replacecount); - throw std::invalid_argument(msg); - } - typename Sequence::const_iterator isit = is.begin(); - typename Sequence::reverse_iterator it = self->rbegin(); - std::advance(it,size-ii-1); - for (size_t rc=0; rcrend(); ++rc) { - *it++ = *isit++; - for (Py_ssize_t c=0; c<(-step-1) && it != self->rend(); ++c) - it++; - } - } - } - - template - inline void - delslice(Sequence* self, Difference i, Difference j, Py_ssize_t step) { - typename Sequence::size_type size = self->size(); - Difference ii = 0; - Difference jj = 0; - swig::slice_adjust(i, j, step, size, ii, jj, true); - if (step > 0) { - typename Sequence::iterator sb = self->begin(); - std::advance(sb,ii); - if (step == 1) { - typename Sequence::iterator se = self->begin(); - std::advance(se,jj); - self->erase(sb,se); - } else { - typename Sequence::iterator it = sb; - size_t delcount = (jj - ii + step - 1) / step; - while (delcount) { - it = self->erase(it); - for (Py_ssize_t c=0; c<(step-1) && it != self->end(); ++c) - it++; - delcount--; - } - } - } else { - typename Sequence::reverse_iterator sb = self->rbegin(); - std::advance(sb,size-ii-1); - typename Sequence::reverse_iterator it = sb; - size_t delcount = (ii - jj - step - 1) / -step; - while (delcount) { - it = typename Sequence::reverse_iterator(self->erase((++it).base())); - for (Py_ssize_t c=0; c<(-step-1) && it != self->rend(); ++c) - it++; - delcount--; - } - } - } -} - - -#if defined(__SUNPRO_CC) && defined(_RWSTD_VER) -# if !defined(SWIG_NO_STD_NOITERATOR_TRAITS_STL) -# define SWIG_STD_NOITERATOR_TRAITS_STL -# endif -#endif - -#if !defined(SWIG_STD_NOITERATOR_TRAITS_STL) -#include -#else -namespace std { - template - struct iterator_traits { - typedef ptrdiff_t difference_type; - typedef typename Iterator::value_type value_type; - }; - - template - struct iterator_traits<__reverse_bi_iterator > { - typedef Distance difference_type; - typedef T value_type; - }; - - template - struct iterator_traits { - typedef T value_type; - typedef ptrdiff_t difference_type; - }; - - template - inline typename iterator_traits<_InputIterator>::difference_type - distance(_InputIterator __first, _InputIterator __last) - { - typename iterator_traits<_InputIterator>::difference_type __n = 0; - while (__first != __last) { - ++__first; ++__n; - } - return __n; - } -} -#endif - - -namespace swig { - template - class SwigPyIterator_T : public SwigPyIterator - { - public: - typedef OutIterator out_iterator; - typedef typename std::iterator_traits::value_type value_type; - typedef SwigPyIterator_T self_type; - - SwigPyIterator_T(out_iterator curr, PyObject *seq) - : SwigPyIterator(seq), current(curr) - { - } - - const out_iterator& get_current() const - { - return current; - } - - - bool equal (const SwigPyIterator &iter) const - { - const self_type *iters = dynamic_cast(&iter); - if (iters) { - return (current == iters->get_current()); - } else { - throw std::invalid_argument("bad iterator type"); - } - } - - ptrdiff_t distance(const SwigPyIterator &iter) const - { - const self_type *iters = dynamic_cast(&iter); - if (iters) { - return std::distance(current, iters->get_current()); - } else { - throw std::invalid_argument("bad iterator type"); - } - } - - protected: - out_iterator current; - }; - - template - struct from_oper - { - typedef const ValueType& argument_type; - typedef PyObject *result_type; - result_type operator()(argument_type v) const - { - return swig::from(v); - } - }; - - template::value_type, - typename FromOper = from_oper > - class SwigPyIteratorOpen_T : public SwigPyIterator_T - { - public: - FromOper from; - typedef OutIterator out_iterator; - typedef ValueType value_type; - typedef SwigPyIterator_T base; - typedef SwigPyIteratorOpen_T self_type; - - SwigPyIteratorOpen_T(out_iterator curr, PyObject *seq) - : SwigPyIterator_T(curr, seq) - { - } - - PyObject *value() const { - return from(static_cast(*(base::current))); - } - - SwigPyIterator *copy() const - { - return new self_type(*this); - } - - SwigPyIterator *incr(size_t n = 1) - { - while (n--) { - ++base::current; - } - return this; - } - - SwigPyIterator *decr(size_t n = 1) - { - while (n--) { - --base::current; - } - return this; - } - }; - - template::value_type, - typename FromOper = from_oper > - class SwigPyIteratorClosed_T : public SwigPyIterator_T - { - public: - FromOper from; - typedef OutIterator out_iterator; - typedef ValueType value_type; - typedef SwigPyIterator_T base; - typedef SwigPyIteratorClosed_T self_type; - - SwigPyIteratorClosed_T(out_iterator curr, out_iterator first, out_iterator last, PyObject *seq) - : SwigPyIterator_T(curr, seq), begin(first), end(last) - { - } - - PyObject *value() const { - if (base::current == end) { - throw stop_iteration(); - } else { - return from(static_cast(*(base::current))); - } - } - - SwigPyIterator *copy() const - { - return new self_type(*this); - } - - SwigPyIterator *incr(size_t n = 1) - { - while (n--) { - if (base::current == end) { - throw stop_iteration(); - } else { - ++base::current; - } - } - return this; - } - - SwigPyIterator *decr(size_t n = 1) - { - while (n--) { - if (base::current == begin) { - throw stop_iteration(); - } else { - --base::current; - } - } - return this; - } - - private: - out_iterator begin; - out_iterator end; - }; - - template - inline SwigPyIterator* - make_output_iterator(const OutIter& current, const OutIter& begin,const OutIter& end, PyObject *seq = 0) - { - return new SwigPyIteratorClosed_T(current, begin, end, seq); - } - - template - inline SwigPyIterator* - make_output_iterator(const OutIter& current, PyObject *seq = 0) - { - return new SwigPyIteratorOpen_T(current, seq); - } - -} - - -namespace swig -{ - template - struct SwigPySequence_Ref - { - SwigPySequence_Ref(PyObject* seq, Py_ssize_t index) - : _seq(seq), _index(index) - { - } - - operator T () const - { - swig::SwigVar_PyObject item = PySequence_GetItem(_seq, _index); - try { - return swig::as(item, true); - } catch (std::exception& e) { - char msg[1024]; - sprintf(msg, "in sequence element %d ", (int)_index); - if (!PyErr_Occurred()) { - ::SWIG_Error(SWIG_TypeError, swig::type_name()); - } - SWIG_Python_AddErrorMsg(msg); - SWIG_Python_AddErrorMsg(e.what()); - throw; - } - } - - SwigPySequence_Ref& operator=(const T& v) - { - PySequence_SetItem(_seq, _index, swig::from(v)); - return *this; - } - - private: - PyObject* _seq; - Py_ssize_t _index; - }; - - template - struct SwigPySequence_ArrowProxy - { - SwigPySequence_ArrowProxy(const T& x): m_value(x) {} - const T* operator->() const { return &m_value; } - operator const T*() const { return &m_value; } - T m_value; - }; - - template - struct SwigPySequence_InputIterator - { - typedef SwigPySequence_InputIterator self; - - typedef std::random_access_iterator_tag iterator_category; - typedef Reference reference; - typedef T value_type; - typedef T* pointer; - typedef Py_ssize_t difference_type; - - SwigPySequence_InputIterator() - { - } - - SwigPySequence_InputIterator(PyObject* seq, Py_ssize_t index) - : _seq(seq), _index(index) - { - } - - reference operator*() const - { - return reference(_seq, _index); - } - - SwigPySequence_ArrowProxy - operator->() const { - return SwigPySequence_ArrowProxy(operator*()); - } - - bool operator==(const self& ri) const - { - return (_index == ri._index) && (_seq == ri._seq); - } - - bool operator!=(const self& ri) const - { - return !(operator==(ri)); - } - - self& operator ++ () - { - ++_index; - return *this; - } - - self& operator -- () - { - --_index; - return *this; - } - - self& operator += (difference_type n) - { - _index += n; - return *this; - } - - self operator +(difference_type n) const - { - return self(_seq, _index + n); - } - - self& operator -= (difference_type n) - { - _index -= n; - return *this; - } - - self operator -(difference_type n) const - { - return self(_seq, _index - n); - } - - difference_type operator - (const self& ri) const - { - return _index - ri._index; - } - - bool operator < (const self& ri) const - { - return _index < ri._index; - } - - reference - operator[](difference_type n) const - { - return reference(_seq, _index + n); - } - - private: - PyObject* _seq; - difference_type _index; - }; - - // STL container wrapper around a Python sequence - template - struct SwigPySequence_Cont - { - typedef SwigPySequence_Ref reference; - typedef const SwigPySequence_Ref const_reference; - typedef T value_type; - typedef T* pointer; - typedef Py_ssize_t difference_type; - typedef size_t size_type; - typedef const pointer const_pointer; - typedef SwigPySequence_InputIterator iterator; - typedef SwigPySequence_InputIterator const_iterator; - - SwigPySequence_Cont(PyObject* seq) : _seq(0) - { - if (!PySequence_Check(seq)) { - throw std::invalid_argument("a sequence is expected"); - } - _seq = seq; - Py_INCREF(_seq); - } - - ~SwigPySequence_Cont() - { - Py_XDECREF(_seq); - } - - size_type size() const - { - return static_cast(PySequence_Size(_seq)); - } - - bool empty() const - { - return size() == 0; - } - - iterator begin() - { - return iterator(_seq, 0); - } - - const_iterator begin() const - { - return const_iterator(_seq, 0); - } - - iterator end() - { - return iterator(_seq, size()); - } - - const_iterator end() const - { - return const_iterator(_seq, size()); - } - - reference operator[](difference_type n) - { - return reference(_seq, n); - } - - const_reference operator[](difference_type n) const - { - return const_reference(_seq, n); - } - - bool check(bool set_err = true) const - { - Py_ssize_t s = size(); - for (Py_ssize_t i = 0; i < s; ++i) { - swig::SwigVar_PyObject item = PySequence_GetItem(_seq, i); - if (!swig::check(item)) { - if (set_err) { - char msg[1024]; - sprintf(msg, "in sequence element %d", (int)i); - SWIG_Error(SWIG_RuntimeError, msg); - } - return false; - } - } - return true; - } - - private: - PyObject* _seq; - }; - -} - - -namespace swig { - template <> struct traits< double > { - typedef value_category category; - static const char* type_name() { return"double"; } - }; - template <> struct traits_asval< double > { - typedef double value_type; - static int asval(PyObject *obj, value_type *val) { - return SWIG_AsVal_double (obj, val); - } - }; - template <> struct traits_from< double > { - typedef double value_type; - static PyObject *from(const value_type& val) { - return SWIG_From_double (val); - } - }; -} - - -namespace swig { - template - inline void - assign(const SwigPySeq& swigpyseq, Seq* seq) { - // seq->assign(swigpyseq.begin(), swigpyseq.end()); // not used as not always implemented - typedef typename SwigPySeq::value_type value_type; - typename SwigPySeq::const_iterator it = swigpyseq.begin(); - for (;it != swigpyseq.end(); ++it) { - seq->insert(seq->end(),(value_type)(*it)); - } - } - - template - struct traits_asptr_stdseq { - typedef Seq sequence; - typedef T value_type; - - static int asptr(PyObject *obj, sequence **seq) { - if (obj == Py_None || SWIG_Python_GetSwigThis(obj)) { - sequence *p; - swig_type_info *descriptor = swig::type_info(); - if (descriptor && SWIG_IsOK(::SWIG_ConvertPtr(obj, (void **)&p, descriptor, 0))) { - if (seq) *seq = p; - return SWIG_OLDOBJ; - } - } else if (PySequence_Check(obj)) { - try { - SwigPySequence_Cont swigpyseq(obj); - if (seq) { - sequence *pseq = new sequence(); - assign(swigpyseq, pseq); - *seq = pseq; - return SWIG_NEWOBJ; - } else { - return swigpyseq.check() ? SWIG_OK : SWIG_ERROR; - } - } catch (std::exception& e) { - if (seq) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, e.what()); - } - } - return SWIG_ERROR; - } - } - return SWIG_ERROR; - } - }; - - template - struct traits_from_stdseq { - typedef Seq sequence; - typedef T value_type; - typedef typename Seq::size_type size_type; - typedef typename sequence::const_iterator const_iterator; - - static PyObject *from(const sequence& seq) { -#ifdef SWIG_PYTHON_EXTRA_NATIVE_CONTAINERS - swig_type_info *desc = swig::type_info(); - if (desc && desc->clientdata) { - return SWIG_InternalNewPointerObj(new sequence(seq), desc, SWIG_POINTER_OWN); - } -#endif - size_type size = seq.size(); - if (size <= (size_type)INT_MAX) { - PyObject *obj = PyTuple_New((Py_ssize_t)size); - Py_ssize_t i = 0; - for (const_iterator it = seq.begin(); it != seq.end(); ++it, ++i) { - PyTuple_SetItem(obj,i,swig::from(*it)); - } - return obj; - } else { - PyErr_SetString(PyExc_OverflowError,"sequence size not valid in python"); - return NULL; - } - } - }; -} - - - namespace swig { - template - struct traits_reserve > { - static void reserve(std::vector &seq, typename std::vector::size_type n) { - seq.reserve(n); - } - }; - - template - struct traits_asptr > { - static int asptr(PyObject *obj, std::vector **vec) { - return traits_asptr_stdseq >::asptr(obj, vec); - } - }; - - template - struct traits_from > { - static PyObject *from(const std::vector& vec) { - return traits_from_stdseq >::from(vec); - } - }; - } - - - namespace swig { - template <> struct traits > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "double" "," "std::allocator< double >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_double_Sg__iterator(std::vector< double > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_double_Sg____nonzero__(std::vector< double > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_double_Sg____bool__(std::vector< double > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< double >::size_type std_vector_Sl_double_Sg____len__(std::vector< double > const *self){ - return self->size(); - } - -SWIGINTERNINLINE PyObject* -SWIG_From_unsigned_SS_long (unsigned long value) -{ - return (value > LONG_MAX) ? - PyLong_FromUnsignedLong(value) : PyInt_FromLong(static_cast< long >(value)); -} - - -#ifdef SWIG_LONG_LONG_AVAILABLE -SWIGINTERNINLINE PyObject* -SWIG_From_unsigned_SS_long_SS_long (unsigned long long value) -{ - return (value > LONG_MAX) ? - PyLong_FromUnsignedLongLong(value) : PyInt_FromLong(static_cast< long >(value)); -} -#endif - - -SWIGINTERNINLINE PyObject * -SWIG_From_size_t (size_t value) -{ -#ifdef SWIG_LONG_LONG_AVAILABLE - if (sizeof(size_t) <= sizeof(unsigned long)) { -#endif - return SWIG_From_unsigned_SS_long (static_cast< unsigned long >(value)); -#ifdef SWIG_LONG_LONG_AVAILABLE - } else { - /* assume sizeof(size_t) <= sizeof(unsigned long long) */ - return SWIG_From_unsigned_SS_long_SS_long (static_cast< unsigned long long >(value)); - } -#endif -} - -SWIGINTERN std::vector< double,std::allocator< double > > *std_vector_Sl_double_Sg____getslice__(std::vector< double > *self,std::vector< double >::difference_type i,std::vector< double >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_double_Sg____setslice____SWIG_0(std::vector< double > *self,std::vector< double >::difference_type i,std::vector< double >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< double,std::allocator< double > >()); - } -SWIGINTERN void std_vector_Sl_double_Sg____setslice____SWIG_1(std::vector< double > *self,std::vector< double >::difference_type i,std::vector< double >::difference_type j,std::vector< double,std::allocator< double > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_double_Sg____delslice__(std::vector< double > *self,std::vector< double >::difference_type i,std::vector< double >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_double_Sg____delitem____SWIG_0(std::vector< double > *self,std::vector< double >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< double,std::allocator< double > > *std_vector_Sl_double_Sg____getitem____SWIG_0(std::vector< double > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< double,std::allocator< double > >::difference_type id = i; - std::vector< double,std::allocator< double > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_double_Sg____setitem____SWIG_0(std::vector< double > *self,PySliceObject *slice,std::vector< double,std::allocator< double > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< double,std::allocator< double > >::difference_type id = i; - std::vector< double,std::allocator< double > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_double_Sg____setitem____SWIG_1(std::vector< double > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< double,std::allocator< double > >::difference_type id = i; - std::vector< double,std::allocator< double > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_double_Sg____delitem____SWIG_1(std::vector< double > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< double,std::allocator< double > >::difference_type id = i; - std::vector< double,std::allocator< double > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< double >::value_type const &std_vector_Sl_double_Sg____getitem____SWIG_1(std::vector< double > const *self,std::vector< double >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_double_Sg____setitem____SWIG_2(std::vector< double > *self,std::vector< double >::difference_type i,std::vector< double >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< double >::value_type std_vector_Sl_double_Sg__pop(std::vector< double > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< double,std::allocator< double > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_double_Sg__append(std::vector< double > *self,std::vector< double >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< double >::iterator std_vector_Sl_double_Sg__erase__SWIG_0(std::vector< double > *self,std::vector< double >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< double >::iterator std_vector_Sl_double_Sg__erase__SWIG_1(std::vector< double > *self,std::vector< double >::iterator first,std::vector< double >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< double >::iterator std_vector_Sl_double_Sg__insert__SWIG_0(std::vector< double > *self,std::vector< double >::iterator pos,std::vector< double >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_double_Sg__insert__SWIG_1(std::vector< double > *self,std::vector< double >::iterator pos,std::vector< double >::size_type n,std::vector< double >::value_type const &x){ self->insert(pos, n, x); } - -namespace swig { - template <> struct traits< int > { - typedef value_category category; - static const char* type_name() { return"int"; } - }; - template <> struct traits_asval< int > { - typedef int value_type; - static int asval(PyObject *obj, value_type *val) { - return SWIG_AsVal_int (obj, val); - } - }; - template <> struct traits_from< int > { - typedef int value_type; - static PyObject *from(const value_type& val) { - return SWIG_From_int (val); - } - }; -} - - - namespace swig { - template <> struct traits > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "int" "," "std::allocator< int >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_int_Sg__iterator(std::vector< int > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_int_Sg____nonzero__(std::vector< int > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_int_Sg____bool__(std::vector< int > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< int >::size_type std_vector_Sl_int_Sg____len__(std::vector< int > const *self){ - return self->size(); - } -SWIGINTERN std::vector< int,std::allocator< int > > *std_vector_Sl_int_Sg____getslice__(std::vector< int > *self,std::vector< int >::difference_type i,std::vector< int >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_int_Sg____setslice____SWIG_0(std::vector< int > *self,std::vector< int >::difference_type i,std::vector< int >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< int,std::allocator< int > >()); - } -SWIGINTERN void std_vector_Sl_int_Sg____setslice____SWIG_1(std::vector< int > *self,std::vector< int >::difference_type i,std::vector< int >::difference_type j,std::vector< int,std::allocator< int > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_int_Sg____delslice__(std::vector< int > *self,std::vector< int >::difference_type i,std::vector< int >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_int_Sg____delitem____SWIG_0(std::vector< int > *self,std::vector< int >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< int,std::allocator< int > > *std_vector_Sl_int_Sg____getitem____SWIG_0(std::vector< int > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< int,std::allocator< int > >::difference_type id = i; - std::vector< int,std::allocator< int > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_int_Sg____setitem____SWIG_0(std::vector< int > *self,PySliceObject *slice,std::vector< int,std::allocator< int > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< int,std::allocator< int > >::difference_type id = i; - std::vector< int,std::allocator< int > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_int_Sg____setitem____SWIG_1(std::vector< int > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< int,std::allocator< int > >::difference_type id = i; - std::vector< int,std::allocator< int > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_int_Sg____delitem____SWIG_1(std::vector< int > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< int,std::allocator< int > >::difference_type id = i; - std::vector< int,std::allocator< int > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< int >::value_type const &std_vector_Sl_int_Sg____getitem____SWIG_1(std::vector< int > const *self,std::vector< int >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_int_Sg____setitem____SWIG_2(std::vector< int > *self,std::vector< int >::difference_type i,std::vector< int >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< int >::value_type std_vector_Sl_int_Sg__pop(std::vector< int > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< int,std::allocator< int > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_int_Sg__append(std::vector< int > *self,std::vector< int >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< int >::iterator std_vector_Sl_int_Sg__erase__SWIG_0(std::vector< int > *self,std::vector< int >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< int >::iterator std_vector_Sl_int_Sg__erase__SWIG_1(std::vector< int > *self,std::vector< int >::iterator first,std::vector< int >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< int >::iterator std_vector_Sl_int_Sg__insert__SWIG_0(std::vector< int > *self,std::vector< int >::iterator pos,std::vector< int >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_int_Sg__insert__SWIG_1(std::vector< int > *self,std::vector< int >::iterator pos,std::vector< int >::size_type n,std::vector< int >::value_type const &x){ self->insert(pos, n, x); } - -SWIGINTERN swig_type_info* -SWIG_pchar_descriptor(void) -{ - static int init = 0; - static swig_type_info* info = 0; - if (!init) { - info = SWIG_TypeQuery("_p_char"); - init = 1; - } - return info; -} - - -SWIGINTERN int -SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) -{ -#if PY_VERSION_HEX>=0x03000000 -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - if (PyBytes_Check(obj)) -#else - if (PyUnicode_Check(obj)) -#endif -#else - if (PyString_Check(obj)) -#endif - { - char *cstr; Py_ssize_t len; -#if PY_VERSION_HEX>=0x03000000 -#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - if (!alloc && cptr) { - /* We can't allow converting without allocation, since the internal - representation of string in Python 3 is UCS-2/UCS-4 but we require - a UTF-8 representation. - TODO(bhy) More detailed explanation */ - return SWIG_RuntimeError; - } - obj = PyUnicode_AsUTF8String(obj); - if(alloc) *alloc = SWIG_NEWOBJ; -#endif - PyBytes_AsStringAndSize(obj, &cstr, &len); -#else - PyString_AsStringAndSize(obj, &cstr, &len); -#endif - if (cptr) { - if (alloc) { - /* - In python the user should not be able to modify the inner - string representation. To warranty that, if you define - SWIG_PYTHON_SAFE_CSTRINGS, a new/copy of the python string - buffer is always returned. - - The default behavior is just to return the pointer value, - so, be careful. - */ -#if defined(SWIG_PYTHON_SAFE_CSTRINGS) - if (*alloc != SWIG_OLDOBJ) -#else - if (*alloc == SWIG_NEWOBJ) -#endif - { - *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); - *alloc = SWIG_NEWOBJ; - } else { - *cptr = cstr; - *alloc = SWIG_OLDOBJ; - } - } else { -#if PY_VERSION_HEX>=0x03000000 -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - *cptr = PyBytes_AsString(obj); -#else - assert(0); /* Should never reach here with Unicode strings in Python 3 */ -#endif -#else - *cptr = SWIG_Python_str_AsChar(obj); -#endif - } - } - if (psize) *psize = len + 1; -#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - Py_XDECREF(obj); -#endif - return SWIG_OK; - } else { -#if defined(SWIG_PYTHON_2_UNICODE) -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) -#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once" -#endif -#if PY_VERSION_HEX<0x03000000 - if (PyUnicode_Check(obj)) { - char *cstr; Py_ssize_t len; - if (!alloc && cptr) { - return SWIG_RuntimeError; - } - obj = PyUnicode_AsUTF8String(obj); - if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) { - if (cptr) { - if (alloc) *alloc = SWIG_NEWOBJ; - *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1))); - } - if (psize) *psize = len + 1; - - Py_XDECREF(obj); - return SWIG_OK; - } else { - Py_XDECREF(obj); - } - } -#endif -#endif - - swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); - if (pchar_descriptor) { - void* vptr = 0; - if (SWIG_ConvertPtr(obj, &vptr, pchar_descriptor, 0) == SWIG_OK) { - if (cptr) *cptr = (char *) vptr; - if (psize) *psize = vptr ? (strlen((char *)vptr) + 1) : 0; - if (alloc) *alloc = SWIG_OLDOBJ; - return SWIG_OK; - } - } - } - return SWIG_TypeError; -} - - -SWIGINTERN int -SWIG_AsPtr_std_string (PyObject * obj, std::string **val) -{ - char* buf = 0 ; size_t size = 0; int alloc = SWIG_OLDOBJ; - if (SWIG_IsOK((SWIG_AsCharPtrAndSize(obj, &buf, &size, &alloc)))) { - if (buf) { - if (val) *val = new std::string(buf, size - 1); - if (alloc == SWIG_NEWOBJ) delete[] buf; - return SWIG_NEWOBJ; - } else { - if (val) *val = 0; - return SWIG_OLDOBJ; - } - } else { - static int init = 0; - static swig_type_info* descriptor = 0; - if (!init) { - descriptor = SWIG_TypeQuery("std::string" " *"); - init = 1; - } - if (descriptor) { - std::string *vptr; - int res = SWIG_ConvertPtr(obj, (void**)&vptr, descriptor, 0); - if (SWIG_IsOK(res) && val) *val = vptr; - return res; - } - } - return SWIG_ERROR; -} - - -SWIGINTERN int -SWIG_AsVal_std_string (PyObject * obj, std::string *val) -{ - std::string* v = (std::string *) 0; - int res = SWIG_AsPtr_std_string (obj, &v); - if (!SWIG_IsOK(res)) return res; - if (v) { - if (val) *val = *v; - if (SWIG_IsNewObj(res)) { - delete v; - res = SWIG_DelNewMask(res); - } - return res; - } - return SWIG_ERROR; -} - - -SWIGINTERNINLINE PyObject * -SWIG_FromCharPtrAndSize(const char* carray, size_t size) -{ - if (carray) { - if (size > INT_MAX) { - swig_type_info* pchar_descriptor = SWIG_pchar_descriptor(); - return pchar_descriptor ? - SWIG_InternalNewPointerObj(const_cast< char * >(carray), pchar_descriptor, 0) : SWIG_Py_Void(); - } else { -#if PY_VERSION_HEX >= 0x03000000 -#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) - return PyBytes_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); -#else -#if PY_VERSION_HEX >= 0x03010000 - return PyUnicode_DecodeUTF8(carray, static_cast< Py_ssize_t >(size), "surrogateescape"); -#else - return PyUnicode_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); -#endif -#endif -#else - return PyString_FromStringAndSize(carray, static_cast< Py_ssize_t >(size)); -#endif - } - } else { - return SWIG_Py_Void(); - } -} - - -SWIGINTERNINLINE PyObject * -SWIG_From_std_string (const std::string& s) -{ - return SWIG_FromCharPtrAndSize(s.data(), s.size()); -} - - -namespace swig { - template <> struct traits< std::string > { - typedef value_category category; - static const char* type_name() { return"std::string"; } - }; - template <> struct traits_asval< std::string > { - typedef std::string value_type; - static int asval(PyObject *obj, value_type *val) { - return SWIG_AsVal_std_string (obj, val); - } - }; - template <> struct traits_from< std::string > { - typedef std::string value_type; - static PyObject *from(const value_type& val) { - return SWIG_From_std_string (val); - } - }; -} - - - namespace swig { - template <> struct traits > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::string" "," "std::allocator< std::string >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_string_Sg__iterator(std::vector< std::string > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_string_Sg____nonzero__(std::vector< std::string > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_string_Sg____bool__(std::vector< std::string > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::string >::size_type std_vector_Sl_std_string_Sg____len__(std::vector< std::string > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::string,std::allocator< std::string > > *std_vector_Sl_std_string_Sg____getslice__(std::vector< std::string > *self,std::vector< std::string >::difference_type i,std::vector< std::string >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____setslice____SWIG_0(std::vector< std::string > *self,std::vector< std::string >::difference_type i,std::vector< std::string >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::string,std::allocator< std::string > >()); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____setslice____SWIG_1(std::vector< std::string > *self,std::vector< std::string >::difference_type i,std::vector< std::string >::difference_type j,std::vector< std::string,std::allocator< std::string > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____delslice__(std::vector< std::string > *self,std::vector< std::string >::difference_type i,std::vector< std::string >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____delitem____SWIG_0(std::vector< std::string > *self,std::vector< std::string >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::string,std::allocator< std::string > > *std_vector_Sl_std_string_Sg____getitem____SWIG_0(std::vector< std::string > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::string,std::allocator< std::string > >::difference_type id = i; - std::vector< std::string,std::allocator< std::string > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____setitem____SWIG_0(std::vector< std::string > *self,PySliceObject *slice,std::vector< std::string,std::allocator< std::string > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::string,std::allocator< std::string > >::difference_type id = i; - std::vector< std::string,std::allocator< std::string > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____setitem____SWIG_1(std::vector< std::string > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::string,std::allocator< std::string > >::difference_type id = i; - std::vector< std::string,std::allocator< std::string > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____delitem____SWIG_1(std::vector< std::string > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::string,std::allocator< std::string > >::difference_type id = i; - std::vector< std::string,std::allocator< std::string > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::string >::value_type const &std_vector_Sl_std_string_Sg____getitem____SWIG_1(std::vector< std::string > const *self,std::vector< std::string >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_string_Sg____setitem____SWIG_2(std::vector< std::string > *self,std::vector< std::string >::difference_type i,std::vector< std::string >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::string >::value_type std_vector_Sl_std_string_Sg__pop(std::vector< std::string > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::string,std::allocator< std::string > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_string_Sg__append(std::vector< std::string > *self,std::vector< std::string >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::string >::iterator std_vector_Sl_std_string_Sg__erase__SWIG_0(std::vector< std::string > *self,std::vector< std::string >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::string >::iterator std_vector_Sl_std_string_Sg__erase__SWIG_1(std::vector< std::string > *self,std::vector< std::string >::iterator first,std::vector< std::string >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::string >::iterator std_vector_Sl_std_string_Sg__insert__SWIG_0(std::vector< std::string > *self,std::vector< std::string >::iterator pos,std::vector< std::string >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_string_Sg__insert__SWIG_1(std::vector< std::string > *self,std::vector< std::string >::iterator pos,std::vector< std::string >::size_type n,std::vector< std::string >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits >, std::allocator< std::vector< double,std::allocator< double > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::vector< double,std::allocator< double > >" "," "std::allocator< std::vector< double,std::allocator< double > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_vector_Sl_double_Sg__Sg__iterator(std::vector< std::vector< double > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_double_Sg__Sg____nonzero__(std::vector< std::vector< double > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_double_Sg__Sg____bool__(std::vector< std::vector< double > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::vector< double > >::size_type std_vector_Sl_std_vector_Sl_double_Sg__Sg____len__(std::vector< std::vector< double > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *std_vector_Sl_std_vector_Sl_double_Sg__Sg____getslice__(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::difference_type i,std::vector< std::vector< double > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____setslice____SWIG_0(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::difference_type i,std::vector< std::vector< double > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >()); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____setslice____SWIG_1(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::difference_type i,std::vector< std::vector< double > >::difference_type j,std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____delslice__(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::difference_type i,std::vector< std::vector< double > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____delitem____SWIG_0(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *std_vector_Sl_std_vector_Sl_double_Sg__Sg____getitem____SWIG_0(std::vector< std::vector< double > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type id = i; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____setitem____SWIG_0(std::vector< std::vector< double > > *self,PySliceObject *slice,std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type id = i; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____setitem____SWIG_1(std::vector< std::vector< double > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type id = i; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____delitem____SWIG_1(std::vector< std::vector< double > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type id = i; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::vector< double > >::value_type const &std_vector_Sl_std_vector_Sl_double_Sg__Sg____getitem____SWIG_1(std::vector< std::vector< double > > const *self,std::vector< std::vector< double > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg____setitem____SWIG_2(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::difference_type i,std::vector< std::vector< double > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::vector< double > >::value_type std_vector_Sl_std_vector_Sl_double_Sg__Sg__pop(std::vector< std::vector< double > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg__append(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::vector< double > >::iterator std_vector_Sl_std_vector_Sl_double_Sg__Sg__erase__SWIG_0(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::vector< double > >::iterator std_vector_Sl_std_vector_Sl_double_Sg__Sg__erase__SWIG_1(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::iterator first,std::vector< std::vector< double > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::vector< double > >::iterator std_vector_Sl_std_vector_Sl_double_Sg__Sg__insert__SWIG_0(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::iterator pos,std::vector< std::vector< double > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_vector_Sl_double_Sg__Sg__insert__SWIG_1(std::vector< std::vector< double > > *self,std::vector< std::vector< double > >::iterator pos,std::vector< std::vector< double > >::size_type n,std::vector< std::vector< double > >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits >, std::allocator< std::vector< int,std::allocator< int > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::vector< int,std::allocator< int > >" "," "std::allocator< std::vector< int,std::allocator< int > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_vector_Sl_int_Sg__Sg__iterator(std::vector< std::vector< int > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_int_Sg__Sg____nonzero__(std::vector< std::vector< int > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_int_Sg__Sg____bool__(std::vector< std::vector< int > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::vector< int > >::size_type std_vector_Sl_std_vector_Sl_int_Sg__Sg____len__(std::vector< std::vector< int > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *std_vector_Sl_std_vector_Sl_int_Sg__Sg____getslice__(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::difference_type i,std::vector< std::vector< int > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____setslice____SWIG_0(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::difference_type i,std::vector< std::vector< int > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >()); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____setslice____SWIG_1(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::difference_type i,std::vector< std::vector< int > >::difference_type j,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____delslice__(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::difference_type i,std::vector< std::vector< int > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____delitem____SWIG_0(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *std_vector_Sl_std_vector_Sl_int_Sg__Sg____getitem____SWIG_0(std::vector< std::vector< int > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type id = i; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____setitem____SWIG_0(std::vector< std::vector< int > > *self,PySliceObject *slice,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type id = i; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____setitem____SWIG_1(std::vector< std::vector< int > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type id = i; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____delitem____SWIG_1(std::vector< std::vector< int > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type id = i; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::vector< int > >::value_type const &std_vector_Sl_std_vector_Sl_int_Sg__Sg____getitem____SWIG_1(std::vector< std::vector< int > > const *self,std::vector< std::vector< int > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg____setitem____SWIG_2(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::difference_type i,std::vector< std::vector< int > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::vector< int > >::value_type std_vector_Sl_std_vector_Sl_int_Sg__Sg__pop(std::vector< std::vector< int > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg__append(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::vector< int > >::iterator std_vector_Sl_std_vector_Sl_int_Sg__Sg__erase__SWIG_0(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::vector< int > >::iterator std_vector_Sl_std_vector_Sl_int_Sg__Sg__erase__SWIG_1(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::iterator first,std::vector< std::vector< int > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::vector< int > >::iterator std_vector_Sl_std_vector_Sl_int_Sg__Sg__insert__SWIG_0(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::iterator pos,std::vector< std::vector< int > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_vector_Sl_int_Sg__Sg__insert__SWIG_1(std::vector< std::vector< int > > *self,std::vector< std::vector< int > >::iterator pos,std::vector< std::vector< int > >::size_type n,std::vector< std::vector< int > >::value_type const &x){ self->insert(pos, n, x); } - -namespace swig { - template <> struct traits< float > { - typedef value_category category; - static const char* type_name() { return"float"; } - }; - template <> struct traits_asval< float > { - typedef float value_type; - static int asval(PyObject *obj, value_type *val) { - return SWIG_AsVal_float (obj, val); - } - }; - template <> struct traits_from< float > { - typedef float value_type; - static PyObject *from(const value_type& val) { - return SWIG_From_float (val); - } - }; -} - - - namespace swig { - template <> struct traits > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "float" "," "std::allocator< float >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_float_Sg__iterator(std::vector< float > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_float_Sg____nonzero__(std::vector< float > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_float_Sg____bool__(std::vector< float > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< float >::size_type std_vector_Sl_float_Sg____len__(std::vector< float > const *self){ - return self->size(); - } -SWIGINTERN std::vector< float,std::allocator< float > > *std_vector_Sl_float_Sg____getslice__(std::vector< float > *self,std::vector< float >::difference_type i,std::vector< float >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_float_Sg____setslice____SWIG_0(std::vector< float > *self,std::vector< float >::difference_type i,std::vector< float >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< float,std::allocator< float > >()); - } -SWIGINTERN void std_vector_Sl_float_Sg____setslice____SWIG_1(std::vector< float > *self,std::vector< float >::difference_type i,std::vector< float >::difference_type j,std::vector< float,std::allocator< float > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_float_Sg____delslice__(std::vector< float > *self,std::vector< float >::difference_type i,std::vector< float >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_float_Sg____delitem____SWIG_0(std::vector< float > *self,std::vector< float >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< float,std::allocator< float > > *std_vector_Sl_float_Sg____getitem____SWIG_0(std::vector< float > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< float,std::allocator< float > >::difference_type id = i; - std::vector< float,std::allocator< float > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_float_Sg____setitem____SWIG_0(std::vector< float > *self,PySliceObject *slice,std::vector< float,std::allocator< float > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< float,std::allocator< float > >::difference_type id = i; - std::vector< float,std::allocator< float > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_float_Sg____setitem____SWIG_1(std::vector< float > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< float,std::allocator< float > >::difference_type id = i; - std::vector< float,std::allocator< float > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_float_Sg____delitem____SWIG_1(std::vector< float > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< float,std::allocator< float > >::difference_type id = i; - std::vector< float,std::allocator< float > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< float >::value_type const &std_vector_Sl_float_Sg____getitem____SWIG_1(std::vector< float > const *self,std::vector< float >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_float_Sg____setitem____SWIG_2(std::vector< float > *self,std::vector< float >::difference_type i,std::vector< float >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< float >::value_type std_vector_Sl_float_Sg__pop(std::vector< float > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< float,std::allocator< float > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_float_Sg__append(std::vector< float > *self,std::vector< float >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< float >::iterator std_vector_Sl_float_Sg__erase__SWIG_0(std::vector< float > *self,std::vector< float >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< float >::iterator std_vector_Sl_float_Sg__erase__SWIG_1(std::vector< float > *self,std::vector< float >::iterator first,std::vector< float >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< float >::iterator std_vector_Sl_float_Sg__insert__SWIG_0(std::vector< float > *self,std::vector< float >::iterator pos,std::vector< float >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_float_Sg__insert__SWIG_1(std::vector< float > *self,std::vector< float >::iterator pos,std::vector< float >::size_type n,std::vector< float >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - template - struct traits_asptr > { - typedef std::pair value_type; - - static int get_pair(PyObject* first, PyObject* second, - std::pair **val) - { - if (val) { - value_type *vp = (new std::pair()); - T *pfirst = &(vp->first); - int res1 = swig::asval((PyObject*)first, pfirst); - if (!SWIG_IsOK(res1)) { - delete vp; - return res1; - } - U *psecond = &(vp->second); - int res2 = swig::asval((PyObject*)second, psecond); - if (!SWIG_IsOK(res2)) { - delete vp; - return res2; - } - *val = vp; - return SWIG_AddNewMask(res1 > res2 ? res1 : res2); - } else { - T *pfirst = 0; - int res1 = swig::asval((PyObject*)first, pfirst); - if (!SWIG_IsOK(res1)) return res1; - U *psecond = 0; - int res2 = swig::asval((PyObject*)second, psecond); - if (!SWIG_IsOK(res2)) return res2; - return res1 > res2 ? res1 : res2; - } - } - - static int asptr(PyObject *obj, std::pair **val) { - int res = SWIG_ERROR; - if (PyTuple_Check(obj)) { - if (PyTuple_GET_SIZE(obj) == 2) { - res = get_pair(PyTuple_GET_ITEM(obj,0),PyTuple_GET_ITEM(obj,1), val); - } - } else if (PySequence_Check(obj)) { - if (PySequence_Size(obj) == 2) { - swig::SwigVar_PyObject first = PySequence_GetItem(obj,0); - swig::SwigVar_PyObject second = PySequence_GetItem(obj,1); - res = get_pair(first, second, val); - } - } else { - value_type *p; - swig_type_info *descriptor = swig::type_info(); - res = descriptor ? SWIG_ConvertPtr(obj, (void **)&p, descriptor, 0) : SWIG_ERROR; - if (SWIG_IsOK(res) && val) *val = p; - } - return res; - } - }; - - - template - struct traits_from > { - static PyObject *from(const std::pair& val) { - PyObject* obj = PyTuple_New(2); - PyTuple_SetItem(obj,0,swig::from(val.first)); - PyTuple_SetItem(obj,1,swig::from(val.second)); - return obj; - } - }; - } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - namespace swig { - template <> struct traits > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::pair<" "float" "," "std::vector< int,std::allocator< int > >" " >"; - } - }; - } - - - namespace swig { - template <> struct traits > >, std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::pair< float,std::vector< int,std::allocator< int > > >" "," "std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__iterator(std::vector< std::pair< float,std::vector< int > > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____nonzero__(std::vector< std::pair< float,std::vector< int > > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____bool__(std::vector< std::pair< float,std::vector< int > > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::pair< float,std::vector< int > > >::size_type std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____len__(std::vector< std::pair< float,std::vector< int > > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____getslice__(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i,std::vector< std::pair< float,std::vector< int > > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_0(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i,std::vector< std::pair< float,std::vector< int > > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >()); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_1(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i,std::vector< std::pair< float,std::vector< int > > >::difference_type j,std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____delslice__(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i,std::vector< std::pair< float,std::vector< int > > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_0(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_0(std::vector< std::pair< float,std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_0(std::vector< std::pair< float,std::vector< int > > > *self,PySliceObject *slice,std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_1(std::vector< std::pair< float,std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_1(std::vector< std::pair< float,std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::pair< float,std::vector< int > > >::value_type const &std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_1(std::vector< std::pair< float,std::vector< int > > > const *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_2(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::difference_type i,std::vector< std::pair< float,std::vector< int > > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::pair< float,std::vector< int > > >::value_type std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__pop(std::vector< std::pair< float,std::vector< int > > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__append(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::pair< float,std::vector< int > > >::iterator std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_0(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::pair< float,std::vector< int > > >::iterator std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_1(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::iterator first,std::vector< std::pair< float,std::vector< int > > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::pair< float,std::vector< int > > >::iterator std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_0(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::iterator pos,std::vector< std::pair< float,std::vector< int > > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_1(std::vector< std::pair< float,std::vector< int > > > *self,std::vector< std::pair< float,std::vector< int > > >::iterator pos,std::vector< std::pair< float,std::vector< int > > >::size_type n,std::vector< std::pair< float,std::vector< int > > >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits< std::pair< double,std::vector< int,std::allocator< int > > > > { - typedef pointer_category category; - static const char* type_name() { return"std::pair< double,std::vector< int,std::allocator< int > > >"; } - }; - } - - - namespace swig { - template <> struct traits > >, std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::pair< double,std::vector< int,std::allocator< int > > >" "," "std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__iterator(std::vector< std::pair< double,std::vector< int > > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____nonzero__(std::vector< std::pair< double,std::vector< int > > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____bool__(std::vector< std::pair< double,std::vector< int > > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::pair< double,std::vector< int > > >::size_type std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____len__(std::vector< std::pair< double,std::vector< int > > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____getslice__(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i,std::vector< std::pair< double,std::vector< int > > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_0(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i,std::vector< std::pair< double,std::vector< int > > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >()); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_1(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i,std::vector< std::pair< double,std::vector< int > > >::difference_type j,std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____delslice__(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i,std::vector< std::pair< double,std::vector< int > > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_0(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_0(std::vector< std::pair< double,std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_0(std::vector< std::pair< double,std::vector< int > > > *self,PySliceObject *slice,std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_1(std::vector< std::pair< double,std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_1(std::vector< std::pair< double,std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type id = i; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::pair< double,std::vector< int > > >::value_type const &std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_1(std::vector< std::pair< double,std::vector< int > > > const *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_2(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::difference_type i,std::vector< std::pair< double,std::vector< int > > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::pair< double,std::vector< int > > >::value_type std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__pop(std::vector< std::pair< double,std::vector< int > > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__append(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::pair< double,std::vector< int > > >::iterator std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_0(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::pair< double,std::vector< int > > >::iterator std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_1(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::iterator first,std::vector< std::pair< double,std::vector< int > > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::pair< double,std::vector< int > > >::iterator std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_0(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::iterator pos,std::vector< std::pair< double,std::vector< int > > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_1(std::vector< std::pair< double,std::vector< int > > > *self,std::vector< std::pair< double,std::vector< int > > >::iterator pos,std::vector< std::pair< double,std::vector< int > > >::size_type n,std::vector< std::pair< double,std::vector< int > > >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >, std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >" "," "std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__iterator(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____nonzero__(std::vector< std::vector< std::pair< double,std::vector< int > > > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____bool__(std::vector< std::vector< std::pair< double,std::vector< int > > > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____len__(std::vector< std::vector< std::pair< double,std::vector< int > > > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____getslice__(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setslice____SWIG_0(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >()); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setslice____SWIG_1(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type j,std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____delslice__(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____delitem____SWIG_0(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____getitem____SWIG_0(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type id = i; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setitem____SWIG_0(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,PySliceObject *slice,std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type id = i; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setitem____SWIG_1(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type id = i; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____delitem____SWIG_1(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type id = i; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____getitem____SWIG_1(std::vector< std::vector< std::pair< double,std::vector< int > > > > const *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setitem____SWIG_2(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type i,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__pop(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__append(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__erase__SWIG_0(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__erase__SWIG_1(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator first,std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__insert__SWIG_0(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator pos,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__insert__SWIG_1(std::vector< std::vector< std::pair< double,std::vector< int > > > > *self,std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator pos,std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type n,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits >,std::allocator< std::vector< double,std::allocator< double > > > >, std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >" "," "std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__iterator(std::vector< std::vector< std::vector< double > > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____nonzero__(std::vector< std::vector< std::vector< double > > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____bool__(std::vector< std::vector< std::vector< double > > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::vector< std::vector< double > > >::size_type std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____len__(std::vector< std::vector< std::vector< double > > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____getslice__(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::difference_type i,std::vector< std::vector< std::vector< double > > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setslice____SWIG_0(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::difference_type i,std::vector< std::vector< std::vector< double > > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >()); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setslice____SWIG_1(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::difference_type i,std::vector< std::vector< std::vector< double > > >::difference_type j,std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____delslice__(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::difference_type i,std::vector< std::vector< std::vector< double > > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____delitem____SWIG_0(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____getitem____SWIG_0(std::vector< std::vector< std::vector< double > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setitem____SWIG_0(std::vector< std::vector< std::vector< double > > > *self,PySliceObject *slice,std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setitem____SWIG_1(std::vector< std::vector< std::vector< double > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____delitem____SWIG_1(std::vector< std::vector< std::vector< double > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::vector< std::vector< double > > >::value_type const &std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____getitem____SWIG_1(std::vector< std::vector< std::vector< double > > > const *self,std::vector< std::vector< std::vector< double > > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setitem____SWIG_2(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::difference_type i,std::vector< std::vector< std::vector< double > > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::vector< std::vector< double > > >::value_type std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__pop(std::vector< std::vector< std::vector< double > > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__append(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::vector< std::vector< double > > >::iterator std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__erase__SWIG_0(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::vector< std::vector< double > > >::iterator std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__erase__SWIG_1(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::iterator first,std::vector< std::vector< std::vector< double > > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::vector< std::vector< double > > >::iterator std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__insert__SWIG_0(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::iterator pos,std::vector< std::vector< std::vector< double > > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__insert__SWIG_1(std::vector< std::vector< std::vector< double > > > *self,std::vector< std::vector< std::vector< double > > >::iterator pos,std::vector< std::vector< std::vector< double > > >::size_type n,std::vector< std::vector< std::vector< double > > >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits >,std::allocator< std::vector< int,std::allocator< int > > > >, std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > > { - typedef pointer_category category; - static const char* type_name() { - return "std::vector<" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >" "," "std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__iterator(std::vector< std::vector< std::vector< int > > > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____nonzero__(std::vector< std::vector< std::vector< int > > > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____bool__(std::vector< std::vector< std::vector< int > > > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< std::vector< std::vector< int > > >::size_type std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____len__(std::vector< std::vector< std::vector< int > > > const *self){ - return self->size(); - } -SWIGINTERN std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____getslice__(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::difference_type i,std::vector< std::vector< std::vector< int > > >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_0(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::difference_type i,std::vector< std::vector< std::vector< int > > >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >()); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_1(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::difference_type i,std::vector< std::vector< std::vector< int > > >::difference_type j,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____delslice__(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::difference_type i,std::vector< std::vector< std::vector< int > > >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_0(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_0(std::vector< std::vector< std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_0(std::vector< std::vector< std::vector< int > > > *self,PySliceObject *slice,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_1(std::vector< std::vector< std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_1(std::vector< std::vector< std::vector< int > > > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type id = i; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< std::vector< std::vector< int > > >::value_type const &std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_1(std::vector< std::vector< std::vector< int > > > const *self,std::vector< std::vector< std::vector< int > > >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_2(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::difference_type i,std::vector< std::vector< std::vector< int > > >::value_type const &x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< std::vector< std::vector< int > > >::value_type std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__pop(std::vector< std::vector< std::vector< int > > > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__append(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::value_type const &x){ - self->push_back(x); - } -SWIGINTERN std::vector< std::vector< std::vector< int > > >::iterator std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_0(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< std::vector< std::vector< int > > >::iterator std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_1(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::iterator first,std::vector< std::vector< std::vector< int > > >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< std::vector< std::vector< int > > >::iterator std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_0(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::iterator pos,std::vector< std::vector< std::vector< int > > >::value_type const &x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_1(std::vector< std::vector< std::vector< int > > > *self,std::vector< std::vector< std::vector< int > > >::iterator pos,std::vector< std::vector< std::vector< int > > >::size_type n,std::vector< std::vector< std::vector< int > > >::value_type const &x){ self->insert(pos, n, x); } - - namespace swig { - template <> struct traits< PathTrie > { - typedef pointer_category category; - static const char* type_name() { return"PathTrie"; } - }; - } - - - namespace swig { - template <> struct traits > > { - typedef value_category category; - static const char* type_name() { - return "std::vector<" "PathTrie" " *," "std::allocator< PathTrie * >" " >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_PathTrie_Sm__Sg__iterator(std::vector< PathTrie * > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_PathTrie_Sm__Sg____nonzero__(std::vector< PathTrie * > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_PathTrie_Sm__Sg____bool__(std::vector< PathTrie * > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< PathTrie * >::size_type std_vector_Sl_PathTrie_Sm__Sg____len__(std::vector< PathTrie * > const *self){ - return self->size(); - } -SWIGINTERN std::vector< PathTrie *,std::allocator< PathTrie * > > *std_vector_Sl_PathTrie_Sm__Sg____getslice__(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i,std::vector< PathTrie * >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____setslice____SWIG_0(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i,std::vector< PathTrie * >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector< PathTrie*,std::allocator< PathTrie * > >()); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____setslice____SWIG_1(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i,std::vector< PathTrie * >::difference_type j,std::vector< PathTrie *,std::allocator< PathTrie * > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____delslice__(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i,std::vector< PathTrie * >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____delitem____SWIG_0(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< PathTrie *,std::allocator< PathTrie * > > *std_vector_Sl_PathTrie_Sm__Sg____getitem____SWIG_0(std::vector< PathTrie * > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type id = i; - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____setitem____SWIG_0(std::vector< PathTrie * > *self,PySliceObject *slice,std::vector< PathTrie *,std::allocator< PathTrie * > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type id = i; - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____setitem____SWIG_1(std::vector< PathTrie * > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type id = i; - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____delitem____SWIG_1(std::vector< PathTrie * > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type id = i; - std::vector< PathTrie*,std::allocator< PathTrie * > >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< PathTrie * >::value_type std_vector_Sl_PathTrie_Sm__Sg____getitem____SWIG_1(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg____setitem____SWIG_2(std::vector< PathTrie * > *self,std::vector< PathTrie * >::difference_type i,std::vector< PathTrie * >::value_type x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< PathTrie * >::value_type std_vector_Sl_PathTrie_Sm__Sg__pop(std::vector< PathTrie * > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector< PathTrie*,std::allocator< PathTrie * > >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg__append(std::vector< PathTrie * > *self,std::vector< PathTrie * >::value_type x){ - self->push_back(x); - } -SWIGINTERN std::vector< PathTrie * >::iterator std_vector_Sl_PathTrie_Sm__Sg__erase__SWIG_0(std::vector< PathTrie * > *self,std::vector< PathTrie * >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< PathTrie * >::iterator std_vector_Sl_PathTrie_Sm__Sg__erase__SWIG_1(std::vector< PathTrie * > *self,std::vector< PathTrie * >::iterator first,std::vector< PathTrie * >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< PathTrie * >::iterator std_vector_Sl_PathTrie_Sm__Sg__insert__SWIG_0(std::vector< PathTrie * > *self,std::vector< PathTrie * >::iterator pos,std::vector< PathTrie * >::value_type x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_PathTrie_Sm__Sg__insert__SWIG_1(std::vector< PathTrie * > *self,std::vector< PathTrie * >::iterator pos,std::vector< PathTrie * >::size_type n,std::vector< PathTrie * >::value_type x){ self->insert(pos, n, x); } - -namespace swig { - template <> struct traits< bool > { - typedef value_category category; - static const char* type_name() { return"bool"; } - }; - template <> struct traits_asval< bool > { - typedef bool value_type; - static int asval(PyObject *obj, value_type *val) { - return SWIG_AsVal_bool (obj, val); - } - }; - template <> struct traits_from< bool > { - typedef bool value_type; - static PyObject *from(const value_type& val) { - return SWIG_From_bool (val); - } - }; -} - - - namespace swig { - template <> struct traits > > { - typedef value_category category; - static const char* type_name() { - return "std::vector >"; - } - }; - } - -SWIGINTERN swig::SwigPyIterator *std_vector_Sl_bool_Sg__iterator(std::vector< bool > *self,PyObject **PYTHON_SELF){ - return swig::make_output_iterator(self->begin(), self->begin(), self->end(), *PYTHON_SELF); - } -SWIGINTERN bool std_vector_Sl_bool_Sg____nonzero__(std::vector< bool > const *self){ - return !(self->empty()); - } -SWIGINTERN bool std_vector_Sl_bool_Sg____bool__(std::vector< bool > const *self){ - return !(self->empty()); - } -SWIGINTERN std::vector< bool >::size_type std_vector_Sl_bool_Sg____len__(std::vector< bool > const *self){ - return self->size(); - } -SWIGINTERN std::vector< bool,std::allocator< bool > > *std_vector_Sl_bool_Sg____getslice__(std::vector< bool > *self,std::vector< bool >::difference_type i,std::vector< bool >::difference_type j){ - return swig::getslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_bool_Sg____setslice____SWIG_0(std::vector< bool > *self,std::vector< bool >::difference_type i,std::vector< bool >::difference_type j){ - swig::setslice(self, i, j, 1, std::vector >()); - } -SWIGINTERN void std_vector_Sl_bool_Sg____setslice____SWIG_1(std::vector< bool > *self,std::vector< bool >::difference_type i,std::vector< bool >::difference_type j,std::vector< bool,std::allocator< bool > > const &v){ - swig::setslice(self, i, j, 1, v); - } -SWIGINTERN void std_vector_Sl_bool_Sg____delslice__(std::vector< bool > *self,std::vector< bool >::difference_type i,std::vector< bool >::difference_type j){ - swig::delslice(self, i, j, 1); - } -SWIGINTERN void std_vector_Sl_bool_Sg____delitem____SWIG_0(std::vector< bool > *self,std::vector< bool >::difference_type i){ - swig::erase(self, swig::getpos(self, i)); - } -SWIGINTERN std::vector< bool,std::allocator< bool > > *std_vector_Sl_bool_Sg____getitem____SWIG_0(std::vector< bool > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return NULL; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector >::difference_type id = i; - std::vector >::difference_type jd = j; - return swig::getslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_bool_Sg____setitem____SWIG_0(std::vector< bool > *self,PySliceObject *slice,std::vector< bool,std::allocator< bool > > const &v){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector >::difference_type id = i; - std::vector >::difference_type jd = j; - swig::setslice(self, id, jd, step, v); - } -SWIGINTERN void std_vector_Sl_bool_Sg____setitem____SWIG_1(std::vector< bool > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector >::difference_type id = i; - std::vector >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN void std_vector_Sl_bool_Sg____delitem____SWIG_1(std::vector< bool > *self,PySliceObject *slice){ - Py_ssize_t i, j, step; - if( !PySlice_Check(slice) ) { - SWIG_Error(SWIG_TypeError, "Slice object expected."); - return; - } - PySlice_GetIndices(SWIGPY_SLICE_ARG(slice), (Py_ssize_t)self->size(), &i, &j, &step); - std::vector >::difference_type id = i; - std::vector >::difference_type jd = j; - swig::delslice(self, id, jd, step); - } -SWIGINTERN std::vector< bool >::value_type std_vector_Sl_bool_Sg____getitem____SWIG_1(std::vector< bool > *self,std::vector< bool >::difference_type i){ - return *(swig::cgetpos(self, i)); - } -SWIGINTERN void std_vector_Sl_bool_Sg____setitem____SWIG_2(std::vector< bool > *self,std::vector< bool >::difference_type i,std::vector< bool >::value_type x){ - *(swig::getpos(self,i)) = x; - } -SWIGINTERN std::vector< bool >::value_type std_vector_Sl_bool_Sg__pop(std::vector< bool > *self){ - if (self->size() == 0) - throw std::out_of_range("pop from empty container"); - std::vector >::value_type x = self->back(); - self->pop_back(); - return x; - } -SWIGINTERN void std_vector_Sl_bool_Sg__append(std::vector< bool > *self,std::vector< bool >::value_type x){ - self->push_back(x); - } -SWIGINTERN std::vector< bool >::iterator std_vector_Sl_bool_Sg__erase__SWIG_0(std::vector< bool > *self,std::vector< bool >::iterator pos){ return self->erase(pos); } -SWIGINTERN std::vector< bool >::iterator std_vector_Sl_bool_Sg__erase__SWIG_1(std::vector< bool > *self,std::vector< bool >::iterator first,std::vector< bool >::iterator last){ return self->erase(first, last); } -SWIGINTERN std::vector< bool >::iterator std_vector_Sl_bool_Sg__insert__SWIG_0(std::vector< bool > *self,std::vector< bool >::iterator pos,std::vector< bool >::value_type x){ return self->insert(pos, x); } -SWIGINTERN void std_vector_Sl_bool_Sg__insert__SWIG_1(std::vector< bool > *self,std::vector< bool >::iterator pos,std::vector< bool >::size_type n,std::vector< bool >::value_type x){ self->insert(pos, n, x); } -#ifdef __cplusplus -extern "C" { -#endif -SWIGINTERN PyObject *_wrap_delete_SwigPyIterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_SwigPyIterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_SwigPyIterator" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_value(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator_value",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_value" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - try { - result = (PyObject *)((swig::SwigPyIterator const *)arg1)->value(); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = result; - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_incr__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - size_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator_incr",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_incr" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator_incr" "', argument " "2"" of type '" "size_t""'"); - } - arg2 = static_cast< size_t >(val2); - try { - result = (swig::SwigPyIterator *)(arg1)->incr(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_incr__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator_incr",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_incr" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - try { - result = (swig::SwigPyIterator *)(arg1)->incr(); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_incr(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 1) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_SwigPyIterator_incr__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_SwigPyIterator_incr__SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'SwigPyIterator_incr'.\n" - " Possible C/C++ prototypes are:\n" - " swig::SwigPyIterator::incr(size_t)\n" - " swig::SwigPyIterator::incr()\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_decr__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - size_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator_decr",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_decr" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator_decr" "', argument " "2"" of type '" "size_t""'"); - } - arg2 = static_cast< size_t >(val2); - try { - result = (swig::SwigPyIterator *)(arg1)->decr(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_decr__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator_decr",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_decr" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - try { - result = (swig::SwigPyIterator *)(arg1)->decr(); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_decr(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 1) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_SwigPyIterator_decr__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_SwigPyIterator_decr__SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'SwigPyIterator_decr'.\n" - " Possible C/C++ prototypes are:\n" - " swig::SwigPyIterator::decr(size_t)\n" - " swig::SwigPyIterator::decr()\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_distance(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - swig::SwigPyIterator *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - ptrdiff_t result; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator_distance",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_distance" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_swig__SwigPyIterator, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "SwigPyIterator_distance" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "SwigPyIterator_distance" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - arg2 = reinterpret_cast< swig::SwigPyIterator * >(argp2); - try { - result = ((swig::SwigPyIterator const *)arg1)->distance((swig::SwigPyIterator const &)*arg2); - } - catch(std::invalid_argument &_e) { - SWIG_Python_Raise(SWIG_NewPointerObj((new std::invalid_argument(static_cast< const std::invalid_argument& >(_e))),SWIGTYPE_p_std__invalid_argument,SWIG_POINTER_OWN), "std::invalid_argument", SWIGTYPE_p_std__invalid_argument); SWIG_fail; - } - - resultobj = SWIG_From_ptrdiff_t(static_cast< ptrdiff_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_equal(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - swig::SwigPyIterator *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator_equal",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_equal" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_swig__SwigPyIterator, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "SwigPyIterator_equal" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "SwigPyIterator_equal" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - arg2 = reinterpret_cast< swig::SwigPyIterator * >(argp2); - try { - result = (bool)((swig::SwigPyIterator const *)arg1)->equal((swig::SwigPyIterator const &)*arg2); - } - catch(std::invalid_argument &_e) { - SWIG_Python_Raise(SWIG_NewPointerObj((new std::invalid_argument(static_cast< const std::invalid_argument& >(_e))),SWIGTYPE_p_std__invalid_argument,SWIG_POINTER_OWN), "std::invalid_argument", SWIGTYPE_p_std__invalid_argument); SWIG_fail; - } - - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_copy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator_copy",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_copy" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - result = (swig::SwigPyIterator *)((swig::SwigPyIterator const *)arg1)->copy(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_next(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator_next",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_next" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - try { - result = (PyObject *)(arg1)->next(); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = result; - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___next__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator___next__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___next__" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - try { - result = (PyObject *)(arg1)->__next__(); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = result; - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_previous(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:SwigPyIterator_previous",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_previous" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - try { - result = (PyObject *)(arg1)->previous(); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = result; - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator_advance(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - ptrdiff_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator_advance",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator_advance" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator_advance" "', argument " "2"" of type '" "ptrdiff_t""'"); - } - arg2 = static_cast< ptrdiff_t >(val2); - try { - result = (swig::SwigPyIterator *)(arg1)->advance(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___eq__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - swig::SwigPyIterator *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___eq__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___eq__" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_swig__SwigPyIterator, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "SwigPyIterator___eq__" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "SwigPyIterator___eq__" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - arg2 = reinterpret_cast< swig::SwigPyIterator * >(argp2); - result = (bool)((swig::SwigPyIterator const *)arg1)->operator ==((swig::SwigPyIterator const &)*arg2); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___ne__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - swig::SwigPyIterator *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___ne__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___ne__" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_swig__SwigPyIterator, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "SwigPyIterator___ne__" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "SwigPyIterator___ne__" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - arg2 = reinterpret_cast< swig::SwigPyIterator * >(argp2); - result = (bool)((swig::SwigPyIterator const *)arg1)->operator !=((swig::SwigPyIterator const &)*arg2); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___iadd__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - ptrdiff_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___iadd__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___iadd__" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator___iadd__" "', argument " "2"" of type '" "ptrdiff_t""'"); - } - arg2 = static_cast< ptrdiff_t >(val2); - try { - result = (swig::SwigPyIterator *) &(arg1)->operator +=(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___isub__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - ptrdiff_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___isub__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___isub__" "', argument " "1"" of type '" "swig::SwigPyIterator *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator___isub__" "', argument " "2"" of type '" "ptrdiff_t""'"); - } - arg2 = static_cast< ptrdiff_t >(val2); - try { - result = (swig::SwigPyIterator *) &(arg1)->operator -=(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___add__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - ptrdiff_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___add__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___add__" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator___add__" "', argument " "2"" of type '" "ptrdiff_t""'"); - } - arg2 = static_cast< ptrdiff_t >(val2); - try { - result = (swig::SwigPyIterator *)((swig::SwigPyIterator const *)arg1)->operator +(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___sub____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - ptrdiff_t arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - swig::SwigPyIterator *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___sub__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___sub__" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SwigPyIterator___sub__" "', argument " "2"" of type '" "ptrdiff_t""'"); - } - arg2 = static_cast< ptrdiff_t >(val2); - try { - result = (swig::SwigPyIterator *)((swig::SwigPyIterator const *)arg1)->operator -(arg2); - } - catch(swig::stop_iteration &_e) { - { - (void)_e; - SWIG_SetErrorObj(PyExc_StopIteration, SWIG_Py_Void()); - SWIG_fail; - } - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___sub____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - swig::SwigPyIterator *arg1 = (swig::SwigPyIterator *) 0 ; - swig::SwigPyIterator *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - ptrdiff_t result; - - if (!PyArg_ParseTuple(args,(char *)"OO:SwigPyIterator___sub__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_swig__SwigPyIterator, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SwigPyIterator___sub__" "', argument " "1"" of type '" "swig::SwigPyIterator const *""'"); - } - arg1 = reinterpret_cast< swig::SwigPyIterator * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_swig__SwigPyIterator, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "SwigPyIterator___sub__" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "SwigPyIterator___sub__" "', argument " "2"" of type '" "swig::SwigPyIterator const &""'"); - } - arg2 = reinterpret_cast< swig::SwigPyIterator * >(argp2); - result = ((swig::SwigPyIterator const *)arg1)->operator -((swig::SwigPyIterator const &)*arg2); - resultobj = SWIG_From_ptrdiff_t(static_cast< ptrdiff_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SwigPyIterator___sub__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - int res = SWIG_ConvertPtr(argv[1], 0, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_SwigPyIterator___sub____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_swig__SwigPyIterator, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_SwigPyIterator___sub____SWIG_0(self, args); - } - } - } - -fail: - Py_INCREF(Py_NotImplemented); - return Py_NotImplemented; -} - - -SWIGINTERN PyObject *SwigPyIterator_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_swig__SwigPyIterator, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_new_PathTrie(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_PathTrie")) SWIG_fail; - result = (PathTrie *)new PathTrie(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_PathTrie(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_PathTrie",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_PathTrie" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_trie__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - int arg2 ; - bool arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - bool val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PathTrie_get_path_trie",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_get_path_trie" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_int(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_get_path_trie" "', argument " "2"" of type '" "int""'"); - } - arg2 = static_cast< int >(val2); - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PathTrie_get_path_trie" "', argument " "3"" of type '" "bool""'"); - } - arg3 = static_cast< bool >(val3); - result = (PathTrie *)(arg1)->get_path_trie(arg2,arg3); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_trie__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - int arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_get_path_trie",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_get_path_trie" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_int(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_get_path_trie" "', argument " "2"" of type '" "int""'"); - } - arg2 = static_cast< int >(val2); - result = (PathTrie *)(arg1)->get_path_trie(arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_trie(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_int(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PathTrie_get_path_trie__SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_int(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PathTrie_get_path_trie__SWIG_0(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PathTrie_get_path_trie'.\n" - " Possible C/C++ prototypes are:\n" - " PathTrie::get_path_trie(int,bool)\n" - " PathTrie::get_path_trie(int)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_vec__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - std::vector< int,std::allocator< int > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_get_path_vec",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_get_path_vec" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_get_path_vec" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PathTrie_get_path_vec" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > &""'"); - } - arg2 = reinterpret_cast< std::vector< int,std::allocator< int > > * >(argp2); - result = (PathTrie *)(arg1)->get_path_vec(*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_vec__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - std::vector< int,std::allocator< int > > *arg2 = 0 ; - int arg3 ; - size_t arg4 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - int val3 ; - int ecode3 = 0 ; - size_t val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PathTrie_get_path_vec",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_get_path_vec" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_get_path_vec" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PathTrie_get_path_vec" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > &""'"); - } - arg2 = reinterpret_cast< std::vector< int,std::allocator< int > > * >(argp2); - ecode3 = SWIG_AsVal_int(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PathTrie_get_path_vec" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_size_t(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "PathTrie_get_path_vec" "', argument " "4"" of type '" "size_t""'"); - } - arg4 = static_cast< size_t >(val4); - result = (PathTrie *)(arg1)->get_path_vec(*arg2,arg3,arg4); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_vec__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - std::vector< int,std::allocator< int > > *arg2 = 0 ; - int arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - int val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PathTrie_get_path_vec",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_get_path_vec" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_get_path_vec" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PathTrie_get_path_vec" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > &""'"); - } - arg2 = reinterpret_cast< std::vector< int,std::allocator< int > > * >(argp2); - ecode3 = SWIG_AsVal_int(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PathTrie_get_path_vec" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - result = (PathTrie *)(arg1)->get_path_vec(*arg2,arg3); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_get_path_vec(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[1], &vptr, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PathTrie_get_path_vec__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[1], &vptr, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_int(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PathTrie_get_path_vec__SWIG_2(self, args); - } - } - } - } - if (argc == 4) { - int _v; - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[0], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[1], &vptr, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_int(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PathTrie_get_path_vec__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PathTrie_get_path_vec'.\n" - " Possible C/C++ prototypes are:\n" - " PathTrie::get_path_vec(std::vector< int,std::allocator< int > > &)\n" - " PathTrie::get_path_vec(std::vector< int,std::allocator< int > > &,int,size_t)\n" - " PathTrie::get_path_vec(std::vector< int,std::allocator< int > > &,int)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_iterate_to_vec(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_iterate_to_vec",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_iterate_to_vec" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_iterate_to_vec" "', argument " "2"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PathTrie_iterate_to_vec" "', argument " "2"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg2 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp2); - (arg1)->iterate_to_vec(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_iterate_to_vec_only(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_iterate_to_vec_only",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_iterate_to_vec_only" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_iterate_to_vec_only" "', argument " "2"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PathTrie_iterate_to_vec_only" "', argument " "2"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg2 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp2); - (arg1)->iterate_to_vec_only(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_set_dictionary(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - fst::StdVectorFst *arg2 = (fst::StdVectorFst *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_set_dictionary",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_set_dictionary" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_fst__StdVectorFst, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_set_dictionary" "', argument " "2"" of type '" "fst::StdVectorFst *""'"); - } - arg2 = reinterpret_cast< fst::StdVectorFst * >(argp2); - (arg1)->set_dictionary(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_set_matcher(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - std::shared_ptr< fst::SortedMatcher< fst::StdVectorFst > > arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_set_matcher",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_set_matcher" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - { - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_set_matcher" "', argument " "2"" of type '" "std::shared_ptr< fst::SortedMatcher< fst::StdVectorFst > >""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PathTrie_set_matcher" "', argument " "2"" of type '" "std::shared_ptr< fst::SortedMatcher< fst::StdVectorFst > >""'"); - } else { - std::shared_ptr< fst::SortedMatcher< fst::StdVectorFst > > * temp = reinterpret_cast< std::shared_ptr< fst::SortedMatcher< fst::StdVectorFst > > * >(argp2); - arg2 = *temp; - if (SWIG_IsNewObj(res2)) delete temp; - } - } - (arg1)->set_matcher(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_is_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_is_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_is_empty" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (bool)(arg1)->is_empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_remove(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_remove",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_remove" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - (arg1)->remove(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_b_prev_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_log_prob_b_prev_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_b_prev_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_log_prob_b_prev_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->log_prob_b_prev = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_b_prev_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_log_prob_b_prev_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_b_prev_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (float) ((arg1)->log_prob_b_prev); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_nb_prev_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_log_prob_nb_prev_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_nb_prev_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_log_prob_nb_prev_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->log_prob_nb_prev = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_nb_prev_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_log_prob_nb_prev_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_nb_prev_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (float) ((arg1)->log_prob_nb_prev); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_b_cur_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_log_prob_b_cur_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_b_cur_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_log_prob_b_cur_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->log_prob_b_cur = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_b_cur_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_log_prob_b_cur_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_b_cur_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (float) ((arg1)->log_prob_b_cur); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_nb_cur_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_log_prob_nb_cur_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_nb_cur_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_log_prob_nb_cur_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->log_prob_nb_cur = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_log_prob_nb_cur_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_log_prob_nb_cur_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_log_prob_nb_cur_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (float) ((arg1)->log_prob_nb_cur); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_score_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_score_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_score_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_score_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->score = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_score_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_score_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_score_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (float) ((arg1)->score); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_approx_ctc_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_approx_ctc_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_approx_ctc_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_approx_ctc_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->approx_ctc = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_approx_ctc_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_approx_ctc_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_approx_ctc_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (float) ((arg1)->approx_ctc); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_character_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - int arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_character_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_character_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - ecode2 = SWIG_AsVal_int(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PathTrie_character_set" "', argument " "2"" of type '" "int""'"); - } - arg2 = static_cast< int >(val2); - if (arg1) (arg1)->character = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_character_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - int result; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_character_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_character_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (int) ((arg1)->character); - resultobj = SWIG_From_int(static_cast< int >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_parent_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - PathTrie *arg2 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PathTrie_parent_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_parent_set" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_PathTrie, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PathTrie_parent_set" "', argument " "2"" of type '" "PathTrie *""'"); - } - arg2 = reinterpret_cast< PathTrie * >(argp2); - if (arg1) (arg1)->parent = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PathTrie_parent_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - PathTrie *arg1 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PathTrie *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PathTrie_parent_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PathTrie_parent_get" "', argument " "1"" of type '" "PathTrie *""'"); - } - arg1 = reinterpret_cast< PathTrie * >(argp1); - result = (PathTrie *) ((arg1)->parent); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *PathTrie_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_PathTrie, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_DoubleVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_iterator" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_double_Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___nonzero__" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (bool)std_vector_Sl_double_Sg____nonzero__((std::vector< double > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___bool__" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (bool)std_vector_Sl_double_Sg____bool__((std::vector< double > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___len__" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = std_vector_Sl_double_Sg____len__((std::vector< double > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - std::vector< double >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< double,std::allocator< double > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___getslice__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___getslice__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector___getslice__" "', argument " "3"" of type '" "std::vector< double >::difference_type""'"); - } - arg3 = static_cast< std::vector< double >::difference_type >(val3); - try { - result = (std::vector< double,std::allocator< double > > *)std_vector_Sl_double_Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - std::vector< double >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___setslice__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___setslice__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector___setslice__" "', argument " "3"" of type '" "std::vector< double >::difference_type""'"); - } - arg3 = static_cast< std::vector< double >::difference_type >(val3); - try { - std_vector_Sl_double_Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - std::vector< double >::difference_type arg3 ; - std::vector< double,std::allocator< double > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:DoubleVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___setslice__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___setslice__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector___setslice__" "', argument " "3"" of type '" "std::vector< double >::difference_type""'"); - } - arg3 = static_cast< std::vector< double >::difference_type >(val3); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "DoubleVector___setslice__" "', argument " "4"" of type '" "std::vector< double,std::allocator< double > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector___setslice__" "', argument " "4"" of type '" "std::vector< double,std::allocator< double > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_double_Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< double,std::allocator< double > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::__setslice__(std::vector< double >::difference_type,std::vector< double >::difference_type)\n" - " std::vector< double >::__setslice__(std::vector< double >::difference_type,std::vector< double >::difference_type,std::vector< double,std::allocator< double > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - std::vector< double >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___delslice__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___delslice__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector___delslice__" "', argument " "3"" of type '" "std::vector< double >::difference_type""'"); - } - arg3 = static_cast< std::vector< double >::difference_type >(val3); - try { - std_vector_Sl_double_Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___delitem__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___delitem__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - try { - std_vector_Sl_double_Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< double,std::allocator< double > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___getitem__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< double,std::allocator< double > > *)std_vector_Sl_double_Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< double,std::allocator< double > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___setitem__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "DoubleVector___setitem__" "', argument " "3"" of type '" "std::vector< double,std::allocator< double > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector___setitem__" "', argument " "3"" of type '" "std::vector< double,std::allocator< double > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_double_Sg____setitem____SWIG_0(arg1,arg2,(std::vector< double,std::allocator< double > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___setitem__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_double_Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___delitem__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_double_Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_DoubleVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::__delitem__(std::vector< double >::difference_type)\n" - " std::vector< double >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< double >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___getitem__" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___getitem__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - try { - result = (std::vector< double >::value_type *) &std_vector_Sl_double_Sg____getitem____SWIG_1((std::vector< double > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_double(static_cast< double >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_DoubleVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::__getitem__(PySliceObject *)\n" - " std::vector< double >::__getitem__(std::vector< double >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::difference_type arg2 ; - std::vector< double >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - std::vector< double >::value_type temp3 ; - double val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector___setitem__" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector___setitem__" "', argument " "2"" of type '" "std::vector< double >::difference_type""'"); - } - arg2 = static_cast< std::vector< double >::difference_type >(val2); - ecode3 = SWIG_AsVal_double(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector___setitem__" "', argument " "3"" of type '" "std::vector< double >::value_type""'"); - } - temp3 = static_cast< std::vector< double >::value_type >(val3); - arg3 = &temp3; - try { - std_vector_Sl_double_Sg____setitem____SWIG_2(arg1,arg2,(double const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_DoubleVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::__setitem__(PySliceObject *,std::vector< double,std::allocator< double > > const &)\n" - " std::vector< double >::__setitem__(PySliceObject *)\n" - " std::vector< double >::__setitem__(std::vector< double >::difference_type,std::vector< double >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_pop" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - try { - result = (std::vector< double >::value_type)std_vector_Sl_double_Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_double(static_cast< double >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - std::vector< double >::value_type temp2 ; - double val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_append" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_double(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector_append" "', argument " "2"" of type '" "std::vector< double >::value_type""'"); - } - temp2 = static_cast< std::vector< double >::value_type >(val2); - arg2 = &temp2; - std_vector_Sl_double_Sg__append(arg1,(double const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_DoubleVector")) SWIG_fail; - result = (std::vector< double > *)new std::vector< double >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< double > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_DoubleVector",&obj0)) SWIG_fail; - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_DoubleVector" "', argument " "1"" of type '" "std::vector< double > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_DoubleVector" "', argument " "1"" of type '" "std::vector< double > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< double > *)new std::vector< double >((std::vector< double > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_empty" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (bool)((std::vector< double > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_size" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = ((std::vector< double > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_swap" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "DoubleVector_swap" "', argument " "2"" of type '" "std::vector< double > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector_swap" "', argument " "2"" of type '" "std::vector< double > &""'"); - } - arg2 = reinterpret_cast< std::vector< double > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_begin" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_end" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_rbegin" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_rend" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_clear" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< double > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_get_allocator" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = ((std::vector< double > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< double >::allocator_type(static_cast< const std::vector< double >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_double_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_DoubleVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_DoubleVector" "', argument " "1"" of type '" "std::vector< double >::size_type""'"); - } - arg1 = static_cast< std::vector< double >::size_type >(val1); - result = (std::vector< double > *)new std::vector< double >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_pop_back" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_resize" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector_resize" "', argument " "2"" of type '" "std::vector< double >::size_type""'"); - } - arg2 = static_cast< std::vector< double >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< double >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_erase" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_erase" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_erase" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } - } - result = std_vector_Sl_double_Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::iterator arg2 ; - std::vector< double >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< double >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_erase" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_erase" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_erase" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_erase" "', argument " "3"" of type '" "std::vector< double >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_erase" "', argument " "3"" of type '" "std::vector< double >::iterator""'"); - } - } - result = std_vector_Sl_double_Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_DoubleVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_DoubleVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::erase(std::vector< double >::iterator)\n" - " std::vector< double >::erase(std::vector< double >::iterator,std::vector< double >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double >::size_type arg1 ; - std::vector< double >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - std::vector< double >::value_type temp2 ; - double val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< double > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_DoubleVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_DoubleVector" "', argument " "1"" of type '" "std::vector< double >::size_type""'"); - } - arg1 = static_cast< std::vector< double >::size_type >(val1); - ecode2 = SWIG_AsVal_double(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "new_DoubleVector" "', argument " "2"" of type '" "std::vector< double >::value_type""'"); - } - temp2 = static_cast< std::vector< double >::value_type >(val2); - arg2 = &temp2; - result = (std::vector< double > *)new std::vector< double >(arg1,(std::vector< double >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_DoubleVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_DoubleVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_DoubleVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_DoubleVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_DoubleVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::vector()\n" - " std::vector< double >::vector(std::vector< double > const &)\n" - " std::vector< double >::vector(std::vector< double >::size_type)\n" - " std::vector< double >::vector(std::vector< double >::size_type,std::vector< double >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - std::vector< double >::value_type temp2 ; - double val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_push_back" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_double(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector_push_back" "', argument " "2"" of type '" "std::vector< double >::value_type""'"); - } - temp2 = static_cast< std::vector< double >::value_type >(val2); - arg2 = &temp2; - (arg1)->push_back((std::vector< double >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_front" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (std::vector< double >::value_type *) &((std::vector< double > const *)arg1)->front(); - resultobj = SWIG_From_double(static_cast< double >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_back" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = (std::vector< double >::value_type *) &((std::vector< double > const *)arg1)->back(); - resultobj = SWIG_From_double(static_cast< double >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::size_type arg2 ; - std::vector< double >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - std::vector< double >::value_type temp3 ; - double val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_assign" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector_assign" "', argument " "2"" of type '" "std::vector< double >::size_type""'"); - } - arg2 = static_cast< std::vector< double >::size_type >(val2); - ecode3 = SWIG_AsVal_double(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector_assign" "', argument " "3"" of type '" "std::vector< double >::value_type""'"); - } - temp3 = static_cast< std::vector< double >::value_type >(val3); - arg3 = &temp3; - (arg1)->assign(arg2,(std::vector< double >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::size_type arg2 ; - std::vector< double >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - std::vector< double >::value_type temp3 ; - double val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_resize" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector_resize" "', argument " "2"" of type '" "std::vector< double >::size_type""'"); - } - arg2 = static_cast< std::vector< double >::size_type >(val2); - ecode3 = SWIG_AsVal_double(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector_resize" "', argument " "3"" of type '" "std::vector< double >::value_type""'"); - } - temp3 = static_cast< std::vector< double >::value_type >(val3); - arg3 = &temp3; - (arg1)->resize(arg2,(std::vector< double >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::resize(std::vector< double >::size_type)\n" - " std::vector< double >::resize(std::vector< double >::size_type,std::vector< double >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::iterator arg2 ; - std::vector< double >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - std::vector< double >::value_type temp3 ; - double val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< double >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_insert" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_insert" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_insert" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_double(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector_insert" "', argument " "3"" of type '" "std::vector< double >::value_type""'"); - } - temp3 = static_cast< std::vector< double >::value_type >(val3); - arg3 = &temp3; - result = std_vector_Sl_double_Sg__insert__SWIG_0(arg1,arg2,(double const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< double >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::iterator arg2 ; - std::vector< double >::size_type arg3 ; - std::vector< double >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - std::vector< double >::value_type temp4 ; - double val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:DoubleVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_insert" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_insert" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector_insert" "', argument " "2"" of type '" "std::vector< double >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector_insert" "', argument " "3"" of type '" "std::vector< double >::size_type""'"); - } - arg3 = static_cast< std::vector< double >::size_type >(val3); - ecode4 = SWIG_AsVal_double(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "DoubleVector_insert" "', argument " "4"" of type '" "std::vector< double >::value_type""'"); - } - temp4 = static_cast< std::vector< double >::value_type >(val4); - arg4 = &temp4; - std_vector_Sl_double_Sg__insert__SWIG_1(arg1,arg2,arg3,(double const &)*arg4); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_double(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< double >::insert(std::vector< double >::iterator,std::vector< double >::value_type const &)\n" - " std::vector< double >::insert(std::vector< double >::iterator,std::vector< double >::size_type,std::vector< double >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - std::vector< double >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_reserve" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector_reserve" "', argument " "2"" of type '" "std::vector< double >::size_type""'"); - } - arg2 = static_cast< std::vector< double >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< double >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector_capacity" "', argument " "1"" of type '" "std::vector< double > const *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - result = ((std::vector< double > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_DoubleVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< double > *arg1 = (std::vector< double > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_DoubleVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_DoubleVector" "', argument " "1"" of type '" "std::vector< double > *""'"); - } - arg1 = reinterpret_cast< std::vector< double > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *DoubleVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_double_std__allocatorT_double_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_IntVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_iterator" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_int_Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___nonzero__" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (bool)std_vector_Sl_int_Sg____nonzero__((std::vector< int > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___bool__" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (bool)std_vector_Sl_int_Sg____bool__((std::vector< int > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___len__" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = std_vector_Sl_int_Sg____len__((std::vector< int > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - std::vector< int >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< int,std::allocator< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___getslice__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___getslice__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector___getslice__" "', argument " "3"" of type '" "std::vector< int >::difference_type""'"); - } - arg3 = static_cast< std::vector< int >::difference_type >(val3); - try { - result = (std::vector< int,std::allocator< int > > *)std_vector_Sl_int_Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - std::vector< int >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___setslice__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___setslice__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector___setslice__" "', argument " "3"" of type '" "std::vector< int >::difference_type""'"); - } - arg3 = static_cast< std::vector< int >::difference_type >(val3); - try { - std_vector_Sl_int_Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - std::vector< int >::difference_type arg3 ; - std::vector< int,std::allocator< int > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:IntVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___setslice__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___setslice__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector___setslice__" "', argument " "3"" of type '" "std::vector< int >::difference_type""'"); - } - arg3 = static_cast< std::vector< int >::difference_type >(val3); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "IntVector___setslice__" "', argument " "4"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector___setslice__" "', argument " "4"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_int_Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< int,std::allocator< int > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::__setslice__(std::vector< int >::difference_type,std::vector< int >::difference_type)\n" - " std::vector< int >::__setslice__(std::vector< int >::difference_type,std::vector< int >::difference_type,std::vector< int,std::allocator< int > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - std::vector< int >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___delslice__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___delslice__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector___delslice__" "', argument " "3"" of type '" "std::vector< int >::difference_type""'"); - } - arg3 = static_cast< std::vector< int >::difference_type >(val3); - try { - std_vector_Sl_int_Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___delitem__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___delitem__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - try { - std_vector_Sl_int_Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< int,std::allocator< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___getitem__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< int,std::allocator< int > > *)std_vector_Sl_int_Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< int,std::allocator< int > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___setitem__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "IntVector___setitem__" "', argument " "3"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector___setitem__" "', argument " "3"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_int_Sg____setitem____SWIG_0(arg1,arg2,(std::vector< int,std::allocator< int > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___setitem__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_int_Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___delitem__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_int_Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_IntVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::__delitem__(std::vector< int >::difference_type)\n" - " std::vector< int >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< int >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___getitem__" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___getitem__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - try { - result = (std::vector< int >::value_type *) &std_vector_Sl_int_Sg____getitem____SWIG_1((std::vector< int > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_int(static_cast< int >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_IntVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::__getitem__(PySliceObject *)\n" - " std::vector< int >::__getitem__(std::vector< int >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::difference_type arg2 ; - std::vector< int >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - std::vector< int >::value_type temp3 ; - int val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector___setitem__" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector___setitem__" "', argument " "2"" of type '" "std::vector< int >::difference_type""'"); - } - arg2 = static_cast< std::vector< int >::difference_type >(val2); - ecode3 = SWIG_AsVal_int(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector___setitem__" "', argument " "3"" of type '" "std::vector< int >::value_type""'"); - } - temp3 = static_cast< std::vector< int >::value_type >(val3); - arg3 = &temp3; - try { - std_vector_Sl_int_Sg____setitem____SWIG_2(arg1,arg2,(int const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_IntVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::__setitem__(PySliceObject *,std::vector< int,std::allocator< int > > const &)\n" - " std::vector< int >::__setitem__(PySliceObject *)\n" - " std::vector< int >::__setitem__(std::vector< int >::difference_type,std::vector< int >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_pop" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - try { - result = (std::vector< int >::value_type)std_vector_Sl_int_Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_int(static_cast< int >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - std::vector< int >::value_type temp2 ; - int val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_append" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_int(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector_append" "', argument " "2"" of type '" "std::vector< int >::value_type""'"); - } - temp2 = static_cast< std::vector< int >::value_type >(val2); - arg2 = &temp2; - std_vector_Sl_int_Sg__append(arg1,(int const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_IntVector")) SWIG_fail; - result = (std::vector< int > *)new std::vector< int >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< int > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_IntVector",&obj0)) SWIG_fail; - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_IntVector" "', argument " "1"" of type '" "std::vector< int > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_IntVector" "', argument " "1"" of type '" "std::vector< int > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< int > *)new std::vector< int >((std::vector< int > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_empty" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (bool)((std::vector< int > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_size" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = ((std::vector< int > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_swap" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "IntVector_swap" "', argument " "2"" of type '" "std::vector< int > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector_swap" "', argument " "2"" of type '" "std::vector< int > &""'"); - } - arg2 = reinterpret_cast< std::vector< int > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_begin" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_end" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_rbegin" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_rend" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_clear" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< int > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_get_allocator" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = ((std::vector< int > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< int >::allocator_type(static_cast< const std::vector< int >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_int_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_IntVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_IntVector" "', argument " "1"" of type '" "std::vector< int >::size_type""'"); - } - arg1 = static_cast< std::vector< int >::size_type >(val1); - result = (std::vector< int > *)new std::vector< int >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_pop_back" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_resize" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector_resize" "', argument " "2"" of type '" "std::vector< int >::size_type""'"); - } - arg2 = static_cast< std::vector< int >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< int >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_erase" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_erase" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_erase" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } - } - result = std_vector_Sl_int_Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::iterator arg2 ; - std::vector< int >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< int >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_erase" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_erase" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_erase" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_erase" "', argument " "3"" of type '" "std::vector< int >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_erase" "', argument " "3"" of type '" "std::vector< int >::iterator""'"); - } - } - result = std_vector_Sl_int_Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_IntVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_IntVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::erase(std::vector< int >::iterator)\n" - " std::vector< int >::erase(std::vector< int >::iterator,std::vector< int >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int >::size_type arg1 ; - std::vector< int >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - std::vector< int >::value_type temp2 ; - int val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< int > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_IntVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_IntVector" "', argument " "1"" of type '" "std::vector< int >::size_type""'"); - } - arg1 = static_cast< std::vector< int >::size_type >(val1); - ecode2 = SWIG_AsVal_int(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "new_IntVector" "', argument " "2"" of type '" "std::vector< int >::value_type""'"); - } - temp2 = static_cast< std::vector< int >::value_type >(val2); - arg2 = &temp2; - result = (std::vector< int > *)new std::vector< int >(arg1,(std::vector< int >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_IntVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_IntVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_IntVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_IntVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_IntVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::vector()\n" - " std::vector< int >::vector(std::vector< int > const &)\n" - " std::vector< int >::vector(std::vector< int >::size_type)\n" - " std::vector< int >::vector(std::vector< int >::size_type,std::vector< int >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - std::vector< int >::value_type temp2 ; - int val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_push_back" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_int(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector_push_back" "', argument " "2"" of type '" "std::vector< int >::value_type""'"); - } - temp2 = static_cast< std::vector< int >::value_type >(val2); - arg2 = &temp2; - (arg1)->push_back((std::vector< int >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_front" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (std::vector< int >::value_type *) &((std::vector< int > const *)arg1)->front(); - resultobj = SWIG_From_int(static_cast< int >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_back" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = (std::vector< int >::value_type *) &((std::vector< int > const *)arg1)->back(); - resultobj = SWIG_From_int(static_cast< int >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::size_type arg2 ; - std::vector< int >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - std::vector< int >::value_type temp3 ; - int val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_assign" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector_assign" "', argument " "2"" of type '" "std::vector< int >::size_type""'"); - } - arg2 = static_cast< std::vector< int >::size_type >(val2); - ecode3 = SWIG_AsVal_int(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector_assign" "', argument " "3"" of type '" "std::vector< int >::value_type""'"); - } - temp3 = static_cast< std::vector< int >::value_type >(val3); - arg3 = &temp3; - (arg1)->assign(arg2,(std::vector< int >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::size_type arg2 ; - std::vector< int >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - std::vector< int >::value_type temp3 ; - int val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_resize" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector_resize" "', argument " "2"" of type '" "std::vector< int >::size_type""'"); - } - arg2 = static_cast< std::vector< int >::size_type >(val2); - ecode3 = SWIG_AsVal_int(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector_resize" "', argument " "3"" of type '" "std::vector< int >::value_type""'"); - } - temp3 = static_cast< std::vector< int >::value_type >(val3); - arg3 = &temp3; - (arg1)->resize(arg2,(std::vector< int >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::resize(std::vector< int >::size_type)\n" - " std::vector< int >::resize(std::vector< int >::size_type,std::vector< int >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::iterator arg2 ; - std::vector< int >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - std::vector< int >::value_type temp3 ; - int val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< int >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_insert" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_insert" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_insert" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_int(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector_insert" "', argument " "3"" of type '" "std::vector< int >::value_type""'"); - } - temp3 = static_cast< std::vector< int >::value_type >(val3); - arg3 = &temp3; - result = std_vector_Sl_int_Sg__insert__SWIG_0(arg1,arg2,(int const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< int >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::iterator arg2 ; - std::vector< int >::size_type arg3 ; - std::vector< int >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - std::vector< int >::value_type temp4 ; - int val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:IntVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_insert" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_insert" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector_insert" "', argument " "2"" of type '" "std::vector< int >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector_insert" "', argument " "3"" of type '" "std::vector< int >::size_type""'"); - } - arg3 = static_cast< std::vector< int >::size_type >(val3); - ecode4 = SWIG_AsVal_int(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "IntVector_insert" "', argument " "4"" of type '" "std::vector< int >::value_type""'"); - } - temp4 = static_cast< std::vector< int >::value_type >(val4); - arg4 = &temp4; - std_vector_Sl_int_Sg__insert__SWIG_1(arg1,arg2,arg3,(int const &)*arg4); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_int(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< int >::insert(std::vector< int >::iterator,std::vector< int >::value_type const &)\n" - " std::vector< int >::insert(std::vector< int >::iterator,std::vector< int >::size_type,std::vector< int >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - std::vector< int >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_reserve" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector_reserve" "', argument " "2"" of type '" "std::vector< int >::size_type""'"); - } - arg2 = static_cast< std::vector< int >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector_capacity" "', argument " "1"" of type '" "std::vector< int > const *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - result = ((std::vector< int > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_IntVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int > *arg1 = (std::vector< int > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_IntVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_IntVector" "', argument " "1"" of type '" "std::vector< int > *""'"); - } - arg1 = reinterpret_cast< std::vector< int > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *IntVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_StringVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_iterator" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_string_Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___nonzero__" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (bool)std_vector_Sl_std_string_Sg____nonzero__((std::vector< std::string > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___bool__" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (bool)std_vector_Sl_std_string_Sg____bool__((std::vector< std::string > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___len__" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = std_vector_Sl_std_string_Sg____len__((std::vector< std::string > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - std::vector< std::string >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::string,std::allocator< std::string > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___getslice__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___getslice__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "StringVector___getslice__" "', argument " "3"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::string >::difference_type >(val3); - try { - result = (std::vector< std::string,std::allocator< std::string > > *)std_vector_Sl_std_string_Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - std::vector< std::string >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___setslice__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___setslice__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "StringVector___setslice__" "', argument " "3"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::string >::difference_type >(val3); - try { - std_vector_Sl_std_string_Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - std::vector< std::string >::difference_type arg3 ; - std::vector< std::string,std::allocator< std::string > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:StringVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___setslice__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___setslice__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "StringVector___setslice__" "', argument " "3"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::string >::difference_type >(val3); - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "StringVector___setslice__" "', argument " "4"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector___setslice__" "', argument " "4"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_string_Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::string,std::allocator< std::string > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_StringVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_StringVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::__setslice__(std::vector< std::string >::difference_type,std::vector< std::string >::difference_type)\n" - " std::vector< std::string >::__setslice__(std::vector< std::string >::difference_type,std::vector< std::string >::difference_type,std::vector< std::string,std::allocator< std::string > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - std::vector< std::string >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___delslice__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___delslice__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "StringVector___delslice__" "', argument " "3"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::string >::difference_type >(val3); - try { - std_vector_Sl_std_string_Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___delitem__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___delitem__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - try { - std_vector_Sl_std_string_Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___getitem__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::string,std::allocator< std::string > > *)std_vector_Sl_std_string_Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::string,std::allocator< std::string > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___setitem__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "StringVector___setitem__" "', argument " "3"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector___setitem__" "', argument " "3"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_string_Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::string,std::allocator< std::string > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___setitem__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_string_Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___delitem__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_string_Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_StringVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_StringVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::__delitem__(std::vector< std::string >::difference_type)\n" - " std::vector< std::string >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::string >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___getitem__" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___getitem__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - try { - result = (std::vector< std::string >::value_type *) &std_vector_Sl_std_string_Sg____getitem____SWIG_1((std::vector< std::string > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_std_string(static_cast< std::string >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_StringVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_StringVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::__getitem__(PySliceObject *)\n" - " std::vector< std::string >::__getitem__(std::vector< std::string >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::difference_type arg2 ; - std::vector< std::string >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector___setitem__" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector___setitem__" "', argument " "2"" of type '" "std::vector< std::string >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::string >::difference_type >(val2); - { - std::string *ptr = (std::string *)0; - res3 = SWIG_AsPtr_std_string(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "StringVector___setitem__" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector___setitem__" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_string_Sg____setitem____SWIG_2(arg1,arg2,(std::string const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_StringVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_StringVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_AsPtr_std_string(argv[2], (std::string**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_StringVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::__setitem__(PySliceObject *,std::vector< std::string,std::allocator< std::string > > const &)\n" - " std::vector< std::string >::__setitem__(PySliceObject *)\n" - " std::vector< std::string >::__setitem__(std::vector< std::string >::difference_type,std::vector< std::string >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_pop" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - try { - result = std_vector_Sl_std_string_Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_std_string(static_cast< std::string >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_append" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - { - std::string *ptr = (std::string *)0; - res2 = SWIG_AsPtr_std_string(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "StringVector_append" "', argument " "2"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_append" "', argument " "2"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_string_Sg__append(arg1,(std::string const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_StringVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_StringVector")) SWIG_fail; - result = (std::vector< std::string > *)new std::vector< std::string >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_StringVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::string > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_StringVector",&obj0)) SWIG_fail; - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_StringVector" "', argument " "1"" of type '" "std::vector< std::string > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_StringVector" "', argument " "1"" of type '" "std::vector< std::string > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::string > *)new std::vector< std::string >((std::vector< std::string > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_empty" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (bool)((std::vector< std::string > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_size" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = ((std::vector< std::string > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_swap" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "StringVector_swap" "', argument " "2"" of type '" "std::vector< std::string > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_swap" "', argument " "2"" of type '" "std::vector< std::string > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::string > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_begin" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_end" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_rbegin" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_rend" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_clear" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::string > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_get_allocator" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = ((std::vector< std::string > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::string >::allocator_type(static_cast< const std::vector< std::string >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__string_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_StringVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_StringVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_StringVector" "', argument " "1"" of type '" "std::vector< std::string >::size_type""'"); - } - arg1 = static_cast< std::vector< std::string >::size_type >(val1); - result = (std::vector< std::string > *)new std::vector< std::string >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_pop_back" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_resize" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector_resize" "', argument " "2"" of type '" "std::vector< std::string >::size_type""'"); - } - arg2 = static_cast< std::vector< std::string >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::string >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_erase" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_erase" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_erase" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } - } - result = std_vector_Sl_std_string_Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::iterator arg2 ; - std::vector< std::string >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::string >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_erase" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_erase" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_erase" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_erase" "', argument " "3"" of type '" "std::vector< std::string >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_erase" "', argument " "3"" of type '" "std::vector< std::string >::iterator""'"); - } - } - result = std_vector_Sl_std_string_Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_StringVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_StringVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::erase(std::vector< std::string >::iterator)\n" - " std::vector< std::string >::erase(std::vector< std::string >::iterator,std::vector< std::string >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_StringVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string >::size_type arg1 ; - std::vector< std::string >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::string > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_StringVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_StringVector" "', argument " "1"" of type '" "std::vector< std::string >::size_type""'"); - } - arg1 = static_cast< std::vector< std::string >::size_type >(val1); - { - std::string *ptr = (std::string *)0; - res2 = SWIG_AsPtr_std_string(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_StringVector" "', argument " "2"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_StringVector" "', argument " "2"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::string > *)new std::vector< std::string >(arg1,(std::vector< std::string >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_StringVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_StringVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_StringVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_StringVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_AsPtr_std_string(argv[1], (std::string**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_StringVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_StringVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::vector()\n" - " std::vector< std::string >::vector(std::vector< std::string > const &)\n" - " std::vector< std::string >::vector(std::vector< std::string >::size_type)\n" - " std::vector< std::string >::vector(std::vector< std::string >::size_type,std::vector< std::string >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_push_back" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - { - std::string *ptr = (std::string *)0; - res2 = SWIG_AsPtr_std_string(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "StringVector_push_back" "', argument " "2"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_push_back" "', argument " "2"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::string >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_front" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (std::vector< std::string >::value_type *) &((std::vector< std::string > const *)arg1)->front(); - resultobj = SWIG_From_std_string(static_cast< std::string >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_back" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = (std::vector< std::string >::value_type *) &((std::vector< std::string > const *)arg1)->back(); - resultobj = SWIG_From_std_string(static_cast< std::string >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::size_type arg2 ; - std::vector< std::string >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_assign" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector_assign" "', argument " "2"" of type '" "std::vector< std::string >::size_type""'"); - } - arg2 = static_cast< std::vector< std::string >::size_type >(val2); - { - std::string *ptr = (std::string *)0; - res3 = SWIG_AsPtr_std_string(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "StringVector_assign" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_assign" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::string >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::size_type arg2 ; - std::vector< std::string >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_resize" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector_resize" "', argument " "2"" of type '" "std::vector< std::string >::size_type""'"); - } - arg2 = static_cast< std::vector< std::string >::size_type >(val2); - { - std::string *ptr = (std::string *)0; - res3 = SWIG_AsPtr_std_string(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "StringVector_resize" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_resize" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::string >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_StringVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_AsPtr_std_string(argv[2], (std::string**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_StringVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::resize(std::vector< std::string >::size_type)\n" - " std::vector< std::string >::resize(std::vector< std::string >::size_type,std::vector< std::string >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::iterator arg2 ; - std::vector< std::string >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::string >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:StringVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_insert" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_insert" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_insert" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } - } - { - std::string *ptr = (std::string *)0; - res3 = SWIG_AsPtr_std_string(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "StringVector_insert" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_insert" "', argument " "3"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_string_Sg__insert__SWIG_0(arg1,arg2,(std::string const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::string >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::iterator arg2 ; - std::vector< std::string >::size_type arg3 ; - std::vector< std::string >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:StringVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_insert" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_insert" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "StringVector_insert" "', argument " "2"" of type '" "std::vector< std::string >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "StringVector_insert" "', argument " "3"" of type '" "std::vector< std::string >::size_type""'"); - } - arg3 = static_cast< std::vector< std::string >::size_type >(val3); - { - std::string *ptr = (std::string *)0; - res4 = SWIG_AsPtr_std_string(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "StringVector_insert" "', argument " "4"" of type '" "std::vector< std::string >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringVector_insert" "', argument " "4"" of type '" "std::vector< std::string >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_string_Sg__insert__SWIG_1(arg1,arg2,arg3,(std::string const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - int res = SWIG_AsPtr_std_string(argv[2], (std::string**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_StringVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_AsPtr_std_string(argv[3], (std::string**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_StringVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'StringVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::string >::insert(std::vector< std::string >::iterator,std::vector< std::string >::value_type const &)\n" - " std::vector< std::string >::insert(std::vector< std::string >::iterator,std::vector< std::string >::size_type,std::vector< std::string >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_StringVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - std::vector< std::string >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_reserve" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "StringVector_reserve" "', argument " "2"" of type '" "std::vector< std::string >::size_type""'"); - } - arg2 = static_cast< std::vector< std::string >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:StringVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringVector_capacity" "', argument " "1"" of type '" "std::vector< std::string > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - result = ((std::vector< std::string > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_StringVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::string > *arg1 = (std::vector< std::string > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_StringVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_StringVector" "', argument " "1"" of type '" "std::vector< std::string > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::string > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *StringVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_iterator" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_vector_Sl_double_Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___nonzero__" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_double_Sg__Sg____nonzero__((std::vector< std::vector< double > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___bool__" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_double_Sg__Sg____bool__((std::vector< std::vector< double > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___len__" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = std_vector_Sl_std_vector_Sl_double_Sg__Sg____len__((std::vector< std::vector< double > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - std::vector< std::vector< double > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___getslice__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___getslice__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorDouble___getslice__" "', argument " "3"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< double > >::difference_type >(val3); - try { - result = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)std_vector_Sl_std_vector_Sl_double_Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - std::vector< std::vector< double > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< double > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - std::vector< std::vector< double > >::difference_type arg3 ; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:VectorOfStructVectorDouble___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< double > >::difference_type >(val3); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorDouble___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::__setslice__(std::vector< std::vector< double > >::difference_type,std::vector< std::vector< double > >::difference_type)\n" - " std::vector< std::vector< double > >::__setslice__(std::vector< std::vector< double > >::difference_type,std::vector< std::vector< double > >::difference_type,std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - std::vector< std::vector< double > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___delslice__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___delslice__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorDouble___delslice__" "', argument " "3"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< double > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___delitem__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)std_vector_Sl_std_vector_Sl_double_Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::__delitem__(std::vector< std::vector< double > >::difference_type)\n" - " std::vector< std::vector< double > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< double > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___getitem__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - try { - result = (std::vector< std::vector< double > >::value_type *) &std_vector_Sl_std_vector_Sl_double_Sg__Sg____getitem____SWIG_1((std::vector< std::vector< double > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< double,std::allocator< double > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::__getitem__(PySliceObject *)\n" - " std::vector< std::vector< double > >::__getitem__(std::vector< std::vector< double > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::difference_type arg2 ; - std::vector< std::vector< double > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "2"" of type '" "std::vector< std::vector< double > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::difference_type >(val2); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_double_Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::vector< double,std::allocator< double > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorDouble___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorDouble___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::__setitem__(PySliceObject *,std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)\n" - " std::vector< std::vector< double > >::__setitem__(PySliceObject *)\n" - " std::vector< std::vector< double > >::__setitem__(std::vector< std::vector< double > >::difference_type,std::vector< std::vector< double > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_pop" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - try { - result = std_vector_Sl_std_vector_Sl_double_Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< double,std::allocator< double > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_append" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "VectorOfStructVectorDouble_append" "', argument " "2"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_append" "', argument " "2"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_vector_Sl_double_Sg__Sg__append(arg1,(std::vector< double,std::allocator< double > > const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorDouble__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_VectorOfStructVectorDouble")) SWIG_fail; - result = (std::vector< std::vector< double > > *)new std::vector< std::vector< double > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorDouble__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double,std::allocator< double > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_VectorOfStructVectorDouble",&obj0)) SWIG_fail; - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_VectorOfStructVectorDouble" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_VectorOfStructVectorDouble" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::vector< double > > *)new std::vector< std::vector< double > >((std::vector< std::vector< double,std::allocator< double > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_empty" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (bool)((std::vector< std::vector< double > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_size" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = ((std::vector< std::vector< double > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double,std::allocator< double > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_swap" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "VectorOfStructVectorDouble_swap" "', argument " "2"" of type '" "std::vector< std::vector< double,std::allocator< double > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_swap" "', argument " "2"" of type '" "std::vector< std::vector< double,std::allocator< double > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::vector< double,std::allocator< double > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_begin" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_end" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_rbegin" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_rend" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_clear" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::vector< double,std::allocator< double > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_get_allocator" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = ((std::vector< std::vector< double > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::vector< double > >::allocator_type(static_cast< const std::vector< std::vector< double > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorDouble__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_VectorOfStructVectorDouble",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_VectorOfStructVectorDouble" "', argument " "1"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< double > >::size_type >(val1); - result = (std::vector< std::vector< double > > *)new std::vector< std::vector< double > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_pop_back" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_resize" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble_resize" "', argument " "2"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< double > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_double_Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::iterator arg2 ; - std::vector< std::vector< double > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< double > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "3"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_erase" "', argument " "3"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_double_Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_VectorOfStructVectorDouble_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_VectorOfStructVectorDouble_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::erase(std::vector< std::vector< double > >::iterator)\n" - " std::vector< std::vector< double > >::erase(std::vector< std::vector< double > >::iterator,std::vector< std::vector< double > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorDouble__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > >::size_type arg1 ; - std::vector< std::vector< double > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< double > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_VectorOfStructVectorDouble",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_VectorOfStructVectorDouble" "', argument " "1"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< double > >::size_type >(val1); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_VectorOfStructVectorDouble" "', argument " "2"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_VectorOfStructVectorDouble" "', argument " "2"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::vector< double > > *)new std::vector< std::vector< double > >(arg1,(std::vector< std::vector< double > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorDouble(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_VectorOfStructVectorDouble__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_VectorOfStructVectorDouble__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_VectorOfStructVectorDouble__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_VectorOfStructVectorDouble__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_VectorOfStructVectorDouble'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::vector()\n" - " std::vector< std::vector< double > >::vector(std::vector< std::vector< double,std::allocator< double > > > const &)\n" - " std::vector< std::vector< double > >::vector(std::vector< std::vector< double > >::size_type)\n" - " std::vector< std::vector< double > >::vector(std::vector< std::vector< double > >::size_type,std::vector< std::vector< double > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_push_back" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "VectorOfStructVectorDouble_push_back" "', argument " "2"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_push_back" "', argument " "2"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::vector< double > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_front" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (std::vector< std::vector< double > >::value_type *) &((std::vector< std::vector< double > > const *)arg1)->front(); - resultobj = swig::from(static_cast< std::vector< double,std::allocator< double > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_back" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = (std::vector< std::vector< double > >::value_type *) &((std::vector< std::vector< double > > const *)arg1)->back(); - resultobj = swig::from(static_cast< std::vector< double,std::allocator< double > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::size_type arg2 ; - std::vector< std::vector< double > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_assign" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble_assign" "', argument " "2"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::size_type >(val2); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorDouble_assign" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_assign" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::vector< double > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::size_type arg2 ; - std::vector< std::vector< double > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_resize" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble_resize" "', argument " "2"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::size_type >(val2); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorDouble_resize" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_resize" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::vector< double > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorDouble_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorDouble_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::resize(std::vector< std::vector< double > >::size_type)\n" - " std::vector< std::vector< double > >::resize(std::vector< std::vector< double > >::size_type,std::vector< std::vector< double > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::iterator arg2 ; - std::vector< std::vector< double > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< double > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorDouble_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } - } - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_insert" "', argument " "3"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_vector_Sl_double_Sg__Sg__insert__SWIG_0(arg1,arg2,(std::vector< double,std::allocator< double > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< double > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::iterator arg2 ; - std::vector< std::vector< double > >::size_type arg3 ; - std::vector< std::vector< double > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:VectorOfStructVectorDouble_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "2"" of type '" "std::vector< std::vector< double > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "3"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::vector< double > >::size_type >(val3); - { - std::vector< double,std::allocator< double > > *ptr = (std::vector< double,std::allocator< double > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "VectorOfStructVectorDouble_insert" "', argument " "4"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorDouble_insert" "', argument " "4"" of type '" "std::vector< std::vector< double > >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_vector_Sl_double_Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::vector< double,std::allocator< double > > const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - int res = swig::asptr(argv[2], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorDouble_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< double,std::allocator< double > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorDouble_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorDouble_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< double > >::insert(std::vector< std::vector< double > >::iterator,std::vector< std::vector< double > >::value_type const &)\n" - " std::vector< std::vector< double > >::insert(std::vector< std::vector< double > >::iterator,std::vector< std::vector< double > >::size_type,std::vector< std::vector< double > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - std::vector< std::vector< double > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorDouble_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_reserve" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorDouble_reserve" "', argument " "2"" of type '" "std::vector< std::vector< double > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< double > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorDouble_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< double > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorDouble_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorDouble_capacity" "', argument " "1"" of type '" "std::vector< std::vector< double > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - result = ((std::vector< std::vector< double > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_VectorOfStructVectorDouble(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double > > *arg1 = (std::vector< std::vector< double > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_VectorOfStructVectorDouble",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_VectorOfStructVectorDouble" "', argument " "1"" of type '" "std::vector< std::vector< double > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< double > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *VectorOfStructVectorDouble_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_iterator" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_vector_Sl_int_Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___nonzero__" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_int_Sg__Sg____nonzero__((std::vector< std::vector< int > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___bool__" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_int_Sg__Sg____bool__((std::vector< std::vector< int > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___len__" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = std_vector_Sl_std_vector_Sl_int_Sg__Sg____len__((std::vector< std::vector< int > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - std::vector< std::vector< int > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___getslice__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___getslice__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorInt___getslice__" "', argument " "3"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< int > >::difference_type >(val3); - try { - result = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)std_vector_Sl_std_vector_Sl_int_Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - std::vector< std::vector< int > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< int > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - std::vector< std::vector< int > >::difference_type arg3 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:VectorOfStructVectorInt___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< int > >::difference_type >(val3); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorInt___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorInt___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::__setslice__(std::vector< std::vector< int > >::difference_type,std::vector< std::vector< int > >::difference_type)\n" - " std::vector< std::vector< int > >::__setslice__(std::vector< std::vector< int > >::difference_type,std::vector< std::vector< int > >::difference_type,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - std::vector< std::vector< int > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___delslice__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___delslice__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorInt___delslice__" "', argument " "3"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< int > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___delitem__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)std_vector_Sl_std_vector_Sl_int_Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_VectorOfStructVectorInt___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorInt___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::__delitem__(std::vector< std::vector< int > >::difference_type)\n" - " std::vector< std::vector< int > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< int > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___getitem__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - try { - result = (std::vector< std::vector< int > >::value_type *) &std_vector_Sl_std_vector_Sl_int_Sg__Sg____getitem____SWIG_1((std::vector< std::vector< int > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< int,std::allocator< int > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_VectorOfStructVectorInt___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorInt___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::__getitem__(PySliceObject *)\n" - " std::vector< std::vector< int > >::__getitem__(std::vector< std::vector< int > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::difference_type arg2 ; - std::vector< std::vector< int > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "2"" of type '" "std::vector< std::vector< int > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::difference_type >(val2); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_int_Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::vector< int,std::allocator< int > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_VectorOfStructVectorInt___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorInt___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorInt___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::__setitem__(PySliceObject *,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)\n" - " std::vector< std::vector< int > >::__setitem__(PySliceObject *)\n" - " std::vector< std::vector< int > >::__setitem__(std::vector< std::vector< int > >::difference_type,std::vector< std::vector< int > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_pop" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - try { - result = std_vector_Sl_std_vector_Sl_int_Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< int,std::allocator< int > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_append" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "VectorOfStructVectorInt_append" "', argument " "2"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_append" "', argument " "2"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_vector_Sl_int_Sg__Sg__append(arg1,(std::vector< int,std::allocator< int > > const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorInt__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_VectorOfStructVectorInt")) SWIG_fail; - result = (std::vector< std::vector< int > > *)new std::vector< std::vector< int > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorInt__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int,std::allocator< int > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_VectorOfStructVectorInt",&obj0)) SWIG_fail; - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_VectorOfStructVectorInt" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_VectorOfStructVectorInt" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::vector< int > > *)new std::vector< std::vector< int > >((std::vector< std::vector< int,std::allocator< int > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_empty" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (bool)((std::vector< std::vector< int > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_size" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = ((std::vector< std::vector< int > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int,std::allocator< int > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_swap" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "VectorOfStructVectorInt_swap" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_swap" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::vector< int,std::allocator< int > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_begin" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_end" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_rbegin" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_rend" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_clear" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::vector< int,std::allocator< int > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_get_allocator" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = ((std::vector< std::vector< int > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::vector< int > >::allocator_type(static_cast< const std::vector< std::vector< int > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorInt__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_VectorOfStructVectorInt",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_VectorOfStructVectorInt" "', argument " "1"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< int > >::size_type >(val1); - result = (std::vector< std::vector< int > > *)new std::vector< std::vector< int > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_pop_back" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_resize" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt_resize" "', argument " "2"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< int > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_erase" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_erase" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_erase" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_int_Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::iterator arg2 ; - std::vector< std::vector< int > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< int > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_erase" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_erase" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_erase" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_erase" "', argument " "3"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_erase" "', argument " "3"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_int_Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_VectorOfStructVectorInt_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_VectorOfStructVectorInt_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::erase(std::vector< std::vector< int > >::iterator)\n" - " std::vector< std::vector< int > >::erase(std::vector< std::vector< int > >::iterator,std::vector< std::vector< int > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorInt__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > >::size_type arg1 ; - std::vector< std::vector< int > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_VectorOfStructVectorInt",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_VectorOfStructVectorInt" "', argument " "1"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< int > >::size_type >(val1); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_VectorOfStructVectorInt" "', argument " "2"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_VectorOfStructVectorInt" "', argument " "2"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::vector< int > > *)new std::vector< std::vector< int > >(arg1,(std::vector< std::vector< int > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_VectorOfStructVectorInt(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_VectorOfStructVectorInt__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_VectorOfStructVectorInt__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_VectorOfStructVectorInt__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_VectorOfStructVectorInt__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_VectorOfStructVectorInt'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::vector()\n" - " std::vector< std::vector< int > >::vector(std::vector< std::vector< int,std::allocator< int > > > const &)\n" - " std::vector< std::vector< int > >::vector(std::vector< std::vector< int > >::size_type)\n" - " std::vector< std::vector< int > >::vector(std::vector< std::vector< int > >::size_type,std::vector< std::vector< int > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_push_back" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "VectorOfStructVectorInt_push_back" "', argument " "2"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_push_back" "', argument " "2"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::vector< int > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_front" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (std::vector< std::vector< int > >::value_type *) &((std::vector< std::vector< int > > const *)arg1)->front(); - resultobj = swig::from(static_cast< std::vector< int,std::allocator< int > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_back" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = (std::vector< std::vector< int > >::value_type *) &((std::vector< std::vector< int > > const *)arg1)->back(); - resultobj = swig::from(static_cast< std::vector< int,std::allocator< int > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::size_type arg2 ; - std::vector< std::vector< int > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_assign" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt_assign" "', argument " "2"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::size_type >(val2); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorInt_assign" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_assign" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::vector< int > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::size_type arg2 ; - std::vector< std::vector< int > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_resize" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt_resize" "', argument " "2"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::size_type >(val2); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorInt_resize" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_resize" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::vector< int > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_VectorOfStructVectorInt_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorInt_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::resize(std::vector< std::vector< int > >::size_type)\n" - " std::vector< std::vector< int > >::resize(std::vector< std::vector< int > >::size_type,std::vector< std::vector< int > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::iterator arg2 ; - std::vector< std::vector< int > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< int > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:VectorOfStructVectorInt_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_insert" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_insert" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_insert" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } - } - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "VectorOfStructVectorInt_insert" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_insert" "', argument " "3"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_vector_Sl_int_Sg__Sg__insert__SWIG_0(arg1,arg2,(std::vector< int,std::allocator< int > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< int > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::iterator arg2 ; - std::vector< std::vector< int > >::size_type arg3 ; - std::vector< std::vector< int > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:VectorOfStructVectorInt_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_insert" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_insert" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } else { - swig::SwigPyIterator_T >::iterator > *iter_t = dynamic_cast >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "VectorOfStructVectorInt_insert" "', argument " "2"" of type '" "std::vector< std::vector< int > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "VectorOfStructVectorInt_insert" "', argument " "3"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::vector< int > >::size_type >(val3); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "VectorOfStructVectorInt_insert" "', argument " "4"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "VectorOfStructVectorInt_insert" "', argument " "4"" of type '" "std::vector< std::vector< int > >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_vector_Sl_int_Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::vector< int,std::allocator< int > > const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - int res = swig::asptr(argv[2], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorInt_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_VectorOfStructVectorInt_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'VectorOfStructVectorInt_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< int > >::insert(std::vector< std::vector< int > >::iterator,std::vector< std::vector< int > >::value_type const &)\n" - " std::vector< std::vector< int > >::insert(std::vector< std::vector< int > >::iterator,std::vector< std::vector< int > >::size_type,std::vector< std::vector< int > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - std::vector< std::vector< int > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:VectorOfStructVectorInt_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_reserve" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "VectorOfStructVectorInt_reserve" "', argument " "2"" of type '" "std::vector< std::vector< int > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< int > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_VectorOfStructVectorInt_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< int > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:VectorOfStructVectorInt_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "VectorOfStructVectorInt_capacity" "', argument " "1"" of type '" "std::vector< std::vector< int > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - result = ((std::vector< std::vector< int > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_VectorOfStructVectorInt(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int > > *arg1 = (std::vector< std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_VectorOfStructVectorInt",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_VectorOfStructVectorInt" "', argument " "1"" of type '" "std::vector< std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< int > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *VectorOfStructVectorInt_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_FloatVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_iterator" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_float_Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___nonzero__" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (bool)std_vector_Sl_float_Sg____nonzero__((std::vector< float > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___bool__" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (bool)std_vector_Sl_float_Sg____bool__((std::vector< float > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___len__" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = std_vector_Sl_float_Sg____len__((std::vector< float > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - std::vector< float >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< float,std::allocator< float > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___getslice__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___getslice__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector___getslice__" "', argument " "3"" of type '" "std::vector< float >::difference_type""'"); - } - arg3 = static_cast< std::vector< float >::difference_type >(val3); - try { - result = (std::vector< float,std::allocator< float > > *)std_vector_Sl_float_Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - std::vector< float >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___setslice__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___setslice__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector___setslice__" "', argument " "3"" of type '" "std::vector< float >::difference_type""'"); - } - arg3 = static_cast< std::vector< float >::difference_type >(val3); - try { - std_vector_Sl_float_Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - std::vector< float >::difference_type arg3 ; - std::vector< float,std::allocator< float > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:FloatVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___setslice__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___setslice__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector___setslice__" "', argument " "3"" of type '" "std::vector< float >::difference_type""'"); - } - arg3 = static_cast< std::vector< float >::difference_type >(val3); - { - std::vector< float,std::allocator< float > > *ptr = (std::vector< float,std::allocator< float > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "FloatVector___setslice__" "', argument " "4"" of type '" "std::vector< float,std::allocator< float > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "FloatVector___setslice__" "', argument " "4"" of type '" "std::vector< float,std::allocator< float > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_float_Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< float,std::allocator< float > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_FloatVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::__setslice__(std::vector< float >::difference_type,std::vector< float >::difference_type)\n" - " std::vector< float >::__setslice__(std::vector< float >::difference_type,std::vector< float >::difference_type,std::vector< float,std::allocator< float > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - std::vector< float >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___delslice__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___delslice__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector___delslice__" "', argument " "3"" of type '" "std::vector< float >::difference_type""'"); - } - arg3 = static_cast< std::vector< float >::difference_type >(val3); - try { - std_vector_Sl_float_Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___delitem__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___delitem__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - try { - std_vector_Sl_float_Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< float,std::allocator< float > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___getitem__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< float,std::allocator< float > > *)std_vector_Sl_float_Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< float,std::allocator< float > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___setitem__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< float,std::allocator< float > > *ptr = (std::vector< float,std::allocator< float > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "FloatVector___setitem__" "', argument " "3"" of type '" "std::vector< float,std::allocator< float > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "FloatVector___setitem__" "', argument " "3"" of type '" "std::vector< float,std::allocator< float > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_float_Sg____setitem____SWIG_0(arg1,arg2,(std::vector< float,std::allocator< float > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___setitem__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_float_Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___delitem__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_float_Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_FloatVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::__delitem__(std::vector< float >::difference_type)\n" - " std::vector< float >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< float >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___getitem__" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___getitem__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - try { - result = (std::vector< float >::value_type *) &std_vector_Sl_float_Sg____getitem____SWIG_1((std::vector< float > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_float(static_cast< float >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_FloatVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::__getitem__(PySliceObject *)\n" - " std::vector< float >::__getitem__(std::vector< float >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::difference_type arg2 ; - std::vector< float >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - std::vector< float >::value_type temp3 ; - float val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector___setitem__" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector___setitem__" "', argument " "2"" of type '" "std::vector< float >::difference_type""'"); - } - arg2 = static_cast< std::vector< float >::difference_type >(val2); - ecode3 = SWIG_AsVal_float(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector___setitem__" "', argument " "3"" of type '" "std::vector< float >::value_type""'"); - } - temp3 = static_cast< std::vector< float >::value_type >(val3); - arg3 = &temp3; - try { - std_vector_Sl_float_Sg____setitem____SWIG_2(arg1,arg2,(float const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_FloatVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_FloatVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_float(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::__setitem__(PySliceObject *,std::vector< float,std::allocator< float > > const &)\n" - " std::vector< float >::__setitem__(PySliceObject *)\n" - " std::vector< float >::__setitem__(std::vector< float >::difference_type,std::vector< float >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_pop" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - try { - result = (std::vector< float >::value_type)std_vector_Sl_float_Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - std::vector< float >::value_type temp2 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_append" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector_append" "', argument " "2"" of type '" "std::vector< float >::value_type""'"); - } - temp2 = static_cast< std::vector< float >::value_type >(val2); - arg2 = &temp2; - std_vector_Sl_float_Sg__append(arg1,(float const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_FloatVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_FloatVector")) SWIG_fail; - result = (std::vector< float > *)new std::vector< float >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_FloatVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< float > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_FloatVector",&obj0)) SWIG_fail; - { - std::vector< float,std::allocator< float > > *ptr = (std::vector< float,std::allocator< float > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_FloatVector" "', argument " "1"" of type '" "std::vector< float > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_FloatVector" "', argument " "1"" of type '" "std::vector< float > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< float > *)new std::vector< float >((std::vector< float > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_empty" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (bool)((std::vector< float > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_size" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = ((std::vector< float > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_swap" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "FloatVector_swap" "', argument " "2"" of type '" "std::vector< float > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "FloatVector_swap" "', argument " "2"" of type '" "std::vector< float > &""'"); - } - arg2 = reinterpret_cast< std::vector< float > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_begin" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_end" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_rbegin" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_rend" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_clear" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< float > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_get_allocator" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = ((std::vector< float > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< float >::allocator_type(static_cast< const std::vector< float >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_float_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_FloatVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_FloatVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_FloatVector" "', argument " "1"" of type '" "std::vector< float >::size_type""'"); - } - arg1 = static_cast< std::vector< float >::size_type >(val1); - result = (std::vector< float > *)new std::vector< float >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_pop_back" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_resize" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector_resize" "', argument " "2"" of type '" "std::vector< float >::size_type""'"); - } - arg2 = static_cast< std::vector< float >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< float >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_erase" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_erase" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_erase" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } - } - result = std_vector_Sl_float_Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::iterator arg2 ; - std::vector< float >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< float >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_erase" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_erase" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_erase" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_erase" "', argument " "3"" of type '" "std::vector< float >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_erase" "', argument " "3"" of type '" "std::vector< float >::iterator""'"); - } - } - result = std_vector_Sl_float_Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_FloatVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_FloatVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::erase(std::vector< float >::iterator)\n" - " std::vector< float >::erase(std::vector< float >::iterator,std::vector< float >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_FloatVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float >::size_type arg1 ; - std::vector< float >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - std::vector< float >::value_type temp2 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< float > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_FloatVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_FloatVector" "', argument " "1"" of type '" "std::vector< float >::size_type""'"); - } - arg1 = static_cast< std::vector< float >::size_type >(val1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "new_FloatVector" "', argument " "2"" of type '" "std::vector< float >::value_type""'"); - } - temp2 = static_cast< std::vector< float >::value_type >(val2); - arg2 = &temp2; - result = (std::vector< float > *)new std::vector< float >(arg1,(std::vector< float >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_FloatVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_FloatVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_FloatVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_FloatVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_float(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_FloatVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_FloatVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::vector()\n" - " std::vector< float >::vector(std::vector< float > const &)\n" - " std::vector< float >::vector(std::vector< float >::size_type)\n" - " std::vector< float >::vector(std::vector< float >::size_type,std::vector< float >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - std::vector< float >::value_type temp2 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_push_back" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector_push_back" "', argument " "2"" of type '" "std::vector< float >::value_type""'"); - } - temp2 = static_cast< std::vector< float >::value_type >(val2); - arg2 = &temp2; - (arg1)->push_back((std::vector< float >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_front" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (std::vector< float >::value_type *) &((std::vector< float > const *)arg1)->front(); - resultobj = SWIG_From_float(static_cast< float >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_back" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = (std::vector< float >::value_type *) &((std::vector< float > const *)arg1)->back(); - resultobj = SWIG_From_float(static_cast< float >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::size_type arg2 ; - std::vector< float >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - std::vector< float >::value_type temp3 ; - float val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_assign" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector_assign" "', argument " "2"" of type '" "std::vector< float >::size_type""'"); - } - arg2 = static_cast< std::vector< float >::size_type >(val2); - ecode3 = SWIG_AsVal_float(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector_assign" "', argument " "3"" of type '" "std::vector< float >::value_type""'"); - } - temp3 = static_cast< std::vector< float >::value_type >(val3); - arg3 = &temp3; - (arg1)->assign(arg2,(std::vector< float >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::size_type arg2 ; - std::vector< float >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - std::vector< float >::value_type temp3 ; - float val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_resize" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector_resize" "', argument " "2"" of type '" "std::vector< float >::size_type""'"); - } - arg2 = static_cast< std::vector< float >::size_type >(val2); - ecode3 = SWIG_AsVal_float(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector_resize" "', argument " "3"" of type '" "std::vector< float >::value_type""'"); - } - temp3 = static_cast< std::vector< float >::value_type >(val3); - arg3 = &temp3; - (arg1)->resize(arg2,(std::vector< float >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_float(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::resize(std::vector< float >::size_type)\n" - " std::vector< float >::resize(std::vector< float >::size_type,std::vector< float >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::iterator arg2 ; - std::vector< float >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - std::vector< float >::value_type temp3 ; - float val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< float >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:FloatVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_insert" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_insert" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_insert" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_float(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector_insert" "', argument " "3"" of type '" "std::vector< float >::value_type""'"); - } - temp3 = static_cast< std::vector< float >::value_type >(val3); - arg3 = &temp3; - result = std_vector_Sl_float_Sg__insert__SWIG_0(arg1,arg2,(float const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< float >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::iterator arg2 ; - std::vector< float >::size_type arg3 ; - std::vector< float >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - std::vector< float >::value_type temp4 ; - float val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:FloatVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_insert" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_insert" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "FloatVector_insert" "', argument " "2"" of type '" "std::vector< float >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "FloatVector_insert" "', argument " "3"" of type '" "std::vector< float >::size_type""'"); - } - arg3 = static_cast< std::vector< float >::size_type >(val3); - ecode4 = SWIG_AsVal_float(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "FloatVector_insert" "', argument " "4"" of type '" "std::vector< float >::value_type""'"); - } - temp4 = static_cast< std::vector< float >::value_type >(val4); - arg4 = &temp4; - std_vector_Sl_float_Sg__insert__SWIG_1(arg1,arg2,arg3,(float const &)*arg4); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_float(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< float,std::allocator< float > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_float(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_FloatVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'FloatVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< float >::insert(std::vector< float >::iterator,std::vector< float >::value_type const &)\n" - " std::vector< float >::insert(std::vector< float >::iterator,std::vector< float >::size_type,std::vector< float >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - std::vector< float >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:FloatVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_reserve" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "FloatVector_reserve" "', argument " "2"" of type '" "std::vector< float >::size_type""'"); - } - arg2 = static_cast< std::vector< float >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_FloatVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< float >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:FloatVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "FloatVector_capacity" "', argument " "1"" of type '" "std::vector< float > const *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - result = ((std::vector< float > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_FloatVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< float > *arg1 = (std::vector< float > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_FloatVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_FloatVector" "', argument " "1"" of type '" "std::vector< float > *""'"); - } - arg1 = reinterpret_cast< std::vector< float > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *FloatVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_float_std__allocatorT_float_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_new_Pair__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_Pair")) SWIG_fail; - result = (std::pair< float,std::vector< int > > *)new std::pair< float,std::vector< int > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_Pair__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - float arg1 ; - std::vector< int,std::allocator< int > > arg2 ; - float val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::pair< float,std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_Pair",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_float(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_Pair" "', argument " "1"" of type '" "float""'"); - } - arg1 = static_cast< float >(val1); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - int res = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res) || !ptr) { - SWIG_exception_fail(SWIG_ArgError((ptr ? res : SWIG_TypeError)), "in method '" "new_Pair" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > >""'"); - } - arg2 = *ptr; - if (SWIG_IsNewObj(res)) delete ptr; - } - result = (std::pair< float,std::vector< int > > *)new std::pair< float,std::vector< int > >(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_Pair__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int,std::allocator< int > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::pair< float,std::vector< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_Pair",&obj0)) SWIG_fail; - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_Pair" "', argument " "1"" of type '" "std::pair< float,std::vector< int,std::allocator< int > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_Pair" "', argument " "1"" of type '" "std::pair< float,std::vector< int,std::allocator< int > > > const &""'"); - } - arg1 = ptr; - } - result = (std::pair< float,std::vector< int > > *)new std::pair< float,std::vector< int > >((std::pair< float,std::vector< int,std::allocator< int > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_Pair(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_Pair__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::pair< float,std::vector< int,std::allocator< int > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_Pair__SWIG_2(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_float(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_Pair__SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_Pair'.\n" - " Possible C/C++ prototypes are:\n" - " std::pair< float,std::vector< int > >::pair()\n" - " std::pair< float,std::vector< int > >::pair(float,std::vector< int,std::allocator< int > >)\n" - " std::pair< float,std::vector< int > >::pair(std::pair< float,std::vector< int,std::allocator< int > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_Pair_first_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int > > *arg1 = (std::pair< float,std::vector< int > > *) 0 ; - float arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:Pair_first_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Pair_first_set" "', argument " "1"" of type '" "std::pair< float,std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::pair< float,std::vector< int > > * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "Pair_first_set" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - if (arg1) (arg1)->first = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Pair_first_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int > > *arg1 = (std::pair< float,std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - float result; - - if (!PyArg_ParseTuple(args,(char *)"O:Pair_first_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Pair_first_get" "', argument " "1"" of type '" "std::pair< float,std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::pair< float,std::vector< int > > * >(argp1); - result = (float) ((arg1)->first); - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Pair_second_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int > > *arg1 = (std::pair< float,std::vector< int > > *) 0 ; - std::vector< int,std::allocator< int > > *arg2 = (std::vector< int,std::allocator< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:Pair_second_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Pair_second_set" "', argument " "1"" of type '" "std::pair< float,std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::pair< float,std::vector< int > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "Pair_second_set" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > *""'"); - } - arg2 = reinterpret_cast< std::vector< int,std::allocator< int > > * >(argp2); - if (arg1) (arg1)->second = *arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Pair_second_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int > > *arg1 = (std::pair< float,std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< int,std::allocator< int > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:Pair_second_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Pair_second_get" "', argument " "1"" of type '" "std::pair< float,std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::pair< float,std::vector< int > > * >(argp1); - result = (std::vector< int,std::allocator< int > > *)& ((arg1)->second); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_int_std__allocatorT_int_t_t, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_Pair(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< float,std::vector< int > > *arg1 = (std::pair< float,std::vector< int > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_Pair",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_Pair" "', argument " "1"" of type '" "std::pair< float,std::vector< int > > *""'"); - } - arg1 = reinterpret_cast< std::pair< float,std::vector< int > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *Pair_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_iterator" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___nonzero__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (bool)std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____nonzero__((std::vector< std::pair< float,std::vector< int > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___bool__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (bool)std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____bool__((std::vector< std::pair< float,std::vector< int > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___len__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____len__((std::vector< std::pair< float,std::vector< int > > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___getslice__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___getslice__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairFloatVectorVector___getslice__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val3); - try { - result = (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *)std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___setslice__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___setslice__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairFloatVectorVector___setslice__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val3); - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg3 ; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PairFloatVectorVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___setslice__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___setslice__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairFloatVectorVector___setslice__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val3); - { - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "PairFloatVectorVector___setslice__" "', argument " "4"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector___setslice__" "', argument " "4"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairFloatVectorVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairFloatVectorVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::__setslice__(std::vector< std::pair< float,std::vector< int > > >::difference_type,std::vector< std::pair< float,std::vector< int > > >::difference_type)\n" - " std::vector< std::pair< float,std::vector< int > > >::__setslice__(std::vector< std::pair< float,std::vector< int > > >::difference_type,std::vector< std::pair< float,std::vector< int > > >::difference_type,std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___delslice__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___delslice__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairFloatVectorVector___delslice__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val3); - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___delitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___delitem__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___getitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *)std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___setitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairFloatVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___setitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___delitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairFloatVectorVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairFloatVectorVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::__delitem__(std::vector< std::pair< float,std::vector< int > > >::difference_type)\n" - " std::vector< std::pair< float,std::vector< int > > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___getitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___getitem__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - try { - result = (std::vector< std::pair< float,std::vector< int > > >::value_type *) &std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_1((std::vector< std::pair< float,std::vector< int > > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::pair< float,std::vector< int,std::allocator< int > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairFloatVectorVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairFloatVectorVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::__getitem__(PySliceObject *)\n" - " std::vector< std::pair< float,std::vector< int > > >::__getitem__(std::vector< std::pair< float,std::vector< int > > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector___setitem__" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector___setitem__" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::difference_type >(val2); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairFloatVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::pair< float,std::vector< int,std::allocator< int > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairFloatVectorVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairFloatVectorVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::pair< float,std::vector< int,std::allocator< int > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairFloatVectorVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::__setitem__(PySliceObject *,std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > const &)\n" - " std::vector< std::pair< float,std::vector< int > > >::__setitem__(PySliceObject *)\n" - " std::vector< std::pair< float,std::vector< int > > >::__setitem__(std::vector< std::pair< float,std::vector< int > > >::difference_type,std::vector< std::pair< float,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_pop" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - try { - result = std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::pair< float,std::vector< int,std::allocator< int > > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_append" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairFloatVectorVector_append" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_append" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__append(arg1,(std::pair< float,std::vector< int,std::allocator< int > > > const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairFloatVectorVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_PairFloatVectorVector")) SWIG_fail; - result = (std::vector< std::pair< float,std::vector< int > > > *)new std::vector< std::pair< float,std::vector< int > > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairFloatVectorVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_PairFloatVectorVector",&obj0)) SWIG_fail; - { - std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_PairFloatVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_PairFloatVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::pair< float,std::vector< int > > > *)new std::vector< std::pair< float,std::vector< int > > >((std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_empty" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (bool)((std::vector< std::pair< float,std::vector< int > > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_size" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = ((std::vector< std::pair< float,std::vector< int > > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_swap" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairFloatVectorVector_swap" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_swap" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_begin" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_end" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_rbegin" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_rend" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_clear" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_get_allocator" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = ((std::vector< std::pair< float,std::vector< int > > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::pair< float,std::vector< int > > >::allocator_type(static_cast< const std::vector< std::pair< float,std::vector< int > > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairFloatVectorVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_PairFloatVectorVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_PairFloatVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val1); - result = (std::vector< std::pair< float,std::vector< int > > > *)new std::vector< std::pair< float,std::vector< int > > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_pop_back" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_resize" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector_resize" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_erase" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } - } - result = std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator arg2 ; - std::vector< std::pair< float,std::vector< int > > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_erase" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_erase" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_erase" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } - } - result = std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_PairFloatVectorVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_PairFloatVectorVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::erase(std::vector< std::pair< float,std::vector< int > > >::iterator)\n" - " std::vector< std::pair< float,std::vector< int > > >::erase(std::vector< std::pair< float,std::vector< int > > >::iterator,std::vector< std::pair< float,std::vector< int > > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_PairFloatVectorVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > >::size_type arg1 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< float,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_PairFloatVectorVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_PairFloatVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val1); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_PairFloatVectorVector" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_PairFloatVectorVector" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::pair< float,std::vector< int > > > *)new std::vector< std::pair< float,std::vector< int > > >(arg1,(std::vector< std::pair< float,std::vector< int > > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairFloatVectorVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_PairFloatVectorVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_PairFloatVectorVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_PairFloatVectorVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::pair< float,std::vector< int,std::allocator< int > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_PairFloatVectorVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_PairFloatVectorVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::vector()\n" - " std::vector< std::pair< float,std::vector< int > > >::vector(std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > const &)\n" - " std::vector< std::pair< float,std::vector< int > > >::vector(std::vector< std::pair< float,std::vector< int > > >::size_type)\n" - " std::vector< std::pair< float,std::vector< int > > >::vector(std::vector< std::pair< float,std::vector< int > > >::size_type,std::vector< std::pair< float,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_push_back" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairFloatVectorVector_push_back" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_push_back" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::pair< float,std::vector< int > > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_front" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (std::vector< std::pair< float,std::vector< int > > >::value_type *) &((std::vector< std::pair< float,std::vector< int > > > const *)arg1)->front(); - resultobj = swig::from(static_cast< std::pair< float,std::vector< int,std::allocator< int > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_back" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = (std::vector< std::pair< float,std::vector< int > > >::value_type *) &((std::vector< std::pair< float,std::vector< int > > > const *)arg1)->back(); - resultobj = swig::from(static_cast< std::pair< float,std::vector< int,std::allocator< int > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_assign" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector_assign" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val2); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairFloatVectorVector_assign" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_assign" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::pair< float,std::vector< int > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type arg2 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_resize" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector_resize" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val2); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairFloatVectorVector_resize" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_resize" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::pair< float,std::vector< int > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairFloatVectorVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::pair< float,std::vector< int,std::allocator< int > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairFloatVectorVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::resize(std::vector< std::pair< float,std::vector< int > > >::size_type)\n" - " std::vector< std::pair< float,std::vector< int > > >::resize(std::vector< std::pair< float,std::vector< int > > >::size_type,std::vector< std::pair< float,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator arg2 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairFloatVectorVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_insert" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } - } - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairFloatVectorVector_insert" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_insert" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_0(arg1,arg2,(std::pair< float,std::vector< int,std::allocator< int > > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< float,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::iterator arg2 ; - std::vector< std::pair< float,std::vector< int > > >::size_type arg3 ; - std::vector< std::pair< float,std::vector< int > > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PairFloatVectorVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_insert" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairFloatVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairFloatVectorVector_insert" "', argument " "3"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val3); - { - std::pair< float,std::vector< int,std::allocator< int > > > *ptr = (std::pair< float,std::vector< int,std::allocator< int > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "PairFloatVectorVector_insert" "', argument " "4"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairFloatVectorVector_insert" "', argument " "4"" of type '" "std::vector< std::pair< float,std::vector< int > > >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_pair_Sl_float_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::pair< float,std::vector< int,std::allocator< int > > > const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - int res = swig::asptr(argv[2], (std::pair< float,std::vector< int,std::allocator< int > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairFloatVectorVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::pair< float,std::vector< int,std::allocator< int > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairFloatVectorVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairFloatVectorVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< float,std::vector< int > > >::insert(std::vector< std::pair< float,std::vector< int > > >::iterator,std::vector< std::pair< float,std::vector< int > > >::value_type const &)\n" - " std::vector< std::pair< float,std::vector< int > > >::insert(std::vector< std::pair< float,std::vector< int > > >::iterator,std::vector< std::pair< float,std::vector< int > > >::size_type,std::vector< std::pair< float,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairFloatVectorVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_reserve" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairFloatVectorVector_reserve" "', argument " "2"" of type '" "std::vector< std::pair< float,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< float,std::vector< int > > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairFloatVectorVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< float,std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairFloatVectorVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairFloatVectorVector_capacity" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - result = ((std::vector< std::pair< float,std::vector< int > > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_PairFloatVectorVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< float,std::vector< int > > > *arg1 = (std::vector< std::pair< float,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_PairFloatVectorVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_PairFloatVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< float,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< float,std::vector< int > > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *PairFloatVectorVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_iterator" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___nonzero__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (bool)std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____nonzero__((std::vector< std::pair< double,std::vector< int > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___bool__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (bool)std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____bool__((std::vector< std::pair< double,std::vector< int > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___len__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____len__((std::vector< std::pair< double,std::vector< int > > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___getslice__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___getslice__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector___getslice__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val3); - try { - result = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val3); - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg3 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PairDoubleVectorVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val3); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "PairDoubleVectorVector___setslice__" "', argument " "4"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector___setslice__" "', argument " "4"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::__setslice__(std::vector< std::pair< double,std::vector< int > > >::difference_type,std::vector< std::pair< double,std::vector< int > > >::difference_type)\n" - " std::vector< std::pair< double,std::vector< int > > >::__setslice__(std::vector< std::pair< double,std::vector< int > > >::difference_type,std::vector< std::pair< double,std::vector< int > > >::difference_type,std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___delslice__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___delslice__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector___delslice__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val3); - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___delitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___delitem__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___getitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___delitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairDoubleVectorVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::__delitem__(std::vector< std::pair< double,std::vector< int > > >::difference_type)\n" - " std::vector< std::pair< double,std::vector< int > > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___getitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___getitem__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - try { - result = (std::vector< std::pair< double,std::vector< int > > >::value_type *) &std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_1((std::vector< std::pair< double,std::vector< int > > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairDoubleVectorVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::__getitem__(PySliceObject *)\n" - " std::vector< std::pair< double,std::vector< int > > >::__getitem__(std::vector< std::pair< double,std::vector< int > > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::difference_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::difference_type >(val2); - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector___setitem__" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg3 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp3); - try { - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::pair< double,std::vector< int,std::allocator< int > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairDoubleVectorVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_ConvertPtr(argv[2], 0, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::__setitem__(PySliceObject *,std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)\n" - " std::vector< std::pair< double,std::vector< int > > >::__setitem__(PySliceObject *)\n" - " std::vector< std::pair< double,std::vector< int > > >::__setitem__(std::vector< std::pair< double,std::vector< int > > >::difference_type,std::vector< std::pair< double,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::pair< double,std::vector< int,std::allocator< int > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_pop" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - try { - result = std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj((new std::vector< std::pair< double,std::vector< int > > >::value_type(static_cast< const std::vector< std::pair< double,std::vector< int > > >::value_type& >(result))), SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_append" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairDoubleVectorVector_append" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_append" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg2 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp2); - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__append(arg1,(std::pair< double,std::vector< int,std::allocator< int > > > const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_PairDoubleVectorVector")) SWIG_fail; - result = (std::vector< std::pair< double,std::vector< int > > > *)new std::vector< std::pair< double,std::vector< int > > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_PairDoubleVectorVector",&obj0)) SWIG_fail; - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_PairDoubleVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_PairDoubleVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::pair< double,std::vector< int > > > *)new std::vector< std::pair< double,std::vector< int > > >((std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_empty" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (bool)((std::vector< std::pair< double,std::vector< int > > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_size" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = ((std::vector< std::pair< double,std::vector< int > > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_swap" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairDoubleVectorVector_swap" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_swap" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_begin" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_end" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_rbegin" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_rend" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_clear" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_get_allocator" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = ((std::vector< std::pair< double,std::vector< int > > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::pair< double,std::vector< int > > >::allocator_type(static_cast< const std::vector< std::pair< double,std::vector< int > > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_PairDoubleVectorVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_PairDoubleVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val1); - result = (std::vector< std::pair< double,std::vector< int > > > *)new std::vector< std::pair< double,std::vector< int > > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_pop_back" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_resize" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector_resize" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_erase" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } - } - result = std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator arg2 ; - std::vector< std::pair< double,std::vector< int > > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_erase" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_erase" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_erase" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_erase" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } - } - result = std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_PairDoubleVectorVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_PairDoubleVectorVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::erase(std::vector< std::pair< double,std::vector< int > > >::iterator)\n" - " std::vector< std::pair< double,std::vector< int > > >::erase(std::vector< std::pair< double,std::vector< int > > >::iterator,std::vector< std::pair< double,std::vector< int > > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > >::size_type arg1 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::pair< double,std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_PairDoubleVectorVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_PairDoubleVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_PairDoubleVectorVector" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_PairDoubleVectorVector" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg2 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp2); - result = (std::vector< std::pair< double,std::vector< int > > > *)new std::vector< std::pair< double,std::vector< int > > >(arg1,(std::vector< std::pair< double,std::vector< int > > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_PairDoubleVectorVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_PairDoubleVectorVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_PairDoubleVectorVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_ConvertPtr(argv[1], 0, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_PairDoubleVectorVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_PairDoubleVectorVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::vector()\n" - " std::vector< std::pair< double,std::vector< int > > >::vector(std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > const &)\n" - " std::vector< std::pair< double,std::vector< int > > >::vector(std::vector< std::pair< double,std::vector< int > > >::size_type)\n" - " std::vector< std::pair< double,std::vector< int > > >::vector(std::vector< std::pair< double,std::vector< int > > >::size_type,std::vector< std::pair< double,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_push_back" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairDoubleVectorVector_push_back" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_push_back" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg2 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp2); - (arg1)->push_back((std::vector< std::pair< double,std::vector< int > > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_front" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (std::vector< std::pair< double,std::vector< int > > >::value_type *) &((std::vector< std::pair< double,std::vector< int > > > const *)arg1)->front(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_back" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = (std::vector< std::pair< double,std::vector< int > > >::value_type *) &((std::vector< std::pair< double,std::vector< int > > > const *)arg1)->back(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_assign" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector_assign" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val2); - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector_assign" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_assign" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg3 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp3); - (arg1)->assign(arg2,(std::vector< std::pair< double,std::vector< int > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type arg2 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_resize" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector_resize" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val2); - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector_resize" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_resize" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg3 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp3); - (arg1)->resize(arg2,(std::vector< std::pair< double,std::vector< int > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_ConvertPtr(argv[2], 0, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::resize(std::vector< std::pair< double,std::vector< int > > >::size_type)\n" - " std::vector< std::pair< double,std::vector< int > > >::resize(std::vector< std::pair< double,std::vector< int > > >::size_type,std::vector< std::pair< double,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator arg2 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_insert" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector_insert" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_insert" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg3 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp3); - result = std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_0(arg1,arg2,(std::pair< double,std::vector< int,std::allocator< int > > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::pair< double,std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::iterator arg2 ; - std::vector< std::pair< double,std::vector< int > > >::size_type arg3 ; - std::vector< std::pair< double,std::vector< int > > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - void *argp4 = 0 ; - int res4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PairDoubleVectorVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_insert" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector_insert" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector_insert" "', argument " "3"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val3); - res4 = SWIG_ConvertPtr(obj3, &argp4, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0 | 0); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "PairDoubleVectorVector_insert" "', argument " "4"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - if (!argp4) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector_insert" "', argument " "4"" of type '" "std::vector< std::pair< double,std::vector< int > > >::value_type const &""'"); - } - arg4 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > >::value_type * >(argp4); - std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::pair< double,std::vector< int,std::allocator< int > > > const &)*arg4); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - int res = SWIG_ConvertPtr(argv[2], 0, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = SWIG_ConvertPtr(argv[3], 0, SWIGTYPE_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::pair< double,std::vector< int > > >::insert(std::vector< std::pair< double,std::vector< int > > >::iterator,std::vector< std::pair< double,std::vector< int > > >::value_type const &)\n" - " std::vector< std::pair< double,std::vector< int > > >::insert(std::vector< std::pair< double,std::vector< int > > >::iterator,std::vector< std::pair< double,std::vector< int > > >::size_type,std::vector< std::pair< double,std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_reserve" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector_reserve" "', argument " "2"" of type '" "std::vector< std::pair< double,std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::pair< double,std::vector< int > > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::pair< double,std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector_capacity" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - result = ((std::vector< std::pair< double,std::vector< int > > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_PairDoubleVectorVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::pair< double,std::vector< int > > > *arg1 = (std::vector< std::pair< double,std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_PairDoubleVectorVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_PairDoubleVectorVector" "', argument " "1"" of type '" "std::vector< std::pair< double,std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::pair< double,std::vector< int > > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *PairDoubleVectorVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_iterator" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___nonzero__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____nonzero__((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___bool__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____bool__((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___len__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____len__((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___getslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___getslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector2___getslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val3); - try { - result = (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *)std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg3 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PairDoubleVectorVector2___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val3); - { - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *ptr = (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector2___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector2___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__setslice__(std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__setslice__(std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type,std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type,std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___delslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___delslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector2___delslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___delitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *)std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *ptr = (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairDoubleVectorVector2___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector2___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__delitem__(std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___getitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - try { - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *) &std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____getitem____SWIG_1((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairDoubleVectorVector2___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector2___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__getitem__(PySliceObject *)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__getitem__(std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type >(val2); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_PairDoubleVectorVector2___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector2___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector2___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__setitem__(PySliceObject *,std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > const &)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__setitem__(PySliceObject *)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::__setitem__(std::vector< std::vector< std::pair< double,std::vector< int > > > >::difference_type,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_pop" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - try { - result = std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_append" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairDoubleVectorVector2_append" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_append" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__append(arg1,(std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector2__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_PairDoubleVectorVector2")) SWIG_fail; - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *)new std::vector< std::vector< std::pair< double,std::vector< int > > > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector2__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_PairDoubleVectorVector2",&obj0)) SWIG_fail; - { - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *ptr = (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_PairDoubleVectorVector2" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_PairDoubleVectorVector2" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *)new std::vector< std::vector< std::pair< double,std::vector< int > > > >((std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_empty" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (bool)((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_size" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = ((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_swap" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairDoubleVectorVector2_swap" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_swap" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_begin" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_end" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_rbegin" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_rend" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_clear" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_get_allocator" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = ((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::vector< std::pair< double,std::vector< int > > > >::allocator_type(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector2__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_PairDoubleVectorVector2",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_PairDoubleVectorVector2" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val1); - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *)new std::vector< std::vector< std::pair< double,std::vector< int > > > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_pop_back" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_resize" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2_resize" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_erase" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > > >::iterator > *iter_t = dynamic_cast > > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_erase" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > > >::iterator > *iter_t = dynamic_cast > > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_erase" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > > >::iterator > *iter_t = dynamic_cast > > >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_erase" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_PairDoubleVectorVector2_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > > >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_PairDoubleVectorVector2_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::erase(std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::erase(std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator,std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector2__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg1 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_PairDoubleVectorVector2",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_PairDoubleVectorVector2" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val1); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_PairDoubleVectorVector2" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_PairDoubleVectorVector2" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *)new std::vector< std::vector< std::pair< double,std::vector< int > > > >(arg1,(std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_PairDoubleVectorVector2(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_PairDoubleVectorVector2__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_PairDoubleVectorVector2__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_PairDoubleVectorVector2__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_PairDoubleVectorVector2__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_PairDoubleVectorVector2'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::vector()\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::vector(std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > const &)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::vector(std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::vector(std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_push_back" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "PairDoubleVectorVector2_push_back" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_push_back" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_front" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *) &((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1)->front(); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_back" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = (std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *) &((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1)->back(); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_assign" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2_assign" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val2); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector2_assign" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_assign" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_resize" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2_resize" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val2); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector2_resize" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_resize" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_PairDoubleVectorVector2_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector2_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::resize(std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::resize(std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:PairDoubleVectorVector2_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_insert" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > > >::iterator > *iter_t = dynamic_cast > > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } - } - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "PairDoubleVectorVector2_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__insert__SWIG_0(arg1,arg2,(std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator arg2 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg3 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:PairDoubleVectorVector2_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_insert" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > > >::iterator > *iter_t = dynamic_cast > > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "PairDoubleVectorVector2_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "PairDoubleVectorVector2_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val3); - { - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *ptr = (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "PairDoubleVectorVector2_insert" "', argument " "4"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "PairDoubleVectorVector2_insert" "', argument " "4"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_vector_Sl_std_pair_Sl_double_Sc_std_vector_Sl_int_Sg__Sg__Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > > >::iterator > *>(iter) != 0)); - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector2_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > > >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_PairDoubleVectorVector2_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'PairDoubleVectorVector2_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::insert(std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)\n" - " std::vector< std::vector< std::pair< double,std::vector< int > > > >::insert(std::vector< std::vector< std::pair< double,std::vector< int > > > >::iterator,std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type,std::vector< std::vector< std::pair< double,std::vector< int > > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:PairDoubleVectorVector2_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_reserve" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "PairDoubleVectorVector2_reserve" "', argument " "2"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_PairDoubleVectorVector2_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int > > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:PairDoubleVectorVector2_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "PairDoubleVectorVector2_capacity" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - result = ((std::vector< std::vector< std::pair< double,std::vector< int > > > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_PairDoubleVectorVector2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::pair< double,std::vector< int > > > > *arg1 = (std::vector< std::vector< std::pair< double,std::vector< int > > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_PairDoubleVectorVector2",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_PairDoubleVectorVector2" "', argument " "1"" of type '" "std::vector< std::vector< std::pair< double,std::vector< int > > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::pair< double,std::vector< int > > > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *PairDoubleVectorVector2_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_DoubleVector3_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_iterator" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___nonzero__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____nonzero__((std::vector< std::vector< std::vector< double > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___bool__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____bool__((std::vector< std::vector< std::vector< double > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___len__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____len__((std::vector< std::vector< std::vector< double > > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___getslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___getslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector3___getslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val3); - try { - result = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector3___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg3 ; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:DoubleVector3___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector3___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val3); - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "DoubleVector3___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector3___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector3___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::__setslice__(std::vector< std::vector< std::vector< double > > >::difference_type,std::vector< std::vector< std::vector< double > > >::difference_type)\n" - " std::vector< std::vector< std::vector< double > > >::__setslice__(std::vector< std::vector< std::vector< double > > >::difference_type,std::vector< std::vector< std::vector< double > > >::difference_type,std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___delslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___delslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector3___delslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___delitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "DoubleVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_DoubleVector3___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector3___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::__delitem__(std::vector< std::vector< std::vector< double > > >::difference_type)\n" - " std::vector< std::vector< std::vector< double > > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< double > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___getitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - try { - result = (std::vector< std::vector< std::vector< double > > >::value_type *) &std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____getitem____SWIG_1((std::vector< std::vector< std::vector< double > > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_DoubleVector3___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector3___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::__getitem__(PySliceObject *)\n" - " std::vector< std::vector< std::vector< double > > >::__getitem__(std::vector< std::vector< std::vector< double > > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3___setitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::difference_type >(val2); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "DoubleVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_DoubleVector3___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector3___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector3___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::__setitem__(PySliceObject *,std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)\n" - " std::vector< std::vector< std::vector< double > > >::__setitem__(PySliceObject *)\n" - " std::vector< std::vector< std::vector< double > > >::__setitem__(std::vector< std::vector< std::vector< double > > >::difference_type,std::vector< std::vector< std::vector< double > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_pop" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - try { - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_append" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "DoubleVector3_append" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_append" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__append(arg1,(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector3__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_DoubleVector3")) SWIG_fail; - result = (std::vector< std::vector< std::vector< double > > > *)new std::vector< std::vector< std::vector< double > > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector3__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_DoubleVector3",&obj0)) SWIG_fail; - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_DoubleVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_DoubleVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::vector< std::vector< double > > > *)new std::vector< std::vector< std::vector< double > > >((std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_empty" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (bool)((std::vector< std::vector< std::vector< double > > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_size" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = ((std::vector< std::vector< std::vector< double > > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_swap" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "DoubleVector3_swap" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_swap" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_begin" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_end" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_rbegin" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_rend" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_clear" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_get_allocator" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = ((std::vector< std::vector< std::vector< double > > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::vector< std::vector< double > > >::allocator_type(static_cast< const std::vector< std::vector< std::vector< double > > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector3__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_DoubleVector3",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_DoubleVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val1); - result = (std::vector< std::vector< std::vector< double > > > *)new std::vector< std::vector< std::vector< double > > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_pop_back" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_resize" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3_resize" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< double > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_erase" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::iterator arg2 ; - std::vector< std::vector< std::vector< double > > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::vector< double > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_erase" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_erase" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_erase" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_DoubleVector3_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_DoubleVector3_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::erase(std::vector< std::vector< std::vector< double > > >::iterator)\n" - " std::vector< std::vector< std::vector< double > > >::erase(std::vector< std::vector< std::vector< double > > >::iterator,std::vector< std::vector< std::vector< double > > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector3__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > >::size_type arg1 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< double > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_DoubleVector3",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_DoubleVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val1); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_DoubleVector3" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_DoubleVector3" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::vector< std::vector< double > > > *)new std::vector< std::vector< std::vector< double > > >(arg1,(std::vector< std::vector< std::vector< double > > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_DoubleVector3(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_DoubleVector3__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_DoubleVector3__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_DoubleVector3__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_DoubleVector3__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_DoubleVector3'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::vector()\n" - " std::vector< std::vector< std::vector< double > > >::vector(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > const &)\n" - " std::vector< std::vector< std::vector< double > > >::vector(std::vector< std::vector< std::vector< double > > >::size_type)\n" - " std::vector< std::vector< std::vector< double > > >::vector(std::vector< std::vector< std::vector< double > > >::size_type,std::vector< std::vector< std::vector< double > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_push_back" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "DoubleVector3_push_back" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_push_back" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::vector< std::vector< double > > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_front" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (std::vector< std::vector< std::vector< double > > >::value_type *) &((std::vector< std::vector< std::vector< double > > > const *)arg1)->front(); - resultobj = swig::from(static_cast< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_back" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = (std::vector< std::vector< std::vector< double > > >::value_type *) &((std::vector< std::vector< std::vector< double > > > const *)arg1)->back(); - resultobj = swig::from(static_cast< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::size_type arg2 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_assign" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3_assign" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val2); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "DoubleVector3_assign" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_assign" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::vector< std::vector< double > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::size_type arg2 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_resize" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3_resize" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val2); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "DoubleVector3_resize" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_resize" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::vector< std::vector< double > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_DoubleVector3_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector3_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::resize(std::vector< std::vector< std::vector< double > > >::size_type)\n" - " std::vector< std::vector< std::vector< double > > >::resize(std::vector< std::vector< std::vector< double > > >::size_type,std::vector< std::vector< std::vector< double > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::iterator arg2 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::vector< double > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:DoubleVector3_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_insert" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } - } - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "DoubleVector3_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__insert__SWIG_0(arg1,arg2,(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< double > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::iterator arg2 ; - std::vector< std::vector< std::vector< double > > >::size_type arg3 ; - std::vector< std::vector< std::vector< double > > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:DoubleVector3_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_insert" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "DoubleVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "DoubleVector3_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val3); - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "DoubleVector3_insert" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleVector3_insert" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< double > > >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_vector_Sl_std_vector_Sl_double_Sg__Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector3_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_DoubleVector3_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'DoubleVector3_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< double > > >::insert(std::vector< std::vector< std::vector< double > > >::iterator,std::vector< std::vector< std::vector< double > > >::value_type const &)\n" - " std::vector< std::vector< std::vector< double > > >::insert(std::vector< std::vector< std::vector< double > > >::iterator,std::vector< std::vector< std::vector< double > > >::size_type,std::vector< std::vector< std::vector< double > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - std::vector< std::vector< std::vector< double > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleVector3_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_reserve" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "DoubleVector3_reserve" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< double > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< double > > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleVector3_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< double > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:DoubleVector3_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleVector3_capacity" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - result = ((std::vector< std::vector< std::vector< double > > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_DoubleVector3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double > > > *arg1 = (std::vector< std::vector< std::vector< double > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_DoubleVector3",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_DoubleVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< double > > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *DoubleVector3_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_IntVector3_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_iterator" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___nonzero__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____nonzero__((std::vector< std::vector< std::vector< int > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___bool__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (bool)std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____bool__((std::vector< std::vector< std::vector< int > > > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___len__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____len__((std::vector< std::vector< std::vector< int > > > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___getslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___getslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector3___getslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val3); - try { - result = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector3___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg3 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:IntVector3___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___setslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___setslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector3___setslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val3); - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "IntVector3___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3___setslice__" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector3___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector3___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::__setslice__(std::vector< std::vector< std::vector< int > > >::difference_type,std::vector< std::vector< std::vector< int > > >::difference_type)\n" - " std::vector< std::vector< std::vector< int > > >::__setslice__(std::vector< std::vector< std::vector< int > > >::difference_type,std::vector< std::vector< std::vector< int > > >::difference_type,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___delslice__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___delslice__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector3___delslice__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val3); - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___delitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "IntVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___delitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_IntVector3___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector3___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::__delitem__(std::vector< std::vector< std::vector< int > > >::difference_type)\n" - " std::vector< std::vector< std::vector< int > > >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___getitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___getitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - try { - result = (std::vector< std::vector< std::vector< int > > >::value_type *) &std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____getitem____SWIG_1((std::vector< std::vector< std::vector< int > > > const *)arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_IntVector3___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector3___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::__getitem__(PySliceObject *)\n" - " std::vector< std::vector< std::vector< int > > >::__getitem__(std::vector< std::vector< std::vector< int > > >::difference_type) const\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::difference_type arg2 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3___setitem__" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3___setitem__" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::difference_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::difference_type >(val2); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "IntVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3___setitem__" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg____setitem____SWIG_2(arg1,arg2,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_IntVector3___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector3___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector3___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::__setitem__(PySliceObject *,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)\n" - " std::vector< std::vector< std::vector< int > > >::__setitem__(PySliceObject *)\n" - " std::vector< std::vector< std::vector< int > > >::__setitem__(std::vector< std::vector< std::vector< int > > >::difference_type,std::vector< std::vector< std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_pop" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - try { - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = swig::from(static_cast< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_append" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "IntVector3_append" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_append" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg2 = ptr; - } - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__append(arg1,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector3__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_IntVector3")) SWIG_fail; - result = (std::vector< std::vector< std::vector< int > > > *)new std::vector< std::vector< std::vector< int > > >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector3__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_IntVector3",&obj0)) SWIG_fail; - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_IntVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_IntVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< std::vector< std::vector< int > > > *)new std::vector< std::vector< std::vector< int > > >((std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_empty" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (bool)((std::vector< std::vector< std::vector< int > > > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_size" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = ((std::vector< std::vector< std::vector< int > > > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_swap" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "IntVector3_swap" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_swap" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > &""'"); - } - arg2 = reinterpret_cast< std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_begin" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_end" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_rbegin" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_rend" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_clear" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_get_allocator" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = ((std::vector< std::vector< std::vector< int > > > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< std::vector< std::vector< int > > >::allocator_type(static_cast< const std::vector< std::vector< std::vector< int > > >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector3__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_IntVector3",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_IntVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val1); - result = (std::vector< std::vector< std::vector< int > > > *)new std::vector< std::vector< std::vector< int > > >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_pop_back" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_resize" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3_resize" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_erase" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::iterator arg2 ; - std::vector< std::vector< std::vector< int > > >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_erase" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_erase" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_erase" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_erase" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } - } - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_IntVector3_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_IntVector3_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::erase(std::vector< std::vector< std::vector< int > > >::iterator)\n" - " std::vector< std::vector< std::vector< int > > >::erase(std::vector< std::vector< std::vector< int > > >::iterator,std::vector< std::vector< std::vector< int > > >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector3__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > >::size_type arg1 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg2 = 0 ; - size_t val1 ; - int ecode1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::vector< std::vector< int > > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_IntVector3",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_IntVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg1 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val1); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_IntVector3" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_IntVector3" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg2 = ptr; - } - result = (std::vector< std::vector< std::vector< int > > > *)new std::vector< std::vector< std::vector< int > > >(arg1,(std::vector< std::vector< std::vector< int > > >::value_type const &)*arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_IntVector3(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_IntVector3__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_IntVector3__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_IntVector3__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_IntVector3__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_IntVector3'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::vector()\n" - " std::vector< std::vector< std::vector< int > > >::vector(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > const &)\n" - " std::vector< std::vector< std::vector< int > > >::vector(std::vector< std::vector< std::vector< int > > >::size_type)\n" - " std::vector< std::vector< std::vector< int > > >::vector(std::vector< std::vector< std::vector< int > > >::size_type,std::vector< std::vector< std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_push_back" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "IntVector3_push_back" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_push_back" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg2 = ptr; - } - (arg1)->push_back((std::vector< std::vector< std::vector< int > > >::value_type const &)*arg2); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_front" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (std::vector< std::vector< std::vector< int > > >::value_type *) &((std::vector< std::vector< std::vector< int > > > const *)arg1)->front(); - resultobj = swig::from(static_cast< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::value_type *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_back" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = (std::vector< std::vector< std::vector< int > > >::value_type *) &((std::vector< std::vector< std::vector< int > > > const *)arg1)->back(); - resultobj = swig::from(static_cast< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > >(*result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::size_type arg2 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_assign" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3_assign" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val2); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "IntVector3_assign" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_assign" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->assign(arg2,(std::vector< std::vector< std::vector< int > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::size_type arg2 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_resize" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3_resize" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val2); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "IntVector3_resize" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_resize" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - (arg1)->resize(arg2,(std::vector< std::vector< std::vector< int > > >::value_type const &)*arg3); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_IntVector3_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector3_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::resize(std::vector< std::vector< std::vector< int > > >::size_type)\n" - " std::vector< std::vector< std::vector< int > > >::resize(std::vector< std::vector< std::vector< int > > >::size_type,std::vector< std::vector< std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::iterator arg2 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::vector< std::vector< int > > >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:IntVector3_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_insert" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "IntVector3_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg3 = ptr; - } - result = std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_0(arg1,arg2,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< std::vector< std::vector< int > > >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::iterator arg2 ; - std::vector< std::vector< std::vector< int > > >::size_type arg3 ; - std::vector< std::vector< std::vector< int > > >::value_type *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:IntVector3_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_insert" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } else { - swig::SwigPyIterator_T > >::iterator > *iter_t = dynamic_cast > >::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "IntVector3_insert" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "IntVector3_insert" "', argument " "3"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg3 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val3); - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "IntVector3_insert" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntVector3_insert" "', argument " "4"" of type '" "std::vector< std::vector< std::vector< int > > >::value_type const &""'"); - } - arg4 = ptr; - } - std_vector_Sl_std_vector_Sl_std_vector_Sl_int_Sg__Sg__Sg__insert__SWIG_1(arg1,arg2,arg3,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg4); - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - int res = swig::asptr(argv[2], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector3_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast > >::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_IntVector3_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'IntVector3_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< std::vector< std::vector< int > > >::insert(std::vector< std::vector< std::vector< int > > >::iterator,std::vector< std::vector< std::vector< int > > >::value_type const &)\n" - " std::vector< std::vector< std::vector< int > > >::insert(std::vector< std::vector< std::vector< int > > >::iterator,std::vector< std::vector< std::vector< int > > >::size_type,std::vector< std::vector< std::vector< int > > >::value_type const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - std::vector< std::vector< std::vector< int > > >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntVector3_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_reserve" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "IntVector3_reserve" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int > > >::size_type""'"); - } - arg2 = static_cast< std::vector< std::vector< std::vector< int > > >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_IntVector3_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::vector< std::vector< int > > >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:IntVector3_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntVector3_capacity" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > const *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - result = ((std::vector< std::vector< std::vector< int > > > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_IntVector3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< int > > > *arg1 = (std::vector< std::vector< std::vector< int > > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_IntVector3",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_IntVector3" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< int > > > *""'"); - } - arg1 = reinterpret_cast< std::vector< std::vector< std::vector< int > > > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *IntVector3_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_TrieVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_iterator" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_PathTrie_Sm__Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___nonzero__" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (bool)std_vector_Sl_PathTrie_Sm__Sg____nonzero__((std::vector< PathTrie * > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___bool__" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (bool)std_vector_Sl_PathTrie_Sm__Sg____bool__((std::vector< PathTrie * > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___len__" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = std_vector_Sl_PathTrie_Sm__Sg____len__((std::vector< PathTrie * > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - std::vector< PathTrie * >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___getslice__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___getslice__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "TrieVector___getslice__" "', argument " "3"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg3 = static_cast< std::vector< PathTrie * >::difference_type >(val3); - try { - result = (std::vector< PathTrie *,std::allocator< PathTrie * > > *)std_vector_Sl_PathTrie_Sm__Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - std::vector< PathTrie * >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___setslice__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___setslice__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "TrieVector___setslice__" "', argument " "3"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg3 = static_cast< std::vector< PathTrie * >::difference_type >(val3); - try { - std_vector_Sl_PathTrie_Sm__Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - std::vector< PathTrie * >::difference_type arg3 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:TrieVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___setslice__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___setslice__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "TrieVector___setslice__" "', argument " "3"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg3 = static_cast< std::vector< PathTrie * >::difference_type >(val3); - { - std::vector< PathTrie*,std::allocator< PathTrie * > > *ptr = (std::vector< PathTrie*,std::allocator< PathTrie * > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "TrieVector___setslice__" "', argument " "4"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "TrieVector___setslice__" "', argument " "4"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_PathTrie_Sm__Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< PathTrie *,std::allocator< PathTrie * > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_TrieVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_TrieVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::__setslice__(std::vector< PathTrie * >::difference_type,std::vector< PathTrie * >::difference_type)\n" - " std::vector< PathTrie * >::__setslice__(std::vector< PathTrie * >::difference_type,std::vector< PathTrie * >::difference_type,std::vector< PathTrie *,std::allocator< PathTrie * > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - std::vector< PathTrie * >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___delslice__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___delslice__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "TrieVector___delslice__" "', argument " "3"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg3 = static_cast< std::vector< PathTrie * >::difference_type >(val3); - try { - std_vector_Sl_PathTrie_Sm__Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___delitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___delitem__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - try { - std_vector_Sl_PathTrie_Sm__Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___getitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< PathTrie *,std::allocator< PathTrie * > > *)std_vector_Sl_PathTrie_Sm__Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___setitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector< PathTrie*,std::allocator< PathTrie * > > *ptr = (std::vector< PathTrie*,std::allocator< PathTrie * > > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "TrieVector___setitem__" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "TrieVector___setitem__" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_PathTrie_Sm__Sg____setitem____SWIG_0(arg1,arg2,(std::vector< PathTrie *,std::allocator< PathTrie * > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___setitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_PathTrie_Sm__Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___delitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_PathTrie_Sm__Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_TrieVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_TrieVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::__delitem__(std::vector< PathTrie * >::difference_type)\n" - " std::vector< PathTrie * >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< PathTrie * >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___getitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___getitem__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - try { - result = (std::vector< PathTrie * >::value_type)std_vector_Sl_PathTrie_Sm__Sg____getitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_TrieVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_TrieVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::__getitem__(PySliceObject *)\n" - " std::vector< PathTrie * >::__getitem__(std::vector< PathTrie * >::difference_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::difference_type arg2 ; - std::vector< PathTrie * >::value_type arg3 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector___setitem__" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector___setitem__" "', argument " "2"" of type '" "std::vector< PathTrie * >::difference_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::difference_type >(val2); - res3 = SWIG_ConvertPtr(obj2, &argp3,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "TrieVector___setitem__" "', argument " "3"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp3); - try { - std_vector_Sl_PathTrie_Sm__Sg____setitem____SWIG_2(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_TrieVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_TrieVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_TrieVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::__setitem__(PySliceObject *,std::vector< PathTrie *,std::allocator< PathTrie * > > const &)\n" - " std::vector< PathTrie * >::__setitem__(PySliceObject *)\n" - " std::vector< PathTrie * >::__setitem__(std::vector< PathTrie * >::difference_type,std::vector< PathTrie * >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_pop" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - try { - result = (std::vector< PathTrie * >::value_type)std_vector_Sl_PathTrie_Sm__Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::value_type arg2 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_append" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "TrieVector_append" "', argument " "2"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg2 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp2); - std_vector_Sl_PathTrie_Sm__Sg__append(arg1,arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_TrieVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_TrieVector")) SWIG_fail; - result = (std::vector< PathTrie * > *)new std::vector< PathTrie * >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_TrieVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_TrieVector",&obj0)) SWIG_fail; - { - std::vector< PathTrie*,std::allocator< PathTrie * > > *ptr = (std::vector< PathTrie*,std::allocator< PathTrie * > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_TrieVector" "', argument " "1"" of type '" "std::vector< PathTrie * > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_TrieVector" "', argument " "1"" of type '" "std::vector< PathTrie * > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< PathTrie * > *)new std::vector< PathTrie * >((std::vector< PathTrie * > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_empty" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (bool)((std::vector< PathTrie * > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_size" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = ((std::vector< PathTrie * > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_swap" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "TrieVector_swap" "', argument " "2"" of type '" "std::vector< PathTrie * > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "TrieVector_swap" "', argument " "2"" of type '" "std::vector< PathTrie * > &""'"); - } - arg2 = reinterpret_cast< std::vector< PathTrie * > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_begin" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_end" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_rbegin" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_rend" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_clear" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< PathTrie * > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_get_allocator" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = ((std::vector< PathTrie * > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< PathTrie * >::allocator_type(static_cast< const std::vector< PathTrie * >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_PathTrie_p_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_TrieVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_TrieVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_TrieVector" "', argument " "1"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg1 = static_cast< std::vector< PathTrie * >::size_type >(val1); - result = (std::vector< PathTrie * > *)new std::vector< PathTrie * >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_pop_back" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_resize" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector_resize" "', argument " "2"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< PathTrie * >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_erase" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_erase" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_erase" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } - } - result = std_vector_Sl_PathTrie_Sm__Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::iterator arg2 ; - std::vector< PathTrie * >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< PathTrie * >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_erase" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_erase" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_erase" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_erase" "', argument " "3"" of type '" "std::vector< PathTrie * >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_erase" "', argument " "3"" of type '" "std::vector< PathTrie * >::iterator""'"); - } - } - result = std_vector_Sl_PathTrie_Sm__Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_TrieVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_TrieVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::erase(std::vector< PathTrie * >::iterator)\n" - " std::vector< PathTrie * >::erase(std::vector< PathTrie * >::iterator,std::vector< PathTrie * >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_TrieVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * >::size_type arg1 ; - std::vector< PathTrie * >::value_type arg2 = (std::vector< PathTrie * >::value_type) 0 ; - size_t val1 ; - int ecode1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< PathTrie * > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_TrieVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_TrieVector" "', argument " "1"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg1 = static_cast< std::vector< PathTrie * >::size_type >(val1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_TrieVector" "', argument " "2"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg2 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp2); - result = (std::vector< PathTrie * > *)new std::vector< PathTrie * >(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_TrieVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_TrieVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_TrieVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_TrieVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[1], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_TrieVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_TrieVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::vector()\n" - " std::vector< PathTrie * >::vector(std::vector< PathTrie * > const &)\n" - " std::vector< PathTrie * >::vector(std::vector< PathTrie * >::size_type)\n" - " std::vector< PathTrie * >::vector(std::vector< PathTrie * >::size_type,std::vector< PathTrie * >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::value_type arg2 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_push_back" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "TrieVector_push_back" "', argument " "2"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg2 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp2); - (arg1)->push_back(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_front" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (std::vector< PathTrie * >::value_type)((std::vector< PathTrie * > const *)arg1)->front(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_back" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = (std::vector< PathTrie * >::value_type)((std::vector< PathTrie * > const *)arg1)->back(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_PathTrie, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::size_type arg2 ; - std::vector< PathTrie * >::value_type arg3 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_assign" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector_assign" "', argument " "2"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::size_type >(val2); - res3 = SWIG_ConvertPtr(obj2, &argp3,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "TrieVector_assign" "', argument " "3"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp3); - (arg1)->assign(arg2,arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::size_type arg2 ; - std::vector< PathTrie * >::value_type arg3 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_resize" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector_resize" "', argument " "2"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::size_type >(val2); - res3 = SWIG_ConvertPtr(obj2, &argp3,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "TrieVector_resize" "', argument " "3"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp3); - (arg1)->resize(arg2,arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_TrieVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_TrieVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::resize(std::vector< PathTrie * >::size_type)\n" - " std::vector< PathTrie * >::resize(std::vector< PathTrie * >::size_type,std::vector< PathTrie * >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::iterator arg2 ; - std::vector< PathTrie * >::value_type arg3 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< PathTrie * >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:TrieVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_insert" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_insert" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_insert" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, &argp3,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "TrieVector_insert" "', argument " "3"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp3); - result = std_vector_Sl_PathTrie_Sm__Sg__insert__SWIG_0(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< PathTrie * >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::iterator arg2 ; - std::vector< PathTrie * >::size_type arg3 ; - std::vector< PathTrie * >::value_type arg4 = (std::vector< PathTrie * >::value_type) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - void *argp4 = 0 ; - int res4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:TrieVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_insert" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_insert" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "TrieVector_insert" "', argument " "2"" of type '" "std::vector< PathTrie * >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "TrieVector_insert" "', argument " "3"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg3 = static_cast< std::vector< PathTrie * >::size_type >(val3); - res4 = SWIG_ConvertPtr(obj3, &argp4,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "TrieVector_insert" "', argument " "4"" of type '" "std::vector< PathTrie * >::value_type""'"); - } - arg4 = reinterpret_cast< std::vector< PathTrie * >::value_type >(argp4); - std_vector_Sl_PathTrie_Sm__Sg__insert__SWIG_1(arg1,arg2,arg3,arg4); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_TrieVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< PathTrie*,std::allocator< PathTrie * > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[3], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_TrieVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'TrieVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< PathTrie * >::insert(std::vector< PathTrie * >::iterator,std::vector< PathTrie * >::value_type)\n" - " std::vector< PathTrie * >::insert(std::vector< PathTrie * >::iterator,std::vector< PathTrie * >::size_type,std::vector< PathTrie * >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - std::vector< PathTrie * >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:TrieVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_reserve" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "TrieVector_reserve" "', argument " "2"" of type '" "std::vector< PathTrie * >::size_type""'"); - } - arg2 = static_cast< std::vector< PathTrie * >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_TrieVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< PathTrie * >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:TrieVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "TrieVector_capacity" "', argument " "1"" of type '" "std::vector< PathTrie * > const *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - result = ((std::vector< PathTrie * > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_TrieVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< PathTrie * > *arg1 = (std::vector< PathTrie * > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_TrieVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_TrieVector" "', argument " "1"" of type '" "std::vector< PathTrie * > *""'"); - } - arg1 = reinterpret_cast< std::vector< PathTrie * > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *TrieVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_BoolVector_iterator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - PyObject **arg2 = (PyObject **) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - swig::SwigPyIterator *result = 0 ; - - arg2 = &obj0; - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_iterator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_iterator" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (swig::SwigPyIterator *)std_vector_Sl_bool_Sg__iterator(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_swig__SwigPyIterator, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___nonzero__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector___nonzero__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___nonzero__" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (bool)std_vector_Sl_bool_Sg____nonzero__((std::vector< bool > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___bool__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector___bool__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___bool__" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (bool)std_vector_Sl_bool_Sg____bool__((std::vector< bool > const *)arg1); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___len__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector___len__",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___len__" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = std_vector_Sl_bool_Sg____len__((std::vector< bool > const *)arg1); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___getslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - std::vector< bool >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< bool,std::allocator< bool > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector___getslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___getslice__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___getslice__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector___getslice__" "', argument " "3"" of type '" "std::vector< bool >::difference_type""'"); - } - arg3 = static_cast< std::vector< bool >::difference_type >(val3); - try { - result = (std::vector< bool,std::allocator< bool > > *)std_vector_Sl_bool_Sg____getslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setslice____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - std::vector< bool >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector___setslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___setslice__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___setslice__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector___setslice__" "', argument " "3"" of type '" "std::vector< bool >::difference_type""'"); - } - arg3 = static_cast< std::vector< bool >::difference_type >(val3); - try { - std_vector_Sl_bool_Sg____setslice____SWIG_0(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setslice____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - std::vector< bool >::difference_type arg3 ; - std::vector< bool,std::allocator< bool > > *arg4 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:BoolVector___setslice__",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___setslice__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___setslice__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector___setslice__" "', argument " "3"" of type '" "std::vector< bool >::difference_type""'"); - } - arg3 = static_cast< std::vector< bool >::difference_type >(val3); - { - std::vector > *ptr = (std::vector > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "BoolVector___setslice__" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "BoolVector___setslice__" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg4 = ptr; - } - try { - std_vector_Sl_bool_Sg____setslice____SWIG_1(arg1,arg2,arg3,(std::vector< bool,std::allocator< bool > > const &)*arg4); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setslice__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector___setslice____SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - int res = swig::asptr(argv[3], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_BoolVector___setslice____SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector___setslice__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::__setslice__(std::vector< bool >::difference_type,std::vector< bool >::difference_type)\n" - " std::vector< bool >::__setslice__(std::vector< bool >::difference_type,std::vector< bool >::difference_type,std::vector< bool,std::allocator< bool > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___delslice__(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - std::vector< bool >::difference_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - ptrdiff_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector___delslice__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___delslice__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___delslice__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - ecode3 = SWIG_AsVal_ptrdiff_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector___delslice__" "', argument " "3"" of type '" "std::vector< bool >::difference_type""'"); - } - arg3 = static_cast< std::vector< bool >::difference_type >(val3); - try { - std_vector_Sl_bool_Sg____delslice__(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___delitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___delitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___delitem__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - try { - std_vector_Sl_bool_Sg____delitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___getitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< bool,std::allocator< bool > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___getitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector___getitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - result = (std::vector< bool,std::allocator< bool > > *)std_vector_Sl_bool_Sg____getitem____SWIG_0(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setitem____SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - std::vector< bool,std::allocator< bool > > *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res3 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___setitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - { - std::vector > *ptr = (std::vector > *)0; - res3 = swig::asptr(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "BoolVector___setitem__" "', argument " "3"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "BoolVector___setitem__" "', argument " "3"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg3 = ptr; - } - try { - std_vector_Sl_bool_Sg____setitem____SWIG_0(arg1,arg2,(std::vector< bool,std::allocator< bool > > const &)*arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - if (SWIG_IsNewObj(res3)) delete arg3; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector___setitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___setitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector___setitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_bool_Sg____setitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___delitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - PySliceObject *arg2 = (PySliceObject *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector___delitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___delitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - { - if (!PySlice_Check(obj1)) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector___delitem__" "', argument " "2"" of type '" "PySliceObject *""'"); - } - arg2 = (PySliceObject *) obj1; - } - try { - std_vector_Sl_bool_Sg____delitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - catch(std::invalid_argument &_e) { - SWIG_exception_fail(SWIG_ValueError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___delitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_BoolVector___delitem____SWIG_1(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector___delitem____SWIG_0(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector___delitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::__delitem__(std::vector< bool >::difference_type)\n" - " std::vector< bool >::__delitem__(PySliceObject *)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___getitem____SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< bool >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector___getitem__",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___getitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___getitem__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - try { - result = (std::vector< bool >::value_type)std_vector_Sl_bool_Sg____getitem____SWIG_1(arg1,arg2); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___getitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_BoolVector___getitem____SWIG_0(self, args); - } - } - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector___getitem____SWIG_1(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector___getitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::__getitem__(PySliceObject *)\n" - " std::vector< bool >::__getitem__(std::vector< bool >::difference_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setitem____SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::difference_type arg2 ; - std::vector< bool >::value_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - ptrdiff_t val2 ; - int ecode2 = 0 ; - bool val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector___setitem__",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector___setitem__" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_ptrdiff_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector___setitem__" "', argument " "2"" of type '" "std::vector< bool >::difference_type""'"); - } - arg2 = static_cast< std::vector< bool >::difference_type >(val2); - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector___setitem__" "', argument " "3"" of type '" "std::vector< bool >::value_type""'"); - } - arg3 = static_cast< std::vector< bool >::value_type >(val3); - try { - std_vector_Sl_bool_Sg____setitem____SWIG_2(arg1,arg2,arg3); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector___setitem__(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - return _wrap_BoolVector___setitem____SWIG_1(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - _v = PySlice_Check(argv[1]); - } - if (_v) { - int res = swig::asptr(argv[2], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_BoolVector___setitem____SWIG_0(self, args); - } - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_ptrdiff_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector___setitem____SWIG_2(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector___setitem__'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::__setitem__(PySliceObject *,std::vector< bool,std::allocator< bool > > const &)\n" - " std::vector< bool >::__setitem__(PySliceObject *)\n" - " std::vector< bool >::__setitem__(std::vector< bool >::difference_type,std::vector< bool >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_pop(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_pop",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_pop" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - try { - result = (std::vector< bool >::value_type)std_vector_Sl_bool_Sg__pop(arg1); - } - catch(std::out_of_range &_e) { - SWIG_exception_fail(SWIG_IndexError, (&_e)->what()); - } - - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_append(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::value_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - bool val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector_append",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_append" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_bool(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector_append" "', argument " "2"" of type '" "std::vector< bool >::value_type""'"); - } - arg2 = static_cast< std::vector< bool >::value_type >(val2); - std_vector_Sl_bool_Sg__append(arg1,arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_BoolVector__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_BoolVector")) SWIG_fail; - result = (std::vector< bool > *)new std::vector< bool >(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_BoolVector__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = 0 ; - int res1 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - std::vector< bool > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_BoolVector",&obj0)) SWIG_fail; - { - std::vector > *ptr = (std::vector > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_BoolVector" "', argument " "1"" of type '" "std::vector< bool > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_BoolVector" "', argument " "1"" of type '" "std::vector< bool > const &""'"); - } - arg1 = ptr; - } - result = (std::vector< bool > *)new std::vector< bool >((std::vector< bool > const &)*arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res1)) delete arg1; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_empty(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_empty",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_empty" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (bool)((std::vector< bool > const *)arg1)->empty(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_size" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = ((std::vector< bool > const *)arg1)->size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_swap(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector_swap",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_swap" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "BoolVector_swap" "', argument " "2"" of type '" "std::vector< bool > &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "BoolVector_swap" "', argument " "2"" of type '" "std::vector< bool > &""'"); - } - arg2 = reinterpret_cast< std::vector< bool > * >(argp2); - (arg1)->swap(*arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_begin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_begin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_begin" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (arg1)->begin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_end(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_end",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_end" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (arg1)->end(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_rbegin(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_rbegin",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_rbegin" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (arg1)->rbegin(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_rend(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::reverse_iterator result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_rend",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_rend" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (arg1)->rend(); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::reverse_iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_clear(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_clear",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_clear" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - (arg1)->clear(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_get_allocator(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - SwigValueWrapper< std::allocator< bool > > result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_get_allocator",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_get_allocator" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = ((std::vector< bool > const *)arg1)->get_allocator(); - resultobj = SWIG_NewPointerObj((new std::vector< bool >::allocator_type(static_cast< const std::vector< bool >::allocator_type& >(result))), SWIGTYPE_p_std__allocatorT_bool_t, SWIG_POINTER_OWN | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_BoolVector__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool >::size_type arg1 ; - size_t val1 ; - int ecode1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:new_BoolVector",&obj0)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_BoolVector" "', argument " "1"" of type '" "std::vector< bool >::size_type""'"); - } - arg1 = static_cast< std::vector< bool >::size_type >(val1); - result = (std::vector< bool > *)new std::vector< bool >(arg1); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_pop_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_pop_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_pop_back" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - (arg1)->pop_back(); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_resize__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector_resize",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_resize" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector_resize" "', argument " "2"" of type '" "std::vector< bool >::size_type""'"); - } - arg2 = static_cast< std::vector< bool >::size_type >(val2); - (arg1)->resize(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_erase__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::iterator arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< bool >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector_erase",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_erase" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_erase" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_erase" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } - } - result = std_vector_Sl_bool_Sg__erase__SWIG_0(arg1,arg2); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_erase__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::iterator arg2 ; - std::vector< bool >::iterator arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - swig::SwigPyIterator *iter3 = 0 ; - int res3 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< bool >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector_erase",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_erase" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_erase" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_erase" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } - } - res3 = SWIG_ConvertPtr(obj2, SWIG_as_voidptrptr(&iter3), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res3) || !iter3) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_erase" "', argument " "3"" of type '" "std::vector< bool >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter3); - if (iter_t) { - arg3 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_erase" "', argument " "3"" of type '" "std::vector< bool >::iterator""'"); - } - } - result = std_vector_Sl_bool_Sg__erase__SWIG_1(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_erase(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_BoolVector_erase__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[2], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - return _wrap_BoolVector_erase__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector_erase'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::erase(std::vector< bool >::iterator)\n" - " std::vector< bool >::erase(std::vector< bool >::iterator,std::vector< bool >::iterator)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_new_BoolVector__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool >::size_type arg1 ; - std::vector< bool >::value_type arg2 ; - size_t val1 ; - int ecode1 = 0 ; - bool val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< bool > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:new_BoolVector",&obj0,&obj1)) SWIG_fail; - ecode1 = SWIG_AsVal_size_t(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_BoolVector" "', argument " "1"" of type '" "std::vector< bool >::size_type""'"); - } - arg1 = static_cast< std::vector< bool >::size_type >(val1); - ecode2 = SWIG_AsVal_bool(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "new_BoolVector" "', argument " "2"" of type '" "std::vector< bool >::value_type""'"); - } - arg2 = static_cast< std::vector< bool >::value_type >(val2); - result = (std::vector< bool > *)new std::vector< bool >(arg1,arg2); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_new_BoolVector(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[3] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 2) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 0) { - return _wrap_new_BoolVector__SWIG_0(self, args); - } - if (argc == 1) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_BoolVector__SWIG_2(self, args); - } - } - if (argc == 1) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_new_BoolVector__SWIG_1(self, args); - } - } - if (argc == 2) { - int _v; - { - int res = SWIG_AsVal_size_t(argv[0], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_new_BoolVector__SWIG_3(self, args); - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'new_BoolVector'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::vector()\n" - " std::vector< bool >::vector(std::vector< bool > const &)\n" - " std::vector< bool >::vector(std::vector< bool >::size_type)\n" - " std::vector< bool >::vector(std::vector< bool >::size_type,std::vector< bool >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_push_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::value_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - bool val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector_push_back",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_push_back" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_bool(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector_push_back" "', argument " "2"" of type '" "std::vector< bool >::value_type""'"); - } - arg2 = static_cast< std::vector< bool >::value_type >(val2); - (arg1)->push_back(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_front(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_front",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_front" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (std::vector< bool >::value_type)((std::vector< bool > const *)arg1)->front(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_back(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::value_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_back",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_back" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = (std::vector< bool >::value_type)((std::vector< bool > const *)arg1)->back(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_assign(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::size_type arg2 ; - std::vector< bool >::value_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - bool val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector_assign",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_assign" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector_assign" "', argument " "2"" of type '" "std::vector< bool >::size_type""'"); - } - arg2 = static_cast< std::vector< bool >::size_type >(val2); - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector_assign" "', argument " "3"" of type '" "std::vector< bool >::value_type""'"); - } - arg3 = static_cast< std::vector< bool >::value_type >(val3); - (arg1)->assign(arg2,arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_resize__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::size_type arg2 ; - std::vector< bool >::value_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - bool val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector_resize",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_resize" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector_resize" "', argument " "2"" of type '" "std::vector< bool >::size_type""'"); - } - arg2 = static_cast< std::vector< bool >::size_type >(val2); - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector_resize" "', argument " "3"" of type '" "std::vector< bool >::value_type""'"); - } - arg3 = static_cast< std::vector< bool >::value_type >(val3); - (arg1)->resize(arg2,arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_resize(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[4] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 3) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector_resize__SWIG_0(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[1], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector_resize__SWIG_1(self, args); - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector_resize'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::resize(std::vector< bool >::size_type)\n" - " std::vector< bool >::resize(std::vector< bool >::size_type,std::vector< bool >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_insert__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::iterator arg2 ; - std::vector< bool >::value_type arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - bool val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< bool >::iterator result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:BoolVector_insert",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_insert" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_insert" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_insert" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector_insert" "', argument " "3"" of type '" "std::vector< bool >::value_type""'"); - } - arg3 = static_cast< std::vector< bool >::value_type >(val3); - result = std_vector_Sl_bool_Sg__insert__SWIG_0(arg1,arg2,arg3); - resultobj = SWIG_NewPointerObj(swig::make_output_iterator(static_cast< const std::vector< bool >::iterator & >(result)), - swig::SwigPyIterator::descriptor(),SWIG_POINTER_OWN); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_insert__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::iterator arg2 ; - std::vector< bool >::size_type arg3 ; - std::vector< bool >::value_type arg4 ; - void *argp1 = 0 ; - int res1 = 0 ; - swig::SwigPyIterator *iter2 = 0 ; - int res2 ; - size_t val3 ; - int ecode3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:BoolVector_insert",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_insert" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, SWIG_as_voidptrptr(&iter2), swig::SwigPyIterator::descriptor(), 0); - if (!SWIG_IsOK(res2) || !iter2) { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_insert" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } else { - swig::SwigPyIterator_T::iterator > *iter_t = dynamic_cast::iterator > *>(iter2); - if (iter_t) { - arg2 = iter_t->get_current(); - } else { - SWIG_exception_fail(SWIG_ArgError(SWIG_TypeError), "in method '" "BoolVector_insert" "', argument " "2"" of type '" "std::vector< bool >::iterator""'"); - } - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "BoolVector_insert" "', argument " "3"" of type '" "std::vector< bool >::size_type""'"); - } - arg3 = static_cast< std::vector< bool >::size_type >(val3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "BoolVector_insert" "', argument " "4"" of type '" "std::vector< bool >::value_type""'"); - } - arg4 = static_cast< std::vector< bool >::value_type >(val4); - std_vector_Sl_bool_Sg__insert__SWIG_1(arg1,arg2,arg3,arg4); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_insert(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector_insert__SWIG_0(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - swig::SwigPyIterator *iter = 0; - int res = SWIG_ConvertPtr(argv[1], SWIG_as_voidptrptr(&iter), swig::SwigPyIterator::descriptor(), 0); - _v = (SWIG_IsOK(res) && iter && (dynamic_cast::iterator > *>(iter) != 0)); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_BoolVector_insert__SWIG_1(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'BoolVector_insert'.\n" - " Possible C/C++ prototypes are:\n" - " std::vector< bool >::insert(std::vector< bool >::iterator,std::vector< bool >::value_type)\n" - " std::vector< bool >::insert(std::vector< bool >::iterator,std::vector< bool >::size_type,std::vector< bool >::value_type)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_reserve(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - std::vector< bool >::size_type arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - size_t val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:BoolVector_reserve",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_reserve" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - ecode2 = SWIG_AsVal_size_t(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "BoolVector_reserve" "', argument " "2"" of type '" "std::vector< bool >::size_type""'"); - } - arg2 = static_cast< std::vector< bool >::size_type >(val2); - (arg1)->reserve(arg2); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_BoolVector_capacity(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< bool >::size_type result; - - if (!PyArg_ParseTuple(args,(char *)"O:BoolVector_capacity",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "BoolVector_capacity" "', argument " "1"" of type '" "std::vector< bool > const *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - result = ((std::vector< bool > const *)arg1)->capacity(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_BoolVector(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< bool > *arg1 = (std::vector< bool > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_BoolVector",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_BoolVector" "', argument " "1"" of type '" "std::vector< bool > *""'"); - } - arg1 = reinterpret_cast< std::vector< bool > * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *BoolVector_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_std__vectorT_bool_std__allocatorT_bool_t_t, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_IntDoublePairCompSecondRev(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< int,double > *arg1 = 0 ; - std::pair< int,double > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"OO:IntDoublePairCompSecondRev",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1, SWIGTYPE_p_std__pairT_int_double_t, 0 | 0); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "IntDoublePairCompSecondRev" "', argument " "1"" of type '" "std::pair< int,double > const &""'"); - } - if (!argp1) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntDoublePairCompSecondRev" "', argument " "1"" of type '" "std::pair< int,double > const &""'"); - } - arg1 = reinterpret_cast< std::pair< int,double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__pairT_int_double_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "IntDoublePairCompSecondRev" "', argument " "2"" of type '" "std::pair< int,double > const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "IntDoublePairCompSecondRev" "', argument " "2"" of type '" "std::pair< int,double > const &""'"); - } - arg2 = reinterpret_cast< std::pair< int,double > * >(argp2); - result = (bool)pair_comp_second_rev< int,double >((std::pair< int,double > const &)*arg1,(std::pair< int,double > const &)*arg2); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_StringDoublePairCompSecondRev(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< std::string,double > *arg1 = 0 ; - std::pair< std::string,double > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"OO:StringDoublePairCompSecondRev",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1, SWIGTYPE_p_std__pairT_std__string_double_t, 0 | 0); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "StringDoublePairCompSecondRev" "', argument " "1"" of type '" "std::pair< std::string,double > const &""'"); - } - if (!argp1) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringDoublePairCompSecondRev" "', argument " "1"" of type '" "std::pair< std::string,double > const &""'"); - } - arg1 = reinterpret_cast< std::pair< std::string,double > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__pairT_std__string_double_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "StringDoublePairCompSecondRev" "', argument " "2"" of type '" "std::pair< std::string,double > const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "StringDoublePairCompSecondRev" "', argument " "2"" of type '" "std::pair< std::string,double > const &""'"); - } - arg2 = reinterpret_cast< std::pair< std::string,double > * >(argp2); - result = (bool)pair_comp_second_rev< std::string,double >((std::pair< std::string,double > const &)*arg1,(std::pair< std::string,double > const &)*arg2); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_DoubleStringPairCompFirstRev(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::pair< double,std::string > *arg1 = 0 ; - std::pair< double,std::string > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"OO:DoubleStringPairCompFirstRev",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1, SWIGTYPE_p_std__pairT_double_std__string_t, 0 | 0); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "DoubleStringPairCompFirstRev" "', argument " "1"" of type '" "std::pair< double,std::string > const &""'"); - } - if (!argp1) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleStringPairCompFirstRev" "', argument " "1"" of type '" "std::pair< double,std::string > const &""'"); - } - arg1 = reinterpret_cast< std::pair< double,std::string > * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_std__pairT_double_std__string_t, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "DoubleStringPairCompFirstRev" "', argument " "2"" of type '" "std::pair< double,std::string > const &""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "DoubleStringPairCompFirstRev" "', argument " "2"" of type '" "std::pair< double,std::string > const &""'"); - } - arg2 = reinterpret_cast< std::pair< double,std::string > * >(argp2); - result = (bool)pair_comp_first_rev< double,std::string >((std::pair< double,std::string > const &)*arg1,(std::pair< double,std::string > const &)*arg2); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN int Swig_var_OOV_SCORE_set(PyObject *) { - SWIG_Error(SWIG_AttributeError,"Variable OOV_SCORE is read-only."); - return 1; -} - - -SWIGINTERN PyObject *Swig_var_OOV_SCORE_get(void) { - PyObject *pyobj = 0; - - pyobj = SWIG_From_double(static_cast< double >(OOV_SCORE)); - return pyobj; -} - - -SWIGINTERN int Swig_var_START_TOKEN_set(PyObject *) { - SWIG_Error(SWIG_AttributeError,"Variable START_TOKEN is read-only."); - return 1; -} - - -SWIGINTERN PyObject *Swig_var_START_TOKEN_get(void) { - PyObject *pyobj = 0; - - pyobj = SWIG_From_std_string(static_cast< std::string >(START_TOKEN)); - return pyobj; -} - - -SWIGINTERN int Swig_var_UNK_TOKEN_set(PyObject *) { - SWIG_Error(SWIG_AttributeError,"Variable UNK_TOKEN is read-only."); - return 1; -} - - -SWIGINTERN PyObject *Swig_var_UNK_TOKEN_get(void) { - PyObject *pyobj = 0; - - pyobj = SWIG_From_std_string(static_cast< std::string >(UNK_TOKEN)); - return pyobj; -} - - -SWIGINTERN int Swig_var_END_TOKEN_set(PyObject *) { - SWIG_Error(SWIG_AttributeError,"Variable END_TOKEN is read-only."); - return 1; -} - - -SWIGINTERN PyObject *Swig_var_END_TOKEN_get(void) { - PyObject *pyobj = 0; - - pyobj = SWIG_From_std_string(static_cast< std::string >(END_TOKEN)); - return pyobj; -} - - -SWIGINTERN PyObject *_wrap_new_RetriveStrEnumerateVocab(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - RetriveStrEnumerateVocab *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)":new_RetriveStrEnumerateVocab")) SWIG_fail; - result = (RetriveStrEnumerateVocab *)new RetriveStrEnumerateVocab(); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_RetriveStrEnumerateVocab, SWIG_POINTER_NEW | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_RetriveStrEnumerateVocab_Add(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - RetriveStrEnumerateVocab *arg1 = (RetriveStrEnumerateVocab *) 0 ; - lm::WordIndex arg2 ; - StringPiece *arg3 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 ; - int res2 = 0 ; - void *argp3 = 0 ; - int res3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:RetriveStrEnumerateVocab_Add",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_RetriveStrEnumerateVocab, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "RetriveStrEnumerateVocab_Add" "', argument " "1"" of type '" "RetriveStrEnumerateVocab *""'"); - } - arg1 = reinterpret_cast< RetriveStrEnumerateVocab * >(argp1); - { - res2 = SWIG_ConvertPtr(obj1, &argp2, SWIGTYPE_p_lm__WordIndex, 0 | 0); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "RetriveStrEnumerateVocab_Add" "', argument " "2"" of type '" "lm::WordIndex""'"); - } - if (!argp2) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "RetriveStrEnumerateVocab_Add" "', argument " "2"" of type '" "lm::WordIndex""'"); - } else { - lm::WordIndex * temp = reinterpret_cast< lm::WordIndex * >(argp2); - arg2 = *temp; - if (SWIG_IsNewObj(res2)) delete temp; - } - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_StringPiece, 0 | 0); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "RetriveStrEnumerateVocab_Add" "', argument " "3"" of type '" "StringPiece const &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "RetriveStrEnumerateVocab_Add" "', argument " "3"" of type '" "StringPiece const &""'"); - } - arg3 = reinterpret_cast< StringPiece * >(argp3); - (arg1)->Add(arg2,(StringPiece const &)*arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_RetriveStrEnumerateVocab_vocabulary_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - RetriveStrEnumerateVocab *arg1 = (RetriveStrEnumerateVocab *) 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = (std::vector< std::string,std::allocator< std::string > > *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:RetriveStrEnumerateVocab_vocabulary_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_RetriveStrEnumerateVocab, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "RetriveStrEnumerateVocab_vocabulary_set" "', argument " "1"" of type '" "RetriveStrEnumerateVocab *""'"); - } - arg1 = reinterpret_cast< RetriveStrEnumerateVocab * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "RetriveStrEnumerateVocab_vocabulary_set" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > *""'"); - } - arg2 = reinterpret_cast< std::vector< std::string,std::allocator< std::string > > * >(argp2); - if (arg1) (arg1)->vocabulary = *arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_RetriveStrEnumerateVocab_vocabulary_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - RetriveStrEnumerateVocab *arg1 = (RetriveStrEnumerateVocab *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - std::vector< std::string,std::allocator< std::string > > *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:RetriveStrEnumerateVocab_vocabulary_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_RetriveStrEnumerateVocab, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "RetriveStrEnumerateVocab_vocabulary_get" "', argument " "1"" of type '" "RetriveStrEnumerateVocab *""'"); - } - arg1 = reinterpret_cast< RetriveStrEnumerateVocab * >(argp1); - result = (std::vector< std::string,std::allocator< std::string > > *)& ((arg1)->vocabulary); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_RetriveStrEnumerateVocab(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - RetriveStrEnumerateVocab *arg1 = (RetriveStrEnumerateVocab *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_RetriveStrEnumerateVocab",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_RetriveStrEnumerateVocab, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_RetriveStrEnumerateVocab" "', argument " "1"" of type '" "RetriveStrEnumerateVocab *""'"); - } - arg1 = reinterpret_cast< RetriveStrEnumerateVocab * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *RetriveStrEnumerateVocab_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_RetriveStrEnumerateVocab, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_new_Scorer(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - double arg1 ; - double arg2 ; - std::string *arg3 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg4 = 0 ; - double val1 ; - int ecode1 = 0 ; - double val2 ; - int ecode2 = 0 ; - int res3 = SWIG_OLDOBJ ; - int res4 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - Scorer *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:new_Scorer",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - ecode1 = SWIG_AsVal_double(obj0, &val1); - if (!SWIG_IsOK(ecode1)) { - SWIG_exception_fail(SWIG_ArgError(ecode1), "in method '" "new_Scorer" "', argument " "1"" of type '" "double""'"); - } - arg1 = static_cast< double >(val1); - ecode2 = SWIG_AsVal_double(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "new_Scorer" "', argument " "2"" of type '" "double""'"); - } - arg2 = static_cast< double >(val2); - { - std::string *ptr = (std::string *)0; - res3 = SWIG_AsPtr_std_string(obj2, &ptr); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "new_Scorer" "', argument " "3"" of type '" "std::string const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_Scorer" "', argument " "3"" of type '" "std::string const &""'"); - } - arg3 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "new_Scorer" "', argument " "4"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_Scorer" "', argument " "4"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg4 = ptr; - } - result = (Scorer *)new Scorer(arg1,arg2,(std::string const &)*arg3,(std::vector< std::string,std::allocator< std::string > > const &)*arg4); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_Scorer, SWIG_POINTER_NEW | 0 ); - if (SWIG_IsNewObj(res3)) delete arg3; - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res3)) delete arg3; - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_delete_Scorer(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:delete_Scorer",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, SWIG_POINTER_DISOWN | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_Scorer" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - delete arg1; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_get_log_cond_prob(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - double result; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_get_log_cond_prob",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_get_log_cond_prob" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "Scorer_get_log_cond_prob" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "Scorer_get_log_cond_prob" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - result = (double)(arg1)->get_log_cond_prob((std::vector< std::string,std::allocator< std::string > > const &)*arg2); - resultobj = SWIG_From_double(static_cast< double >(result)); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_get_sent_log_prob(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - double result; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_get_sent_log_prob",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_get_sent_log_prob" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "Scorer_get_sent_log_prob" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "Scorer_get_sent_log_prob" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - result = (double)(arg1)->get_sent_log_prob((std::vector< std::string,std::allocator< std::string > > const &)*arg2); - resultobj = SWIG_From_double(static_cast< double >(result)); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_get_max_order(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - size_t result; - - if (!PyArg_ParseTuple(args,(char *)"O:Scorer_get_max_order",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_get_max_order" "', argument " "1"" of type '" "Scorer const *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - result = ((Scorer const *)arg1)->get_max_order(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_get_dict_size(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - size_t result; - - if (!PyArg_ParseTuple(args,(char *)"O:Scorer_get_dict_size",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_get_dict_size" "', argument " "1"" of type '" "Scorer const *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - result = ((Scorer const *)arg1)->get_dict_size(); - resultobj = SWIG_From_size_t(static_cast< size_t >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_is_character_based(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - bool result; - - if (!PyArg_ParseTuple(args,(char *)"O:Scorer_is_character_based",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_is_character_based" "', argument " "1"" of type '" "Scorer const *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - result = (bool)((Scorer const *)arg1)->is_character_based(); - resultobj = SWIG_From_bool(static_cast< bool >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_reset_params(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - float arg2 ; - float arg3 ; - void *argp1 = 0 ; - int res1 = 0 ; - float val2 ; - int ecode2 = 0 ; - float val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OOO:Scorer_reset_params",&obj0,&obj1,&obj2)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_reset_params" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - ecode2 = SWIG_AsVal_float(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "Scorer_reset_params" "', argument " "2"" of type '" "float""'"); - } - arg2 = static_cast< float >(val2); - ecode3 = SWIG_AsVal_float(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "Scorer_reset_params" "', argument " "3"" of type '" "float""'"); - } - arg3 = static_cast< float >(val3); - (arg1)->reset_params(arg2,arg3); - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_make_ngram(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - PathTrie *arg2 = (PathTrie *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - void *argp2 = 0 ; - int res2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::string,std::allocator< std::string > > result; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_make_ngram",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_make_ngram" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - res2 = SWIG_ConvertPtr(obj1, &argp2,SWIGTYPE_p_PathTrie, 0 | 0 ); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "Scorer_make_ngram" "', argument " "2"" of type '" "PathTrie *""'"); - } - arg2 = reinterpret_cast< PathTrie * >(argp2); - result = (arg1)->make_ngram(arg2); - resultobj = swig::from(static_cast< std::vector< std::string,std::allocator< std::string > > >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_split_labels(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - std::vector< int,std::allocator< int > > *arg2 = 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::vector< std::string,std::allocator< std::string > > result; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_split_labels",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_split_labels" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "Scorer_split_labels" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "Scorer_split_labels" "', argument " "2"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - arg2 = ptr; - } - result = (arg1)->split_labels((std::vector< int,std::allocator< int > > const &)*arg2); - resultobj = swig::from(static_cast< std::vector< std::string,std::allocator< std::string > > >(result)); - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_alpha_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - double arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - double val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_alpha_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_alpha_set" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - ecode2 = SWIG_AsVal_double(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "Scorer_alpha_set" "', argument " "2"" of type '" "double""'"); - } - arg2 = static_cast< double >(val2); - if (arg1) (arg1)->alpha = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_alpha_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - double result; - - if (!PyArg_ParseTuple(args,(char *)"O:Scorer_alpha_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_alpha_get" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - result = (double) ((arg1)->alpha); - resultobj = SWIG_From_double(static_cast< double >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_beta_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - double arg2 ; - void *argp1 = 0 ; - int res1 = 0 ; - double val2 ; - int ecode2 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_beta_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_beta_set" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - ecode2 = SWIG_AsVal_double(obj1, &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "Scorer_beta_set" "', argument " "2"" of type '" "double""'"); - } - arg2 = static_cast< double >(val2); - if (arg1) (arg1)->beta = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_beta_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - double result; - - if (!PyArg_ParseTuple(args,(char *)"O:Scorer_beta_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_beta_get" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - result = (double) ((arg1)->beta); - resultobj = SWIG_From_double(static_cast< double >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_dictionary_set(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *arg2 = (void *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - int res2 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"OO:Scorer_dictionary_set",&obj0,&obj1)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_dictionary_set" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - res2 = SWIG_ConvertPtr(obj1,SWIG_as_voidptrptr(&arg2), 0, SWIG_POINTER_DISOWN); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "Scorer_dictionary_set" "', argument " "2"" of type '" "void *""'"); - } - if (arg1) (arg1)->dictionary = arg2; - resultobj = SWIG_Py_Void(); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_Scorer_dictionary_get(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - Scorer *arg1 = (Scorer *) 0 ; - void *argp1 = 0 ; - int res1 = 0 ; - PyObject * obj0 = 0 ; - void *result = 0 ; - - if (!PyArg_ParseTuple(args,(char *)"O:Scorer_dictionary_get",&obj0)) SWIG_fail; - res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "Scorer_dictionary_get" "', argument " "1"" of type '" "Scorer *""'"); - } - arg1 = reinterpret_cast< Scorer * >(argp1); - result = (void *) ((arg1)->dictionary); - resultobj = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_void, 0 | 0 ); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *Scorer_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *obj; - if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL; - SWIG_TypeNewClientData(SWIGTYPE_p_Scorer, SWIG_NewClientData(obj)); - return SWIG_Py_Void(); -} - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg1 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - PathTrie *arg3 = 0 ; - bool arg4 ; - size_t arg5 ; - int arg6 ; - int arg7 ; - double arg8 ; - Scorer *arg9 = (Scorer *) 0 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - size_t val5 ; - int ecode5 = 0 ; - int val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - double val8 ; - int ecode8 = 0 ; - void *argp9 = 0 ; - int res9 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - PyObject * obj7 = 0 ; - PyObject * obj8 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOOOO:ctc_beam_search_decoder",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7,&obj8)) SWIG_fail; - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_PathTrie, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - arg3 = reinterpret_cast< PathTrie * >(argp3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "ctc_beam_search_decoder" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_int(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder" "', argument " "6"" of type '" "int""'"); - } - arg6 = static_cast< int >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - ecode8 = SWIG_AsVal_double(obj7, &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "ctc_beam_search_decoder" "', argument " "8"" of type '" "double""'"); - } - arg8 = static_cast< double >(val8); - res9 = SWIG_ConvertPtr(obj8, &argp9,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res9)) { - SWIG_exception_fail(SWIG_ArgError(res9), "in method '" "ctc_beam_search_decoder" "', argument " "9"" of type '" "Scorer *""'"); - } - arg9 = reinterpret_cast< Scorer * >(argp9); - result = ctc_beam_search_decoder((std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg1,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg2,*arg3,arg4,arg5,arg6,arg7,arg8,arg9); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg1 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - PathTrie *arg3 = 0 ; - bool arg4 ; - size_t arg5 ; - int arg6 ; - int arg7 ; - double arg8 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - size_t val5 ; - int ecode5 = 0 ; - int val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - double val8 ; - int ecode8 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - PyObject * obj7 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOOO:ctc_beam_search_decoder",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7)) SWIG_fail; - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_PathTrie, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - arg3 = reinterpret_cast< PathTrie * >(argp3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "ctc_beam_search_decoder" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_int(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder" "', argument " "6"" of type '" "int""'"); - } - arg6 = static_cast< int >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - ecode8 = SWIG_AsVal_double(obj7, &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "ctc_beam_search_decoder" "', argument " "8"" of type '" "double""'"); - } - arg8 = static_cast< double >(val8); - result = ctc_beam_search_decoder((std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg1,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg2,*arg3,arg4,arg5,arg6,arg7,arg8); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg1 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - PathTrie *arg3 = 0 ; - bool arg4 ; - size_t arg5 ; - int arg6 ; - int arg7 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - size_t val5 ; - int ecode5 = 0 ; - int val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOO:ctc_beam_search_decoder",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6)) SWIG_fail; - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_PathTrie, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - arg3 = reinterpret_cast< PathTrie * >(argp3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "ctc_beam_search_decoder" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_int(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder" "', argument " "6"" of type '" "int""'"); - } - arg6 = static_cast< int >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - result = ctc_beam_search_decoder((std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg1,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg2,*arg3,arg4,arg5,arg6,arg7); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg1 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - PathTrie *arg3 = 0 ; - bool arg4 ; - size_t arg5 ; - int arg6 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - size_t val5 ; - int ecode5 = 0 ; - int val6 ; - int ecode6 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOO:ctc_beam_search_decoder",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5)) SWIG_fail; - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_PathTrie, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - arg3 = reinterpret_cast< PathTrie * >(argp3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "ctc_beam_search_decoder" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_int(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder" "', argument " "6"" of type '" "int""'"); - } - arg6 = static_cast< int >(val6); - result = ctc_beam_search_decoder((std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg1,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg2,*arg3,arg4,arg5,arg6); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder__SWIG_4(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *arg1 = 0 ; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg2 = 0 ; - PathTrie *arg3 = 0 ; - bool arg4 ; - size_t arg5 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - size_t val5 ; - int ecode5 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOO:ctc_beam_search_decoder",&obj0,&obj1,&obj2,&obj3,&obj4)) SWIG_fail; - { - std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *ptr = (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "1"" of type '" "std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "2"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_PathTrie, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder" "', argument " "3"" of type '" "PathTrie &""'"); - } - arg3 = reinterpret_cast< PathTrie * >(argp3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "ctc_beam_search_decoder" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - result = ctc_beam_search_decoder((std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &)*arg1,(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg2,*arg3,arg4,arg5); - resultobj = swig::from(static_cast< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[10] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 9) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 5) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder__SWIG_4(self, args); - } - } - } - } - } - } - if (argc == 6) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder__SWIG_3(self, args); - } - } - } - } - } - } - } - if (argc == 7) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder__SWIG_2(self, args); - } - } - } - } - } - } - } - } - if (argc == 8) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[7], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder__SWIG_1(self, args); - } - } - } - } - } - } - } - } - } - if (argc == 9) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_PathTrie, 0); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[7], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[8], &vptr, SWIGTYPE_p_Scorer, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_ctc_beam_search_decoder__SWIG_0(self, args); - } - } - } - } - } - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'ctc_beam_search_decoder'.\n" - " Possible C/C++ prototypes are:\n" - " ctc_beam_search_decoder(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,PathTrie &,bool const,size_t,int,int,double,Scorer *)\n" - " ctc_beam_search_decoder(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,PathTrie &,bool const,size_t,int,int,double)\n" - " ctc_beam_search_decoder(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,PathTrie &,bool const,size_t,int,int)\n" - " ctc_beam_search_decoder(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,PathTrie &,bool const,size_t,int)\n" - " ctc_beam_search_decoder(std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > const &,std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,PathTrie &,bool const,size_t)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder_batch__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg1 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg2 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg3 = 0 ; - std::vector< bool,std::allocator< bool > > *arg4 = 0 ; - size_t arg5 ; - size_t arg6 ; - int arg7 ; - int arg8 ; - double arg9 ; - Scorer *arg10 = (Scorer *) 0 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - int res4 = SWIG_OLDOBJ ; - size_t val5 ; - int ecode5 = 0 ; - size_t val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - int val8 ; - int ecode8 = 0 ; - double val9 ; - int ecode9 = 0 ; - void *argp10 = 0 ; - int res10 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - PyObject * obj7 = 0 ; - PyObject * obj8 = 0 ; - PyObject * obj9 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOOOOO:ctc_beam_search_decoder_batch",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7,&obj8,&obj9)) SWIG_fail; - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp3); - { - std::vector > *ptr = (std::vector > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg4 = ptr; - } - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder_batch" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_size_t(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder_batch" "', argument " "6"" of type '" "size_t""'"); - } - arg6 = static_cast< size_t >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder_batch" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - ecode8 = SWIG_AsVal_int(obj7, &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "ctc_beam_search_decoder_batch" "', argument " "8"" of type '" "int""'"); - } - arg8 = static_cast< int >(val8); - ecode9 = SWIG_AsVal_double(obj8, &val9); - if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "ctc_beam_search_decoder_batch" "', argument " "9"" of type '" "double""'"); - } - arg9 = static_cast< double >(val9); - res10 = SWIG_ConvertPtr(obj9, &argp10,SWIGTYPE_p_Scorer, 0 | 0 ); - if (!SWIG_IsOK(res10)) { - SWIG_exception_fail(SWIG_ArgError(res10), "in method '" "ctc_beam_search_decoder_batch" "', argument " "10"" of type '" "Scorer *""'"); - } - arg10 = reinterpret_cast< Scorer * >(argp10); - result = ctc_beam_search_decoder_batch((std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg1,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg2,*arg3,(std::vector< bool,std::allocator< bool > > const &)*arg4,arg5,arg6,arg7,arg8,arg9,arg10); - resultobj = swig::from(static_cast< std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder_batch__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg1 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg2 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg3 = 0 ; - std::vector< bool,std::allocator< bool > > *arg4 = 0 ; - size_t arg5 ; - size_t arg6 ; - int arg7 ; - int arg8 ; - double arg9 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - int res4 = SWIG_OLDOBJ ; - size_t val5 ; - int ecode5 = 0 ; - size_t val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - int val8 ; - int ecode8 = 0 ; - double val9 ; - int ecode9 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - PyObject * obj7 = 0 ; - PyObject * obj8 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOOOO:ctc_beam_search_decoder_batch",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7,&obj8)) SWIG_fail; - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp3); - { - std::vector > *ptr = (std::vector > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg4 = ptr; - } - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder_batch" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_size_t(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder_batch" "', argument " "6"" of type '" "size_t""'"); - } - arg6 = static_cast< size_t >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder_batch" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - ecode8 = SWIG_AsVal_int(obj7, &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "ctc_beam_search_decoder_batch" "', argument " "8"" of type '" "int""'"); - } - arg8 = static_cast< int >(val8); - ecode9 = SWIG_AsVal_double(obj8, &val9); - if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "ctc_beam_search_decoder_batch" "', argument " "9"" of type '" "double""'"); - } - arg9 = static_cast< double >(val9); - result = ctc_beam_search_decoder_batch((std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg1,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg2,*arg3,(std::vector< bool,std::allocator< bool > > const &)*arg4,arg5,arg6,arg7,arg8,arg9); - resultobj = swig::from(static_cast< std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder_batch__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg1 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg2 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg3 = 0 ; - std::vector< bool,std::allocator< bool > > *arg4 = 0 ; - size_t arg5 ; - size_t arg6 ; - int arg7 ; - int arg8 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - int res4 = SWIG_OLDOBJ ; - size_t val5 ; - int ecode5 = 0 ; - size_t val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - int val8 ; - int ecode8 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - PyObject * obj7 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOOO:ctc_beam_search_decoder_batch",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7)) SWIG_fail; - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp3); - { - std::vector > *ptr = (std::vector > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg4 = ptr; - } - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder_batch" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_size_t(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder_batch" "', argument " "6"" of type '" "size_t""'"); - } - arg6 = static_cast< size_t >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder_batch" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - ecode8 = SWIG_AsVal_int(obj7, &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "ctc_beam_search_decoder_batch" "', argument " "8"" of type '" "int""'"); - } - arg8 = static_cast< int >(val8); - result = ctc_beam_search_decoder_batch((std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg1,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg2,*arg3,(std::vector< bool,std::allocator< bool > > const &)*arg4,arg5,arg6,arg7,arg8); - resultobj = swig::from(static_cast< std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder_batch__SWIG_3(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg1 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg2 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg3 = 0 ; - std::vector< bool,std::allocator< bool > > *arg4 = 0 ; - size_t arg5 ; - size_t arg6 ; - int arg7 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - int res4 = SWIG_OLDOBJ ; - size_t val5 ; - int ecode5 = 0 ; - size_t val6 ; - int ecode6 = 0 ; - int val7 ; - int ecode7 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - PyObject * obj6 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOOO:ctc_beam_search_decoder_batch",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6)) SWIG_fail; - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp3); - { - std::vector > *ptr = (std::vector > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg4 = ptr; - } - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder_batch" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_size_t(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder_batch" "', argument " "6"" of type '" "size_t""'"); - } - arg6 = static_cast< size_t >(val6); - ecode7 = SWIG_AsVal_int(obj6, &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "ctc_beam_search_decoder_batch" "', argument " "7"" of type '" "int""'"); - } - arg7 = static_cast< int >(val7); - result = ctc_beam_search_decoder_batch((std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg1,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg2,*arg3,(std::vector< bool,std::allocator< bool > > const &)*arg4,arg5,arg6,arg7); - resultobj = swig::from(static_cast< std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder_batch__SWIG_4(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *arg1 = 0 ; - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *arg2 = 0 ; - std::vector< PathTrie *,std::allocator< PathTrie * > > *arg3 = 0 ; - std::vector< bool,std::allocator< bool > > *arg4 = 0 ; - size_t arg5 ; - size_t arg6 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - void *argp3 = 0 ; - int res3 = 0 ; - int res4 = SWIG_OLDOBJ ; - size_t val5 ; - int ecode5 = 0 ; - size_t val6 ; - int ecode6 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - PyObject * obj5 = 0 ; - std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOOO:ctc_beam_search_decoder_batch",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5)) SWIG_fail; - { - std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *ptr = (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "1"" of type '" "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *ptr = (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "2"" of type '" "std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &""'"); - } - arg2 = ptr; - } - res3 = SWIG_ConvertPtr(obj2, &argp3, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0 ); - if (!SWIG_IsOK(res3)) { - SWIG_exception_fail(SWIG_ArgError(res3), "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - if (!argp3) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "3"" of type '" "std::vector< PathTrie *,std::allocator< PathTrie * > > &""'"); - } - arg3 = reinterpret_cast< std::vector< PathTrie *,std::allocator< PathTrie * > > * >(argp3); - { - std::vector > *ptr = (std::vector > *)0; - res4 = swig::asptr(obj3, &ptr); - if (!SWIG_IsOK(res4)) { - SWIG_exception_fail(SWIG_ArgError(res4), "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "ctc_beam_search_decoder_batch" "', argument " "4"" of type '" "std::vector< bool,std::allocator< bool > > const &""'"); - } - arg4 = ptr; - } - ecode5 = SWIG_AsVal_size_t(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "ctc_beam_search_decoder_batch" "', argument " "5"" of type '" "size_t""'"); - } - arg5 = static_cast< size_t >(val5); - ecode6 = SWIG_AsVal_size_t(obj5, &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "ctc_beam_search_decoder_batch" "', argument " "6"" of type '" "size_t""'"); - } - arg6 = static_cast< size_t >(val6); - result = ctc_beam_search_decoder_batch((std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &)*arg1,(std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &)*arg2,*arg3,(std::vector< bool,std::allocator< bool > > const &)*arg4,arg5,arg6); - resultobj = swig::from(static_cast< std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - if (SWIG_IsNewObj(res4)) delete arg4; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_ctc_beam_search_decoder_batch(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[11] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 10) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 6) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[3], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder_batch__SWIG_4(self, args); - } - } - } - } - } - } - } - if (argc == 7) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[3], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder_batch__SWIG_3(self, args); - } - } - } - } - } - } - } - } - if (argc == 8) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[3], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[7], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder_batch__SWIG_2(self, args); - } - } - } - } - } - } - } - } - } - if (argc == 9) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[3], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[7], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[8], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_ctc_beam_search_decoder_batch__SWIG_1(self, args); - } - } - } - } - } - } - } - } - } - } - if (argc == 10) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[2], &vptr, SWIGTYPE_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[3], (std::vector >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[5], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[6], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[7], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_double(argv[8], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - void *vptr = 0; - int res = SWIG_ConvertPtr(argv[9], &vptr, SWIGTYPE_p_Scorer, 0); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_ctc_beam_search_decoder_batch__SWIG_0(self, args); - } - } - } - } - } - } - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'ctc_beam_search_decoder_batch'.\n" - " Possible C/C++ prototypes are:\n" - " ctc_beam_search_decoder_batch(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &,std::vector< PathTrie *,std::allocator< PathTrie * > > &,std::vector< bool,std::allocator< bool > > const &,size_t,size_t,int,int,double,Scorer *)\n" - " ctc_beam_search_decoder_batch(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &,std::vector< PathTrie *,std::allocator< PathTrie * > > &,std::vector< bool,std::allocator< bool > > const &,size_t,size_t,int,int,double)\n" - " ctc_beam_search_decoder_batch(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &,std::vector< PathTrie *,std::allocator< PathTrie * > > &,std::vector< bool,std::allocator< bool > > const &,size_t,size_t,int,int)\n" - " ctc_beam_search_decoder_batch(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &,std::vector< PathTrie *,std::allocator< PathTrie * > > &,std::vector< bool,std::allocator< bool > > const &,size_t,size_t,int)\n" - " ctc_beam_search_decoder_batch(std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > const &,std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > const &,std::vector< PathTrie *,std::allocator< PathTrie * > > &,std::vector< bool,std::allocator< bool > > const &,size_t,size_t)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_map_sent__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int,std::allocator< int > > *arg1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - bool arg3 ; - int arg4 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - bool val3 ; - int ecode3 = 0 ; - int val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - std::string result; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:map_sent",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "map_sent" "', argument " "1"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_sent" "', argument " "1"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "map_sent" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_sent" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "map_sent" "', argument " "3"" of type '" "bool""'"); - } - arg3 = static_cast< bool >(val3); - ecode4 = SWIG_AsVal_int(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "map_sent" "', argument " "4"" of type '" "int""'"); - } - arg4 = static_cast< int >(val4); - result = map_sent((std::vector< int,std::allocator< int > > const &)*arg1,(std::vector< std::string,std::allocator< std::string > > const &)*arg2,arg3,arg4); - resultobj = SWIG_From_std_string(static_cast< std::string >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_map_sent__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int,std::allocator< int > > *arg1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - bool arg3 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - bool val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::string result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:map_sent",&obj0,&obj1,&obj2)) SWIG_fail; - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "map_sent" "', argument " "1"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_sent" "', argument " "1"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "map_sent" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_sent" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - ecode3 = SWIG_AsVal_bool(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "map_sent" "', argument " "3"" of type '" "bool""'"); - } - arg3 = static_cast< bool >(val3); - result = map_sent((std::vector< int,std::allocator< int > > const &)*arg1,(std::vector< std::string,std::allocator< std::string > > const &)*arg2,arg3); - resultobj = SWIG_From_std_string(static_cast< std::string >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_map_sent__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< int,std::allocator< int > > *arg1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - std::string result; - - if (!PyArg_ParseTuple(args,(char *)"OO:map_sent",&obj0,&obj1)) SWIG_fail; - { - std::vector< int,std::allocator< int > > *ptr = (std::vector< int,std::allocator< int > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "map_sent" "', argument " "1"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_sent" "', argument " "1"" of type '" "std::vector< int,std::allocator< int > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "map_sent" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_sent" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - result = map_sent((std::vector< int,std::allocator< int > > const &)*arg1,(std::vector< std::string,std::allocator< std::string > > const &)*arg2); - resultobj = SWIG_From_std_string(static_cast< std::string >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_map_sent(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[5] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 4) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 2) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - return _wrap_map_sent__SWIG_2(self, args); - } - } - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_map_sent__SWIG_1(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< int,std::allocator< int > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_bool(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_map_sent__SWIG_0(self, args); - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'map_sent'.\n" - " Possible C/C++ prototypes are:\n" - " map_sent(std::vector< int,std::allocator< int > > const &,std::vector< std::string,std::allocator< std::string > > const &,bool,int)\n" - " map_sent(std::vector< int,std::allocator< int > > const &,std::vector< std::string,std::allocator< std::string > > const &,bool)\n" - " map_sent(std::vector< int,std::allocator< int > > const &,std::vector< std::string,std::allocator< std::string > > const &)\n"); - return 0; -} - - -SWIGINTERN PyObject *_wrap_map_batch__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - size_t arg3 ; - bool arg4 ; - int arg5 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - size_t val3 ; - int ecode3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - int val5 ; - int ecode5 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - PyObject * obj4 = 0 ; - std::vector< std::string,std::allocator< std::string > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOOO:map_batch",&obj0,&obj1,&obj2,&obj3,&obj4)) SWIG_fail; - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "map_batch" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_batch" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "map_batch" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_batch" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "map_batch" "', argument " "3"" of type '" "size_t""'"); - } - arg3 = static_cast< size_t >(val3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "map_batch" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - ecode5 = SWIG_AsVal_int(obj4, &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "map_batch" "', argument " "5"" of type '" "int""'"); - } - arg5 = static_cast< int >(val5); - result = map_batch((std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg1,(std::vector< std::string,std::allocator< std::string > > const &)*arg2,arg3,arg4,arg5); - resultobj = swig::from(static_cast< std::vector< std::string,std::allocator< std::string > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_map_batch__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - size_t arg3 ; - bool arg4 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - size_t val3 ; - int ecode3 = 0 ; - bool val4 ; - int ecode4 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - PyObject * obj3 = 0 ; - std::vector< std::string,std::allocator< std::string > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOOO:map_batch",&obj0,&obj1,&obj2,&obj3)) SWIG_fail; - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "map_batch" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_batch" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "map_batch" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_batch" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "map_batch" "', argument " "3"" of type '" "size_t""'"); - } - arg3 = static_cast< size_t >(val3); - ecode4 = SWIG_AsVal_bool(obj3, &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "map_batch" "', argument " "4"" of type '" "bool""'"); - } - arg4 = static_cast< bool >(val4); - result = map_batch((std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg1,(std::vector< std::string,std::allocator< std::string > > const &)*arg2,arg3,arg4); - resultobj = swig::from(static_cast< std::vector< std::string,std::allocator< std::string > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_map_batch__SWIG_2(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { - PyObject *resultobj = 0; - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *arg1 = 0 ; - std::vector< std::string,std::allocator< std::string > > *arg2 = 0 ; - size_t arg3 ; - int res1 = SWIG_OLDOBJ ; - int res2 = SWIG_OLDOBJ ; - size_t val3 ; - int ecode3 = 0 ; - PyObject * obj0 = 0 ; - PyObject * obj1 = 0 ; - PyObject * obj2 = 0 ; - std::vector< std::string,std::allocator< std::string > > result; - - if (!PyArg_ParseTuple(args,(char *)"OOO:map_batch",&obj0,&obj1,&obj2)) SWIG_fail; - { - std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *ptr = (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *)0; - res1 = swig::asptr(obj0, &ptr); - if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "map_batch" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_batch" "', argument " "1"" of type '" "std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &""'"); - } - arg1 = ptr; - } - { - std::vector< std::string,std::allocator< std::string > > *ptr = (std::vector< std::string,std::allocator< std::string > > *)0; - res2 = swig::asptr(obj1, &ptr); - if (!SWIG_IsOK(res2)) { - SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "map_batch" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - if (!ptr) { - SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "map_batch" "', argument " "2"" of type '" "std::vector< std::string,std::allocator< std::string > > const &""'"); - } - arg2 = ptr; - } - ecode3 = SWIG_AsVal_size_t(obj2, &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "map_batch" "', argument " "3"" of type '" "size_t""'"); - } - arg3 = static_cast< size_t >(val3); - result = map_batch((std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &)*arg1,(std::vector< std::string,std::allocator< std::string > > const &)*arg2,arg3); - resultobj = swig::from(static_cast< std::vector< std::string,std::allocator< std::string > > >(result)); - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return resultobj; -fail: - if (SWIG_IsNewObj(res1)) delete arg1; - if (SWIG_IsNewObj(res2)) delete arg2; - return NULL; -} - - -SWIGINTERN PyObject *_wrap_map_batch(PyObject *self, PyObject *args) { - Py_ssize_t argc; - PyObject *argv[6] = { - 0 - }; - Py_ssize_t ii; - - if (!PyTuple_Check(args)) SWIG_fail; - argc = PyObject_Length(args); - for (ii = 0; (ii < 5) && (ii < argc); ii++) { - argv[ii] = PyTuple_GET_ITEM(args,ii); - } - if (argc == 3) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_map_batch__SWIG_2(self, args); - } - } - } - } - if (argc == 4) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_map_batch__SWIG_1(self, args); - } - } - } - } - } - if (argc == 5) { - int _v; - int res = swig::asptr(argv[0], (std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - int res = swig::asptr(argv[1], (std::vector< std::string,std::allocator< std::string > >**)(0)); - _v = SWIG_CheckState(res); - if (_v) { - { - int res = SWIG_AsVal_size_t(argv[2], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_bool(argv[3], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - { - int res = SWIG_AsVal_int(argv[4], NULL); - _v = SWIG_CheckState(res); - } - if (_v) { - return _wrap_map_batch__SWIG_0(self, args); - } - } - } - } - } - } - -fail: - SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'map_batch'.\n" - " Possible C/C++ prototypes are:\n" - " map_batch(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,std::vector< std::string,std::allocator< std::string > > const &,size_t,bool,int)\n" - " map_batch(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,std::vector< std::string,std::allocator< std::string > > const &,size_t,bool)\n" - " map_batch(std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > const &,std::vector< std::string,std::allocator< std::string > > const &,size_t)\n"); - return 0; -} - - -static PyMethodDef SwigMethods[] = { - { (char *)"SWIG_PyInstanceMethod_New", (PyCFunction)SWIG_PyInstanceMethod_New, METH_O, NULL}, - { (char *)"delete_SwigPyIterator", _wrap_delete_SwigPyIterator, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_value", _wrap_SwigPyIterator_value, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_incr", _wrap_SwigPyIterator_incr, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_decr", _wrap_SwigPyIterator_decr, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_distance", _wrap_SwigPyIterator_distance, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_equal", _wrap_SwigPyIterator_equal, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_copy", _wrap_SwigPyIterator_copy, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_next", _wrap_SwigPyIterator_next, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___next__", _wrap_SwigPyIterator___next__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_previous", _wrap_SwigPyIterator_previous, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_advance", _wrap_SwigPyIterator_advance, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___eq__", _wrap_SwigPyIterator___eq__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___ne__", _wrap_SwigPyIterator___ne__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___iadd__", _wrap_SwigPyIterator___iadd__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___isub__", _wrap_SwigPyIterator___isub__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___add__", _wrap_SwigPyIterator___add__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator___sub__", _wrap_SwigPyIterator___sub__, METH_VARARGS, NULL}, - { (char *)"SwigPyIterator_swigregister", SwigPyIterator_swigregister, METH_VARARGS, NULL}, - { (char *)"new_PathTrie", _wrap_new_PathTrie, METH_VARARGS, NULL}, - { (char *)"delete_PathTrie", _wrap_delete_PathTrie, METH_VARARGS, NULL}, - { (char *)"PathTrie_get_path_trie", _wrap_PathTrie_get_path_trie, METH_VARARGS, NULL}, - { (char *)"PathTrie_get_path_vec", _wrap_PathTrie_get_path_vec, METH_VARARGS, NULL}, - { (char *)"PathTrie_iterate_to_vec", _wrap_PathTrie_iterate_to_vec, METH_VARARGS, NULL}, - { (char *)"PathTrie_iterate_to_vec_only", _wrap_PathTrie_iterate_to_vec_only, METH_VARARGS, NULL}, - { (char *)"PathTrie_set_dictionary", _wrap_PathTrie_set_dictionary, METH_VARARGS, NULL}, - { (char *)"PathTrie_set_matcher", _wrap_PathTrie_set_matcher, METH_VARARGS, NULL}, - { (char *)"PathTrie_is_empty", _wrap_PathTrie_is_empty, METH_VARARGS, NULL}, - { (char *)"PathTrie_remove", _wrap_PathTrie_remove, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_b_prev_set", _wrap_PathTrie_log_prob_b_prev_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_b_prev_get", _wrap_PathTrie_log_prob_b_prev_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_nb_prev_set", _wrap_PathTrie_log_prob_nb_prev_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_nb_prev_get", _wrap_PathTrie_log_prob_nb_prev_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_b_cur_set", _wrap_PathTrie_log_prob_b_cur_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_b_cur_get", _wrap_PathTrie_log_prob_b_cur_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_nb_cur_set", _wrap_PathTrie_log_prob_nb_cur_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_log_prob_nb_cur_get", _wrap_PathTrie_log_prob_nb_cur_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_score_set", _wrap_PathTrie_score_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_score_get", _wrap_PathTrie_score_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_approx_ctc_set", _wrap_PathTrie_approx_ctc_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_approx_ctc_get", _wrap_PathTrie_approx_ctc_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_character_set", _wrap_PathTrie_character_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_character_get", _wrap_PathTrie_character_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_parent_set", _wrap_PathTrie_parent_set, METH_VARARGS, NULL}, - { (char *)"PathTrie_parent_get", _wrap_PathTrie_parent_get, METH_VARARGS, NULL}, - { (char *)"PathTrie_swigregister", PathTrie_swigregister, METH_VARARGS, NULL}, - { (char *)"DoubleVector_iterator", _wrap_DoubleVector_iterator, METH_VARARGS, NULL}, - { (char *)"DoubleVector___nonzero__", _wrap_DoubleVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___bool__", _wrap_DoubleVector___bool__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___len__", _wrap_DoubleVector___len__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___getslice__", _wrap_DoubleVector___getslice__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___setslice__", _wrap_DoubleVector___setslice__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___delslice__", _wrap_DoubleVector___delslice__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___delitem__", _wrap_DoubleVector___delitem__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___getitem__", _wrap_DoubleVector___getitem__, METH_VARARGS, NULL}, - { (char *)"DoubleVector___setitem__", _wrap_DoubleVector___setitem__, METH_VARARGS, NULL}, - { (char *)"DoubleVector_pop", _wrap_DoubleVector_pop, METH_VARARGS, NULL}, - { (char *)"DoubleVector_append", _wrap_DoubleVector_append, METH_VARARGS, NULL}, - { (char *)"DoubleVector_empty", _wrap_DoubleVector_empty, METH_VARARGS, NULL}, - { (char *)"DoubleVector_size", _wrap_DoubleVector_size, METH_VARARGS, NULL}, - { (char *)"DoubleVector_swap", _wrap_DoubleVector_swap, METH_VARARGS, NULL}, - { (char *)"DoubleVector_begin", _wrap_DoubleVector_begin, METH_VARARGS, NULL}, - { (char *)"DoubleVector_end", _wrap_DoubleVector_end, METH_VARARGS, NULL}, - { (char *)"DoubleVector_rbegin", _wrap_DoubleVector_rbegin, METH_VARARGS, NULL}, - { (char *)"DoubleVector_rend", _wrap_DoubleVector_rend, METH_VARARGS, NULL}, - { (char *)"DoubleVector_clear", _wrap_DoubleVector_clear, METH_VARARGS, NULL}, - { (char *)"DoubleVector_get_allocator", _wrap_DoubleVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"DoubleVector_pop_back", _wrap_DoubleVector_pop_back, METH_VARARGS, NULL}, - { (char *)"DoubleVector_erase", _wrap_DoubleVector_erase, METH_VARARGS, NULL}, - { (char *)"new_DoubleVector", _wrap_new_DoubleVector, METH_VARARGS, NULL}, - { (char *)"DoubleVector_push_back", _wrap_DoubleVector_push_back, METH_VARARGS, NULL}, - { (char *)"DoubleVector_front", _wrap_DoubleVector_front, METH_VARARGS, NULL}, - { (char *)"DoubleVector_back", _wrap_DoubleVector_back, METH_VARARGS, NULL}, - { (char *)"DoubleVector_assign", _wrap_DoubleVector_assign, METH_VARARGS, NULL}, - { (char *)"DoubleVector_resize", _wrap_DoubleVector_resize, METH_VARARGS, NULL}, - { (char *)"DoubleVector_insert", _wrap_DoubleVector_insert, METH_VARARGS, NULL}, - { (char *)"DoubleVector_reserve", _wrap_DoubleVector_reserve, METH_VARARGS, NULL}, - { (char *)"DoubleVector_capacity", _wrap_DoubleVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_DoubleVector", _wrap_delete_DoubleVector, METH_VARARGS, NULL}, - { (char *)"DoubleVector_swigregister", DoubleVector_swigregister, METH_VARARGS, NULL}, - { (char *)"IntVector_iterator", _wrap_IntVector_iterator, METH_VARARGS, NULL}, - { (char *)"IntVector___nonzero__", _wrap_IntVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"IntVector___bool__", _wrap_IntVector___bool__, METH_VARARGS, NULL}, - { (char *)"IntVector___len__", _wrap_IntVector___len__, METH_VARARGS, NULL}, - { (char *)"IntVector___getslice__", _wrap_IntVector___getslice__, METH_VARARGS, NULL}, - { (char *)"IntVector___setslice__", _wrap_IntVector___setslice__, METH_VARARGS, NULL}, - { (char *)"IntVector___delslice__", _wrap_IntVector___delslice__, METH_VARARGS, NULL}, - { (char *)"IntVector___delitem__", _wrap_IntVector___delitem__, METH_VARARGS, NULL}, - { (char *)"IntVector___getitem__", _wrap_IntVector___getitem__, METH_VARARGS, NULL}, - { (char *)"IntVector___setitem__", _wrap_IntVector___setitem__, METH_VARARGS, NULL}, - { (char *)"IntVector_pop", _wrap_IntVector_pop, METH_VARARGS, NULL}, - { (char *)"IntVector_append", _wrap_IntVector_append, METH_VARARGS, NULL}, - { (char *)"IntVector_empty", _wrap_IntVector_empty, METH_VARARGS, NULL}, - { (char *)"IntVector_size", _wrap_IntVector_size, METH_VARARGS, NULL}, - { (char *)"IntVector_swap", _wrap_IntVector_swap, METH_VARARGS, NULL}, - { (char *)"IntVector_begin", _wrap_IntVector_begin, METH_VARARGS, NULL}, - { (char *)"IntVector_end", _wrap_IntVector_end, METH_VARARGS, NULL}, - { (char *)"IntVector_rbegin", _wrap_IntVector_rbegin, METH_VARARGS, NULL}, - { (char *)"IntVector_rend", _wrap_IntVector_rend, METH_VARARGS, NULL}, - { (char *)"IntVector_clear", _wrap_IntVector_clear, METH_VARARGS, NULL}, - { (char *)"IntVector_get_allocator", _wrap_IntVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"IntVector_pop_back", _wrap_IntVector_pop_back, METH_VARARGS, NULL}, - { (char *)"IntVector_erase", _wrap_IntVector_erase, METH_VARARGS, NULL}, - { (char *)"new_IntVector", _wrap_new_IntVector, METH_VARARGS, NULL}, - { (char *)"IntVector_push_back", _wrap_IntVector_push_back, METH_VARARGS, NULL}, - { (char *)"IntVector_front", _wrap_IntVector_front, METH_VARARGS, NULL}, - { (char *)"IntVector_back", _wrap_IntVector_back, METH_VARARGS, NULL}, - { (char *)"IntVector_assign", _wrap_IntVector_assign, METH_VARARGS, NULL}, - { (char *)"IntVector_resize", _wrap_IntVector_resize, METH_VARARGS, NULL}, - { (char *)"IntVector_insert", _wrap_IntVector_insert, METH_VARARGS, NULL}, - { (char *)"IntVector_reserve", _wrap_IntVector_reserve, METH_VARARGS, NULL}, - { (char *)"IntVector_capacity", _wrap_IntVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_IntVector", _wrap_delete_IntVector, METH_VARARGS, NULL}, - { (char *)"IntVector_swigregister", IntVector_swigregister, METH_VARARGS, NULL}, - { (char *)"StringVector_iterator", _wrap_StringVector_iterator, METH_VARARGS, NULL}, - { (char *)"StringVector___nonzero__", _wrap_StringVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"StringVector___bool__", _wrap_StringVector___bool__, METH_VARARGS, NULL}, - { (char *)"StringVector___len__", _wrap_StringVector___len__, METH_VARARGS, NULL}, - { (char *)"StringVector___getslice__", _wrap_StringVector___getslice__, METH_VARARGS, NULL}, - { (char *)"StringVector___setslice__", _wrap_StringVector___setslice__, METH_VARARGS, NULL}, - { (char *)"StringVector___delslice__", _wrap_StringVector___delslice__, METH_VARARGS, NULL}, - { (char *)"StringVector___delitem__", _wrap_StringVector___delitem__, METH_VARARGS, NULL}, - { (char *)"StringVector___getitem__", _wrap_StringVector___getitem__, METH_VARARGS, NULL}, - { (char *)"StringVector___setitem__", _wrap_StringVector___setitem__, METH_VARARGS, NULL}, - { (char *)"StringVector_pop", _wrap_StringVector_pop, METH_VARARGS, NULL}, - { (char *)"StringVector_append", _wrap_StringVector_append, METH_VARARGS, NULL}, - { (char *)"StringVector_empty", _wrap_StringVector_empty, METH_VARARGS, NULL}, - { (char *)"StringVector_size", _wrap_StringVector_size, METH_VARARGS, NULL}, - { (char *)"StringVector_swap", _wrap_StringVector_swap, METH_VARARGS, NULL}, - { (char *)"StringVector_begin", _wrap_StringVector_begin, METH_VARARGS, NULL}, - { (char *)"StringVector_end", _wrap_StringVector_end, METH_VARARGS, NULL}, - { (char *)"StringVector_rbegin", _wrap_StringVector_rbegin, METH_VARARGS, NULL}, - { (char *)"StringVector_rend", _wrap_StringVector_rend, METH_VARARGS, NULL}, - { (char *)"StringVector_clear", _wrap_StringVector_clear, METH_VARARGS, NULL}, - { (char *)"StringVector_get_allocator", _wrap_StringVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"StringVector_pop_back", _wrap_StringVector_pop_back, METH_VARARGS, NULL}, - { (char *)"StringVector_erase", _wrap_StringVector_erase, METH_VARARGS, NULL}, - { (char *)"new_StringVector", _wrap_new_StringVector, METH_VARARGS, NULL}, - { (char *)"StringVector_push_back", _wrap_StringVector_push_back, METH_VARARGS, NULL}, - { (char *)"StringVector_front", _wrap_StringVector_front, METH_VARARGS, NULL}, - { (char *)"StringVector_back", _wrap_StringVector_back, METH_VARARGS, NULL}, - { (char *)"StringVector_assign", _wrap_StringVector_assign, METH_VARARGS, NULL}, - { (char *)"StringVector_resize", _wrap_StringVector_resize, METH_VARARGS, NULL}, - { (char *)"StringVector_insert", _wrap_StringVector_insert, METH_VARARGS, NULL}, - { (char *)"StringVector_reserve", _wrap_StringVector_reserve, METH_VARARGS, NULL}, - { (char *)"StringVector_capacity", _wrap_StringVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_StringVector", _wrap_delete_StringVector, METH_VARARGS, NULL}, - { (char *)"StringVector_swigregister", StringVector_swigregister, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_iterator", _wrap_VectorOfStructVectorDouble_iterator, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___nonzero__", _wrap_VectorOfStructVectorDouble___nonzero__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___bool__", _wrap_VectorOfStructVectorDouble___bool__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___len__", _wrap_VectorOfStructVectorDouble___len__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___getslice__", _wrap_VectorOfStructVectorDouble___getslice__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___setslice__", _wrap_VectorOfStructVectorDouble___setslice__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___delslice__", _wrap_VectorOfStructVectorDouble___delslice__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___delitem__", _wrap_VectorOfStructVectorDouble___delitem__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___getitem__", _wrap_VectorOfStructVectorDouble___getitem__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble___setitem__", _wrap_VectorOfStructVectorDouble___setitem__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_pop", _wrap_VectorOfStructVectorDouble_pop, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_append", _wrap_VectorOfStructVectorDouble_append, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_empty", _wrap_VectorOfStructVectorDouble_empty, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_size", _wrap_VectorOfStructVectorDouble_size, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_swap", _wrap_VectorOfStructVectorDouble_swap, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_begin", _wrap_VectorOfStructVectorDouble_begin, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_end", _wrap_VectorOfStructVectorDouble_end, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_rbegin", _wrap_VectorOfStructVectorDouble_rbegin, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_rend", _wrap_VectorOfStructVectorDouble_rend, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_clear", _wrap_VectorOfStructVectorDouble_clear, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_get_allocator", _wrap_VectorOfStructVectorDouble_get_allocator, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_pop_back", _wrap_VectorOfStructVectorDouble_pop_back, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_erase", _wrap_VectorOfStructVectorDouble_erase, METH_VARARGS, NULL}, - { (char *)"new_VectorOfStructVectorDouble", _wrap_new_VectorOfStructVectorDouble, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_push_back", _wrap_VectorOfStructVectorDouble_push_back, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_front", _wrap_VectorOfStructVectorDouble_front, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_back", _wrap_VectorOfStructVectorDouble_back, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_assign", _wrap_VectorOfStructVectorDouble_assign, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_resize", _wrap_VectorOfStructVectorDouble_resize, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_insert", _wrap_VectorOfStructVectorDouble_insert, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_reserve", _wrap_VectorOfStructVectorDouble_reserve, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_capacity", _wrap_VectorOfStructVectorDouble_capacity, METH_VARARGS, NULL}, - { (char *)"delete_VectorOfStructVectorDouble", _wrap_delete_VectorOfStructVectorDouble, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorDouble_swigregister", VectorOfStructVectorDouble_swigregister, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_iterator", _wrap_VectorOfStructVectorInt_iterator, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___nonzero__", _wrap_VectorOfStructVectorInt___nonzero__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___bool__", _wrap_VectorOfStructVectorInt___bool__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___len__", _wrap_VectorOfStructVectorInt___len__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___getslice__", _wrap_VectorOfStructVectorInt___getslice__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___setslice__", _wrap_VectorOfStructVectorInt___setslice__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___delslice__", _wrap_VectorOfStructVectorInt___delslice__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___delitem__", _wrap_VectorOfStructVectorInt___delitem__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___getitem__", _wrap_VectorOfStructVectorInt___getitem__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt___setitem__", _wrap_VectorOfStructVectorInt___setitem__, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_pop", _wrap_VectorOfStructVectorInt_pop, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_append", _wrap_VectorOfStructVectorInt_append, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_empty", _wrap_VectorOfStructVectorInt_empty, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_size", _wrap_VectorOfStructVectorInt_size, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_swap", _wrap_VectorOfStructVectorInt_swap, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_begin", _wrap_VectorOfStructVectorInt_begin, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_end", _wrap_VectorOfStructVectorInt_end, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_rbegin", _wrap_VectorOfStructVectorInt_rbegin, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_rend", _wrap_VectorOfStructVectorInt_rend, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_clear", _wrap_VectorOfStructVectorInt_clear, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_get_allocator", _wrap_VectorOfStructVectorInt_get_allocator, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_pop_back", _wrap_VectorOfStructVectorInt_pop_back, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_erase", _wrap_VectorOfStructVectorInt_erase, METH_VARARGS, NULL}, - { (char *)"new_VectorOfStructVectorInt", _wrap_new_VectorOfStructVectorInt, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_push_back", _wrap_VectorOfStructVectorInt_push_back, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_front", _wrap_VectorOfStructVectorInt_front, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_back", _wrap_VectorOfStructVectorInt_back, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_assign", _wrap_VectorOfStructVectorInt_assign, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_resize", _wrap_VectorOfStructVectorInt_resize, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_insert", _wrap_VectorOfStructVectorInt_insert, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_reserve", _wrap_VectorOfStructVectorInt_reserve, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_capacity", _wrap_VectorOfStructVectorInt_capacity, METH_VARARGS, NULL}, - { (char *)"delete_VectorOfStructVectorInt", _wrap_delete_VectorOfStructVectorInt, METH_VARARGS, NULL}, - { (char *)"VectorOfStructVectorInt_swigregister", VectorOfStructVectorInt_swigregister, METH_VARARGS, NULL}, - { (char *)"FloatVector_iterator", _wrap_FloatVector_iterator, METH_VARARGS, NULL}, - { (char *)"FloatVector___nonzero__", _wrap_FloatVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"FloatVector___bool__", _wrap_FloatVector___bool__, METH_VARARGS, NULL}, - { (char *)"FloatVector___len__", _wrap_FloatVector___len__, METH_VARARGS, NULL}, - { (char *)"FloatVector___getslice__", _wrap_FloatVector___getslice__, METH_VARARGS, NULL}, - { (char *)"FloatVector___setslice__", _wrap_FloatVector___setslice__, METH_VARARGS, NULL}, - { (char *)"FloatVector___delslice__", _wrap_FloatVector___delslice__, METH_VARARGS, NULL}, - { (char *)"FloatVector___delitem__", _wrap_FloatVector___delitem__, METH_VARARGS, NULL}, - { (char *)"FloatVector___getitem__", _wrap_FloatVector___getitem__, METH_VARARGS, NULL}, - { (char *)"FloatVector___setitem__", _wrap_FloatVector___setitem__, METH_VARARGS, NULL}, - { (char *)"FloatVector_pop", _wrap_FloatVector_pop, METH_VARARGS, NULL}, - { (char *)"FloatVector_append", _wrap_FloatVector_append, METH_VARARGS, NULL}, - { (char *)"FloatVector_empty", _wrap_FloatVector_empty, METH_VARARGS, NULL}, - { (char *)"FloatVector_size", _wrap_FloatVector_size, METH_VARARGS, NULL}, - { (char *)"FloatVector_swap", _wrap_FloatVector_swap, METH_VARARGS, NULL}, - { (char *)"FloatVector_begin", _wrap_FloatVector_begin, METH_VARARGS, NULL}, - { (char *)"FloatVector_end", _wrap_FloatVector_end, METH_VARARGS, NULL}, - { (char *)"FloatVector_rbegin", _wrap_FloatVector_rbegin, METH_VARARGS, NULL}, - { (char *)"FloatVector_rend", _wrap_FloatVector_rend, METH_VARARGS, NULL}, - { (char *)"FloatVector_clear", _wrap_FloatVector_clear, METH_VARARGS, NULL}, - { (char *)"FloatVector_get_allocator", _wrap_FloatVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"FloatVector_pop_back", _wrap_FloatVector_pop_back, METH_VARARGS, NULL}, - { (char *)"FloatVector_erase", _wrap_FloatVector_erase, METH_VARARGS, NULL}, - { (char *)"new_FloatVector", _wrap_new_FloatVector, METH_VARARGS, NULL}, - { (char *)"FloatVector_push_back", _wrap_FloatVector_push_back, METH_VARARGS, NULL}, - { (char *)"FloatVector_front", _wrap_FloatVector_front, METH_VARARGS, NULL}, - { (char *)"FloatVector_back", _wrap_FloatVector_back, METH_VARARGS, NULL}, - { (char *)"FloatVector_assign", _wrap_FloatVector_assign, METH_VARARGS, NULL}, - { (char *)"FloatVector_resize", _wrap_FloatVector_resize, METH_VARARGS, NULL}, - { (char *)"FloatVector_insert", _wrap_FloatVector_insert, METH_VARARGS, NULL}, - { (char *)"FloatVector_reserve", _wrap_FloatVector_reserve, METH_VARARGS, NULL}, - { (char *)"FloatVector_capacity", _wrap_FloatVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_FloatVector", _wrap_delete_FloatVector, METH_VARARGS, NULL}, - { (char *)"FloatVector_swigregister", FloatVector_swigregister, METH_VARARGS, NULL}, - { (char *)"new_Pair", _wrap_new_Pair, METH_VARARGS, NULL}, - { (char *)"Pair_first_set", _wrap_Pair_first_set, METH_VARARGS, NULL}, - { (char *)"Pair_first_get", _wrap_Pair_first_get, METH_VARARGS, NULL}, - { (char *)"Pair_second_set", _wrap_Pair_second_set, METH_VARARGS, NULL}, - { (char *)"Pair_second_get", _wrap_Pair_second_get, METH_VARARGS, NULL}, - { (char *)"delete_Pair", _wrap_delete_Pair, METH_VARARGS, NULL}, - { (char *)"Pair_swigregister", Pair_swigregister, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_iterator", _wrap_PairFloatVectorVector_iterator, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___nonzero__", _wrap_PairFloatVectorVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___bool__", _wrap_PairFloatVectorVector___bool__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___len__", _wrap_PairFloatVectorVector___len__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___getslice__", _wrap_PairFloatVectorVector___getslice__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___setslice__", _wrap_PairFloatVectorVector___setslice__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___delslice__", _wrap_PairFloatVectorVector___delslice__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___delitem__", _wrap_PairFloatVectorVector___delitem__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___getitem__", _wrap_PairFloatVectorVector___getitem__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector___setitem__", _wrap_PairFloatVectorVector___setitem__, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_pop", _wrap_PairFloatVectorVector_pop, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_append", _wrap_PairFloatVectorVector_append, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_empty", _wrap_PairFloatVectorVector_empty, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_size", _wrap_PairFloatVectorVector_size, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_swap", _wrap_PairFloatVectorVector_swap, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_begin", _wrap_PairFloatVectorVector_begin, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_end", _wrap_PairFloatVectorVector_end, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_rbegin", _wrap_PairFloatVectorVector_rbegin, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_rend", _wrap_PairFloatVectorVector_rend, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_clear", _wrap_PairFloatVectorVector_clear, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_get_allocator", _wrap_PairFloatVectorVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_pop_back", _wrap_PairFloatVectorVector_pop_back, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_erase", _wrap_PairFloatVectorVector_erase, METH_VARARGS, NULL}, - { (char *)"new_PairFloatVectorVector", _wrap_new_PairFloatVectorVector, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_push_back", _wrap_PairFloatVectorVector_push_back, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_front", _wrap_PairFloatVectorVector_front, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_back", _wrap_PairFloatVectorVector_back, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_assign", _wrap_PairFloatVectorVector_assign, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_resize", _wrap_PairFloatVectorVector_resize, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_insert", _wrap_PairFloatVectorVector_insert, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_reserve", _wrap_PairFloatVectorVector_reserve, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_capacity", _wrap_PairFloatVectorVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_PairFloatVectorVector", _wrap_delete_PairFloatVectorVector, METH_VARARGS, NULL}, - { (char *)"PairFloatVectorVector_swigregister", PairFloatVectorVector_swigregister, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_iterator", _wrap_PairDoubleVectorVector_iterator, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___nonzero__", _wrap_PairDoubleVectorVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___bool__", _wrap_PairDoubleVectorVector___bool__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___len__", _wrap_PairDoubleVectorVector___len__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___getslice__", _wrap_PairDoubleVectorVector___getslice__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___setslice__", _wrap_PairDoubleVectorVector___setslice__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___delslice__", _wrap_PairDoubleVectorVector___delslice__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___delitem__", _wrap_PairDoubleVectorVector___delitem__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___getitem__", _wrap_PairDoubleVectorVector___getitem__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector___setitem__", _wrap_PairDoubleVectorVector___setitem__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_pop", _wrap_PairDoubleVectorVector_pop, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_append", _wrap_PairDoubleVectorVector_append, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_empty", _wrap_PairDoubleVectorVector_empty, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_size", _wrap_PairDoubleVectorVector_size, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_swap", _wrap_PairDoubleVectorVector_swap, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_begin", _wrap_PairDoubleVectorVector_begin, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_end", _wrap_PairDoubleVectorVector_end, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_rbegin", _wrap_PairDoubleVectorVector_rbegin, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_rend", _wrap_PairDoubleVectorVector_rend, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_clear", _wrap_PairDoubleVectorVector_clear, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_get_allocator", _wrap_PairDoubleVectorVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_pop_back", _wrap_PairDoubleVectorVector_pop_back, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_erase", _wrap_PairDoubleVectorVector_erase, METH_VARARGS, NULL}, - { (char *)"new_PairDoubleVectorVector", _wrap_new_PairDoubleVectorVector, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_push_back", _wrap_PairDoubleVectorVector_push_back, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_front", _wrap_PairDoubleVectorVector_front, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_back", _wrap_PairDoubleVectorVector_back, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_assign", _wrap_PairDoubleVectorVector_assign, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_resize", _wrap_PairDoubleVectorVector_resize, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_insert", _wrap_PairDoubleVectorVector_insert, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_reserve", _wrap_PairDoubleVectorVector_reserve, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_capacity", _wrap_PairDoubleVectorVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_PairDoubleVectorVector", _wrap_delete_PairDoubleVectorVector, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector_swigregister", PairDoubleVectorVector_swigregister, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_iterator", _wrap_PairDoubleVectorVector2_iterator, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___nonzero__", _wrap_PairDoubleVectorVector2___nonzero__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___bool__", _wrap_PairDoubleVectorVector2___bool__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___len__", _wrap_PairDoubleVectorVector2___len__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___getslice__", _wrap_PairDoubleVectorVector2___getslice__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___setslice__", _wrap_PairDoubleVectorVector2___setslice__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___delslice__", _wrap_PairDoubleVectorVector2___delslice__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___delitem__", _wrap_PairDoubleVectorVector2___delitem__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___getitem__", _wrap_PairDoubleVectorVector2___getitem__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2___setitem__", _wrap_PairDoubleVectorVector2___setitem__, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_pop", _wrap_PairDoubleVectorVector2_pop, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_append", _wrap_PairDoubleVectorVector2_append, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_empty", _wrap_PairDoubleVectorVector2_empty, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_size", _wrap_PairDoubleVectorVector2_size, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_swap", _wrap_PairDoubleVectorVector2_swap, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_begin", _wrap_PairDoubleVectorVector2_begin, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_end", _wrap_PairDoubleVectorVector2_end, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_rbegin", _wrap_PairDoubleVectorVector2_rbegin, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_rend", _wrap_PairDoubleVectorVector2_rend, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_clear", _wrap_PairDoubleVectorVector2_clear, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_get_allocator", _wrap_PairDoubleVectorVector2_get_allocator, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_pop_back", _wrap_PairDoubleVectorVector2_pop_back, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_erase", _wrap_PairDoubleVectorVector2_erase, METH_VARARGS, NULL}, - { (char *)"new_PairDoubleVectorVector2", _wrap_new_PairDoubleVectorVector2, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_push_back", _wrap_PairDoubleVectorVector2_push_back, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_front", _wrap_PairDoubleVectorVector2_front, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_back", _wrap_PairDoubleVectorVector2_back, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_assign", _wrap_PairDoubleVectorVector2_assign, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_resize", _wrap_PairDoubleVectorVector2_resize, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_insert", _wrap_PairDoubleVectorVector2_insert, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_reserve", _wrap_PairDoubleVectorVector2_reserve, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_capacity", _wrap_PairDoubleVectorVector2_capacity, METH_VARARGS, NULL}, - { (char *)"delete_PairDoubleVectorVector2", _wrap_delete_PairDoubleVectorVector2, METH_VARARGS, NULL}, - { (char *)"PairDoubleVectorVector2_swigregister", PairDoubleVectorVector2_swigregister, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_iterator", _wrap_DoubleVector3_iterator, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___nonzero__", _wrap_DoubleVector3___nonzero__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___bool__", _wrap_DoubleVector3___bool__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___len__", _wrap_DoubleVector3___len__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___getslice__", _wrap_DoubleVector3___getslice__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___setslice__", _wrap_DoubleVector3___setslice__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___delslice__", _wrap_DoubleVector3___delslice__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___delitem__", _wrap_DoubleVector3___delitem__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___getitem__", _wrap_DoubleVector3___getitem__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3___setitem__", _wrap_DoubleVector3___setitem__, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_pop", _wrap_DoubleVector3_pop, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_append", _wrap_DoubleVector3_append, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_empty", _wrap_DoubleVector3_empty, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_size", _wrap_DoubleVector3_size, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_swap", _wrap_DoubleVector3_swap, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_begin", _wrap_DoubleVector3_begin, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_end", _wrap_DoubleVector3_end, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_rbegin", _wrap_DoubleVector3_rbegin, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_rend", _wrap_DoubleVector3_rend, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_clear", _wrap_DoubleVector3_clear, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_get_allocator", _wrap_DoubleVector3_get_allocator, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_pop_back", _wrap_DoubleVector3_pop_back, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_erase", _wrap_DoubleVector3_erase, METH_VARARGS, NULL}, - { (char *)"new_DoubleVector3", _wrap_new_DoubleVector3, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_push_back", _wrap_DoubleVector3_push_back, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_front", _wrap_DoubleVector3_front, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_back", _wrap_DoubleVector3_back, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_assign", _wrap_DoubleVector3_assign, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_resize", _wrap_DoubleVector3_resize, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_insert", _wrap_DoubleVector3_insert, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_reserve", _wrap_DoubleVector3_reserve, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_capacity", _wrap_DoubleVector3_capacity, METH_VARARGS, NULL}, - { (char *)"delete_DoubleVector3", _wrap_delete_DoubleVector3, METH_VARARGS, NULL}, - { (char *)"DoubleVector3_swigregister", DoubleVector3_swigregister, METH_VARARGS, NULL}, - { (char *)"IntVector3_iterator", _wrap_IntVector3_iterator, METH_VARARGS, NULL}, - { (char *)"IntVector3___nonzero__", _wrap_IntVector3___nonzero__, METH_VARARGS, NULL}, - { (char *)"IntVector3___bool__", _wrap_IntVector3___bool__, METH_VARARGS, NULL}, - { (char *)"IntVector3___len__", _wrap_IntVector3___len__, METH_VARARGS, NULL}, - { (char *)"IntVector3___getslice__", _wrap_IntVector3___getslice__, METH_VARARGS, NULL}, - { (char *)"IntVector3___setslice__", _wrap_IntVector3___setslice__, METH_VARARGS, NULL}, - { (char *)"IntVector3___delslice__", _wrap_IntVector3___delslice__, METH_VARARGS, NULL}, - { (char *)"IntVector3___delitem__", _wrap_IntVector3___delitem__, METH_VARARGS, NULL}, - { (char *)"IntVector3___getitem__", _wrap_IntVector3___getitem__, METH_VARARGS, NULL}, - { (char *)"IntVector3___setitem__", _wrap_IntVector3___setitem__, METH_VARARGS, NULL}, - { (char *)"IntVector3_pop", _wrap_IntVector3_pop, METH_VARARGS, NULL}, - { (char *)"IntVector3_append", _wrap_IntVector3_append, METH_VARARGS, NULL}, - { (char *)"IntVector3_empty", _wrap_IntVector3_empty, METH_VARARGS, NULL}, - { (char *)"IntVector3_size", _wrap_IntVector3_size, METH_VARARGS, NULL}, - { (char *)"IntVector3_swap", _wrap_IntVector3_swap, METH_VARARGS, NULL}, - { (char *)"IntVector3_begin", _wrap_IntVector3_begin, METH_VARARGS, NULL}, - { (char *)"IntVector3_end", _wrap_IntVector3_end, METH_VARARGS, NULL}, - { (char *)"IntVector3_rbegin", _wrap_IntVector3_rbegin, METH_VARARGS, NULL}, - { (char *)"IntVector3_rend", _wrap_IntVector3_rend, METH_VARARGS, NULL}, - { (char *)"IntVector3_clear", _wrap_IntVector3_clear, METH_VARARGS, NULL}, - { (char *)"IntVector3_get_allocator", _wrap_IntVector3_get_allocator, METH_VARARGS, NULL}, - { (char *)"IntVector3_pop_back", _wrap_IntVector3_pop_back, METH_VARARGS, NULL}, - { (char *)"IntVector3_erase", _wrap_IntVector3_erase, METH_VARARGS, NULL}, - { (char *)"new_IntVector3", _wrap_new_IntVector3, METH_VARARGS, NULL}, - { (char *)"IntVector3_push_back", _wrap_IntVector3_push_back, METH_VARARGS, NULL}, - { (char *)"IntVector3_front", _wrap_IntVector3_front, METH_VARARGS, NULL}, - { (char *)"IntVector3_back", _wrap_IntVector3_back, METH_VARARGS, NULL}, - { (char *)"IntVector3_assign", _wrap_IntVector3_assign, METH_VARARGS, NULL}, - { (char *)"IntVector3_resize", _wrap_IntVector3_resize, METH_VARARGS, NULL}, - { (char *)"IntVector3_insert", _wrap_IntVector3_insert, METH_VARARGS, NULL}, - { (char *)"IntVector3_reserve", _wrap_IntVector3_reserve, METH_VARARGS, NULL}, - { (char *)"IntVector3_capacity", _wrap_IntVector3_capacity, METH_VARARGS, NULL}, - { (char *)"delete_IntVector3", _wrap_delete_IntVector3, METH_VARARGS, NULL}, - { (char *)"IntVector3_swigregister", IntVector3_swigregister, METH_VARARGS, NULL}, - { (char *)"TrieVector_iterator", _wrap_TrieVector_iterator, METH_VARARGS, NULL}, - { (char *)"TrieVector___nonzero__", _wrap_TrieVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"TrieVector___bool__", _wrap_TrieVector___bool__, METH_VARARGS, NULL}, - { (char *)"TrieVector___len__", _wrap_TrieVector___len__, METH_VARARGS, NULL}, - { (char *)"TrieVector___getslice__", _wrap_TrieVector___getslice__, METH_VARARGS, NULL}, - { (char *)"TrieVector___setslice__", _wrap_TrieVector___setslice__, METH_VARARGS, NULL}, - { (char *)"TrieVector___delslice__", _wrap_TrieVector___delslice__, METH_VARARGS, NULL}, - { (char *)"TrieVector___delitem__", _wrap_TrieVector___delitem__, METH_VARARGS, NULL}, - { (char *)"TrieVector___getitem__", _wrap_TrieVector___getitem__, METH_VARARGS, NULL}, - { (char *)"TrieVector___setitem__", _wrap_TrieVector___setitem__, METH_VARARGS, NULL}, - { (char *)"TrieVector_pop", _wrap_TrieVector_pop, METH_VARARGS, NULL}, - { (char *)"TrieVector_append", _wrap_TrieVector_append, METH_VARARGS, NULL}, - { (char *)"TrieVector_empty", _wrap_TrieVector_empty, METH_VARARGS, NULL}, - { (char *)"TrieVector_size", _wrap_TrieVector_size, METH_VARARGS, NULL}, - { (char *)"TrieVector_swap", _wrap_TrieVector_swap, METH_VARARGS, NULL}, - { (char *)"TrieVector_begin", _wrap_TrieVector_begin, METH_VARARGS, NULL}, - { (char *)"TrieVector_end", _wrap_TrieVector_end, METH_VARARGS, NULL}, - { (char *)"TrieVector_rbegin", _wrap_TrieVector_rbegin, METH_VARARGS, NULL}, - { (char *)"TrieVector_rend", _wrap_TrieVector_rend, METH_VARARGS, NULL}, - { (char *)"TrieVector_clear", _wrap_TrieVector_clear, METH_VARARGS, NULL}, - { (char *)"TrieVector_get_allocator", _wrap_TrieVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"TrieVector_pop_back", _wrap_TrieVector_pop_back, METH_VARARGS, NULL}, - { (char *)"TrieVector_erase", _wrap_TrieVector_erase, METH_VARARGS, NULL}, - { (char *)"new_TrieVector", _wrap_new_TrieVector, METH_VARARGS, NULL}, - { (char *)"TrieVector_push_back", _wrap_TrieVector_push_back, METH_VARARGS, NULL}, - { (char *)"TrieVector_front", _wrap_TrieVector_front, METH_VARARGS, NULL}, - { (char *)"TrieVector_back", _wrap_TrieVector_back, METH_VARARGS, NULL}, - { (char *)"TrieVector_assign", _wrap_TrieVector_assign, METH_VARARGS, NULL}, - { (char *)"TrieVector_resize", _wrap_TrieVector_resize, METH_VARARGS, NULL}, - { (char *)"TrieVector_insert", _wrap_TrieVector_insert, METH_VARARGS, NULL}, - { (char *)"TrieVector_reserve", _wrap_TrieVector_reserve, METH_VARARGS, NULL}, - { (char *)"TrieVector_capacity", _wrap_TrieVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_TrieVector", _wrap_delete_TrieVector, METH_VARARGS, NULL}, - { (char *)"TrieVector_swigregister", TrieVector_swigregister, METH_VARARGS, NULL}, - { (char *)"BoolVector_iterator", _wrap_BoolVector_iterator, METH_VARARGS, NULL}, - { (char *)"BoolVector___nonzero__", _wrap_BoolVector___nonzero__, METH_VARARGS, NULL}, - { (char *)"BoolVector___bool__", _wrap_BoolVector___bool__, METH_VARARGS, NULL}, - { (char *)"BoolVector___len__", _wrap_BoolVector___len__, METH_VARARGS, NULL}, - { (char *)"BoolVector___getslice__", _wrap_BoolVector___getslice__, METH_VARARGS, NULL}, - { (char *)"BoolVector___setslice__", _wrap_BoolVector___setslice__, METH_VARARGS, NULL}, - { (char *)"BoolVector___delslice__", _wrap_BoolVector___delslice__, METH_VARARGS, NULL}, - { (char *)"BoolVector___delitem__", _wrap_BoolVector___delitem__, METH_VARARGS, NULL}, - { (char *)"BoolVector___getitem__", _wrap_BoolVector___getitem__, METH_VARARGS, NULL}, - { (char *)"BoolVector___setitem__", _wrap_BoolVector___setitem__, METH_VARARGS, NULL}, - { (char *)"BoolVector_pop", _wrap_BoolVector_pop, METH_VARARGS, NULL}, - { (char *)"BoolVector_append", _wrap_BoolVector_append, METH_VARARGS, NULL}, - { (char *)"BoolVector_empty", _wrap_BoolVector_empty, METH_VARARGS, NULL}, - { (char *)"BoolVector_size", _wrap_BoolVector_size, METH_VARARGS, NULL}, - { (char *)"BoolVector_swap", _wrap_BoolVector_swap, METH_VARARGS, NULL}, - { (char *)"BoolVector_begin", _wrap_BoolVector_begin, METH_VARARGS, NULL}, - { (char *)"BoolVector_end", _wrap_BoolVector_end, METH_VARARGS, NULL}, - { (char *)"BoolVector_rbegin", _wrap_BoolVector_rbegin, METH_VARARGS, NULL}, - { (char *)"BoolVector_rend", _wrap_BoolVector_rend, METH_VARARGS, NULL}, - { (char *)"BoolVector_clear", _wrap_BoolVector_clear, METH_VARARGS, NULL}, - { (char *)"BoolVector_get_allocator", _wrap_BoolVector_get_allocator, METH_VARARGS, NULL}, - { (char *)"BoolVector_pop_back", _wrap_BoolVector_pop_back, METH_VARARGS, NULL}, - { (char *)"BoolVector_erase", _wrap_BoolVector_erase, METH_VARARGS, NULL}, - { (char *)"new_BoolVector", _wrap_new_BoolVector, METH_VARARGS, NULL}, - { (char *)"BoolVector_push_back", _wrap_BoolVector_push_back, METH_VARARGS, NULL}, - { (char *)"BoolVector_front", _wrap_BoolVector_front, METH_VARARGS, NULL}, - { (char *)"BoolVector_back", _wrap_BoolVector_back, METH_VARARGS, NULL}, - { (char *)"BoolVector_assign", _wrap_BoolVector_assign, METH_VARARGS, NULL}, - { (char *)"BoolVector_resize", _wrap_BoolVector_resize, METH_VARARGS, NULL}, - { (char *)"BoolVector_insert", _wrap_BoolVector_insert, METH_VARARGS, NULL}, - { (char *)"BoolVector_reserve", _wrap_BoolVector_reserve, METH_VARARGS, NULL}, - { (char *)"BoolVector_capacity", _wrap_BoolVector_capacity, METH_VARARGS, NULL}, - { (char *)"delete_BoolVector", _wrap_delete_BoolVector, METH_VARARGS, NULL}, - { (char *)"BoolVector_swigregister", BoolVector_swigregister, METH_VARARGS, NULL}, - { (char *)"IntDoublePairCompSecondRev", _wrap_IntDoublePairCompSecondRev, METH_VARARGS, NULL}, - { (char *)"StringDoublePairCompSecondRev", _wrap_StringDoublePairCompSecondRev, METH_VARARGS, NULL}, - { (char *)"DoubleStringPairCompFirstRev", _wrap_DoubleStringPairCompFirstRev, METH_VARARGS, NULL}, - { (char *)"new_RetriveStrEnumerateVocab", _wrap_new_RetriveStrEnumerateVocab, METH_VARARGS, NULL}, - { (char *)"RetriveStrEnumerateVocab_Add", _wrap_RetriveStrEnumerateVocab_Add, METH_VARARGS, NULL}, - { (char *)"RetriveStrEnumerateVocab_vocabulary_set", _wrap_RetriveStrEnumerateVocab_vocabulary_set, METH_VARARGS, NULL}, - { (char *)"RetriveStrEnumerateVocab_vocabulary_get", _wrap_RetriveStrEnumerateVocab_vocabulary_get, METH_VARARGS, NULL}, - { (char *)"delete_RetriveStrEnumerateVocab", _wrap_delete_RetriveStrEnumerateVocab, METH_VARARGS, NULL}, - { (char *)"RetriveStrEnumerateVocab_swigregister", RetriveStrEnumerateVocab_swigregister, METH_VARARGS, NULL}, - { (char *)"new_Scorer", _wrap_new_Scorer, METH_VARARGS, NULL}, - { (char *)"delete_Scorer", _wrap_delete_Scorer, METH_VARARGS, NULL}, - { (char *)"Scorer_get_log_cond_prob", _wrap_Scorer_get_log_cond_prob, METH_VARARGS, NULL}, - { (char *)"Scorer_get_sent_log_prob", _wrap_Scorer_get_sent_log_prob, METH_VARARGS, NULL}, - { (char *)"Scorer_get_max_order", _wrap_Scorer_get_max_order, METH_VARARGS, NULL}, - { (char *)"Scorer_get_dict_size", _wrap_Scorer_get_dict_size, METH_VARARGS, NULL}, - { (char *)"Scorer_is_character_based", _wrap_Scorer_is_character_based, METH_VARARGS, NULL}, - { (char *)"Scorer_reset_params", _wrap_Scorer_reset_params, METH_VARARGS, NULL}, - { (char *)"Scorer_make_ngram", _wrap_Scorer_make_ngram, METH_VARARGS, NULL}, - { (char *)"Scorer_split_labels", _wrap_Scorer_split_labels, METH_VARARGS, NULL}, - { (char *)"Scorer_alpha_set", _wrap_Scorer_alpha_set, METH_VARARGS, NULL}, - { (char *)"Scorer_alpha_get", _wrap_Scorer_alpha_get, METH_VARARGS, NULL}, - { (char *)"Scorer_beta_set", _wrap_Scorer_beta_set, METH_VARARGS, NULL}, - { (char *)"Scorer_beta_get", _wrap_Scorer_beta_get, METH_VARARGS, NULL}, - { (char *)"Scorer_dictionary_set", _wrap_Scorer_dictionary_set, METH_VARARGS, NULL}, - { (char *)"Scorer_dictionary_get", _wrap_Scorer_dictionary_get, METH_VARARGS, NULL}, - { (char *)"Scorer_swigregister", Scorer_swigregister, METH_VARARGS, NULL}, - { (char *)"ctc_beam_search_decoder", _wrap_ctc_beam_search_decoder, METH_VARARGS, NULL}, - { (char *)"ctc_beam_search_decoder_batch", _wrap_ctc_beam_search_decoder_batch, METH_VARARGS, NULL}, - { (char *)"map_sent", _wrap_map_sent, METH_VARARGS, NULL}, - { (char *)"map_batch", _wrap_map_batch, METH_VARARGS, NULL}, - { NULL, NULL, 0, NULL } -}; - - -/* -------- TYPE CONVERSION AND EQUIVALENCE RULES (BEGIN) -------- */ - -static swig_type_info _swigt__p_PathTrie = {"_p_PathTrie", "PathTrie *|std::vector< PathTrie * >::value_type", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_RetriveStrEnumerateVocab = {"_p_RetriveStrEnumerateVocab", "RetriveStrEnumerateVocab *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_Scorer = {"_p_Scorer", "Scorer *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_StringPiece = {"_p_StringPiece", "StringPiece *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_allocator_type = {"_p_allocator_type", "allocator_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_const_reference = {"_p_const_reference", "const_reference *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_difference_type = {"_p_difference_type", "difference_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_first_type = {"_p_first_type", "first_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_fst__StdVectorFst = {"_p_fst__StdVectorFst", "fst::StdVectorFst *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_lm__WordIndex = {"_p_lm__WordIndex", "lm::WordIndex *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_p_PyObject = {"_p_p_PyObject", "PyObject **", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_reference = {"_p_reference", "reference *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_second_type = {"_p_second_type", "second_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_size_type = {"_p_size_type", "size_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_PathTrie_p_t = {"_p_std__allocatorT_PathTrie_p_t", "std::vector< PathTrie * >::allocator_type *|std::allocator< PathTrie * > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_bool_t = {"_p_std__allocatorT_bool_t", "std::vector< bool >::allocator_type *|std::allocator< bool > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_double_t = {"_p_std__allocatorT_double_t", "std::vector< double >::allocator_type *|std::allocator< double > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_float_t = {"_p_std__allocatorT_float_t", "std::vector< float >::allocator_type *|std::allocator< float > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_int_t = {"_p_std__allocatorT_int_t", "std::vector< int >::allocator_type *|std::allocator< int > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t = {"_p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t", "std::vector< std::pair< double,std::vector< int > > >::allocator_type *|std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t = {"_p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t", "std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > *|std::vector< std::pair< float,std::vector< int > > >::allocator_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__string_t = {"_p_std__allocatorT_std__string_t", "std::vector< std::string >::allocator_type *|std::allocator< std::string > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t = {"_p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t", "std::vector< std::vector< double > >::allocator_type *|std::allocator< std::vector< double,std::allocator< double > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t = {"_p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t", "std::vector< std::vector< int > >::allocator_type *|std::allocator< std::vector< int,std::allocator< int > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t = {"_p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t", "std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > *|std::vector< std::vector< std::pair< double,std::vector< int > > > >::allocator_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t = {"_p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t", "std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > *|std::vector< std::vector< std::vector< double > > >::allocator_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t = {"_p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t", "std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > *|std::vector< std::vector< std::vector< int > > >::allocator_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__invalid_argument = {"_p_std__invalid_argument", "std::invalid_argument *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__pairT_double_std__string_t = {"_p_std__pairT_double_std__string_t", "std::pair< double,std::string > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t = {"_p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t", "std::pair< double,std::vector< int,std::allocator< int > > > *|std::vector< std::pair< double,std::vector< int > > >::value_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t = {"_p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t", "std::pair< float,std::vector< int > > *|std::pair< float,std::vector< int,std::allocator< int > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__pairT_int_double_t = {"_p_std__pairT_int_double_t", "std::pair< int,double > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__pairT_std__string_double_t = {"_p_std__pairT_std__string_double_t", "std::pair< std::string,double > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t = {"_p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t", "std::shared_ptr< fst::SortedMatcher< fst::StdVectorFst > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t = {"_p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t", "std::vector< PathTrie *,std::allocator< PathTrie * > > *|std::vector< PathTrie * > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_bool_std__allocatorT_bool_t_t = {"_p_std__vectorT_bool_std__allocatorT_bool_t_t", "std::vector< bool,std::allocator< bool > > *|std::vector< bool > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_double_std__allocatorT_double_t_t = {"_p_std__vectorT_double_std__allocatorT_double_t_t", "std::vector< double,std::allocator< double > > *|std::vector< double > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_float_std__allocatorT_float_t_t = {"_p_std__vectorT_float_std__allocatorT_float_t_t", "std::vector< float > *|std::vector< float,std::allocator< float > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_int_std__allocatorT_int_t_t = {"_p_std__vectorT_int_std__allocatorT_int_t_t", "std::vector< int,std::allocator< int > > *|std::vector< int > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t = {"_p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t", "std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > *|std::vector< std::pair< double,std::vector< int,std::allocator< int > > > > *|std::vector< std::pair< double,std::vector< int > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t = {"_p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t", "std::vector< std::pair< float,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< float,std::vector< int,std::allocator< int > > > > > *|std::vector< std::pair< float,std::vector< int,std::allocator< int > > > > *|std::vector< std::pair< float,std::vector< int > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__string_std__allocatorT_std__string_t_t = {"_p_std__vectorT_std__string_std__allocatorT_std__string_t_t", "std::vector< std::string,std::allocator< std::string > > *|std::vector< std::string > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t = {"_p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t", "std::vector< std::vector< double > > *|std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > *|std::vector< std::vector< double,std::allocator< double > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t = {"_p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t", "std::vector< std::vector< int > > *|std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > *|std::vector< std::vector< int,std::allocator< int > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t = {"_p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t", "std::vector< std::vector< std::pair< double,std::vector< int > > > > *|std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > >,std::allocator< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > > *|std::vector< std::vector< std::pair< double,std::vector< int,std::allocator< int > > >,std::allocator< std::pair< double,std::vector< int,std::allocator< int > > > > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t = {"_p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t", "std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > >,std::allocator< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > > *|std::vector< std::vector< std::vector< double,std::allocator< double > >,std::allocator< std::vector< double,std::allocator< double > > > > > *|std::vector< std::vector< std::vector< double > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t = {"_p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t", "std::vector< std::vector< std::vector< int > > > *|std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > >,std::allocator< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > > *|std::vector< std::vector< std::vector< int,std::allocator< int > >,std::allocator< std::vector< int,std::allocator< int > > > > > *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_swig__SwigPyIterator = {"_p_swig__SwigPyIterator", "swig::SwigPyIterator *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_value_type = {"_p_value_type", "value_type *", 0, 0, (void*)0, 0}; -static swig_type_info _swigt__p_void = {"_p_void", "void *", 0, 0, (void*)0, 0}; - -static swig_type_info *swig_type_initial[] = { - &_swigt__p_PathTrie, - &_swigt__p_RetriveStrEnumerateVocab, - &_swigt__p_Scorer, - &_swigt__p_StringPiece, - &_swigt__p_allocator_type, - &_swigt__p_char, - &_swigt__p_const_reference, - &_swigt__p_difference_type, - &_swigt__p_first_type, - &_swigt__p_fst__StdVectorFst, - &_swigt__p_lm__WordIndex, - &_swigt__p_p_PyObject, - &_swigt__p_reference, - &_swigt__p_second_type, - &_swigt__p_size_type, - &_swigt__p_std__allocatorT_PathTrie_p_t, - &_swigt__p_std__allocatorT_bool_t, - &_swigt__p_std__allocatorT_double_t, - &_swigt__p_std__allocatorT_float_t, - &_swigt__p_std__allocatorT_int_t, - &_swigt__p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t, - &_swigt__p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t, - &_swigt__p_std__allocatorT_std__string_t, - &_swigt__p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t, - &_swigt__p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t, - &_swigt__p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, - &_swigt__p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t, - &_swigt__p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, - &_swigt__p_std__invalid_argument, - &_swigt__p_std__pairT_double_std__string_t, - &_swigt__p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, - &_swigt__p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, - &_swigt__p_std__pairT_int_double_t, - &_swigt__p_std__pairT_std__string_double_t, - &_swigt__p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t, - &_swigt__p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, - &_swigt__p_std__vectorT_bool_std__allocatorT_bool_t_t, - &_swigt__p_std__vectorT_double_std__allocatorT_double_t_t, - &_swigt__p_std__vectorT_float_std__allocatorT_float_t_t, - &_swigt__p_std__vectorT_int_std__allocatorT_int_t_t, - &_swigt__p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, - &_swigt__p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, - &_swigt__p_std__vectorT_std__string_std__allocatorT_std__string_t_t, - &_swigt__p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, - &_swigt__p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, - &_swigt__p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, - &_swigt__p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, - &_swigt__p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, - &_swigt__p_swig__SwigPyIterator, - &_swigt__p_value_type, - &_swigt__p_void, -}; - -static swig_cast_info _swigc__p_PathTrie[] = { {&_swigt__p_PathTrie, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_RetriveStrEnumerateVocab[] = { {&_swigt__p_RetriveStrEnumerateVocab, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_Scorer[] = { {&_swigt__p_Scorer, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_StringPiece[] = { {&_swigt__p_StringPiece, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_allocator_type[] = { {&_swigt__p_allocator_type, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_const_reference[] = { {&_swigt__p_const_reference, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_difference_type[] = { {&_swigt__p_difference_type, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_first_type[] = { {&_swigt__p_first_type, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_fst__StdVectorFst[] = { {&_swigt__p_fst__StdVectorFst, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_lm__WordIndex[] = { {&_swigt__p_lm__WordIndex, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_p_PyObject[] = { {&_swigt__p_p_PyObject, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_reference[] = { {&_swigt__p_reference, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_second_type[] = { {&_swigt__p_second_type, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_size_type[] = { {&_swigt__p_size_type, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_PathTrie_p_t[] = { {&_swigt__p_std__allocatorT_PathTrie_p_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_bool_t[] = { {&_swigt__p_std__allocatorT_bool_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_double_t[] = { {&_swigt__p_std__allocatorT_double_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_float_t[] = { {&_swigt__p_std__allocatorT_float_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_int_t[] = { {&_swigt__p_std__allocatorT_int_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t[] = { {&_swigt__p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t[] = { {&_swigt__p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__string_t[] = { {&_swigt__p_std__allocatorT_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t[] = { {&_swigt__p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t[] = { {&_swigt__p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t[] = { {&_swigt__p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t[] = { {&_swigt__p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t[] = { {&_swigt__p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__invalid_argument[] = { {&_swigt__p_std__invalid_argument, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__pairT_double_std__string_t[] = { {&_swigt__p_std__pairT_double_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t[] = { {&_swigt__p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t[] = { {&_swigt__p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__pairT_int_double_t[] = { {&_swigt__p_std__pairT_int_double_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__pairT_std__string_double_t[] = { {&_swigt__p_std__pairT_std__string_double_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t[] = { {&_swigt__p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t[] = { {&_swigt__p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_bool_std__allocatorT_bool_t_t[] = { {&_swigt__p_std__vectorT_bool_std__allocatorT_bool_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_double_std__allocatorT_double_t_t[] = { {&_swigt__p_std__vectorT_double_std__allocatorT_double_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_float_std__allocatorT_float_t_t[] = { {&_swigt__p_std__vectorT_float_std__allocatorT_float_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_int_std__allocatorT_int_t_t[] = { {&_swigt__p_std__vectorT_int_std__allocatorT_int_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__string_std__allocatorT_std__string_t_t[] = { {&_swigt__p_std__vectorT_std__string_std__allocatorT_std__string_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_swig__SwigPyIterator[] = { {&_swigt__p_swig__SwigPyIterator, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_value_type[] = { {&_swigt__p_value_type, 0, 0, 0},{0, 0, 0, 0}}; -static swig_cast_info _swigc__p_void[] = { {&_swigt__p_void, 0, 0, 0},{0, 0, 0, 0}}; - -static swig_cast_info *swig_cast_initial[] = { - _swigc__p_PathTrie, - _swigc__p_RetriveStrEnumerateVocab, - _swigc__p_Scorer, - _swigc__p_StringPiece, - _swigc__p_allocator_type, - _swigc__p_char, - _swigc__p_const_reference, - _swigc__p_difference_type, - _swigc__p_first_type, - _swigc__p_fst__StdVectorFst, - _swigc__p_lm__WordIndex, - _swigc__p_p_PyObject, - _swigc__p_reference, - _swigc__p_second_type, - _swigc__p_size_type, - _swigc__p_std__allocatorT_PathTrie_p_t, - _swigc__p_std__allocatorT_bool_t, - _swigc__p_std__allocatorT_double_t, - _swigc__p_std__allocatorT_float_t, - _swigc__p_std__allocatorT_int_t, - _swigc__p_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t, - _swigc__p_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t, - _swigc__p_std__allocatorT_std__string_t, - _swigc__p_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t, - _swigc__p_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t, - _swigc__p_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, - _swigc__p_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t, - _swigc__p_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, - _swigc__p_std__invalid_argument, - _swigc__p_std__pairT_double_std__string_t, - _swigc__p_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t, - _swigc__p_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t, - _swigc__p_std__pairT_int_double_t, - _swigc__p_std__pairT_std__string_double_t, - _swigc__p_std__shared_ptrT_fst__SortedMatcherT_fst__StdVectorFst_t_t, - _swigc__p_std__vectorT_PathTrie_p_std__allocatorT_PathTrie_p_t_t, - _swigc__p_std__vectorT_bool_std__allocatorT_bool_t_t, - _swigc__p_std__vectorT_double_std__allocatorT_double_t_t, - _swigc__p_std__vectorT_float_std__allocatorT_float_t_t, - _swigc__p_std__vectorT_int_std__allocatorT_int_t_t, - _swigc__p_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, - _swigc__p_std__vectorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_float_std__vectorT_int_std__allocatorT_int_t_t_t_t_t, - _swigc__p_std__vectorT_std__string_std__allocatorT_std__string_t_t, - _swigc__p_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t, - _swigc__p_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t, - _swigc__p_std__vectorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_std__allocatorT_std__vectorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_std__allocatorT_std__pairT_double_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t_t, - _swigc__p_std__vectorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_double_std__allocatorT_double_t_t_std__allocatorT_std__vectorT_double_std__allocatorT_double_t_t_t_t_t_t, - _swigc__p_std__vectorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_std__allocatorT_std__vectorT_std__vectorT_int_std__allocatorT_int_t_t_std__allocatorT_std__vectorT_int_std__allocatorT_int_t_t_t_t_t_t, - _swigc__p_swig__SwigPyIterator, - _swigc__p_value_type, - _swigc__p_void, -}; - - -/* -------- TYPE CONVERSION AND EQUIVALENCE RULES (END) -------- */ - -static swig_const_info swig_const_table[] = { -{0, 0, 0, 0.0, 0, 0}}; - -#ifdef __cplusplus -} -#endif -/* ----------------------------------------------------------------------------- - * Type initialization: - * This problem is tough by the requirement that no dynamic - * memory is used. Also, since swig_type_info structures store pointers to - * swig_cast_info structures and swig_cast_info structures store pointers back - * to swig_type_info structures, we need some lookup code at initialization. - * The idea is that swig generates all the structures that are needed. - * The runtime then collects these partially filled structures. - * The SWIG_InitializeModule function takes these initial arrays out of - * swig_module, and does all the lookup, filling in the swig_module.types - * array with the correct data and linking the correct swig_cast_info - * structures together. - * - * The generated swig_type_info structures are assigned statically to an initial - * array. We just loop through that array, and handle each type individually. - * First we lookup if this type has been already loaded, and if so, use the - * loaded structure instead of the generated one. Then we have to fill in the - * cast linked list. The cast data is initially stored in something like a - * two-dimensional array. Each row corresponds to a type (there are the same - * number of rows as there are in the swig_type_initial array). Each entry in - * a column is one of the swig_cast_info structures for that type. - * The cast_initial array is actually an array of arrays, because each row has - * a variable number of columns. So to actually build the cast linked list, - * we find the array of casts associated with the type, and loop through it - * adding the casts to the list. The one last trick we need to do is making - * sure the type pointer in the swig_cast_info struct is correct. - * - * First off, we lookup the cast->type name to see if it is already loaded. - * There are three cases to handle: - * 1) If the cast->type has already been loaded AND the type we are adding - * casting info to has not been loaded (it is in this module), THEN we - * replace the cast->type pointer with the type pointer that has already - * been loaded. - * 2) If BOTH types (the one we are adding casting info to, and the - * cast->type) are loaded, THEN the cast info has already been loaded by - * the previous module so we just ignore it. - * 3) Finally, if cast->type has not already been loaded, then we add that - * swig_cast_info to the linked list (because the cast->type) pointer will - * be correct. - * ----------------------------------------------------------------------------- */ - -#ifdef __cplusplus -extern "C" { -#if 0 -} /* c-mode */ -#endif -#endif - -#if 0 -#define SWIGRUNTIME_DEBUG -#endif - - -SWIGRUNTIME void -SWIG_InitializeModule(void *clientdata) { - size_t i; - swig_module_info *module_head, *iter; - int init; - - /* check to see if the circular list has been setup, if not, set it up */ - if (swig_module.next==0) { - /* Initialize the swig_module */ - swig_module.type_initial = swig_type_initial; - swig_module.cast_initial = swig_cast_initial; - swig_module.next = &swig_module; - init = 1; - } else { - init = 0; - } - - /* Try and load any already created modules */ - module_head = SWIG_GetModule(clientdata); - if (!module_head) { - /* This is the first module loaded for this interpreter */ - /* so set the swig module into the interpreter */ - SWIG_SetModule(clientdata, &swig_module); - } else { - /* the interpreter has loaded a SWIG module, but has it loaded this one? */ - iter=module_head; - do { - if (iter==&swig_module) { - /* Our module is already in the list, so there's nothing more to do. */ - return; - } - iter=iter->next; - } while (iter!= module_head); - - /* otherwise we must add our module into the list */ - swig_module.next = module_head->next; - module_head->next = &swig_module; - } - - /* When multiple interpreters are used, a module could have already been initialized in - a different interpreter, but not yet have a pointer in this interpreter. - In this case, we do not want to continue adding types... everything should be - set up already */ - if (init == 0) return; - - /* Now work on filling in swig_module.types */ -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: size %d\n", swig_module.size); -#endif - for (i = 0; i < swig_module.size; ++i) { - swig_type_info *type = 0; - swig_type_info *ret; - swig_cast_info *cast; - -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name); -#endif - - /* if there is another module already loaded */ - if (swig_module.next != &swig_module) { - type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, swig_module.type_initial[i]->name); - } - if (type) { - /* Overwrite clientdata field */ -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: found type %s\n", type->name); -#endif - if (swig_module.type_initial[i]->clientdata) { - type->clientdata = swig_module.type_initial[i]->clientdata; -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: found and overwrite type %s \n", type->name); -#endif - } - } else { - type = swig_module.type_initial[i]; - } - - /* Insert casting types */ - cast = swig_module.cast_initial[i]; - while (cast->type) { - /* Don't need to add information already in the list */ - ret = 0; -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: look cast %s\n", cast->type->name); -#endif - if (swig_module.next != &swig_module) { - ret = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, cast->type->name); -#ifdef SWIGRUNTIME_DEBUG - if (ret) printf("SWIG_InitializeModule: found cast %s\n", ret->name); -#endif - } - if (ret) { - if (type == swig_module.type_initial[i]) { -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: skip old type %s\n", ret->name); -#endif - cast->type = ret; - ret = 0; - } else { - /* Check for casting already in the list */ - swig_cast_info *ocast = SWIG_TypeCheck(ret->name, type); -#ifdef SWIGRUNTIME_DEBUG - if (ocast) printf("SWIG_InitializeModule: skip old cast %s\n", ret->name); -#endif - if (!ocast) ret = 0; - } - } - - if (!ret) { -#ifdef SWIGRUNTIME_DEBUG - printf("SWIG_InitializeModule: adding cast %s\n", cast->type->name); -#endif - if (type->cast) { - type->cast->prev = cast; - cast->next = type->cast; - } - type->cast = cast; - } - cast++; - } - /* Set entry in modules->types array equal to the type */ - swig_module.types[i] = type; - } - swig_module.types[i] = 0; - -#ifdef SWIGRUNTIME_DEBUG - printf("**** SWIG_InitializeModule: Cast List ******\n"); - for (i = 0; i < swig_module.size; ++i) { - int j = 0; - swig_cast_info *cast = swig_module.cast_initial[i]; - printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name); - while (cast->type) { - printf("SWIG_InitializeModule: cast type %s\n", cast->type->name); - cast++; - ++j; - } - printf("---- Total casts: %d\n",j); - } - printf("**** SWIG_InitializeModule: Cast List ******\n"); -#endif -} - -/* This function will propagate the clientdata field of type to -* any new swig_type_info structures that have been added into the list -* of equivalent types. It is like calling -* SWIG_TypeClientData(type, clientdata) a second time. -*/ -SWIGRUNTIME void -SWIG_PropagateClientData(void) { - size_t i; - swig_cast_info *equiv; - static int init_run = 0; - - if (init_run) return; - init_run = 1; - - for (i = 0; i < swig_module.size; i++) { - if (swig_module.types[i]->clientdata) { - equiv = swig_module.types[i]->cast; - while (equiv) { - if (!equiv->converter) { - if (equiv->type && !equiv->type->clientdata) - SWIG_TypeClientData(equiv->type, swig_module.types[i]->clientdata); - } - equiv = equiv->next; - } - } - } -} - -#ifdef __cplusplus -#if 0 -{ - /* c-mode */ -#endif -} -#endif - - - -#ifdef __cplusplus -extern "C" { -#endif - - /* Python-specific SWIG API */ -#define SWIG_newvarlink() SWIG_Python_newvarlink() -#define SWIG_addvarlink(p, name, get_attr, set_attr) SWIG_Python_addvarlink(p, name, get_attr, set_attr) -#define SWIG_InstallConstants(d, constants) SWIG_Python_InstallConstants(d, constants) - - /* ----------------------------------------------------------------------------- - * global variable support code. - * ----------------------------------------------------------------------------- */ - - typedef struct swig_globalvar { - char *name; /* Name of global variable */ - PyObject *(*get_attr)(void); /* Return the current value */ - int (*set_attr)(PyObject *); /* Set the value */ - struct swig_globalvar *next; - } swig_globalvar; - - typedef struct swig_varlinkobject { - PyObject_HEAD - swig_globalvar *vars; - } swig_varlinkobject; - - SWIGINTERN PyObject * - swig_varlink_repr(swig_varlinkobject *SWIGUNUSEDPARM(v)) { -#if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_InternFromString(""); -#else - return PyString_FromString(""); -#endif - } - - SWIGINTERN PyObject * - swig_varlink_str(swig_varlinkobject *v) { -#if PY_VERSION_HEX >= 0x03000000 - PyObject *str = PyUnicode_InternFromString("("); - PyObject *tail; - PyObject *joined; - swig_globalvar *var; - for (var = v->vars; var; var=var->next) { - tail = PyUnicode_FromString(var->name); - joined = PyUnicode_Concat(str, tail); - Py_DecRef(str); - Py_DecRef(tail); - str = joined; - if (var->next) { - tail = PyUnicode_InternFromString(", "); - joined = PyUnicode_Concat(str, tail); - Py_DecRef(str); - Py_DecRef(tail); - str = joined; - } - } - tail = PyUnicode_InternFromString(")"); - joined = PyUnicode_Concat(str, tail); - Py_DecRef(str); - Py_DecRef(tail); - str = joined; -#else - PyObject *str = PyString_FromString("("); - swig_globalvar *var; - for (var = v->vars; var; var=var->next) { - PyString_ConcatAndDel(&str,PyString_FromString(var->name)); - if (var->next) PyString_ConcatAndDel(&str,PyString_FromString(", ")); - } - PyString_ConcatAndDel(&str,PyString_FromString(")")); -#endif - return str; - } - - SWIGINTERN int - swig_varlink_print(swig_varlinkobject *v, FILE *fp, int SWIGUNUSEDPARM(flags)) { - char *tmp; - PyObject *str = swig_varlink_str(v); - fprintf(fp,"Swig global variables "); - fprintf(fp,"%s\n", tmp = SWIG_Python_str_AsChar(str)); - SWIG_Python_str_DelForPy3(tmp); - Py_DECREF(str); - return 0; - } - - SWIGINTERN void - swig_varlink_dealloc(swig_varlinkobject *v) { - swig_globalvar *var = v->vars; - while (var) { - swig_globalvar *n = var->next; - free(var->name); - free(var); - var = n; - } - } - - SWIGINTERN PyObject * - swig_varlink_getattr(swig_varlinkobject *v, char *n) { - PyObject *res = NULL; - swig_globalvar *var = v->vars; - while (var) { - if (strcmp(var->name,n) == 0) { - res = (*var->get_attr)(); - break; - } - var = var->next; - } - if (res == NULL && !PyErr_Occurred()) { - PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n); - } - return res; - } - - SWIGINTERN int - swig_varlink_setattr(swig_varlinkobject *v, char *n, PyObject *p) { - int res = 1; - swig_globalvar *var = v->vars; - while (var) { - if (strcmp(var->name,n) == 0) { - res = (*var->set_attr)(p); - break; - } - var = var->next; - } - if (res == 1 && !PyErr_Occurred()) { - PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n); - } - return res; - } - - SWIGINTERN PyTypeObject* - swig_varlink_type(void) { - static char varlink__doc__[] = "Swig var link object"; - static PyTypeObject varlink_type; - static int type_init = 0; - if (!type_init) { - const PyTypeObject tmp = { -#if PY_VERSION_HEX >= 0x03000000 - PyVarObject_HEAD_INIT(NULL, 0) -#else - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ -#endif - (char *)"swigvarlink", /* tp_name */ - sizeof(swig_varlinkobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor) swig_varlink_dealloc, /* tp_dealloc */ - (printfunc) swig_varlink_print, /* tp_print */ - (getattrfunc) swig_varlink_getattr, /* tp_getattr */ - (setattrfunc) swig_varlink_setattr, /* tp_setattr */ - 0, /* tp_compare */ - (reprfunc) swig_varlink_repr, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - (reprfunc) swig_varlink_str, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - 0, /* tp_flags */ - varlink__doc__, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ -#if PY_VERSION_HEX >= 0x02020000 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* tp_iter -> tp_weaklist */ -#endif -#if PY_VERSION_HEX >= 0x02030000 - 0, /* tp_del */ -#endif -#if PY_VERSION_HEX >= 0x02060000 - 0, /* tp_version_tag */ -#endif -#if PY_VERSION_HEX >= 0x03040000 - 0, /* tp_finalize */ -#endif -#ifdef COUNT_ALLOCS - 0, /* tp_allocs */ - 0, /* tp_frees */ - 0, /* tp_maxalloc */ -#if PY_VERSION_HEX >= 0x02050000 - 0, /* tp_prev */ -#endif - 0 /* tp_next */ -#endif - }; - varlink_type = tmp; - type_init = 1; -#if PY_VERSION_HEX < 0x02020000 - varlink_type.ob_type = &PyType_Type; -#else - if (PyType_Ready(&varlink_type) < 0) - return NULL; -#endif - } - return &varlink_type; - } - - /* Create a variable linking object for use later */ - SWIGINTERN PyObject * - SWIG_Python_newvarlink(void) { - swig_varlinkobject *result = PyObject_NEW(swig_varlinkobject, swig_varlink_type()); - if (result) { - result->vars = 0; - } - return ((PyObject*) result); - } - - SWIGINTERN void - SWIG_Python_addvarlink(PyObject *p, char *name, PyObject *(*get_attr)(void), int (*set_attr)(PyObject *p)) { - swig_varlinkobject *v = (swig_varlinkobject *) p; - swig_globalvar *gv = (swig_globalvar *) malloc(sizeof(swig_globalvar)); - if (gv) { - size_t size = strlen(name)+1; - gv->name = (char *)malloc(size); - if (gv->name) { - strncpy(gv->name,name,size); - gv->get_attr = get_attr; - gv->set_attr = set_attr; - gv->next = v->vars; - } - } - v->vars = gv; - } - - SWIGINTERN PyObject * - SWIG_globals(void) { - static PyObject *_SWIG_globals = 0; - if (!_SWIG_globals) _SWIG_globals = SWIG_newvarlink(); - return _SWIG_globals; - } - - /* ----------------------------------------------------------------------------- - * constants/methods manipulation - * ----------------------------------------------------------------------------- */ - - /* Install Constants */ - SWIGINTERN void - SWIG_Python_InstallConstants(PyObject *d, swig_const_info constants[]) { - PyObject *obj = 0; - size_t i; - for (i = 0; constants[i].type; ++i) { - switch(constants[i].type) { - case SWIG_PY_POINTER: - obj = SWIG_InternalNewPointerObj(constants[i].pvalue, *(constants[i]).ptype,0); - break; - case SWIG_PY_BINARY: - obj = SWIG_NewPackedObj(constants[i].pvalue, constants[i].lvalue, *(constants[i].ptype)); - break; - default: - obj = 0; - break; - } - if (obj) { - PyDict_SetItemString(d, constants[i].name, obj); - Py_DECREF(obj); - } - } - } - - /* -----------------------------------------------------------------------------*/ - /* Fix SwigMethods to carry the callback ptrs when needed */ - /* -----------------------------------------------------------------------------*/ - - SWIGINTERN void - SWIG_Python_FixMethods(PyMethodDef *methods, - swig_const_info *const_table, - swig_type_info **types, - swig_type_info **types_initial) { - size_t i; - for (i = 0; methods[i].ml_name; ++i) { - const char *c = methods[i].ml_doc; - if (!c) continue; - c = strstr(c, "swig_ptr: "); - if (c) { - int j; - swig_const_info *ci = 0; - const char *name = c + 10; - for (j = 0; const_table[j].type; ++j) { - if (strncmp(const_table[j].name, name, - strlen(const_table[j].name)) == 0) { - ci = &(const_table[j]); - break; - } - } - if (ci) { - void *ptr = (ci->type == SWIG_PY_POINTER) ? ci->pvalue : 0; - if (ptr) { - size_t shift = (ci->ptype) - types; - swig_type_info *ty = types_initial[shift]; - size_t ldoc = (c - methods[i].ml_doc); - size_t lptr = strlen(ty->name)+2*sizeof(void*)+2; - char *ndoc = (char*)malloc(ldoc + lptr + 10); - if (ndoc) { - char *buff = ndoc; - memcpy(buff, methods[i].ml_doc, ldoc); - buff += ldoc; - memcpy(buff, "swig_ptr: ", 10); - buff += 10; - SWIG_PackVoidPtr(buff, ptr, ty->name, lptr); - methods[i].ml_doc = ndoc; - } - } - } - } - } - } - -#ifdef __cplusplus -} -#endif - -/* -----------------------------------------------------------------------------* - * Partial Init method - * -----------------------------------------------------------------------------*/ - -#ifdef __cplusplus -extern "C" -#endif - -SWIGEXPORT -#if PY_VERSION_HEX >= 0x03000000 -PyObject* -#else -void -#endif -SWIG_init(void) { - PyObject *m, *d, *md; -#if PY_VERSION_HEX >= 0x03000000 - static struct PyModuleDef SWIG_module = { -# if PY_VERSION_HEX >= 0x03020000 - PyModuleDef_HEAD_INIT, -# else - { - PyObject_HEAD_INIT(NULL) - NULL, /* m_init */ - 0, /* m_index */ - NULL, /* m_copy */ - }, -# endif - (char *) SWIG_name, - NULL, - -1, - SwigMethods, - NULL, - NULL, - NULL, - NULL - }; -#endif - -#if defined(SWIGPYTHON_BUILTIN) - static SwigPyClientData SwigPyObject_clientdata = { - 0, 0, 0, 0, 0, 0, 0 - }; - static PyGetSetDef this_getset_def = { - (char *)"this", &SwigPyBuiltin_ThisClosure, NULL, NULL, NULL - }; - static SwigPyGetSet thisown_getset_closure = { - (PyCFunction) SwigPyObject_own, - (PyCFunction) SwigPyObject_own - }; - static PyGetSetDef thisown_getset_def = { - (char *)"thisown", SwigPyBuiltin_GetterClosure, SwigPyBuiltin_SetterClosure, NULL, &thisown_getset_closure - }; - PyTypeObject *builtin_pytype; - int builtin_base_count; - swig_type_info *builtin_basetype; - PyObject *tuple; - PyGetSetDescrObject *static_getset; - PyTypeObject *metatype; - PyTypeObject *swigpyobject; - SwigPyClientData *cd; - PyObject *public_interface, *public_symbol; - PyObject *this_descr; - PyObject *thisown_descr; - PyObject *self = 0; - int i; - - (void)builtin_pytype; - (void)builtin_base_count; - (void)builtin_basetype; - (void)tuple; - (void)static_getset; - (void)self; - - /* Metaclass is used to implement static member variables */ - metatype = SwigPyObjectType(); - assert(metatype); -#endif - - /* Fix SwigMethods to carry the callback ptrs when needed */ - SWIG_Python_FixMethods(SwigMethods, swig_const_table, swig_types, swig_type_initial); - -#if PY_VERSION_HEX >= 0x03000000 - m = PyModule_Create(&SWIG_module); -#else - m = Py_InitModule((char *) SWIG_name, SwigMethods); -#endif - - md = d = PyModule_GetDict(m); - (void)md; - - SWIG_InitializeModule(0); - -#ifdef SWIGPYTHON_BUILTIN - swigpyobject = SwigPyObject_TypeOnce(); - - SwigPyObject_stype = SWIG_MangledTypeQuery("_p_SwigPyObject"); - assert(SwigPyObject_stype); - cd = (SwigPyClientData*) SwigPyObject_stype->clientdata; - if (!cd) { - SwigPyObject_stype->clientdata = &SwigPyObject_clientdata; - SwigPyObject_clientdata.pytype = swigpyobject; - } else if (swigpyobject->tp_basicsize != cd->pytype->tp_basicsize) { - PyErr_SetString(PyExc_RuntimeError, "Import error: attempted to load two incompatible swig-generated modules."); -# if PY_VERSION_HEX >= 0x03000000 - return NULL; -# else - return; -# endif - } - - /* All objects have a 'this' attribute */ - this_descr = PyDescr_NewGetSet(SwigPyObject_type(), &this_getset_def); - (void)this_descr; - - /* All objects have a 'thisown' attribute */ - thisown_descr = PyDescr_NewGetSet(SwigPyObject_type(), &thisown_getset_def); - (void)thisown_descr; - - public_interface = PyList_New(0); - public_symbol = 0; - (void)public_symbol; - - PyDict_SetItemString(md, "__all__", public_interface); - Py_DECREF(public_interface); - for (i = 0; SwigMethods[i].ml_name != NULL; ++i) - SwigPyBuiltin_AddPublicSymbol(public_interface, SwigMethods[i].ml_name); - for (i = 0; swig_const_table[i].name != 0; ++i) - SwigPyBuiltin_AddPublicSymbol(public_interface, swig_const_table[i].name); -#endif - - SWIG_InstallConstants(d,swig_const_table); - - PyDict_SetItemString(md,(char *)"cvar", SWIG_globals()); - SWIG_addvarlink(SWIG_globals(),(char *)"OOV_SCORE",Swig_var_OOV_SCORE_get, Swig_var_OOV_SCORE_set); - SWIG_addvarlink(SWIG_globals(),(char *)"START_TOKEN",Swig_var_START_TOKEN_get, Swig_var_START_TOKEN_set); - SWIG_addvarlink(SWIG_globals(),(char *)"UNK_TOKEN",Swig_var_UNK_TOKEN_get, Swig_var_UNK_TOKEN_set); - SWIG_addvarlink(SWIG_globals(),(char *)"END_TOKEN",Swig_var_END_TOKEN_get, Swig_var_END_TOKEN_set); -#if PY_VERSION_HEX >= 0x03000000 - return m; -#else - return; -#endif -} - diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/path_trie.cpp b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/path_trie.cpp deleted file mode 100644 index e68affacaed5614261ac64b19a6ab3c2b4089319..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/path_trie.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "path_trie.h" - -#include -#include -#include -#include -#include - -#include "decoder_utils.h" - -PathTrie::PathTrie() { - log_prob_b_prev = -NUM_FLT_INF; - log_prob_nb_prev = -NUM_FLT_INF; - log_prob_b_cur = -NUM_FLT_INF; - log_prob_nb_cur = -NUM_FLT_INF; - score = -NUM_FLT_INF; - - ROOT_ = -1; - character = ROOT_; - exists_ = true; - parent = nullptr; - - dictionary_ = nullptr; - dictionary_state_ = 0; - has_dictionary_ = false; - - matcher_ = nullptr; -} - -PathTrie::~PathTrie() { - for (auto child : children_) { - delete child.second; - } -} - -PathTrie* PathTrie::get_path_trie(int new_char, bool reset) { - auto child = children_.begin(); - for (child = children_.begin(); child != children_.end(); ++child) { - if (child->first == new_char) { - break; - } - } - if (child != children_.end()) { - if (!child->second->exists_) { - child->second->exists_ = true; - child->second->log_prob_b_prev = -NUM_FLT_INF; - child->second->log_prob_nb_prev = -NUM_FLT_INF; - child->second->log_prob_b_cur = -NUM_FLT_INF; - child->second->log_prob_nb_cur = -NUM_FLT_INF; - } - return (child->second); - } else { - if (has_dictionary_) { - matcher_->SetState(dictionary_state_); - bool found = matcher_->Find(new_char + 1); - if (!found) { - // Adding this character causes word outside dictionary - auto FSTZERO = fst::TropicalWeight::Zero(); - auto final_weight = dictionary_->Final(dictionary_state_); - bool is_final = (final_weight != FSTZERO); - if (is_final && reset) { - dictionary_state_ = dictionary_->Start(); - } - return nullptr; - } else { - PathTrie* new_path = new PathTrie; - new_path->character = new_char; - new_path->parent = this; - new_path->dictionary_ = dictionary_; - new_path->dictionary_state_ = matcher_->Value().nextstate; - new_path->has_dictionary_ = true; - new_path->matcher_ = matcher_; - children_.push_back(std::make_pair(new_char, new_path)); - return new_path; - } - } else { - PathTrie* new_path = new PathTrie; - new_path->character = new_char; - new_path->parent = this; - children_.push_back(std::make_pair(new_char, new_path)); - return new_path; - } - } -} - -PathTrie* PathTrie::get_path_vec(std::vector& output) { - return get_path_vec(output, ROOT_); -} - -PathTrie* PathTrie::get_path_vec(std::vector& output, int stop, - size_t max_steps) { - if (character == stop || character == ROOT_ || output.size() == max_steps) { - std::reverse(output.begin(), output.end()); - return this; - } else { - output.push_back(character); - return parent->get_path_vec(output, stop, max_steps); - } -} - -void PathTrie::iterate_to_vec_only(std::vector& output) { - if (exists_) { - output.push_back(this); - } - for (auto child : children_) { - child.second->iterate_to_vec_only(output); - } -} - -void PathTrie::iterate_to_vec(std::vector& output) { - if (exists_) { - log_prob_b_prev = log_prob_b_cur; - log_prob_nb_prev = log_prob_nb_cur; - - log_prob_b_cur = -NUM_FLT_INF; - log_prob_nb_cur = -NUM_FLT_INF; - - score = log_sum_exp(log_prob_b_prev, log_prob_nb_prev); - output.push_back(this); - } - for (auto child : children_) { - child.second->iterate_to_vec(output); - } -} - -void PathTrie::remove() { - exists_ = false; - - if (children_.size() == 0) { - auto child = parent->children_.begin(); - for (child = parent->children_.begin(); child != parent->children_.end(); - ++child) { - if (child->first == character) { - parent->children_.erase(child); - break; - } - } - - if (parent->children_.size() == 0 && !parent->exists_) { - parent->remove(); - } - - delete this; - } -} - -void PathTrie::set_dictionary(fst::StdVectorFst* dictionary) { - dictionary_ = dictionary; - dictionary_state_ = dictionary->Start(); - has_dictionary_ = true; -} - -using FSTMATCH = fst::SortedMatcher; -void PathTrie::set_matcher(std::shared_ptr matcher) { - matcher_ = matcher; -} diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/path_trie.h b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/path_trie.h deleted file mode 100644 index b551ed8b8af77d3479d9bc85f51c6e8c2749c306..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/path_trie.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PATH_TRIE_H -#define PATH_TRIE_H - -#include -#include -#include -#include -#include - -#include "fst/fstlib.h" - -/* Trie tree for prefix storing and manipulating, with a dictionary in - * finite-state transducer for spelling correction. - */ -class PathTrie { - public: - PathTrie(); - ~PathTrie(); - - // get new prefix after appending new char - PathTrie* get_path_trie(int new_char, bool reset = true); - - // get the prefix in index from root to current node - PathTrie* get_path_vec(std::vector& output); - - // get the prefix in index from some stop node to current nodel - PathTrie* get_path_vec(std::vector& output, int stop, - size_t max_steps = std::numeric_limits::max()); - - // update log probs - void iterate_to_vec(std::vector& output); - - void iterate_to_vec_only(std::vector& output); - - // set dictionary for FST - void set_dictionary(fst::StdVectorFst* dictionary); - - void set_matcher(std::shared_ptr>); - - bool is_empty() { return ROOT_ == character; } - - // remove current path from root - void remove(); - - float log_prob_b_prev; - float log_prob_nb_prev; - float log_prob_b_cur; - float log_prob_nb_cur; - float score; - float approx_ctc; - int character; - PathTrie* parent; - - private: - int ROOT_; - bool exists_; - bool has_dictionary_; - - std::vector> children_; - - // pointer to dictionary of FST - fst::StdVectorFst* dictionary_; - fst::StdVectorFst::StateId dictionary_state_; - // true if finding ars in FST - std::shared_ptr> matcher_; -}; - -#endif // PATH_TRIE_H diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/scorer.cpp b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/scorer.cpp deleted file mode 100644 index 953761a9aec5e0a60a85233f024a2dbcafc3bbce..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/scorer.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "scorer.h" - -#include -#include - -#include "lm/config.hh" -#include "lm/model.hh" -#include "lm/state.hh" -#include "util/string_piece.hh" -#include "util/tokenize_piece.hh" - -#include "decoder_utils.h" - -using namespace lm::ngram; - -Scorer::Scorer(double alpha, double beta, const std::string& lm_path, - const std::vector& vocab_list) { - this->alpha = alpha; - this->beta = beta; - - dictionary = nullptr; - is_character_based_ = true; - language_model_ = nullptr; - - max_order_ = 0; - dict_size_ = 0; - SPACE_ID_ = -1; - - setup(lm_path, vocab_list); -} - -Scorer::~Scorer() { - if (language_model_ != nullptr) { - delete static_cast(language_model_); - } - if (dictionary != nullptr) { - delete static_cast(dictionary); - } -} - -void Scorer::setup(const std::string& lm_path, - const std::vector& vocab_list) { - // load language model - load_lm(lm_path); - // set char map for scorer - set_char_map(vocab_list); - // fill the dictionary for FST - if (!is_character_based()) { - fill_dictionary(true); - } -} - -void Scorer::load_lm(const std::string& lm_path) { - const char* filename = lm_path.c_str(); - VALID_CHECK_EQ(access(filename, F_OK), 0, "Invalid language model path"); - - RetriveStrEnumerateVocab enumerate; - lm::ngram::Config config; - config.enumerate_vocab = &enumerate; - language_model_ = lm::ngram::LoadVirtual(filename, config); - max_order_ = static_cast(language_model_)->Order(); - vocabulary_ = enumerate.vocabulary; - for (size_t i = 0; i < vocabulary_.size(); ++i) { - if (is_character_based_ && vocabulary_[i] != UNK_TOKEN && - vocabulary_[i] != START_TOKEN && vocabulary_[i] != END_TOKEN && - get_utf8_str_len(enumerate.vocabulary[i]) > 1) { - is_character_based_ = false; - } - } -} - -double Scorer::get_log_cond_prob(const std::vector& words) { - lm::base::Model* model = static_cast(language_model_); - double cond_prob; - lm::ngram::State state, tmp_state, out_state; - // avoid to inserting in begin - model->NullContextWrite(&state); - for (size_t i = 0; i < words.size(); ++i) { - lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]); - // encounter OOV - if (word_index == 0) { - return OOV_SCORE; - } - cond_prob = model->BaseScore(&state, word_index, &out_state); - tmp_state = state; - state = out_state; - out_state = tmp_state; - } - // return log10 prob - return cond_prob; -} - -double Scorer::get_sent_log_prob(const std::vector& words) { - std::vector sentence; - if (words.size() == 0) { - for (size_t i = 0; i < max_order_; ++i) { - sentence.push_back(START_TOKEN); - } - } else { - for (size_t i = 0; i < max_order_ - 1; ++i) { - sentence.push_back(START_TOKEN); - } - sentence.insert(sentence.end(), words.begin(), words.end()); - } - sentence.push_back(END_TOKEN); - return get_log_prob(sentence); -} - -double Scorer::get_log_prob(const std::vector& words) { - assert(words.size() > max_order_); - double score = 0.0; - for (size_t i = 0; i < words.size() - max_order_ + 1; ++i) { - std::vector ngram(words.begin() + i, - words.begin() + i + max_order_); - score += get_log_cond_prob(ngram); - } - return score; -} - -void Scorer::reset_params(float alpha, float beta) { - this->alpha = alpha; - this->beta = beta; -} - -std::string Scorer::vec2str(const std::vector& input) { - std::string word; - for (auto ind : input) { - word += char_list_[ind]; - } - return word; -} - -std::vector Scorer::split_labels(const std::vector& labels) { - if (labels.empty()) return {}; - - std::string s = vec2str(labels); - std::vector words; - if (is_character_based_) { - words = split_utf8_str(s); - } else { - words = split_str(s, " "); - } - return words; -} - -void Scorer::set_char_map(const std::vector& char_list) { - char_list_ = char_list; - char_map_.clear(); - - // Set the char map for the FST for spelling correction - for (size_t i = 0; i < char_list_.size(); i++) { - if (char_list_[i] == " ") { - SPACE_ID_ = i; - } - // The initial state of FST is state 0, hence the index of chars in - // the FST should start from 1 to avoid the conflict with the initial - // state, otherwise wrong decoding results would be given. - char_map_[char_list_[i]] = i + 1; - } -} - -std::vector Scorer::make_ngram(PathTrie* prefix) { - std::vector ngram; - PathTrie* current_node = prefix; - PathTrie* new_node = nullptr; - - for (int order = 0; order < max_order_; order++) { - std::vector prefix_vec; - - if (is_character_based_) { - new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_, 1); - current_node = new_node; - } else { - new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_); - current_node = new_node->parent; // Skipping spaces - } - - // reconstruct word - std::string word = vec2str(prefix_vec); - ngram.push_back(word); - - if (new_node->character == -1) { - // No more spaces, but still need order - for (int i = 0; i < max_order_ - order - 1; i++) { - ngram.push_back(START_TOKEN); - } - break; - } - } - std::reverse(ngram.begin(), ngram.end()); - return ngram; -} - -void Scorer::fill_dictionary(bool add_space) { - fst::StdVectorFst dictionary; - // For each unigram convert to ints and put in trie - int dict_size = 0; - for (const auto& word : vocabulary_) { - bool added = add_word_to_dictionary(word, char_map_, add_space, - SPACE_ID_ + 1, &dictionary); - dict_size += added ? 1 : 0; - } - - dict_size_ = dict_size; - - /* Simplify FST - - * This gets rid of "epsilon" transitions in the FST. - * These are transitions that don't require a string input to be taken. - * Getting rid of them is necessary to make the FST determinisitc, but - * can greatly increase the size of the FST - */ - fst::RmEpsilon(&dictionary); - fst::StdVectorFst* new_dict = new fst::StdVectorFst; - - /* This makes the FST deterministic, meaning for any string input there's - * only one possible state the FST could be in. It is assumed our - * dictionary is deterministic when using it. - * (lest we'd have to check for multiple transitions at each state) - */ - fst::Determinize(dictionary, new_dict); - - /* Finds the simplest equivalent fst. This is unnecessary but decreases - * memory usage of the dictionary - */ - fst::Minimize(new_dict); - this->dictionary = new_dict; -} diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/scorer.h b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/scorer.h deleted file mode 100644 index e31e05b0a1dad4aad1093a7ef607f45d5505f889..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/scorer.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SCORER_H_ -#define SCORER_H_ - -#include -#include -#include -#include - -#include "lm/enumerate_vocab.hh" -#include "lm/virtual_interface.hh" -#include "lm/word_index.hh" -#include "util/string_piece.hh" - -#include "path_trie.h" - -const double OOV_SCORE = -1000.0; -const std::string START_TOKEN = ""; -const std::string UNK_TOKEN = ""; -const std::string END_TOKEN = ""; - -// Implement a callback to retrive the dictionary of language model. -class RetriveStrEnumerateVocab : public lm::EnumerateVocab { - public: - RetriveStrEnumerateVocab() {} - - void Add(lm::WordIndex index, const StringPiece &str) { - vocabulary.push_back(std::string(str.data(), str.length())); - } - - std::vector vocabulary; -}; - -/* External scorer to query score for n-gram or sentence, including language - * model scoring and word insertion. - * - * Example: - * Scorer scorer(alpha, beta, "path_of_language_model"); - * scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" }); - * scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" }); - */ -class Scorer { - public: - Scorer(double alpha, double beta, const std::string &lm_path, - const std::vector &vocabulary); - ~Scorer(); - - double get_log_cond_prob(const std::vector &words); - - double get_sent_log_prob(const std::vector &words); - - // return the max order - size_t get_max_order() const { return max_order_; } - - // return the dictionary size of language model - size_t get_dict_size() const { return dict_size_; } - - // retrun true if the language model is character based - bool is_character_based() const { return is_character_based_; } - - // reset params alpha & beta - void reset_params(float alpha, float beta); - - // make ngram for a given prefix - std::vector make_ngram(PathTrie *prefix); - - // trransform the labels in index to the vector of words (word based lm) or - // the vector of characters (character based lm) - std::vector split_labels(const std::vector &labels); - - // language model weight - double alpha; - // word insertion weight - double beta; - - // pointer to the dictionary of FST - void *dictionary; - - protected: - // necessary setup: load language model, set char map, fill FST's dictionary - void setup(const std::string &lm_path, - const std::vector &vocab_list); - - // load language model from given path - void load_lm(const std::string &lm_path); - - // fill dictionary for FST - void fill_dictionary(bool add_space); - - // set char map - void set_char_map(const std::vector &char_list); - - double get_log_prob(const std::vector &words); - - // translate the vector in index to string - std::string vec2str(const std::vector &input); - - private: - void *language_model_; - bool is_character_based_; - size_t max_order_; - size_t dict_size_; - - int SPACE_ID_; - std::vector char_list_; - std::unordered_map char_map_; - - std::vector vocabulary_; -}; - -#endif // SCORER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/setup.py b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/setup.py deleted file mode 100644 index 77d85eed7de728ccdd85b40fe70801aa2e3eb2fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/setup.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Script to build and install decoder package.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from setuptools import setup, Extension, distutils -from distutils import ccompiler -import glob -import platform -import os, sys -import multiprocessing.pool -import argparse - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_processes", - default=1, - type=int, - help="Number of cpu processes to build package. (default: %(default)d)") -args = parser.parse_known_args() - -# reconstruct sys.argv to pass to setup below -sys.argv = [sys.argv[0]] + args[1] - - -# monkey-patch for parallel compilation -# See: https://stackoverflow.com/a/13176803 -def parallelCCompile(self, - sources, - output_dir=None, - macros=None, - include_dirs=None, - debug=0, - extra_preargs=None, - extra_postargs=None, - depends=None): - # those lines are copied from distutils.ccompiler.CCompiler directly - macros, objects, extra_postargs, pp_opts, build = self._setup_compile( - output_dir, macros, include_dirs, sources, depends, extra_postargs) - cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) - - # parallel code - def _single_compile(obj): - try: - src, ext = build[obj] - except KeyError: - return - self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts) - - # convert to list, imap is evaluated on-demand - thread_pool = multiprocessing.pool.ThreadPool(args[0].num_processes) - list(thread_pool.imap(_single_compile, objects)) - return objects - - -def compile_test(header, library): - dummy_path = os.path.join(os.path.dirname(__file__), "dummy") - command = "bash -c \"g++ -include " + header \ - + " -l" + library + " -x c++ - <<<'int main() {}' -o " \ - + dummy_path + " >/dev/null 2>/dev/null && rm " \ - + dummy_path + " 2>/dev/null\"" - return os.system(command) == 0 - - -# hack compile to support parallel compiling -distutils.ccompiler.CCompiler.compile = parallelCCompile - -FILES = glob.glob('kenlm/util/*.cc') \ - + glob.glob('kenlm/lm/*.cc') \ - + glob.glob('kenlm/util/double-conversion/*.cc') - -FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') - -FILES = [ - fn for fn in FILES - if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( - 'unittest.cc')) -] - -LIBS = ['stdc++'] -if platform.system() != 'Darwin': - LIBS.append('rt') - -ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11'] - -if compile_test('zlib.h', 'z'): - ARGS.append('-DHAVE_ZLIB') - LIBS.append('z') - -if compile_test('bzlib.h', 'bz2'): - ARGS.append('-DHAVE_BZLIB') - LIBS.append('bz2') - -if compile_test('lzma.h', 'lzma'): - ARGS.append('-DHAVE_XZLIB') - LIBS.append('lzma') - -os.system('swig -python -c++ ./decoders.i') - -decoders_module = [ - Extension( - name='_swig_decoders', - sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'), - language='c++', - include_dirs=[ - '.', - 'kenlm', - 'openfst-1.6.3/src/include', - 'ThreadPool', - ], - libraries=LIBS, - extra_compile_args=ARGS) -] - -setup( - name='swig_decoders', - version='1.1', - description="""CTC decoders""", - ext_modules=decoders_module, - py_modules=['swig_decoders'], ) diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/setup.sh b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/setup.sh deleted file mode 100644 index 15774c6cef018c15977dfd4f95cf655d6e2b32ed..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/setup.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash - -if [ ! -d kenlm ]; then - git clone https://github.com/kpu/kenlm.git - echo -e "\n" -fi - -if [ ! -d openfst-1.6.3 ]; then - echo "Download and extract openfst ..." - wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.3.tar.gz - tar -xzvf openfst-1.6.3.tar.gz - echo -e "\n" -fi - -if [ ! -d ThreadPool ]; then - git clone https://github.com/progschj/ThreadPool.git - echo -e "\n" -fi - -echo "Install decoders ..." -python3 setup.py install --num_processes 10 diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/swig_decoders.py b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/swig_decoders.py deleted file mode 100644 index e6a0eb4fcc9ee4744ed6028a2c9b0461f2782de3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/swig_decoders.py +++ /dev/null @@ -1,1859 +0,0 @@ -# This file was automatically generated by SWIG (http://www.swig.org). -# Version 3.0.12 -# -# Do not make changes to this file unless you know what you are doing--modify -# the SWIG interface file instead. - -from sys import version_info as _swig_python_version_info -if _swig_python_version_info >= (2, 7, 0): - def swig_import_helper(): - import importlib - pkg = __name__.rpartition('.')[0] - mname = '.'.join((pkg, '_swig_decoders')).lstrip('.') - try: - return importlib.import_module(mname) - except ImportError: - return importlib.import_module('_swig_decoders') - _swig_decoders = swig_import_helper() - del swig_import_helper -elif _swig_python_version_info >= (2, 6, 0): - def swig_import_helper(): - from os.path import dirname - import imp - fp = None - try: - fp, pathname, description = imp.find_module('_swig_decoders', [dirname(__file__)]) - except ImportError: - import _swig_decoders - return _swig_decoders - try: - _mod = imp.load_module('_swig_decoders', fp, pathname, description) - finally: - if fp is not None: - fp.close() - return _mod - _swig_decoders = swig_import_helper() - del swig_import_helper -else: - import _swig_decoders -del _swig_python_version_info - -try: - _swig_property = property -except NameError: - pass # Python < 2.2 doesn't have 'property'. - -try: - import builtins as __builtin__ -except ImportError: - import __builtin__ - -def _swig_setattr_nondynamic(self, class_type, name, value, static=1): - if (name == "thisown"): - return self.this.own(value) - if (name == "this"): - if type(value).__name__ == 'SwigPyObject': - self.__dict__[name] = value - return - method = class_type.__swig_setmethods__.get(name, None) - if method: - return method(self, value) - if (not static): - if _newclass: - object.__setattr__(self, name, value) - else: - self.__dict__[name] = value - else: - raise AttributeError("You cannot add attributes to %s" % self) - - -def _swig_setattr(self, class_type, name, value): - return _swig_setattr_nondynamic(self, class_type, name, value, 0) - - -def _swig_getattr(self, class_type, name): - if (name == "thisown"): - return self.this.own() - method = class_type.__swig_getmethods__.get(name, None) - if method: - return method(self) - raise AttributeError("'%s' object has no attribute '%s'" % (class_type.__name__, name)) - - -def _swig_repr(self): - try: - strthis = "proxy of " + self.this.__repr__() - except __builtin__.Exception: - strthis = "" - return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,) - -try: - _object = object - _newclass = 1 -except __builtin__.Exception: - class _object: - pass - _newclass = 0 - -class SwigPyIterator(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, SwigPyIterator, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, SwigPyIterator, name) - - def __init__(self, *args, **kwargs): - raise AttributeError("No constructor defined - class is abstract") - __repr__ = _swig_repr - __swig_destroy__ = _swig_decoders.delete_SwigPyIterator - __del__ = lambda self: None - - def value(self): - return _swig_decoders.SwigPyIterator_value(self) - - def incr(self, n=1): - return _swig_decoders.SwigPyIterator_incr(self, n) - - def decr(self, n=1): - return _swig_decoders.SwigPyIterator_decr(self, n) - - def distance(self, x): - return _swig_decoders.SwigPyIterator_distance(self, x) - - def equal(self, x): - return _swig_decoders.SwigPyIterator_equal(self, x) - - def copy(self): - return _swig_decoders.SwigPyIterator_copy(self) - - def next(self): - return _swig_decoders.SwigPyIterator_next(self) - - def __next__(self): - return _swig_decoders.SwigPyIterator___next__(self) - - def previous(self): - return _swig_decoders.SwigPyIterator_previous(self) - - def advance(self, n): - return _swig_decoders.SwigPyIterator_advance(self, n) - - def __eq__(self, x): - return _swig_decoders.SwigPyIterator___eq__(self, x) - - def __ne__(self, x): - return _swig_decoders.SwigPyIterator___ne__(self, x) - - def __iadd__(self, n): - return _swig_decoders.SwigPyIterator___iadd__(self, n) - - def __isub__(self, n): - return _swig_decoders.SwigPyIterator___isub__(self, n) - - def __add__(self, n): - return _swig_decoders.SwigPyIterator___add__(self, n) - - def __sub__(self, *args): - return _swig_decoders.SwigPyIterator___sub__(self, *args) - def __iter__(self): - return self -SwigPyIterator_swigregister = _swig_decoders.SwigPyIterator_swigregister -SwigPyIterator_swigregister(SwigPyIterator) - -class PathTrie(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, PathTrie, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, PathTrie, name) - __repr__ = _swig_repr - - def __init__(self): - this = _swig_decoders.new_PathTrie() - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - __swig_destroy__ = _swig_decoders.delete_PathTrie - __del__ = lambda self: None - - def get_path_trie(self, new_char, reset=True): - return _swig_decoders.PathTrie_get_path_trie(self, new_char, reset) - - def get_path_vec(self, *args): - return _swig_decoders.PathTrie_get_path_vec(self, *args) - - def iterate_to_vec(self, output): - return _swig_decoders.PathTrie_iterate_to_vec(self, output) - - def iterate_to_vec_only(self, output): - return _swig_decoders.PathTrie_iterate_to_vec_only(self, output) - - def set_dictionary(self, dictionary): - return _swig_decoders.PathTrie_set_dictionary(self, dictionary) - - def set_matcher(self, arg2): - return _swig_decoders.PathTrie_set_matcher(self, arg2) - - def is_empty(self): - return _swig_decoders.PathTrie_is_empty(self) - - def remove(self): - return _swig_decoders.PathTrie_remove(self) - __swig_setmethods__["log_prob_b_prev"] = _swig_decoders.PathTrie_log_prob_b_prev_set - __swig_getmethods__["log_prob_b_prev"] = _swig_decoders.PathTrie_log_prob_b_prev_get - if _newclass: - log_prob_b_prev = _swig_property(_swig_decoders.PathTrie_log_prob_b_prev_get, _swig_decoders.PathTrie_log_prob_b_prev_set) - __swig_setmethods__["log_prob_nb_prev"] = _swig_decoders.PathTrie_log_prob_nb_prev_set - __swig_getmethods__["log_prob_nb_prev"] = _swig_decoders.PathTrie_log_prob_nb_prev_get - if _newclass: - log_prob_nb_prev = _swig_property(_swig_decoders.PathTrie_log_prob_nb_prev_get, _swig_decoders.PathTrie_log_prob_nb_prev_set) - __swig_setmethods__["log_prob_b_cur"] = _swig_decoders.PathTrie_log_prob_b_cur_set - __swig_getmethods__["log_prob_b_cur"] = _swig_decoders.PathTrie_log_prob_b_cur_get - if _newclass: - log_prob_b_cur = _swig_property(_swig_decoders.PathTrie_log_prob_b_cur_get, _swig_decoders.PathTrie_log_prob_b_cur_set) - __swig_setmethods__["log_prob_nb_cur"] = _swig_decoders.PathTrie_log_prob_nb_cur_set - __swig_getmethods__["log_prob_nb_cur"] = _swig_decoders.PathTrie_log_prob_nb_cur_get - if _newclass: - log_prob_nb_cur = _swig_property(_swig_decoders.PathTrie_log_prob_nb_cur_get, _swig_decoders.PathTrie_log_prob_nb_cur_set) - __swig_setmethods__["score"] = _swig_decoders.PathTrie_score_set - __swig_getmethods__["score"] = _swig_decoders.PathTrie_score_get - if _newclass: - score = _swig_property(_swig_decoders.PathTrie_score_get, _swig_decoders.PathTrie_score_set) - __swig_setmethods__["approx_ctc"] = _swig_decoders.PathTrie_approx_ctc_set - __swig_getmethods__["approx_ctc"] = _swig_decoders.PathTrie_approx_ctc_get - if _newclass: - approx_ctc = _swig_property(_swig_decoders.PathTrie_approx_ctc_get, _swig_decoders.PathTrie_approx_ctc_set) - __swig_setmethods__["character"] = _swig_decoders.PathTrie_character_set - __swig_getmethods__["character"] = _swig_decoders.PathTrie_character_get - if _newclass: - character = _swig_property(_swig_decoders.PathTrie_character_get, _swig_decoders.PathTrie_character_set) - __swig_setmethods__["parent"] = _swig_decoders.PathTrie_parent_set - __swig_getmethods__["parent"] = _swig_decoders.PathTrie_parent_get - if _newclass: - parent = _swig_property(_swig_decoders.PathTrie_parent_get, _swig_decoders.PathTrie_parent_set) -PathTrie_swigregister = _swig_decoders.PathTrie_swigregister -PathTrie_swigregister(PathTrie) - -class DoubleVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.DoubleVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.DoubleVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.DoubleVector___bool__(self) - - def __len__(self): - return _swig_decoders.DoubleVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.DoubleVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.DoubleVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.DoubleVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.DoubleVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.DoubleVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.DoubleVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.DoubleVector_pop(self) - - def append(self, x): - return _swig_decoders.DoubleVector_append(self, x) - - def empty(self): - return _swig_decoders.DoubleVector_empty(self) - - def size(self): - return _swig_decoders.DoubleVector_size(self) - - def swap(self, v): - return _swig_decoders.DoubleVector_swap(self, v) - - def begin(self): - return _swig_decoders.DoubleVector_begin(self) - - def end(self): - return _swig_decoders.DoubleVector_end(self) - - def rbegin(self): - return _swig_decoders.DoubleVector_rbegin(self) - - def rend(self): - return _swig_decoders.DoubleVector_rend(self) - - def clear(self): - return _swig_decoders.DoubleVector_clear(self) - - def get_allocator(self): - return _swig_decoders.DoubleVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.DoubleVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.DoubleVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_DoubleVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.DoubleVector_push_back(self, x) - - def front(self): - return _swig_decoders.DoubleVector_front(self) - - def back(self): - return _swig_decoders.DoubleVector_back(self) - - def assign(self, n, x): - return _swig_decoders.DoubleVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.DoubleVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.DoubleVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.DoubleVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.DoubleVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_DoubleVector - __del__ = lambda self: None -DoubleVector_swigregister = _swig_decoders.DoubleVector_swigregister -DoubleVector_swigregister(DoubleVector) - -class IntVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, IntVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, IntVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.IntVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.IntVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.IntVector___bool__(self) - - def __len__(self): - return _swig_decoders.IntVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.IntVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.IntVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.IntVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.IntVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.IntVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.IntVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.IntVector_pop(self) - - def append(self, x): - return _swig_decoders.IntVector_append(self, x) - - def empty(self): - return _swig_decoders.IntVector_empty(self) - - def size(self): - return _swig_decoders.IntVector_size(self) - - def swap(self, v): - return _swig_decoders.IntVector_swap(self, v) - - def begin(self): - return _swig_decoders.IntVector_begin(self) - - def end(self): - return _swig_decoders.IntVector_end(self) - - def rbegin(self): - return _swig_decoders.IntVector_rbegin(self) - - def rend(self): - return _swig_decoders.IntVector_rend(self) - - def clear(self): - return _swig_decoders.IntVector_clear(self) - - def get_allocator(self): - return _swig_decoders.IntVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.IntVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.IntVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_IntVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.IntVector_push_back(self, x) - - def front(self): - return _swig_decoders.IntVector_front(self) - - def back(self): - return _swig_decoders.IntVector_back(self) - - def assign(self, n, x): - return _swig_decoders.IntVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.IntVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.IntVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.IntVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.IntVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_IntVector - __del__ = lambda self: None -IntVector_swigregister = _swig_decoders.IntVector_swigregister -IntVector_swigregister(IntVector) - -class StringVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, StringVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, StringVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.StringVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.StringVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.StringVector___bool__(self) - - def __len__(self): - return _swig_decoders.StringVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.StringVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.StringVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.StringVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.StringVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.StringVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.StringVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.StringVector_pop(self) - - def append(self, x): - return _swig_decoders.StringVector_append(self, x) - - def empty(self): - return _swig_decoders.StringVector_empty(self) - - def size(self): - return _swig_decoders.StringVector_size(self) - - def swap(self, v): - return _swig_decoders.StringVector_swap(self, v) - - def begin(self): - return _swig_decoders.StringVector_begin(self) - - def end(self): - return _swig_decoders.StringVector_end(self) - - def rbegin(self): - return _swig_decoders.StringVector_rbegin(self) - - def rend(self): - return _swig_decoders.StringVector_rend(self) - - def clear(self): - return _swig_decoders.StringVector_clear(self) - - def get_allocator(self): - return _swig_decoders.StringVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.StringVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.StringVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_StringVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.StringVector_push_back(self, x) - - def front(self): - return _swig_decoders.StringVector_front(self) - - def back(self): - return _swig_decoders.StringVector_back(self) - - def assign(self, n, x): - return _swig_decoders.StringVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.StringVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.StringVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.StringVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.StringVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_StringVector - __del__ = lambda self: None -StringVector_swigregister = _swig_decoders.StringVector_swigregister -StringVector_swigregister(StringVector) - -class VectorOfStructVectorDouble(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, VectorOfStructVectorDouble, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, VectorOfStructVectorDouble, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.VectorOfStructVectorDouble_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.VectorOfStructVectorDouble___nonzero__(self) - - def __bool__(self): - return _swig_decoders.VectorOfStructVectorDouble___bool__(self) - - def __len__(self): - return _swig_decoders.VectorOfStructVectorDouble___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.VectorOfStructVectorDouble___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.VectorOfStructVectorDouble___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.VectorOfStructVectorDouble___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.VectorOfStructVectorDouble___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.VectorOfStructVectorDouble___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.VectorOfStructVectorDouble___setitem__(self, *args) - - def pop(self): - return _swig_decoders.VectorOfStructVectorDouble_pop(self) - - def append(self, x): - return _swig_decoders.VectorOfStructVectorDouble_append(self, x) - - def empty(self): - return _swig_decoders.VectorOfStructVectorDouble_empty(self) - - def size(self): - return _swig_decoders.VectorOfStructVectorDouble_size(self) - - def swap(self, v): - return _swig_decoders.VectorOfStructVectorDouble_swap(self, v) - - def begin(self): - return _swig_decoders.VectorOfStructVectorDouble_begin(self) - - def end(self): - return _swig_decoders.VectorOfStructVectorDouble_end(self) - - def rbegin(self): - return _swig_decoders.VectorOfStructVectorDouble_rbegin(self) - - def rend(self): - return _swig_decoders.VectorOfStructVectorDouble_rend(self) - - def clear(self): - return _swig_decoders.VectorOfStructVectorDouble_clear(self) - - def get_allocator(self): - return _swig_decoders.VectorOfStructVectorDouble_get_allocator(self) - - def pop_back(self): - return _swig_decoders.VectorOfStructVectorDouble_pop_back(self) - - def erase(self, *args): - return _swig_decoders.VectorOfStructVectorDouble_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_VectorOfStructVectorDouble(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.VectorOfStructVectorDouble_push_back(self, x) - - def front(self): - return _swig_decoders.VectorOfStructVectorDouble_front(self) - - def back(self): - return _swig_decoders.VectorOfStructVectorDouble_back(self) - - def assign(self, n, x): - return _swig_decoders.VectorOfStructVectorDouble_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.VectorOfStructVectorDouble_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.VectorOfStructVectorDouble_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.VectorOfStructVectorDouble_reserve(self, n) - - def capacity(self): - return _swig_decoders.VectorOfStructVectorDouble_capacity(self) - __swig_destroy__ = _swig_decoders.delete_VectorOfStructVectorDouble - __del__ = lambda self: None -VectorOfStructVectorDouble_swigregister = _swig_decoders.VectorOfStructVectorDouble_swigregister -VectorOfStructVectorDouble_swigregister(VectorOfStructVectorDouble) - -class VectorOfStructVectorInt(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, VectorOfStructVectorInt, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, VectorOfStructVectorInt, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.VectorOfStructVectorInt_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.VectorOfStructVectorInt___nonzero__(self) - - def __bool__(self): - return _swig_decoders.VectorOfStructVectorInt___bool__(self) - - def __len__(self): - return _swig_decoders.VectorOfStructVectorInt___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.VectorOfStructVectorInt___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.VectorOfStructVectorInt___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.VectorOfStructVectorInt___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.VectorOfStructVectorInt___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.VectorOfStructVectorInt___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.VectorOfStructVectorInt___setitem__(self, *args) - - def pop(self): - return _swig_decoders.VectorOfStructVectorInt_pop(self) - - def append(self, x): - return _swig_decoders.VectorOfStructVectorInt_append(self, x) - - def empty(self): - return _swig_decoders.VectorOfStructVectorInt_empty(self) - - def size(self): - return _swig_decoders.VectorOfStructVectorInt_size(self) - - def swap(self, v): - return _swig_decoders.VectorOfStructVectorInt_swap(self, v) - - def begin(self): - return _swig_decoders.VectorOfStructVectorInt_begin(self) - - def end(self): - return _swig_decoders.VectorOfStructVectorInt_end(self) - - def rbegin(self): - return _swig_decoders.VectorOfStructVectorInt_rbegin(self) - - def rend(self): - return _swig_decoders.VectorOfStructVectorInt_rend(self) - - def clear(self): - return _swig_decoders.VectorOfStructVectorInt_clear(self) - - def get_allocator(self): - return _swig_decoders.VectorOfStructVectorInt_get_allocator(self) - - def pop_back(self): - return _swig_decoders.VectorOfStructVectorInt_pop_back(self) - - def erase(self, *args): - return _swig_decoders.VectorOfStructVectorInt_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_VectorOfStructVectorInt(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.VectorOfStructVectorInt_push_back(self, x) - - def front(self): - return _swig_decoders.VectorOfStructVectorInt_front(self) - - def back(self): - return _swig_decoders.VectorOfStructVectorInt_back(self) - - def assign(self, n, x): - return _swig_decoders.VectorOfStructVectorInt_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.VectorOfStructVectorInt_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.VectorOfStructVectorInt_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.VectorOfStructVectorInt_reserve(self, n) - - def capacity(self): - return _swig_decoders.VectorOfStructVectorInt_capacity(self) - __swig_destroy__ = _swig_decoders.delete_VectorOfStructVectorInt - __del__ = lambda self: None -VectorOfStructVectorInt_swigregister = _swig_decoders.VectorOfStructVectorInt_swigregister -VectorOfStructVectorInt_swigregister(VectorOfStructVectorInt) - -class FloatVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, FloatVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, FloatVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.FloatVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.FloatVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.FloatVector___bool__(self) - - def __len__(self): - return _swig_decoders.FloatVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.FloatVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.FloatVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.FloatVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.FloatVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.FloatVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.FloatVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.FloatVector_pop(self) - - def append(self, x): - return _swig_decoders.FloatVector_append(self, x) - - def empty(self): - return _swig_decoders.FloatVector_empty(self) - - def size(self): - return _swig_decoders.FloatVector_size(self) - - def swap(self, v): - return _swig_decoders.FloatVector_swap(self, v) - - def begin(self): - return _swig_decoders.FloatVector_begin(self) - - def end(self): - return _swig_decoders.FloatVector_end(self) - - def rbegin(self): - return _swig_decoders.FloatVector_rbegin(self) - - def rend(self): - return _swig_decoders.FloatVector_rend(self) - - def clear(self): - return _swig_decoders.FloatVector_clear(self) - - def get_allocator(self): - return _swig_decoders.FloatVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.FloatVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.FloatVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_FloatVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.FloatVector_push_back(self, x) - - def front(self): - return _swig_decoders.FloatVector_front(self) - - def back(self): - return _swig_decoders.FloatVector_back(self) - - def assign(self, n, x): - return _swig_decoders.FloatVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.FloatVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.FloatVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.FloatVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.FloatVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_FloatVector - __del__ = lambda self: None -FloatVector_swigregister = _swig_decoders.FloatVector_swigregister -FloatVector_swigregister(FloatVector) - -class Pair(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, Pair, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, Pair, name) - __repr__ = _swig_repr - - def __init__(self, *args): - this = _swig_decoders.new_Pair(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - __swig_setmethods__["first"] = _swig_decoders.Pair_first_set - __swig_getmethods__["first"] = _swig_decoders.Pair_first_get - if _newclass: - first = _swig_property(_swig_decoders.Pair_first_get, _swig_decoders.Pair_first_set) - __swig_setmethods__["second"] = _swig_decoders.Pair_second_set - __swig_getmethods__["second"] = _swig_decoders.Pair_second_get - if _newclass: - second = _swig_property(_swig_decoders.Pair_second_get, _swig_decoders.Pair_second_set) - def __len__(self): - return 2 - def __repr__(self): - return str((self.first, self.second)) - def __getitem__(self, index): - if not (index % 2): - return self.first - else: - return self.second - def __setitem__(self, index, val): - if not (index % 2): - self.first = val - else: - self.second = val - __swig_destroy__ = _swig_decoders.delete_Pair - __del__ = lambda self: None -Pair_swigregister = _swig_decoders.Pair_swigregister -Pair_swigregister(Pair) - -class PairFloatVectorVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, PairFloatVectorVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, PairFloatVectorVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.PairFloatVectorVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.PairFloatVectorVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.PairFloatVectorVector___bool__(self) - - def __len__(self): - return _swig_decoders.PairFloatVectorVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.PairFloatVectorVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.PairFloatVectorVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.PairFloatVectorVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.PairFloatVectorVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.PairFloatVectorVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.PairFloatVectorVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.PairFloatVectorVector_pop(self) - - def append(self, x): - return _swig_decoders.PairFloatVectorVector_append(self, x) - - def empty(self): - return _swig_decoders.PairFloatVectorVector_empty(self) - - def size(self): - return _swig_decoders.PairFloatVectorVector_size(self) - - def swap(self, v): - return _swig_decoders.PairFloatVectorVector_swap(self, v) - - def begin(self): - return _swig_decoders.PairFloatVectorVector_begin(self) - - def end(self): - return _swig_decoders.PairFloatVectorVector_end(self) - - def rbegin(self): - return _swig_decoders.PairFloatVectorVector_rbegin(self) - - def rend(self): - return _swig_decoders.PairFloatVectorVector_rend(self) - - def clear(self): - return _swig_decoders.PairFloatVectorVector_clear(self) - - def get_allocator(self): - return _swig_decoders.PairFloatVectorVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.PairFloatVectorVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.PairFloatVectorVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_PairFloatVectorVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.PairFloatVectorVector_push_back(self, x) - - def front(self): - return _swig_decoders.PairFloatVectorVector_front(self) - - def back(self): - return _swig_decoders.PairFloatVectorVector_back(self) - - def assign(self, n, x): - return _swig_decoders.PairFloatVectorVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.PairFloatVectorVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.PairFloatVectorVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.PairFloatVectorVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.PairFloatVectorVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_PairFloatVectorVector - __del__ = lambda self: None -PairFloatVectorVector_swigregister = _swig_decoders.PairFloatVectorVector_swigregister -PairFloatVectorVector_swigregister(PairFloatVectorVector) - -class PairDoubleVectorVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, PairDoubleVectorVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, PairDoubleVectorVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.PairDoubleVectorVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.PairDoubleVectorVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.PairDoubleVectorVector___bool__(self) - - def __len__(self): - return _swig_decoders.PairDoubleVectorVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.PairDoubleVectorVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.PairDoubleVectorVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.PairDoubleVectorVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.PairDoubleVectorVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.PairDoubleVectorVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.PairDoubleVectorVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.PairDoubleVectorVector_pop(self) - - def append(self, x): - return _swig_decoders.PairDoubleVectorVector_append(self, x) - - def empty(self): - return _swig_decoders.PairDoubleVectorVector_empty(self) - - def size(self): - return _swig_decoders.PairDoubleVectorVector_size(self) - - def swap(self, v): - return _swig_decoders.PairDoubleVectorVector_swap(self, v) - - def begin(self): - return _swig_decoders.PairDoubleVectorVector_begin(self) - - def end(self): - return _swig_decoders.PairDoubleVectorVector_end(self) - - def rbegin(self): - return _swig_decoders.PairDoubleVectorVector_rbegin(self) - - def rend(self): - return _swig_decoders.PairDoubleVectorVector_rend(self) - - def clear(self): - return _swig_decoders.PairDoubleVectorVector_clear(self) - - def get_allocator(self): - return _swig_decoders.PairDoubleVectorVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.PairDoubleVectorVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.PairDoubleVectorVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_PairDoubleVectorVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.PairDoubleVectorVector_push_back(self, x) - - def front(self): - return _swig_decoders.PairDoubleVectorVector_front(self) - - def back(self): - return _swig_decoders.PairDoubleVectorVector_back(self) - - def assign(self, n, x): - return _swig_decoders.PairDoubleVectorVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.PairDoubleVectorVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.PairDoubleVectorVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.PairDoubleVectorVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.PairDoubleVectorVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_PairDoubleVectorVector - __del__ = lambda self: None -PairDoubleVectorVector_swigregister = _swig_decoders.PairDoubleVectorVector_swigregister -PairDoubleVectorVector_swigregister(PairDoubleVectorVector) - -class PairDoubleVectorVector2(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, PairDoubleVectorVector2, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, PairDoubleVectorVector2, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.PairDoubleVectorVector2_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.PairDoubleVectorVector2___nonzero__(self) - - def __bool__(self): - return _swig_decoders.PairDoubleVectorVector2___bool__(self) - - def __len__(self): - return _swig_decoders.PairDoubleVectorVector2___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.PairDoubleVectorVector2___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.PairDoubleVectorVector2___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.PairDoubleVectorVector2___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.PairDoubleVectorVector2___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.PairDoubleVectorVector2___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.PairDoubleVectorVector2___setitem__(self, *args) - - def pop(self): - return _swig_decoders.PairDoubleVectorVector2_pop(self) - - def append(self, x): - return _swig_decoders.PairDoubleVectorVector2_append(self, x) - - def empty(self): - return _swig_decoders.PairDoubleVectorVector2_empty(self) - - def size(self): - return _swig_decoders.PairDoubleVectorVector2_size(self) - - def swap(self, v): - return _swig_decoders.PairDoubleVectorVector2_swap(self, v) - - def begin(self): - return _swig_decoders.PairDoubleVectorVector2_begin(self) - - def end(self): - return _swig_decoders.PairDoubleVectorVector2_end(self) - - def rbegin(self): - return _swig_decoders.PairDoubleVectorVector2_rbegin(self) - - def rend(self): - return _swig_decoders.PairDoubleVectorVector2_rend(self) - - def clear(self): - return _swig_decoders.PairDoubleVectorVector2_clear(self) - - def get_allocator(self): - return _swig_decoders.PairDoubleVectorVector2_get_allocator(self) - - def pop_back(self): - return _swig_decoders.PairDoubleVectorVector2_pop_back(self) - - def erase(self, *args): - return _swig_decoders.PairDoubleVectorVector2_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_PairDoubleVectorVector2(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.PairDoubleVectorVector2_push_back(self, x) - - def front(self): - return _swig_decoders.PairDoubleVectorVector2_front(self) - - def back(self): - return _swig_decoders.PairDoubleVectorVector2_back(self) - - def assign(self, n, x): - return _swig_decoders.PairDoubleVectorVector2_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.PairDoubleVectorVector2_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.PairDoubleVectorVector2_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.PairDoubleVectorVector2_reserve(self, n) - - def capacity(self): - return _swig_decoders.PairDoubleVectorVector2_capacity(self) - __swig_destroy__ = _swig_decoders.delete_PairDoubleVectorVector2 - __del__ = lambda self: None -PairDoubleVectorVector2_swigregister = _swig_decoders.PairDoubleVectorVector2_swigregister -PairDoubleVectorVector2_swigregister(PairDoubleVectorVector2) - -class DoubleVector3(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector3, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, DoubleVector3, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.DoubleVector3_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.DoubleVector3___nonzero__(self) - - def __bool__(self): - return _swig_decoders.DoubleVector3___bool__(self) - - def __len__(self): - return _swig_decoders.DoubleVector3___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.DoubleVector3___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.DoubleVector3___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.DoubleVector3___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.DoubleVector3___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.DoubleVector3___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.DoubleVector3___setitem__(self, *args) - - def pop(self): - return _swig_decoders.DoubleVector3_pop(self) - - def append(self, x): - return _swig_decoders.DoubleVector3_append(self, x) - - def empty(self): - return _swig_decoders.DoubleVector3_empty(self) - - def size(self): - return _swig_decoders.DoubleVector3_size(self) - - def swap(self, v): - return _swig_decoders.DoubleVector3_swap(self, v) - - def begin(self): - return _swig_decoders.DoubleVector3_begin(self) - - def end(self): - return _swig_decoders.DoubleVector3_end(self) - - def rbegin(self): - return _swig_decoders.DoubleVector3_rbegin(self) - - def rend(self): - return _swig_decoders.DoubleVector3_rend(self) - - def clear(self): - return _swig_decoders.DoubleVector3_clear(self) - - def get_allocator(self): - return _swig_decoders.DoubleVector3_get_allocator(self) - - def pop_back(self): - return _swig_decoders.DoubleVector3_pop_back(self) - - def erase(self, *args): - return _swig_decoders.DoubleVector3_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_DoubleVector3(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.DoubleVector3_push_back(self, x) - - def front(self): - return _swig_decoders.DoubleVector3_front(self) - - def back(self): - return _swig_decoders.DoubleVector3_back(self) - - def assign(self, n, x): - return _swig_decoders.DoubleVector3_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.DoubleVector3_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.DoubleVector3_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.DoubleVector3_reserve(self, n) - - def capacity(self): - return _swig_decoders.DoubleVector3_capacity(self) - __swig_destroy__ = _swig_decoders.delete_DoubleVector3 - __del__ = lambda self: None -DoubleVector3_swigregister = _swig_decoders.DoubleVector3_swigregister -DoubleVector3_swigregister(DoubleVector3) - -class IntVector3(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, IntVector3, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, IntVector3, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.IntVector3_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.IntVector3___nonzero__(self) - - def __bool__(self): - return _swig_decoders.IntVector3___bool__(self) - - def __len__(self): - return _swig_decoders.IntVector3___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.IntVector3___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.IntVector3___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.IntVector3___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.IntVector3___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.IntVector3___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.IntVector3___setitem__(self, *args) - - def pop(self): - return _swig_decoders.IntVector3_pop(self) - - def append(self, x): - return _swig_decoders.IntVector3_append(self, x) - - def empty(self): - return _swig_decoders.IntVector3_empty(self) - - def size(self): - return _swig_decoders.IntVector3_size(self) - - def swap(self, v): - return _swig_decoders.IntVector3_swap(self, v) - - def begin(self): - return _swig_decoders.IntVector3_begin(self) - - def end(self): - return _swig_decoders.IntVector3_end(self) - - def rbegin(self): - return _swig_decoders.IntVector3_rbegin(self) - - def rend(self): - return _swig_decoders.IntVector3_rend(self) - - def clear(self): - return _swig_decoders.IntVector3_clear(self) - - def get_allocator(self): - return _swig_decoders.IntVector3_get_allocator(self) - - def pop_back(self): - return _swig_decoders.IntVector3_pop_back(self) - - def erase(self, *args): - return _swig_decoders.IntVector3_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_IntVector3(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.IntVector3_push_back(self, x) - - def front(self): - return _swig_decoders.IntVector3_front(self) - - def back(self): - return _swig_decoders.IntVector3_back(self) - - def assign(self, n, x): - return _swig_decoders.IntVector3_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.IntVector3_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.IntVector3_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.IntVector3_reserve(self, n) - - def capacity(self): - return _swig_decoders.IntVector3_capacity(self) - __swig_destroy__ = _swig_decoders.delete_IntVector3 - __del__ = lambda self: None -IntVector3_swigregister = _swig_decoders.IntVector3_swigregister -IntVector3_swigregister(IntVector3) - -class TrieVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, TrieVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, TrieVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.TrieVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.TrieVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.TrieVector___bool__(self) - - def __len__(self): - return _swig_decoders.TrieVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.TrieVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.TrieVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.TrieVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.TrieVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.TrieVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.TrieVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.TrieVector_pop(self) - - def append(self, x): - return _swig_decoders.TrieVector_append(self, x) - - def empty(self): - return _swig_decoders.TrieVector_empty(self) - - def size(self): - return _swig_decoders.TrieVector_size(self) - - def swap(self, v): - return _swig_decoders.TrieVector_swap(self, v) - - def begin(self): - return _swig_decoders.TrieVector_begin(self) - - def end(self): - return _swig_decoders.TrieVector_end(self) - - def rbegin(self): - return _swig_decoders.TrieVector_rbegin(self) - - def rend(self): - return _swig_decoders.TrieVector_rend(self) - - def clear(self): - return _swig_decoders.TrieVector_clear(self) - - def get_allocator(self): - return _swig_decoders.TrieVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.TrieVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.TrieVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_TrieVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.TrieVector_push_back(self, x) - - def front(self): - return _swig_decoders.TrieVector_front(self) - - def back(self): - return _swig_decoders.TrieVector_back(self) - - def assign(self, n, x): - return _swig_decoders.TrieVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.TrieVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.TrieVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.TrieVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.TrieVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_TrieVector - __del__ = lambda self: None -TrieVector_swigregister = _swig_decoders.TrieVector_swigregister -TrieVector_swigregister(TrieVector) - -class BoolVector(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, BoolVector, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, BoolVector, name) - __repr__ = _swig_repr - - def iterator(self): - return _swig_decoders.BoolVector_iterator(self) - def __iter__(self): - return self.iterator() - - def __nonzero__(self): - return _swig_decoders.BoolVector___nonzero__(self) - - def __bool__(self): - return _swig_decoders.BoolVector___bool__(self) - - def __len__(self): - return _swig_decoders.BoolVector___len__(self) - - def __getslice__(self, i, j): - return _swig_decoders.BoolVector___getslice__(self, i, j) - - def __setslice__(self, *args): - return _swig_decoders.BoolVector___setslice__(self, *args) - - def __delslice__(self, i, j): - return _swig_decoders.BoolVector___delslice__(self, i, j) - - def __delitem__(self, *args): - return _swig_decoders.BoolVector___delitem__(self, *args) - - def __getitem__(self, *args): - return _swig_decoders.BoolVector___getitem__(self, *args) - - def __setitem__(self, *args): - return _swig_decoders.BoolVector___setitem__(self, *args) - - def pop(self): - return _swig_decoders.BoolVector_pop(self) - - def append(self, x): - return _swig_decoders.BoolVector_append(self, x) - - def empty(self): - return _swig_decoders.BoolVector_empty(self) - - def size(self): - return _swig_decoders.BoolVector_size(self) - - def swap(self, v): - return _swig_decoders.BoolVector_swap(self, v) - - def begin(self): - return _swig_decoders.BoolVector_begin(self) - - def end(self): - return _swig_decoders.BoolVector_end(self) - - def rbegin(self): - return _swig_decoders.BoolVector_rbegin(self) - - def rend(self): - return _swig_decoders.BoolVector_rend(self) - - def clear(self): - return _swig_decoders.BoolVector_clear(self) - - def get_allocator(self): - return _swig_decoders.BoolVector_get_allocator(self) - - def pop_back(self): - return _swig_decoders.BoolVector_pop_back(self) - - def erase(self, *args): - return _swig_decoders.BoolVector_erase(self, *args) - - def __init__(self, *args): - this = _swig_decoders.new_BoolVector(*args) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def push_back(self, x): - return _swig_decoders.BoolVector_push_back(self, x) - - def front(self): - return _swig_decoders.BoolVector_front(self) - - def back(self): - return _swig_decoders.BoolVector_back(self) - - def assign(self, n, x): - return _swig_decoders.BoolVector_assign(self, n, x) - - def resize(self, *args): - return _swig_decoders.BoolVector_resize(self, *args) - - def insert(self, *args): - return _swig_decoders.BoolVector_insert(self, *args) - - def reserve(self, n): - return _swig_decoders.BoolVector_reserve(self, n) - - def capacity(self): - return _swig_decoders.BoolVector_capacity(self) - __swig_destroy__ = _swig_decoders.delete_BoolVector - __del__ = lambda self: None -BoolVector_swigregister = _swig_decoders.BoolVector_swigregister -BoolVector_swigregister(BoolVector) - - -def IntDoublePairCompSecondRev(a, b): - return _swig_decoders.IntDoublePairCompSecondRev(a, b) -IntDoublePairCompSecondRev = _swig_decoders.IntDoublePairCompSecondRev - -def StringDoublePairCompSecondRev(a, b): - return _swig_decoders.StringDoublePairCompSecondRev(a, b) -StringDoublePairCompSecondRev = _swig_decoders.StringDoublePairCompSecondRev - -def DoubleStringPairCompFirstRev(a, b): - return _swig_decoders.DoubleStringPairCompFirstRev(a, b) -DoubleStringPairCompFirstRev = _swig_decoders.DoubleStringPairCompFirstRev -class RetriveStrEnumerateVocab(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, RetriveStrEnumerateVocab, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, RetriveStrEnumerateVocab, name) - __repr__ = _swig_repr - - def __init__(self): - this = _swig_decoders.new_RetriveStrEnumerateVocab() - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - - def Add(self, index, str): - return _swig_decoders.RetriveStrEnumerateVocab_Add(self, index, str) - __swig_setmethods__["vocabulary"] = _swig_decoders.RetriveStrEnumerateVocab_vocabulary_set - __swig_getmethods__["vocabulary"] = _swig_decoders.RetriveStrEnumerateVocab_vocabulary_get - if _newclass: - vocabulary = _swig_property(_swig_decoders.RetriveStrEnumerateVocab_vocabulary_get, _swig_decoders.RetriveStrEnumerateVocab_vocabulary_set) - __swig_destroy__ = _swig_decoders.delete_RetriveStrEnumerateVocab - __del__ = lambda self: None -RetriveStrEnumerateVocab_swigregister = _swig_decoders.RetriveStrEnumerateVocab_swigregister -RetriveStrEnumerateVocab_swigregister(RetriveStrEnumerateVocab) -cvar = _swig_decoders.cvar -OOV_SCORE = cvar.OOV_SCORE -START_TOKEN = cvar.START_TOKEN -UNK_TOKEN = cvar.UNK_TOKEN -END_TOKEN = cvar.END_TOKEN - -class Scorer(_object): - __swig_setmethods__ = {} - __setattr__ = lambda self, name, value: _swig_setattr(self, Scorer, name, value) - __swig_getmethods__ = {} - __getattr__ = lambda self, name: _swig_getattr(self, Scorer, name) - __repr__ = _swig_repr - - def __init__(self, alpha, beta, lm_path, vocabulary): - this = _swig_decoders.new_Scorer(alpha, beta, lm_path, vocabulary) - try: - self.this.append(this) - except __builtin__.Exception: - self.this = this - __swig_destroy__ = _swig_decoders.delete_Scorer - __del__ = lambda self: None - - def get_log_cond_prob(self, words): - return _swig_decoders.Scorer_get_log_cond_prob(self, words) - - def get_sent_log_prob(self, words): - return _swig_decoders.Scorer_get_sent_log_prob(self, words) - - def get_max_order(self): - return _swig_decoders.Scorer_get_max_order(self) - - def get_dict_size(self): - return _swig_decoders.Scorer_get_dict_size(self) - - def is_character_based(self): - return _swig_decoders.Scorer_is_character_based(self) - - def reset_params(self, alpha, beta): - return _swig_decoders.Scorer_reset_params(self, alpha, beta) - - def make_ngram(self, prefix): - return _swig_decoders.Scorer_make_ngram(self, prefix) - - def split_labels(self, labels): - return _swig_decoders.Scorer_split_labels(self, labels) - __swig_setmethods__["alpha"] = _swig_decoders.Scorer_alpha_set - __swig_getmethods__["alpha"] = _swig_decoders.Scorer_alpha_get - if _newclass: - alpha = _swig_property(_swig_decoders.Scorer_alpha_get, _swig_decoders.Scorer_alpha_set) - __swig_setmethods__["beta"] = _swig_decoders.Scorer_beta_set - __swig_getmethods__["beta"] = _swig_decoders.Scorer_beta_get - if _newclass: - beta = _swig_property(_swig_decoders.Scorer_beta_get, _swig_decoders.Scorer_beta_set) - __swig_setmethods__["dictionary"] = _swig_decoders.Scorer_dictionary_set - __swig_getmethods__["dictionary"] = _swig_decoders.Scorer_dictionary_get - if _newclass: - dictionary = _swig_property(_swig_decoders.Scorer_dictionary_get, _swig_decoders.Scorer_dictionary_set) -Scorer_swigregister = _swig_decoders.Scorer_swigregister -Scorer_swigregister(Scorer) - - -def ctc_beam_search_decoder(log_probs_seq, log_probs_idx, root, start, beam_size, blank_id=0, space_id=-1, cutoff_prob=0.999, ext_scorer=None): - return _swig_decoders.ctc_beam_search_decoder(log_probs_seq, log_probs_idx, root, start, beam_size, blank_id, space_id, cutoff_prob, ext_scorer) -ctc_beam_search_decoder = _swig_decoders.ctc_beam_search_decoder - -def ctc_beam_search_decoder_batch(batch_log_probs_seq, batch_log_probs_idx, batch_root_trie, batch_start, beam_size, num_processes, blank_id=0, space_id=-1, cutoff_prob=0.999, ext_scorer=None): - return _swig_decoders.ctc_beam_search_decoder_batch(batch_log_probs_seq, batch_log_probs_idx, batch_root_trie, batch_start, beam_size, num_processes, blank_id, space_id, cutoff_prob, ext_scorer) -ctc_beam_search_decoder_batch = _swig_decoders.ctc_beam_search_decoder_batch - -def map_sent(sent, vocabulary, greedy=False, blank_id=0): - return _swig_decoders.map_sent(sent, vocabulary, greedy, blank_id) -map_sent = _swig_decoders.map_sent - -def map_batch(batch_sents, vocabulary, num_processes, greedy=False, blank_id=0): - return _swig_decoders.map_batch(batch_sents, vocabulary, num_processes, greedy, blank_id) -map_batch = _swig_decoders.map_batch -# This file is compatible with both classic and new-style classes. - - diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/test/test_en.py b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/test/test_en.py deleted file mode 100644 index d777360a670753df0c783772de70be12aba16ac4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/test/test_en.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import swig_decoders as decoder -import numpy as np - -probs_seq = [[ - 0.06390443, 0.21124858, 0.27323887, 0.06870235, 0.0361254, - 0.18184413, 0.16493624 - ], [ - 0.03309247, 0.22866108, 0.24390638, 0.09699597, 0.31895462, - 0.0094893, 0.06890021 - ], [ - 0.218104, 0.19992557, 0.18245131, 0.08503348, 0.14903535, - 0.08424043, 0.08120984 - ], [ - 0.12094152, 0.19162472, 0.01473646, 0.28045061, 0.24246305, - 0.05206269, 0.09772094 - ], [ - 0.1333387, 0.00550838, 0.00301669, 0.21745861, 0.20803985, - 0.41317442, 0.01946335 - ], [ - 0.16468227, 0.1980699, 0.1906545, 0.18963251, 0.19860937, - 0.04377724, 0.01457421 - ]] - -vocab_list = ["\'", " ", "a", "b", "c", "d"] - -log_prob_seq = np.log(np.array(probs_seq, dtype=np.float32)) -log_probs_idx = np.argsort(log_prob_seq, axis=-1)[:, ::-1] -log_prob_seq = np.sort(log_prob_seq, axis=-1)[:, ::-1] - -root = decoder.PathTrie() -root.score = root.log_prob_b_prev = 0.0 -beam_size=20 - -chunk_log_prob_seq = [li.tolist() for li in log_prob_seq] -chunk_log_probs_idx = [li.tolist() for li in log_probs_idx] - -alpha = 0.5 -beta = 0.5 -lm_path = '../kenlm/lm/test.arpa' -scorer = decoder.Scorer(alpha, beta, lm_path, vocab_list) - -root2 = decoder.TrieVector() -temp_dict = {} -for i in range(2): - root = decoder.PathTrie() - temp_dict[i] = root - root2.push_back(root) - - -batch_chunk_log_prob_seq = [chunk_log_prob_seq, chunk_log_prob_seq] -batch_chunk_log_probs_idx = [chunk_log_probs_idx, chunk_log_probs_idx] -batch_chunk_length = [6, 6] -batch_start = [True, True] - -result1 = decoder.ctc_beam_search_decoder_batch(batch_chunk_log_prob_seq, - batch_chunk_log_probs_idx, - root2, - batch_start, - beam_size, 1, 6, 1, 0.9999, scorer) -# print single sentence result -print(decoder.map_sent(result1[0][0][1], vocab_list)) -print(result1[0]) - -# Test stateful decoder -# continue decoding -batch_start = [False, False] -result2 = decoder.ctc_beam_search_decoder_batch(batch_chunk_log_prob_seq, - batch_chunk_log_probs_idx, - root2, - batch_start, - beam_size, 1, 6, 1, 0.9999, scorer) - -print(decoder.map_batch([result1[0][0][1], result1[1][0][1]], vocab_list, 1)) diff --git a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/test/test_zh.py b/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/test/test_zh.py deleted file mode 100644 index 34d25de10e8ec36ae8069c187ea6b88863204eca..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/ctc_decoder/swig/test/test_zh.py +++ /dev/null @@ -1,142 +0,0 @@ - -import numpy as np -import logging -from swig_decoders import TrieVector, ctc_beam_search_decoder_batch, \ - map_sent, map_batch, \ - PathTrie, TrieVector -import multiprocessing - -logging.basicConfig(filename='out.log', level=logging.INFO) - -def test_prefix_beam_search(batch_log_ctc_probs, batch_lens, beam_size, blank_id, space_id, cutoff_prob=0.999): - """ - Prefix beam search - Params: - batch_log_probs: B x T x V, the log probabilities of a sequence - batch_lens: B, the actual length of each sequence - Return: - hyps: a batch of beam candidates for each sequence - [[(score, cand_list1), (score, cand_list2), ....(score, cand_list_beam)], - [(score, cand_list1), (score, candi_list2), ...], - ... - []] - """ - #batch_log_probs_seq, batch_log_probs_idx = torch.topk(batch_log_ctc_probs, beam_size, dim=-1) - batch_log_probs_idx = np.argsort(batch_log_ctc_probs, axis=-1)[:, :, ::-1] - batch_log_probs_seq = np.sort(batch_log_ctc_probs, axis=-1)[:, :, ::-1] - batch_log_probs_seq_list = batch_log_probs_seq.tolist() - batch_log_probs_idx_list = batch_log_probs_idx.tolist() - batch_len_list = batch_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append(batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append(batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - num_processes = min(multiprocessing.cpu_count()-1, len(batch_log_probs_seq)) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - blank_id, - space_id, - cutoff_prob) - return score_hyps - -def test_batch_greedy_search(batch_log_ctc_probs, batch_lens, vocab_list, blank_id): - """ - Greedy search - Params: - batch_log_ctc_probs: B x T x V - batch_lens: B - vocab_list: a list of symbols, of size V - blank_id: id for blank symbol - Return: - batch of decoded string sentences - """ - - sort_ids = np.argsort(batch_log_ctc_probs, axis=-1)[:, :, ::-1] - batch_greedy_ids = sort_ids[:, :, 0].tolist() - batch_len_list = batch_lens.tolist() - batch_ids = [] - for seq_ids, seq_len in zip(batch_greedy_ids, batch_len_list): - batch_ids.append(seq_ids[0: seq_len]) - num_processes = min(multiprocessing.cpu_count()-1, len(batch_ids)) - greedy = True - result = map_batch(batch_ids, vocab_list, num_processes, greedy, blank_id) - return result - - -def test_map_batch(batch_sent_list, vocab_list, blank_id): - """ - Map a batch of sent ids to string - Prams: - batch_sent_list: a list of list of ids - vocab_list: a list of symbols, of size V - blank_id: id for blank symbol - """ - num_processes = min(multiprocessing.cpu_count()-1, len(batch_sent_list)) - greedy = False # this is not used for greedy search so we set it to false - results = map_batch(batch_sent_list, vocab_list, num_processes, greedy, blank_id) - return results - -def test_map_sent(sent_ids, vocab_list, greedy, blank_id): - """ - Map one sentence ids to string - greedy: False, just map. True, use ctc greedy search - """ - return map_sent(sent_ids, vocab_list, greedy, blank_id) - - -def load_vocab(vocab_file): - vocab = [] - with open(vocab_file, "r", encoding="utf-8") as f: - for line in f: - line = line.strip().split() - vocab.append(line[0]) - return vocab - -if __name__ == "__main__": - input = "data/test.npz" - word = "data/words.txt" - beam_size = 10 - blank_id = 0 - space_id = 45 - - vocab_list = load_vocab(word) - input = np.load(input) - batch_log_ctc_probs = input['batch_log_ctc_probs'] - batch_len = input["batch_len"] - # ctc prefix beam search - logging.info("Testing ctc prefix beam search") - score_hyps = test_prefix_beam_search(batch_log_ctc_probs, - batch_len, - beam_size, - blank_id, - space_id, - cutoff_prob=0.999) - # map the most probable cand ids to string - batch_ids = [score_hyps[0][0][1], score_hyps[1][0][1]] - map_sents = test_map_batch(batch_ids, vocab_list, blank_id) - logging.info(map_sents) - - logging.info("Testing greedy search") - # greedy search - greedy_sents = test_batch_greedy_search(batch_log_ctc_probs, - batch_len, - vocab_list, - blank_id) - logging.info(greedy_sents) - - logging.info("Test one sentence") - sent_ids = score_hyps[0][0][1] - one_sent = test_map_sent(sent_ids, vocab_list, False, blank_id) - logging.info(one_sent) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/data.list b/models/audio/speech_recognition/conformer/igie/data.list deleted file mode 100644 index d584b0ee8d13cd0a83182da8edd84d5dcc547f56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/data.list +++ /dev/null @@ -1,7176 +0,0 @@ -{"key": "BAC009S0764W0121", "wav": "./aishell/wav/test/S0764/BAC009S0764W0121.wav", "txt": "甚至出现交易几乎停滞的情况"} -{"key": "BAC009S0764W0122", "wav": "./aishell/wav/test/S0764/BAC009S0764W0122.wav", "txt": "一二线城市虽然也处于调整中"} -{"key": "BAC009S0764W0123", "wav": "./aishell/wav/test/S0764/BAC009S0764W0123.wav", "txt": "但因为聚集了过多公共资源"} -{"key": "BAC009S0764W0124", "wav": "./aishell/wav/test/S0764/BAC009S0764W0124.wav", "txt": "为了规避三四线城市明显过剩的市场风险"} -{"key": "BAC009S0764W0125", "wav": "./aishell/wav/test/S0764/BAC009S0764W0125.wav", "txt": "标杆房企必然调整市场战略"} -{"key": "BAC009S0764W0126", "wav": "./aishell/wav/test/S0764/BAC009S0764W0126.wav", "txt": "因此土地储备至关重要"} -{"key": "BAC009S0764W0127", "wav": "./aishell/wav/test/S0764/BAC009S0764W0127.wav", "txt": "中原地产首席分析师张大伟说"} -{"key": "BAC009S0764W0128", "wav": "./aishell/wav/test/S0764/BAC009S0764W0128.wav", "txt": "一线城市土地供应量减少"} -{"key": "BAC009S0764W0129", "wav": "./aishell/wav/test/S0764/BAC009S0764W0129.wav", "txt": "也助推了土地市场的火爆"} -{"key": "BAC009S0764W0130", "wav": "./aishell/wav/test/S0764/BAC009S0764W0130.wav", "txt": "北京仅新增住宅土地供应十宗"} -{"key": "BAC009S0764W0131", "wav": "./aishell/wav/test/S0764/BAC009S0764W0131.wav", "txt": "开发边界将作为城市发展的刚性约定"} -{"key": "BAC009S0764W0132", "wav": "./aishell/wav/test/S0764/BAC009S0764W0132.wav", "txt": "不得超越界限盲目扩张"} -{"key": "BAC009S0764W0133", "wav": "./aishell/wav/test/S0764/BAC009S0764W0133.wav", "txt": "目前挂牌的只有几宗土地"} -{"key": "BAC009S0764W0134", "wav": "./aishell/wav/test/S0764/BAC009S0764W0134.wav", "txt": "再加上近期一二线楼市升温"} -{"key": "BAC009S0764W0135", "wav": "./aishell/wav/test/S0764/BAC009S0764W0135.wav", "txt": "房企对土地的争抢更加积极"} -{"key": "BAC009S0764W0136", "wav": "./aishell/wav/test/S0764/BAC009S0764W0136.wav", "txt": "土地市场体现了房企对一二线市场的看重"} -{"key": "BAC009S0764W0137", "wav": "./aishell/wav/test/S0764/BAC009S0764W0137.wav", "txt": "面包价格会跟风上涨吗"} -{"key": "BAC009S0764W0138", "wav": "./aishell/wav/test/S0764/BAC009S0764W0138.wav", "txt": "成交量环比大幅增加"} -{"key": "BAC009S0764W0139", "wav": "./aishell/wav/test/S0764/BAC009S0764W0139.wav", "txt": "国家统计局的数据显示"} -{"key": "BAC009S0764W0140", "wav": "./aishell/wav/test/S0764/BAC009S0764W0140.wav", "txt": "其中广州深圳甚至出现了多个日光盘"} -{"key": "BAC009S0764W0141", "wav": "./aishell/wav/test/S0764/BAC009S0764W0141.wav", "txt": "零三年到去年"} -{"key": "BAC009S0764W0142", "wav": "./aishell/wav/test/S0764/BAC009S0764W0142.wav", "txt": "市场基数已不可同日而语"} -{"key": "BAC009S0764W0143", "wav": "./aishell/wav/test/S0764/BAC009S0764W0143.wav", "txt": "在市场整体从高速增长进入中高速增长区间的同时"} -{"key": "BAC009S0764W0144", "wav": "./aishell/wav/test/S0764/BAC009S0764W0144.wav", "txt": "一线城市在价格较高的基础上整体回升并领涨全国"} -{"key": "BAC009S0764W0145", "wav": "./aishell/wav/test/S0764/BAC009S0764W0145.wav", "txt": "绝大部分三线城市房价仍然下降"} -{"key": "BAC009S0764W0146", "wav": "./aishell/wav/test/S0764/BAC009S0764W0146.wav", "txt": "一线楼市成交量激增"} -{"key": "BAC009S0764W0147", "wav": "./aishell/wav/test/S0764/BAC009S0764W0147.wav", "txt": "三四线城市依然冷清"} -{"key": "BAC009S0764W0148", "wav": "./aishell/wav/test/S0764/BAC009S0764W0148.wav", "txt": "根据中原地产研究中心最新数据"} -{"key": "BAC009S0764W0149", "wav": "./aishell/wav/test/S0764/BAC009S0764W0149.wav", "txt": "一线城市签约十七万套"} -{"key": "BAC009S0764W0150", "wav": "./aishell/wav/test/S0764/BAC009S0764W0150.wav", "txt": "同比涨幅达到百分之四"} -{"key": "BAC009S0764W0151", "wav": "./aishell/wav/test/S0764/BAC009S0764W0151.wav", "txt": "三线城市签约十六万套"} -{"key": "BAC009S0764W0152", "wav": "./aishell/wav/test/S0764/BAC009S0764W0152.wav", "txt": "四线城市成交量有轻微下调"} -{"key": "BAC009S0764W0153", "wav": "./aishell/wav/test/S0764/BAC009S0764W0153.wav", "txt": "住房城乡建设部政策研究中心主任秦虹表示"} -{"key": "BAC009S0764W0154", "wav": "./aishell/wav/test/S0764/BAC009S0764W0154.wav", "txt": "我国房地产市场过去从体偏紧部分地区过紧"} -{"key": "BAC009S0764W0155", "wav": "./aishell/wav/test/S0764/BAC009S0764W0155.wav", "txt": "总体偏松部分地区过剩"} -{"key": "BAC009S0764W0156", "wav": "./aishell/wav/test/S0764/BAC009S0764W0156.wav", "txt": "当供给远快于需求时"} -{"key": "BAC009S0764W0157", "wav": "./aishell/wav/test/S0764/BAC009S0764W0157.wav", "txt": "很难出现去年那样的楼市暴涨"} -{"key": "BAC009S0764W0158", "wav": "./aishell/wav/test/S0764/BAC009S0764W0158.wav", "txt": "即便是北上广深等供应偏紧的一线城市"} -{"key": "BAC009S0764W0159", "wav": "./aishell/wav/test/S0764/BAC009S0764W0159.wav", "txt": "也有限购政策在控制需求规模"} -{"key": "BAC009S0764W0160", "wav": "./aishell/wav/test/S0764/BAC009S0764W0160.wav", "txt": "从而有利于抑制楼市过快上涨"} -{"key": "BAC009S0764W0161", "wav": "./aishell/wav/test/S0764/BAC009S0764W0161.wav", "txt": "楼市调控供的行政手段宜减不宜加"} -{"key": "BAC009S0764W0162", "wav": "./aishell/wav/test/S0764/BAC009S0764W0162.wav", "txt": "稳增长措施需更全面地考虑化解楼市风险问题"} -{"key": "BAC009S0764W0163", "wav": "./aishell/wav/test/S0764/BAC009S0764W0163.wav", "txt": "楼市调控将去向何方"} -{"key": "BAC009S0764W0164", "wav": "./aishell/wav/test/S0764/BAC009S0764W0164.wav", "txt": "进一步发挥市场在资源配置中的决定性作用"} -{"key": "BAC009S0764W0165", "wav": "./aishell/wav/test/S0764/BAC009S0764W0165.wav", "txt": "楼市调控的行政手段宜减不宜加"} -{"key": "BAC009S0764W0166", "wav": "./aishell/wav/test/S0764/BAC009S0764W0166.wav", "txt": "去行政化"} -{"key": "BAC009S0764W0167", "wav": "./aishell/wav/test/S0764/BAC009S0764W0167.wav", "txt": "随着市场调整的深入"} -{"key": "BAC009S0764W0168", "wav": "./aishell/wav/test/S0764/BAC009S0764W0168.wav", "txt": "一些三线城市取消限购及限贷"} -{"key": "BAC009S0764W0169", "wav": "./aishell/wav/test/S0764/BAC009S0764W0169.wav", "txt": "实施较大幅度的补贴政策"} -{"key": "BAC009S0764W0170", "wav": "./aishell/wav/test/S0764/BAC009S0764W0170.wav", "txt": "当地新建商品住宅的房价多在每平方米三四千元"} -{"key": "BAC009S0764W0171", "wav": "./aishell/wav/test/S0764/BAC009S0764W0171.wav", "txt": "政府出台每平方米补贴五百元的托市政策"} -{"key": "BAC009S0764W0172", "wav": "./aishell/wav/test/S0764/BAC009S0764W0172.wav", "txt": "由于不可能从根本上改变供求关系"} -{"key": "BAC009S0764W0173", "wav": "./aishell/wav/test/S0764/BAC009S0764W0173.wav", "txt": "类似的补贴政策常常是短效刺激"} -{"key": "BAC009S0764W0174", "wav": "./aishell/wav/test/S0764/BAC009S0764W0174.wav", "txt": "会对市场造成新一轮的干扰"} -{"key": "BAC009S0764W0175", "wav": "./aishell/wav/test/S0764/BAC009S0764W0175.wav", "txt": "安徽铜陵结束了当地契税补贴政策"} -{"key": "BAC009S0764W0176", "wav": "./aishell/wav/test/S0764/BAC009S0764W0176.wav", "txt": "当月住宅类商品房成交套数骤跌"} -{"key": "BAC009S0764W0177", "wav": "./aishell/wav/test/S0764/BAC009S0764W0177.wav", "txt": "在经济下行压力加大的背景下"} -{"key": "BAC009S0764W0178", "wav": "./aishell/wav/test/S0764/BAC009S0764W0178.wav", "txt": "稳增长措施需更全面地考虑化解楼市风险问题"} -{"key": "BAC009S0764W0179", "wav": "./aishell/wav/test/S0764/BAC009S0764W0179.wav", "txt": "国务院发展研究中心市场经济研究所副所长邓郁松认为"} -{"key": "BAC009S0764W0180", "wav": "./aishell/wav/test/S0764/BAC009S0764W0180.wav", "txt": "可能引发房价泡沫风险"} -{"key": "BAC009S0764W0181", "wav": "./aishell/wav/test/S0764/BAC009S0764W0181.wav", "txt": "在经济增速放缓阶段运用货币政策工具时"} -{"key": "BAC009S0764W0183", "wav": "./aishell/wav/test/S0764/BAC009S0764W0183.wav", "txt": "基本住房需求得到满足后"} -{"key": "BAC009S0764W0184", "wav": "./aishell/wav/test/S0764/BAC009S0764W0184.wav", "txt": "对绿色高效宜居的高品质住房需求快速上升"} -{"key": "BAC009S0764W0185", "wav": "./aishell/wav/test/S0764/BAC009S0764W0185.wav", "txt": "通过改革和政策调整"} -{"key": "BAC009S0764W0186", "wav": "./aishell/wav/test/S0764/BAC009S0764W0186.wav", "txt": "实现我国房地产市场的平稳运行"} -{"key": "BAC009S0764W0187", "wav": "./aishell/wav/test/S0764/BAC009S0764W0187.wav", "txt": "及时发现产业发展中的倾向性苗头性问题"} -{"key": "BAC009S0764W0188", "wav": "./aishell/wav/test/S0764/BAC009S0764W0188.wav", "txt": "促进战略性新兴产业健康发展"} -{"key": "BAC009S0764W0189", "wav": "./aishell/wav/test/S0764/BAC009S0764W0189.wav", "txt": "有关部门和社会各界积极响应"} -{"key": "BAC009S0764W0190", "wav": "./aishell/wav/test/S0764/BAC009S0764W0190.wav", "txt": "采取了一系列的政策措施"} -{"key": "BAC009S0764W0191", "wav": "./aishell/wav/test/S0764/BAC009S0764W0191.wav", "txt": "促使我国战略性新兴产业发展实现了良好开局"} -{"key": "BAC009S0764W0192", "wav": "./aishell/wav/test/S0764/BAC009S0764W0192.wav", "txt": "战略性新兴产业在各地呈现出集聚蓬勃发展的态势"} -{"key": "BAC009S0764W0193", "wav": "./aishell/wav/test/S0764/BAC009S0764W0193.wav", "txt": "先后出台的战略性新兴产业的政策措施主要有六项"} -{"key": "BAC009S0764W0194", "wav": "./aishell/wav/test/S0764/BAC009S0764W0194.wav", "txt": "在加强宏观引导方面"} -{"key": "BAC009S0764W0195", "wav": "./aishell/wav/test/S0764/BAC009S0764W0195.wav", "txt": "形成了系统完整的规划体系"} -{"key": "BAC009S0764W0196", "wav": "./aishell/wav/test/S0764/BAC009S0764W0196.wav", "txt": "明确了发展目标和重点任务"} -{"key": "BAC009S0764W0197", "wav": "./aishell/wav/test/S0764/BAC009S0764W0197.wav", "txt": "在加大要素支持方面"} -{"key": "BAC009S0764W0198", "wav": "./aishell/wav/test/S0764/BAC009S0764W0198.wav", "txt": "新批复了七只创投基金的设立方案"} -{"key": "BAC009S0764W0199", "wav": "./aishell/wav/test/S0764/BAC009S0764W0199.wav", "txt": "吸引社会资本七亿元"} -{"key": "BAC009S0764W0200", "wav": "./aishell/wav/test/S0764/BAC009S0764W0200.wav", "txt": "在加快体制改革方面"} -{"key": "BAC009S0764W0201", "wav": "./aishell/wav/test/S0764/BAC009S0764W0201.wav", "txt": "组织了第一批七个地区城市开展三网融合试点"} -{"key": "BAC009S0764W0202", "wav": "./aishell/wav/test/S0764/BAC009S0764W0202.wav", "txt": "第二批三网融合试点工作业已启动"} -{"key": "BAC009S0764W0203", "wav": "./aishell/wav/test/S0764/BAC009S0764W0203.wav", "txt": "制定了可再生能源电价附加补贴和配额交易方案"} -{"key": "BAC009S0764W0204", "wav": "./aishell/wav/test/S0764/BAC009S0764W0204.wav", "txt": "发改委双节期间重点关注电商促销行为"} -{"key": "BAC009S0764W0205", "wav": "./aishell/wav/test/S0764/BAC009S0764W0205.wav", "txt": "本报记者王颖春国家发改委近日发出通知"} -{"key": "BAC009S0764W0206", "wav": "./aishell/wav/test/S0764/BAC009S0764W0206.wav", "txt": "相关公司股票走势农产品"} -{"key": "BAC009S0764W0207", "wav": "./aishell/wav/test/S0764/BAC009S0764W0207.wav", "txt": "积极防范和妥善应对市场价格异常波动"} -{"key": "BAC009S0764W0208", "wav": "./aishell/wav/test/S0764/BAC009S0764W0208.wav", "txt": "维护正常的市场价格秩序"} -{"key": "BAC009S0764W0209", "wav": "./aishell/wav/test/S0764/BAC009S0764W0209.wav", "txt": "严厉打击春运期间违规上调票价价外收费等违法行为"} -{"key": "BAC009S0764W0210", "wav": "./aishell/wav/test/S0764/BAC009S0764W0210.wav", "txt": "切实降低农产品流通成本"} -{"key": "BAC009S0764W0211", "wav": "./aishell/wav/test/S0764/BAC009S0764W0211.wav", "txt": "要加强节日期间旅游市场价格监管"} -{"key": "BAC009S0764W0212", "wav": "./aishell/wav/test/S0764/BAC009S0764W0212.wav", "txt": "以及提供服务中的变相涨价或价格欺诈行为"} -{"key": "BAC009S0764W0213", "wav": "./aishell/wav/test/S0764/BAC009S0764W0213.wav", "txt": "构建良好的旅游市场环境"} -{"key": "BAC009S0764W0214", "wav": "./aishell/wav/test/S0764/BAC009S0764W0214.wav", "txt": "要继续开展商贸零售领域价格秩序整治"} -{"key": "BAC009S0764W0215", "wav": "./aishell/wav/test/S0764/BAC009S0764W0215.wav", "txt": "重点关注大型电子商务经营者的促销行为"} -{"key": "BAC009S0764W0216", "wav": "./aishell/wav/test/S0764/BAC009S0764W0216.wav", "txt": "规范降价打折返券赠送等促销行为"} -{"key": "BAC009S0764W0217", "wav": "./aishell/wav/test/S0764/BAC009S0764W0217.wav", "txt": "营造良好的消费环境"} -{"key": "BAC009S0764W0218", "wav": "./aishell/wav/test/S0764/BAC009S0764W0218.wav", "txt": "发改委多渠道筹集保障房建设资金到"} -{"key": "BAC009S0764W0219", "wav": "./aishell/wav/test/S0764/BAC009S0764W0219.wav", "txt": "要加大保障性安居工程建设资计划落实力度"} -{"key": "BAC009S0764W0220", "wav": "./aishell/wav/test/S0764/BAC009S0764W0220.wav", "txt": "二零一二年中央进一步加大了资金支持力度"} -{"key": "BAC009S0764W0221", "wav": "./aishell/wav/test/S0764/BAC009S0764W0221.wav", "txt": "地方政府也要加大资金筹措力度"} -{"key": "BAC009S0764W0222", "wav": "./aishell/wav/test/S0764/BAC009S0764W0222.wav", "txt": "加强建设资金统筹和组织实施工作"} -{"key": "BAC009S0764W0223", "wav": "./aishell/wav/test/S0764/BAC009S0764W0223.wav", "txt": "确保保障性安居工程年度建设任务的完成"} -{"key": "BAC009S0764W0224", "wav": "./aishell/wav/test/S0764/BAC009S0764W0224.wav", "txt": "充分发挥地方政府融资平台作用"} -{"key": "BAC009S0764W0225", "wav": "./aishell/wav/test/S0764/BAC009S0764W0225.wav", "txt": "鼓励引导社会力量参与建设保障性住房及配套设施"} -{"key": "BAC009S0764W0226", "wav": "./aishell/wav/test/S0764/BAC009S0764W0226.wav", "txt": "尽快将中央补助投资和省级配套资金分解下达到市县"} -{"key": "BAC009S0764W0227", "wav": "./aishell/wav/test/S0764/BAC009S0764W0227.wav", "txt": "二零一二保障房建设"} -{"key": "BAC009S0764W0228", "wav": "./aishell/wav/test/S0764/BAC009S0764W0228.wav", "txt": "七千万套保障房多少钢材"} -{"key": "BAC009S0764W0229", "wav": "./aishell/wav/test/S0764/BAC009S0764W0229.wav", "txt": "如何在五天内筹集到七万元"} -{"key": "BAC009S0764W0230", "wav": "./aishell/wav/test/S0764/BAC009S0764W0230.wav", "txt": "各地保障房建设的套数"} -{"key": "BAC009S0764W0231", "wav": "./aishell/wav/test/S0764/BAC009S0764W0231.wav", "txt": "保障房和水利建设概念股"} -{"key": "BAC009S0764W0232", "wav": "./aishell/wav/test/S0764/BAC009S0764W0232.wav", "txt": "发改委将订制战略避免境外投资恶性竞争到"} -{"key": "BAC009S0764W0233", "wav": "./aishell/wav/test/S0764/BAC009S0764W0233.wav", "txt": "新京报讯记者钟晶晶发改委昨日表示"} -{"key": "BAC009S0764W0234", "wav": "./aishell/wav/test/S0764/BAC009S0764W0234.wav", "txt": "政府将制订境外投资总体战略"} -{"key": "BAC009S0764W0235", "wav": "./aishell/wav/test/S0764/BAC009S0764W0235.wav", "txt": "避免中国企业境外恶性竞争"} -{"key": "BAC009S0764W0236", "wav": "./aishell/wav/test/S0764/BAC009S0764W0236.wav", "txt": "并鼓励企业在境外上市"} -{"key": "BAC009S0764W0237", "wav": "./aishell/wav/test/S0764/BAC009S0764W0237.wav", "txt": "加强海外信息监测为企业提供对外投资指导"} -{"key": "BAC009S0764W0238", "wav": "./aishell/wav/test/S0764/BAC009S0764W0238.wav", "txt": "形成一批具有国际竞争力的中国企业"} -{"key": "BAC009S0764W0239", "wav": "./aishell/wav/test/S0764/BAC009S0764W0239.wav", "txt": "十一五期间我国累计境外投资七千亿美元"} -{"key": "BAC009S0764W0240", "wav": "./aishell/wav/test/S0764/BAC009S0764W0240.wav", "txt": "年均增速百分之七"} -{"key": "BAC009S0764W0242", "wav": "./aishell/wav/test/S0764/BAC009S0764W0242.wav", "txt": "单项投资规模日益增大"} -{"key": "BAC009S0764W0243", "wav": "./aishell/wav/test/S0764/BAC009S0764W0243.wav", "txt": "几个亿美元的项目不断出现"} -{"key": "BAC009S0764W0244", "wav": "./aishell/wav/test/S0764/BAC009S0764W0244.wav", "txt": "规划对十二五的投资规模未做预测"} -{"key": "BAC009S0764W0245", "wav": "./aishell/wav/test/S0764/BAC009S0764W0245.wav", "txt": "但在鼓励企业走出去方面释出多个信号"} -{"key": "BAC009S0764W0246", "wav": "./aishell/wav/test/S0764/BAC009S0764W0246.wav", "txt": "鼓励传统纺织家电汽车等一般制造业外移"} -{"key": "BAC009S0764W0247", "wav": "./aishell/wav/test/S0764/BAC009S0764W0247.wav", "txt": "鼓励商业银行去境外开设分支机构"} -{"key": "BAC009S0764W0248", "wav": "./aishell/wav/test/S0764/BAC009S0764W0248.wav", "txt": "政府将完善境外投资统计制度"} -{"key": "BAC009S0764W0249", "wav": "./aishell/wav/test/S0764/BAC009S0764W0249.wav", "txt": "实行全口径统计和动态监测"} -{"key": "BAC009S0764W0250", "wav": "./aishell/wav/test/S0764/BAC009S0764W0250.wav", "txt": "确保境外企业和人员安全"} -{"key": "BAC009S0764W0251", "wav": "./aishell/wav/test/S0764/BAC009S0764W0251.wav", "txt": "但目前还存在服务架构不完善"} -{"key": "BAC009S0764W0252", "wav": "./aishell/wav/test/S0764/BAC009S0764W0252.wav", "txt": "缺乏对外投资长远规划等问题"} -{"key": "BAC009S0764W0253", "wav": "./aishell/wav/test/S0764/BAC009S0764W0253.wav", "txt": "可控是病毒武器最基本的要求"} -{"key": "BAC009S0764W0254", "wav": "./aishell/wav/test/S0764/BAC009S0764W0254.wav", "txt": "它必须尽量做到只针对敌对国家的计算机和网络"} -{"key": "BAC009S0764W0255", "wav": "./aishell/wav/test/S0764/BAC009S0764W0255.wav", "txt": "不能波及和影响其他无关国家甚至本国"} -{"key": "BAC009S0764W0256", "wav": "./aishell/wav/test/S0764/BAC009S0764W0256.wav", "txt": "具有精确的目标定位和识别能力"} -{"key": "BAC009S0764W0257", "wav": "./aishell/wav/test/S0764/BAC009S0764W0257.wav", "txt": "一旦战事结束或出于特殊需要可以实现自毁"} -{"key": "BAC009S0764W0258", "wav": "./aishell/wav/test/S0764/BAC009S0764W0258.wav", "txt": "病毒武器的传染性超强"} -{"key": "BAC009S0764W0259", "wav": "./aishell/wav/test/S0764/BAC009S0764W0259.wav", "txt": "它可以跨硬件平台传染"} -{"key": "BAC009S0764W0260", "wav": "./aishell/wav/test/S0764/BAC009S0764W0260.wav", "txt": "除了普通计算机以外"} -{"key": "BAC009S0764W0261", "wav": "./aishell/wav/test/S0764/BAC009S0764W0261.wav", "txt": "病毒武器的隐蔽性极佳"} -{"key": "BAC009S0764W0262", "wav": "./aishell/wav/test/S0764/BAC009S0764W0262.wav", "txt": "可以实现在敌国网络中的长期潜伏"} -{"key": "BAC009S0764W0263", "wav": "./aishell/wav/test/S0764/BAC009S0764W0263.wav", "txt": "是威力巨大的定时炸弹"} -{"key": "BAC009S0764W0264", "wav": "./aishell/wav/test/S0764/BAC009S0764W0264.wav", "txt": "用电脑进行战争比用核武器还有效"} -{"key": "BAC009S0764W0265", "wav": "./aishell/wav/test/S0764/BAC009S0764W0265.wav", "txt": "核武器并不能征服类似美国这样的国家"} -{"key": "BAC009S0764W0266", "wav": "./aishell/wav/test/S0764/BAC009S0764W0266.wav", "txt": "利用电脑病毒却可以在一秒钟内从银行盗走过亿美元"} -{"key": "BAC009S0764W0267", "wav": "./aishell/wav/test/S0764/BAC009S0764W0267.wav", "txt": "足够使美国失去战争基础因此彻底失败"} -{"key": "BAC009S0764W0268", "wav": "./aishell/wav/test/S0764/BAC009S0764W0268.wav", "txt": "但是病毒武器的出现"} -{"key": "BAC009S0764W0269", "wav": "./aishell/wav/test/S0764/BAC009S0764W0269.wav", "txt": "预示着未来战争模样将完全改变"} -{"key": "BAC009S0764W0270", "wav": "./aishell/wav/test/S0764/BAC009S0764W0270.wav", "txt": "病毒武器被认为是目前最具有代表性的网络武器"} -{"key": "BAC009S0764W0271", "wav": "./aishell/wav/test/S0764/BAC009S0764W0271.wav", "txt": "美国芯片行业兴起并购热潮搜狐科技"} -{"key": "BAC009S0764W0272", "wav": "./aishell/wav/test/S0764/BAC009S0764W0272.wav", "txt": "反映了芯片行业出现整合热潮"} -{"key": "BAC009S0764W0273", "wav": "./aishell/wav/test/S0764/BAC009S0764W0273.wav", "txt": "英特尔是世界头号芯片制造商"} -{"key": "BAC009S0764W0274", "wav": "./aishell/wav/test/S0764/BAC009S0764W0274.wav", "txt": "此次以一百六十七亿美元收购拓朗"} -{"key": "BAC009S0764W0275", "wav": "./aishell/wav/test/S0764/BAC009S0764W0275.wav", "txt": "将创下该公司成立四七年来最大收购交易的记录"} -{"key": "BAC009S0764W0276", "wav": "./aishell/wav/test/S0764/BAC009S0764W0276.wav", "txt": "正在寻求扩大移动市场份额"} -{"key": "BAC009S0764W0277", "wav": "./aishell/wav/test/S0764/BAC009S0764W0277.wav", "txt": "拓朗的主打产品是现场可编程门阵列芯片"} -{"key": "BAC009S0764W0278", "wav": "./aishell/wav/test/S0764/BAC009S0764W0278.wav", "txt": "可供客户为特定任务重新编程"} -{"key": "BAC009S0764W0279", "wav": "./aishell/wav/test/S0764/BAC009S0764W0279.wav", "txt": "应用于汽车医疗等行业"} -{"key": "BAC009S0764W0280", "wav": "./aishell/wav/test/S0764/BAC009S0764W0280.wav", "txt": "英特尔首席执行官布赖恩克尔扎尼奇在一份声明中说"} -{"key": "BAC009S0764W0281", "wav": "./aishell/wav/test/S0764/BAC009S0764W0281.wav", "txt": "合并拓朗之后将推出新的产品"} -{"key": "BAC009S0764W0282", "wav": "./aishell/wav/test/S0764/BAC009S0764W0282.wav", "txt": "满足数据中心和物联网细分市场的用户需求"} -{"key": "BAC009S0764W0283", "wav": "./aishell/wav/test/S0764/BAC009S0764W0283.wav", "txt": "形成高度定制化的集成产品"} -{"key": "BAC009S0764W0284", "wav": "./aishell/wav/test/S0764/BAC009S0764W0284.wav", "txt": "微芯片科技公司表示"} -{"key": "BAC009S0764W0285", "wav": "./aishell/wav/test/S0764/BAC009S0764W0285.wav", "txt": "两家公司是联网汽车的主要芯片供应商"} -{"key": "BAC009S0764W0287", "wav": "./aishell/wav/test/S0764/BAC009S0764W0287.wav", "txt": "今年芯片行业并购交易额在八百亿美元以上"} -{"key": "BAC009S0764W0288", "wav": "./aishell/wav/test/S0764/BAC009S0764W0288.wav", "txt": "半导体行业的大公司正在寻求通过并购"} -{"key": "BAC009S0764W0289", "wav": "./aishell/wav/test/S0764/BAC009S0764W0289.wav", "txt": "扩大它们在新的芯片市场的份额"} -{"key": "BAC009S0764W0290", "wav": "./aishell/wav/test/S0764/BAC009S0764W0290.wav", "txt": "随着个人计算机芯片的需求放慢"} -{"key": "BAC009S0764W0291", "wav": "./aishell/wav/test/S0764/BAC009S0764W0291.wav", "txt": "英特尔需要找到新的增长点"} -{"key": "BAC009S0764W0292", "wav": "./aishell/wav/test/S0764/BAC009S0764W0292.wav", "txt": "高德纳咨询公司分析师马克黄说"} -{"key": "BAC009S0764W0293", "wav": "./aishell/wav/test/S0764/BAC009S0764W0293.wav", "txt": "如今则猛增到一两亿美元"} -{"key": "BAC009S0764W0294", "wav": "./aishell/wav/test/S0764/BAC009S0764W0294.wav", "txt": "解决小小芯片上的连线和物理问题需要大量昂贵设备"} -{"key": "BAC009S0764W0295", "wav": "./aishell/wav/test/S0764/BAC009S0764W0295.wav", "txt": "芯片行业的并购风体现了整个科技行业的一种趋势"} -{"key": "BAC009S0764W0296", "wav": "./aishell/wav/test/S0764/BAC009S0764W0296.wav", "txt": "即一些财大气粗的科技公司自己不创新"} -{"key": "BAC009S0764W0297", "wav": "./aishell/wav/test/S0764/BAC009S0764W0297.wav", "txt": "而是寻求收购规模较小更为灵活的公司"} -{"key": "BAC009S0764W0298", "wav": "./aishell/wav/test/S0764/BAC009S0764W0298.wav", "txt": "反映了芯片行业出现整合热"} -{"key": "BAC009S0764W0299", "wav": "./aishell/wav/test/S0764/BAC009S0764W0299.wav", "txt": "因为难以忍受股价长期被低估"} -{"key": "BAC009S0764W0300", "wav": "./aishell/wav/test/S0764/BAC009S0764W0300.wav", "txt": "中国游戏公司纷纷忙着退市"} -{"key": "BAC009S0764W0301", "wav": "./aishell/wav/test/S0764/BAC009S0764W0301.wav", "txt": "巨人网络盛大游戏以及完美世界均已选择了私有化"} -{"key": "BAC009S0764W0302", "wav": "./aishell/wav/test/S0764/BAC009S0764W0302.wav", "txt": "是这类公司在美国市场估值长期受低估"} -{"key": "BAC009S0764W0303", "wav": "./aishell/wav/test/S0764/BAC009S0764W0303.wav", "txt": "北京商报讯记者王晔君日前"} -{"key": "BAC009S0764W0304", "wav": "./aishell/wav/test/S0764/BAC009S0764W0304.wav", "txt": "裁员二千人是由于销售模式发生改变"} -{"key": "BAC009S0764W0305", "wav": "./aishell/wav/test/S0764/BAC009S0764W0305.wav", "txt": "公司已将原有的直销模式改为经销模式"} -{"key": "BAC009S0764W0306", "wav": "./aishell/wav/test/S0764/BAC009S0764W0306.wav", "txt": "因此需要的人员大幅下降"} -{"key": "BAC009S0764W0307", "wav": "./aishell/wav/test/S0764/BAC009S0764W0307.wav", "txt": "由于去年底制定的销售战略是直销模式"} -{"key": "BAC009S0764W0308", "wav": "./aishell/wav/test/S0764/BAC009S0764W0308.wav", "txt": "所以今年上半年公司在全国各地的员工人数大幅增加"} -{"key": "BAC009S0764W0309", "wav": "./aishell/wav/test/S0764/BAC009S0764W0309.wav", "txt": "由于近期销售模式的调整"} -{"key": "BAC009S0764W0310", "wav": "./aishell/wav/test/S0764/BAC009S0764W0310.wav", "txt": "即由直销模式转变为经销模式"} -{"key": "BAC009S0764W0311", "wav": "./aishell/wav/test/S0764/BAC009S0764W0311.wav", "txt": "公司将更多地依靠经销商进行销售"} -{"key": "BAC009S0764W0312", "wav": "./aishell/wav/test/S0764/BAC009S0764W0312.wav", "txt": "正是由于销售模式的改变"} -{"key": "BAC009S0764W0313", "wav": "./aishell/wav/test/S0764/BAC009S0764W0313.wav", "txt": "汉能直接销售人员大幅度减少"} -{"key": "BAC009S0764W0314", "wav": "./aishell/wav/test/S0764/BAC009S0764W0314.wav", "txt": "汉能发布中期财报披露"} -{"key": "BAC009S0764W0315", "wav": "./aishell/wav/test/S0764/BAC009S0764W0315.wav", "txt": "上半年营业收入二十一点零八亿港元"} -{"key": "BAC009S0764W0316", "wav": "./aishell/wav/test/S0764/BAC009S0764W0316.wav", "txt": "同比减少百分之三十四毛利十四点六一亿港元"} -{"key": "BAC009S0764W0317", "wav": "./aishell/wav/test/S0764/BAC009S0764W0317.wav", "txt": "同比减少约百分之四十六亏损额为五百九十三二万港元"} -{"key": "BAC009S0764W0318", "wav": "./aishell/wav/test/S0764/BAC009S0764W0318.wav", "txt": "而去年同期盈利十六点七六亿港元"} -{"key": "BAC009S0764W0319", "wav": "./aishell/wav/test/S0764/BAC009S0764W0319.wav", "txt": "是自二零一一年借壳上市以来首次出现亏损"} -{"key": "BAC009S0764W0320", "wav": "./aishell/wav/test/S0764/BAC009S0764W0320.wav", "txt": "同时公布了重组计划"} -{"key": "BAC009S0764W0321", "wav": "./aishell/wav/test/S0764/BAC009S0764W0321.wav", "txt": "撤销旗下高端产业集团和产品开发集团"} -{"key": "BAC009S0764W0322", "wav": "./aishell/wav/test/S0764/BAC009S0764W0322.wav", "txt": "并将从总部事业部及各区域公司共裁员二千人"} -{"key": "BAC009S0764W0323", "wav": "./aishell/wav/test/S0764/BAC009S0764W0323.wav", "txt": "汉能曾计划今年底前将这一数字提高到三百家"} -{"key": "BAC009S0764W0324", "wav": "./aishell/wav/test/S0764/BAC009S0764W0324.wav", "txt": "汉能上半年业绩出现大幅下滑"} -{"key": "BAC009S0764W0325", "wav": "./aishell/wav/test/S0764/BAC009S0764W0325.wav", "txt": "当务之急是扭转业绩"} -{"key": "BAC009S0764W0326", "wav": "./aishell/wav/test/S0764/BAC009S0764W0326.wav", "txt": "而由直销模式改为经销模式"} -{"key": "BAC009S0764W0327", "wav": "./aishell/wav/test/S0764/BAC009S0764W0327.wav", "txt": "可以缩减很多人力成本"} -{"key": "BAC009S0764W0328", "wav": "./aishell/wav/test/S0764/BAC009S0764W0328.wav", "txt": "有利于降低公司运营成本"} -{"key": "BAC009S0764W0329", "wav": "./aishell/wav/test/S0764/BAC009S0764W0329.wav", "txt": "但是由于直销改为经销"} -{"key": "BAC009S0764W0330", "wav": "./aishell/wav/test/S0764/BAC009S0764W0330.wav", "txt": "汉能对自身产品的议价能力推广力都将减弱"} -{"key": "BAC009S0764W0331", "wav": "./aishell/wav/test/S0764/BAC009S0764W0331.wav", "txt": "公司已经暂停或终止部分关联交易项目"} -{"key": "BAC009S0764W0332", "wav": "./aishell/wav/test/S0764/BAC009S0764W0332.wav", "txt": "已经花费了一定的资源和成本"} -{"key": "BAC009S0764W0333", "wav": "./aishell/wav/test/S0764/BAC009S0764W0333.wav", "txt": "因此暂停或终止这些项目"} -{"key": "BAC009S0764W0334", "wav": "./aishell/wav/test/S0764/BAC009S0764W0334.wav", "txt": "对本公司的上半年业绩带来了负面影响"} -{"key": "BAC009S0764W0335", "wav": "./aishell/wav/test/S0764/BAC009S0764W0335.wav", "txt": "北京商报讯记者王晔君日前"} -{"key": "BAC009S0764W0336", "wav": "./aishell/wav/test/S0764/BAC009S0764W0336.wav", "txt": "裁员两千人是由"} -{"key": "BAC009S0764W0338", "wav": "./aishell/wav/test/S0764/BAC009S0764W0338.wav", "txt": "他们在训练和比赛过程之中的速度也会逐渐慢下来"} -{"key": "BAC009S0764W0339", "wav": "./aishell/wav/test/S0764/BAC009S0764W0339.wav", "txt": "但是根据国外科学家最新的研究结果"} -{"key": "BAC009S0764W0340", "wav": "./aishell/wav/test/S0764/BAC009S0764W0340.wav", "txt": "通过对脚踝和小腿等部位的强化"} -{"key": "BAC009S0764W0341", "wav": "./aishell/wav/test/S0764/BAC009S0764W0341.wav", "txt": "可以有效的抵消年龄所带来的速度劣势"} -{"key": "BAC009S0764W0342", "wav": "./aishell/wav/test/S0764/BAC009S0764W0342.wav", "txt": "使上年纪的跑者也能保持较快的速度"} -{"key": "BAC009S0764W0343", "wav": "./aishell/wav/test/S0764/BAC009S0764W0343.wav", "txt": "美国东卡罗莱纳大学和维克森林大学的研究者认为"} -{"key": "BAC009S0764W0344", "wav": "./aishell/wav/test/S0764/BAC009S0764W0344.wav", "txt": "脚踝和小腿的能力变弱"} -{"key": "BAC009S0764W0345", "wav": "./aishell/wav/test/S0764/BAC009S0764W0345.wav", "txt": "如果能够加强这方面的锻炼"} -{"key": "BAC009S0764W0346", "wav": "./aishell/wav/test/S0764/BAC009S0764W0346.wav", "txt": "他们会拥有较快的速度"} -{"key": "BAC009S0764W0347", "wav": "./aishell/wav/test/S0764/BAC009S0764W0347.wav", "txt": "研究者们选取了一些年龄大的跑者作为研究对象"} -{"key": "BAC009S0764W0348", "wav": "./aishell/wav/test/S0764/BAC009S0764W0348.wav", "txt": "并让年轻跑者作为参照"} -{"key": "BAC009S0764W0349", "wav": "./aishell/wav/test/S0764/BAC009S0764W0349.wav", "txt": "他们的步频大致相同"} -{"key": "BAC009S0764W0350", "wav": "./aishell/wav/test/S0764/BAC009S0764W0350.wav", "txt": "年龄大跑者的步幅明显短于年轻人"} -{"key": "BAC009S0764W0351", "wav": "./aishell/wav/test/S0764/BAC009S0764W0351.wav", "txt": "使得他们的速度变慢了"} -{"key": "BAC009S0764W0352", "wav": "./aishell/wav/test/S0764/BAC009S0764W0352.wav", "txt": "研究者们选取了十九位跑者"} -{"key": "BAC009S0764W0353", "wav": "./aishell/wav/test/S0764/BAC009S0764W0353.wav", "txt": "年龄从二十三岁到五十九岁"} -{"key": "BAC009S0764W0354", "wav": "./aishell/wav/test/S0764/BAC009S0764W0354.wav", "txt": "身体质量指数平均为二十三点四"} -{"key": "BAC009S0764W0355", "wav": "./aishell/wav/test/S0764/BAC009S0764W0355.wav", "txt": "身材偏瘦而且比较健康"} -{"key": "BAC009S0764W0356", "wav": "./aishell/wav/test/S0764/BAC009S0764W0356.wav", "txt": "跑者从二十多岁到五十九岁"} -{"key": "BAC009S0764W0357", "wav": "./aishell/wav/test/S0764/BAC009S0764W0357.wav", "txt": "步幅长度和跑步速度大约下降了百分之二十"} -{"key": "BAC009S0764W0358", "wav": "./aishell/wav/test/S0764/BAC009S0764W0358.wav", "txt": "脚踝的能力损失了大约百分之四十八"} -{"key": "BAC009S0764W0359", "wav": "./aishell/wav/test/S0764/BAC009S0764W0359.wav", "txt": "按照平时训练的速度进行跑步"} -{"key": "BAC009S0764W0360", "wav": "./aishell/wav/test/S0764/BAC009S0764W0360.wav", "txt": "二十岁的跑者平均每英里耗时八分十八秒"} -{"key": "BAC009S0764W0361", "wav": "./aishell/wav/test/S0764/BAC009S0764W0361.wav", "txt": "而六十岁的跑者每英里耗时十分十八秒"} -{"key": "BAC009S0764W0362", "wav": "./aishell/wav/test/S0764/BAC009S0764W0362.wav", "txt": "已经有过不少关于这方面的研究"} -{"key": "BAC009S0764W0363", "wav": "./aishell/wav/test/S0764/BAC009S0764W0363.wav", "txt": "但是研究对象都是年轻跑者和年老跑者"} -{"key": "BAC009S0764W0364", "wav": "./aishell/wav/test/S0764/BAC009S0764W0364.wav", "txt": "年龄段的复盖范围比较窄"} -{"key": "BAC009S0764W0365", "wav": "./aishell/wav/test/S0764/BAC009S0764W0365.wav", "txt": "最令德维塔感到不可思议的是"} -{"key": "BAC009S0764W0366", "wav": "./aishell/wav/test/S0764/BAC009S0764W0366.wav", "txt": "跑者们随着年龄的增长"} -{"key": "BAC009S0764W0367", "wav": "./aishell/wav/test/S0764/BAC009S0764W0367.wav", "txt": "速度呈现出直线下降"} -{"key": "BAC009S0764W0368", "wav": "./aishell/wav/test/S0764/BAC009S0764W0368.wav", "txt": "速度下降的更加明显"} -{"key": "BAC009S0764W0369", "wav": "./aishell/wav/test/S0764/BAC009S0764W0369.wav", "txt": "很多六七十岁的跑者看到这个研究结果时"} -{"key": "BAC009S0764W0370", "wav": "./aishell/wav/test/S0764/BAC009S0764W0370.wav", "txt": "意思是他们比较认同这个结果"} -{"key": "BAC009S0764W0371", "wav": "./aishell/wav/test/S0764/BAC009S0764W0371.wav", "txt": "研究者们希望年龄大的跑者能够注意脚踝的锻炼"} -{"key": "BAC009S0764W0372", "wav": "./aishell/wav/test/S0764/BAC009S0764W0372.wav", "txt": "但德维塔觉得归根到底还是小腿肌肉的问题"} -{"key": "BAC009S0764W0373", "wav": "./aishell/wav/test/S0764/BAC009S0764W0373.wav", "txt": "尤其是比目鱼肌和腓肠肌"} -{"key": "BAC009S0764W0374", "wav": "./aishell/wav/test/S0764/BAC009S0764W0374.wav", "txt": "这才是产生跑步力量的根源"} -{"key": "BAC009S0764W0375", "wav": "./aishell/wav/test/S0764/BAC009S0764W0375.wav", "txt": "这两种方式的结合能够有效锻炼小腿肌肉"} -{"key": "BAC009S0764W0376", "wav": "./aishell/wav/test/S0764/BAC009S0764W0376.wav", "txt": "对于高年龄跑者来说"} -{"key": "BAC009S0764W0377", "wav": "./aishell/wav/test/S0764/BAC009S0764W0377.wav", "txt": "开始一项新的锻炼方式具有一定的风险性"} -{"key": "BAC009S0764W0378", "wav": "./aishell/wav/test/S0764/BAC009S0764W0378.wav", "txt": "想通过训练提升脚踝和小腿的能力"} -{"key": "BAC009S0764W0379", "wav": "./aishell/wav/test/S0764/BAC009S0764W0379.wav", "txt": "这些常年坚持跑步的人身体质量指数偏低"} -{"key": "BAC009S0764W0380", "wav": "./aishell/wav/test/S0764/BAC009S0764W0380.wav", "txt": "长期跑步可能是一种不需要药物来保持身材的有效方式"} -{"key": "BAC009S0764W0381", "wav": "./aishell/wav/test/S0764/BAC009S0764W0381.wav", "txt": "在二零二二年冬季奥运会的竞选当中"} -{"key": "BAC009S0764W0382", "wav": "./aishell/wav/test/S0764/BAC009S0764W0382.wav", "txt": "北京和张家口最终击败了强大的对手阿拉木图"} -{"key": "BAC009S0764W0383", "wav": "./aishell/wav/test/S0764/BAC009S0764W0383.wav", "txt": "顺利获得了冬奥会的主办权"} -{"key": "BAC009S0764W0384", "wav": "./aishell/wav/test/S0764/BAC009S0764W0384.wav", "txt": "这也是这项冰雪顶级盛事首次来到中国"} -{"key": "BAC009S0764W0385", "wav": "./aishell/wav/test/S0764/BAC009S0764W0385.wav", "txt": "在此次申办冬奥会的过程中"} -{"key": "BAC009S0764W0386", "wav": "./aishell/wav/test/S0764/BAC009S0764W0386.wav", "txt": "我们看到了自身强大的综合实力"} -{"key": "BAC009S0764W0387", "wav": "./aishell/wav/test/S0764/BAC009S0764W0387.wav", "txt": "也看到了在冰雪运动综合实力上的欠缺和不足"} -{"key": "BAC009S0764W0388", "wav": "./aishell/wav/test/S0764/BAC009S0764W0388.wav", "txt": "经历过夏奥会的沉淀"} -{"key": "BAC009S0764W0389", "wav": "./aishell/wav/test/S0764/BAC009S0764W0389.wav", "txt": "加上近几年承办诸多国际性赛事的经验积累"} -{"key": "BAC009S0764W0390", "wav": "./aishell/wav/test/S0764/BAC009S0764W0390.wav", "txt": "在这场亚洲国家锁定胜局的申办博弈中"} -{"key": "BAC009S0764W0391", "wav": "./aishell/wav/test/S0764/BAC009S0764W0391.wav", "txt": "北申办此次冬奥会的价值要远远超过承办本身"} -{"key": "BAC009S0764W0392", "wav": "./aishell/wav/test/S0764/BAC009S0764W0392.wav", "txt": "对于北京申办冬奥会的最终结果"} -{"key": "BAC009S0764W0393", "wav": "./aishell/wav/test/S0764/BAC009S0764W0393.wav", "txt": "我们也应该抱着更加长远和开阔的视角来看待"} -{"key": "BAC009S0764W0394", "wav": "./aishell/wav/test/S0764/BAC009S0764W0394.wav", "txt": "北京申办冬奥强大实力成获胜武器"} -{"key": "BAC009S0764W0395", "wav": "./aishell/wav/test/S0764/BAC009S0764W0395.wav", "txt": "此次北京联手张家口申办冬奥会"} -{"key": "BAC009S0764W0396", "wav": "./aishell/wav/test/S0764/BAC009S0764W0396.wav", "txt": "在与阿拉木图的直接博弈中"} -{"key": "BAC009S0764W0397", "wav": "./aishell/wav/test/S0764/BAC009S0764W0397.wav", "txt": "财政能力和硬件设施的优势是我们最终取胜的关键原因"} -{"key": "BAC009S0764W0398", "wav": "./aishell/wav/test/S0764/BAC009S0764W0398.wav", "txt": "而二零零八年举办夏季奥运会所留下的宝贵遗产"} -{"key": "BAC009S0764W0399", "wav": "./aishell/wav/test/S0764/BAC009S0764W0399.wav", "txt": "也是最终打动国家奥运委会评审团的法宝"} -{"key": "BAC009S0764W0400", "wav": "./aishell/wav/test/S0764/BAC009S0764W0400.wav", "txt": "从经济实力和基础设施建设上看"} -{"key": "BAC009S0764W0401", "wav": "./aishell/wav/test/S0764/BAC009S0764W0401.wav", "txt": "北京和张家口要占据着相当明显的优势"} -{"key": "BAC009S0764W0402", "wav": "./aishell/wav/test/S0764/BAC009S0764W0402.wav", "txt": "北京和张家口两地的生产总值是二万二千七百三十点八亿元"} -{"key": "BAC009S0764W0403", "wav": "./aishell/wav/test/S0764/BAC009S0764W0403.wav", "txt": "而阿拉木图仅为四百亿美元"} -{"key": "BAC009S0764W0404", "wav": "./aishell/wav/test/S0764/BAC009S0764W0404.wav", "txt": "影片将在二零一五年一月在慕尼黑正式开机"} -{"key": "BAC009S0764W0405", "wav": "./aishell/wav/test/S0764/BAC009S0764W0405.wav", "txt": "好莱坞当红明星之前曾被盛传将扮演斯诺登"} -{"key": "BAC009S0764W0406", "wav": "./aishell/wav/test/S0764/BAC009S0764W0406.wav", "txt": "好莱坞当红明星之前曾被盛传将扮演斯诺登"} -{"key": "BAC009S0764W0407", "wav": "./aishell/wav/test/S0764/BAC009S0764W0407.wav", "txt": "他确实拿下了这个角色"} -{"key": "BAC009S0764W0408", "wav": "./aishell/wav/test/S0764/BAC009S0764W0408.wav", "txt": "对男友有什么条件"} -{"key": "BAC009S0764W0409", "wav": "./aishell/wav/test/S0764/BAC009S0764W0409.wav", "txt": "她表示最重要的就是诚恳"} -{"key": "BAC009S0764W0410", "wav": "./aishell/wav/test/S0764/BAC009S0764W0410.wav", "txt": "对于姊弟恋也不排斥"} -{"key": "BAC009S0764W0411", "wav": "./aishell/wav/test/S0764/BAC009S0764W0411.wav", "txt": "搜狐娱乐讯七月十日消息"} -{"key": "BAC009S0764W0412", "wav": "./aishell/wav/test/S0764/BAC009S0764W0412.wav", "txt": "据台湾媒体报道"} -{"key": "BAC009S0764W0413", "wav": "./aishell/wav/test/S0764/BAC009S0764W0413.wav", "txt": "许玮甯最近到法国工作"} -{"key": "BAC009S0764W0414", "wav": "./aishell/wav/test/S0764/BAC009S0764W0414.wav", "txt": "仍在个人社群网站频繁更新动态"} -{"key": "BAC009S0764W0415", "wav": "./aishell/wav/test/S0764/BAC009S0764W0415.wav", "txt": "甚至被外界揣测是因为和阮经天分手后所刺"} -{"key": "BAC009S0764W0416", "wav": "./aishell/wav/test/S0764/BAC009S0764W0416.wav", "txt": "她近日终于在受访时松口公开正解"} -{"key": "BAC009S0764W0417", "wav": "./aishell/wav/test/S0764/BAC009S0764W0417.wav", "txt": "背后意义竟只是不要忘记自己从哪里来"} -{"key": "BAC009S0764W0418", "wav": "./aishell/wav/test/S0764/BAC009S0764W0418.wav", "txt": "搜狐娱乐讯据台湾媒体报道"} -{"key": "BAC009S0764W0419", "wav": "./aishell/wav/test/S0764/BAC009S0764W0419.wav", "txt": "阮经天和许玮甯交往八年屡传婚讯"} -{"key": "BAC009S0764W0420", "wav": "./aishell/wav/test/S0764/BAC009S0764W0420.wav", "txt": "今年三月底惊爆分手"} -{"key": "BAC009S0764W0421", "wav": "./aishell/wav/test/S0764/BAC009S0764W0421.wav", "txt": "当时女方坦承已分居"} -{"key": "BAC009S0764W0422", "wav": "./aishell/wav/test/S0764/BAC009S0764W0422.wav", "txt": "但小天坚持玮甯依然是我的女人"} -{"key": "BAC009S0764W0423", "wav": "./aishell/wav/test/S0764/BAC009S0764W0423.wav", "txt": "有网友日前目击他俩在大稻埕分食炒饭"} -{"key": "BAC009S0764W0424", "wav": "./aishell/wav/test/S0764/BAC009S0764W0424.wav", "txt": "昨天她出席保养品活动"} -{"key": "BAC009S0764W0425", "wav": "./aishell/wav/test/S0764/BAC009S0764W0425.wav", "txt": "松口仍有联络"} -{"key": "BAC009S0764W0426", "wav": "./aishell/wav/test/S0764/BAC009S0764W0426.wav", "txt": "但称自己单身"} -{"key": "BAC009S0764W0427", "wav": "./aishell/wav/test/S0764/BAC009S0764W0427.wav", "txt": "恰巧昨日记者碰见阮经天出门倒垃圾"} -{"key": "BAC009S0764W0428", "wav": "./aishell/wav/test/S0764/BAC009S0764W0428.wav", "txt": "对许玮甯单身说语气落寞表示我没有什么看法"} -{"key": "BAC009S0764W0429", "wav": "./aishell/wav/test/S0764/BAC009S0764W0429.wav", "txt": "搜狐娱乐讯男方和小三还藕断丝连"} -{"key": "BAC009S0764W0430", "wav": "./aishell/wav/test/S0764/BAC009S0764W0430.wav", "txt": "因而痛斩情丝她除了拥有模特儿火辣身材"} -{"key": "BAC009S0764W0431", "wav": "./aishell/wav/test/S0764/BAC009S0764W0431.wav", "txt": "快报讯记者赵丹丹快递实名制时代终于到来了"} -{"key": "BAC009S0764W0432", "wav": "./aishell/wav/test/S0764/BAC009S0764W0432.wav", "txt": "按照国家邮政总局统一部署"} -{"key": "BAC009S0764W0433", "wav": "./aishell/wav/test/S0764/BAC009S0764W0433.wav", "txt": "从下月起全面实施快递实名制登记"} -{"key": "BAC009S0764W0434", "wav": "./aishell/wav/test/S0764/BAC009S0764W0434.wav", "txt": "现代快报记者从省邮政管理局了解到"} -{"key": "BAC009S0764W0435", "wav": "./aishell/wav/test/S0764/BAC009S0764W0435.wav", "txt": "江苏快递实名制登记动真格"} -{"key": "BAC009S0764W0436", "wav": "./aishell/wav/test/S0764/BAC009S0764W0436.wav", "txt": "本周内动员部署全省九零零多家快递企业按要求执行"} -{"key": "BAC009S0764W0437", "wav": "./aishell/wav/test/S0764/BAC009S0764W0437.wav", "txt": "个人寄快递必须登记有效的身份证件"} -{"key": "BAC009S0764W0438", "wav": "./aishell/wav/test/S0764/BAC009S0764W0438.wav", "txt": "本山传媒回应赵本山将有新作品没听说"} -{"key": "BAC009S0764W0439", "wav": "./aishell/wav/test/S0764/BAC009S0764W0439.wav", "txt": "不仅赢得观众好口碑"} -{"key": "BAC009S0764W0440", "wav": "./aishell/wav/test/S0764/BAC009S0764W0440.wav", "txt": "特别是师父赵本山也公开出面为大鹏点赞"} -{"key": "BAC009S0764W0441", "wav": "./aishell/wav/test/S0764/BAC009S0764W0441.wav", "txt": "本月二八日超级月亮和最圆中秋月喜相逢"} -{"key": "BAC009S0764W0442", "wav": "./aishell/wav/test/S0764/BAC009S0764W0442.wav", "txt": "月亮和地球之间的平均距离仅为三五六八九六万公里"} -{"key": "BAC009S0764W0443", "wav": "./aishell/wav/test/S0764/BAC009S0764W0443.wav", "txt": "月亮看起来会比往常大"} -{"key": "BAC009S0764W0444", "wav": "./aishell/wav/test/S0764/BAC009S0764W0444.wav", "txt": "也就是我们常说的超级月亮"} -{"key": "BAC009S0764W0445", "wav": "./aishell/wav/test/S0764/BAC009S0764W0445.wav", "txt": "这一天还将上演月全食"} -{"key": "BAC009S0764W0446", "wav": "./aishell/wav/test/S0764/BAC009S0764W0446.wav", "txt": "超级月亮碰上月全食"} -{"key": "BAC009S0764W0447", "wav": "./aishell/wav/test/S0764/BAC009S0764W0447.wav", "txt": "错过了这次就要到二零三三年了"} -{"key": "BAC009S0764W0448", "wav": "./aishell/wav/test/S0764/BAC009S0764W0448.wav", "txt": "本月下旬天宇将现五星连线奇观"} -{"key": "BAC009S0764W0449", "wav": "./aishell/wav/test/S0764/BAC009S0764W0449.wav", "txt": "中科院紫金山天文台公布了一零月天象"} -{"key": "BAC009S0764W0450", "wav": "./aishell/wav/test/S0764/BAC009S0764W0450.wav", "txt": "现代快报记者注意到"} -{"key": "BAC009S0764W0451", "wav": "./aishell/wav/test/S0764/BAC009S0764W0451.wav", "txt": "天龙座流星雨猎户座流星雨"} -{"key": "BAC009S0764W0452", "wav": "./aishell/wav/test/S0764/BAC009S0764W0452.wav", "txt": "让一零月的天空有点甜蜜蜜的味道"} -{"key": "BAC009S0764W0453", "wav": "./aishell/wav/test/S0764/BAC009S0764W0453.wav", "txt": "水星金星也将迎来观测良机"} -{"key": "BAC009S0764W0454", "wav": "./aishell/wav/test/S0764/BAC009S0764W0454.wav", "txt": "现代快报记者胡玉梅"} -{"key": "BAC009S0764W0455", "wav": "./aishell/wav/test/S0764/BAC009S0764W0455.wav", "txt": "本月中下旬小行星撞地球"} -{"key": "BAC009S0764W0456", "wav": "./aishell/wav/test/S0764/BAC009S0764W0456.wav", "txt": "专家没有科学依据"} -{"key": "BAC009S0764W0457", "wav": "./aishell/wav/test/S0764/BAC009S0764W0457.wav", "txt": "京华时报讯记者任珊记者从北京市教育考试院获悉"} -{"key": "BAC009S0764W0458", "wav": "./aishell/wav/test/S0764/BAC009S0764W0458.wav", "txt": "高招本科二批今天开始进行征集志愿录取"} -{"key": "BAC009S0764W0459", "wav": "./aishell/wav/test/S0764/BAC009S0764W0459.wav", "txt": "一八一所院校将补录一九四九人"} -{"key": "BAC009S0764W0460", "wav": "./aishell/wav/test/S0764/BAC009S0764W0460.wav", "txt": "朱军系阅兵世家曾参与一九八四年阅兵军乐演奏"} -{"key": "BAC009S0764W0461", "wav": "./aishell/wav/test/S0764/BAC009S0764W0461.wav", "txt": "朱圣祎爆王思聪女朋友被诉法官送达起诉书遇阻"} -{"key": "BAC009S0764W0462", "wav": "./aishell/wav/test/S0764/BAC009S0764W0462.wav", "txt": "王思聪将朱圣祎诉至北京朝阳法院"} -{"key": "BAC009S0764W0463", "wav": "./aishell/wav/test/S0764/BAC009S0764W0463.wav", "txt": "要求停止侵权公开道歉赔偿精神损失一元"} -{"key": "BAC009S0764W0464", "wav": "./aishell/wav/test/S0764/BAC009S0764W0464.wav", "txt": "法官送达起诉书副本等应诉材料遇阻"} -{"key": "BAC009S0764W0465", "wav": "./aishell/wav/test/S0764/BAC009S0764W0465.wav", "txt": "朱茵说紫霞仙子谁来演不是我可以决定的"} -{"key": "BAC009S0764W0466", "wav": "./aishell/wav/test/S0764/BAC009S0764W0466.wav", "txt": "资料图片在湖南卫视上周开播的偶像来了中"} -{"key": "BAC009S0764W0467", "wav": "./aishell/wav/test/S0764/BAC009S0764W0467.wav", "txt": "永远的紫霞仙子朱茵的亮相引起粉丝的热捧"} -{"key": "BAC009S0764W0468", "wav": "./aishell/wav/test/S0764/BAC009S0764W0468.wav", "txt": "永远的紫霞仙子朱茵的亮相引起粉丝的热捧"} -{"key": "BAC009S0764W0469", "wav": "./aishell/wav/test/S0764/BAC009S0764W0469.wav", "txt": "来自全球四七个国家和地区的二零零零多名选手参赛"} -{"key": "BAC009S0764W0470", "wav": "./aishell/wav/test/S0764/BAC009S0764W0470.wav", "txt": "机器人服务员现身火锅店顾客直呼女神"} -{"key": "BAC009S0764W0471", "wav": "./aishell/wav/test/S0764/BAC009S0764W0471.wav", "txt": "女神机器人在火锅店内工作"} -{"key": "BAC009S0764W0472", "wav": "./aishell/wav/test/S0764/BAC009S0764W0472.wav", "txt": "机场严查匿打火机过安检放在鞋子里算藏匿"} -{"key": "BAC009S0764W0473", "wav": "./aishell/wav/test/S0764/BAC009S0764W0473.wav", "txt": "本报讯记者杨柳昨天"} -{"key": "BAC009S0764W0474", "wav": "./aishell/wav/test/S0764/BAC009S0764W0474.wav", "txt": "记者从首都机场公安分局航站区派出所获悉"} -{"key": "BAC009S0764W0475", "wav": "./aishell/wav/test/S0764/BAC009S0764W0475.wav", "txt": "首都机场公安分局航站区派出所联合驻场安检人员"} -{"key": "BAC009S0764W0476", "wav": "./aishell/wav/test/S0764/BAC009S0764W0476.wav", "txt": "坚持违法零容忍和高限处理的执法态度"} -{"key": "BAC009S0764W0477", "wav": "./aishell/wav/test/S0764/BAC009S0764W0477.wav", "txt": "严格搜集和固定相关证据"} -{"key": "BAC009S0764W0478", "wav": "./aishell/wav/test/S0764/BAC009S0764W0478.wav", "txt": "近日在违法事实认识清楚法律法规适用明确的基础上"} -{"key": "BAC009S0764W0479", "wav": "./aishell/wav/test/S0764/BAC009S0764W0479.wav", "txt": "依法对一名藏匿打火机过检的旅客进行了行政处罚"} -{"key": "BAC009S0764W0480", "wav": "./aishell/wav/test/S0764/BAC009S0764W0480.wav", "txt": "机场公安加航航班未发生性侵事件"} -{"key": "BAC009S0764W0482", "wav": "./aishell/wav/test/S0764/BAC009S0764W0482.wav", "txt": "网传该航班一名男性旅客对空姐试图性侵导致飞机返航"} -{"key": "BAC009S0764W0483", "wav": "./aishell/wav/test/S0764/BAC009S0764W0483.wav", "txt": "新京报记者从首都国际机场公安分局相关人员处获悉"} -{"key": "BAC009S0764W0484", "wav": "./aishell/wav/test/S0764/BAC009S0764W0484.wav", "txt": "冲突因空姐发餐时餐车碰到了一名旅客"} -{"key": "BAC009S0764W0485", "wav": "./aishell/wav/test/S0764/BAC009S0764W0485.wav", "txt": "双方因语言交流不畅导致纠纷"} -{"key": "BAC009S0764W0486", "wav": "./aishell/wav/test/S0764/BAC009S0764W0486.wav", "txt": "该男子因影响航班正常秩序"} -{"key": "BAC009S0764W0487", "wav": "./aishell/wav/test/S0764/BAC009S0764W0487.wav", "txt": "明星刘晓庆又火了一把"} -{"key": "BAC009S0764W0488", "wav": "./aishell/wav/test/S0764/BAC009S0764W0488.wav", "txt": "她几乎刷遍了各大媒体"} -{"key": "BAC009S0764W0489", "wav": "./aishell/wav/test/S0764/BAC009S0764W0489.wav", "txt": "不是她的戏或是她的八卦"} -{"key": "BAC009S0764W0490", "wav": "./aishell/wav/test/S0764/BAC009S0764W0490.wav", "txt": "而是因为她也中了天价的招"} -{"key": "BAC009S0764W0491", "wav": "./aishell/wav/test/S0764/BAC009S0764W0491.wav", "txt": "机组成功处置深航机上纵火事件获奖二五零万"} -{"key": "BAC009S0764W0492", "wav": "./aishell/wav/test/S0764/BAC009S0764W0492.wav", "txt": "成功处置深航机上纵火事件"} -{"key": "BAC009S0764W0493", "wav": "./aishell/wav/test/S0764/BAC009S0764W0493.wav", "txt": "杀中传女生嫌犯就想找个人发泄"} -{"key": "BAC009S0764W0494", "wav": "./aishell/wav/test/S0764/BAC009S0764W0494.wav", "txt": "其室友在微博上所发的寻人启事"} -{"key": "BAC009S0764W0495", "wav": "./aishell/wav/test/S0764/BAC009S0764W0495.wav", "txt": "警方证实周云露遇害"} -{"key": "BAC009S0765W0121", "wav": "./aishell/wav/test/S0765/BAC009S0765W0121.wav", "txt": "一线城市出现日光盘"} -{"key": "BAC009S0765W0122", "wav": "./aishell/wav/test/S0765/BAC009S0765W0122.wav", "txt": "楼市地市交相升温房价会不会再度暴涨"} -{"key": "BAC009S0765W0123", "wav": "./aishell/wav/test/S0765/BAC009S0765W0123.wav", "txt": "经济热点导读专家认为"} -{"key": "BAC009S0765W0124", "wav": "./aishell/wav/test/S0765/BAC009S0765W0124.wav", "txt": "我国房地产市场过去总体偏紧部分地区过紧"} -{"key": "BAC009S0765W0125", "wav": "./aishell/wav/test/S0765/BAC009S0765W0125.wav", "txt": "为了将后辈的婚姻分险隔断"} -{"key": "BAC009S0765W0126", "wav": "./aishell/wav/test/S0765/BAC009S0765W0126.wav", "txt": "将受益人定为直系血亲后代非配偶继承人"} -{"key": "BAC009S0765W0127", "wav": "./aishell/wav/test/S0765/BAC009S0765W0127.wav", "txt": "按公司持有房产计征"} -{"key": "BAC009S0765W0128", "wav": "./aishell/wav/test/S0765/BAC009S0765W0128.wav", "txt": "相关公司股票走势"} -{"key": "BAC009S0765W0129", "wav": "./aishell/wav/test/S0765/BAC009S0765W0129.wav", "txt": "房价起飞前购置了十几套房产"} -{"key": "BAC009S0765W0130", "wav": "./aishell/wav/test/S0765/BAC009S0765W0130.wav", "txt": "目前总估值已过亿元"} -{"key": "BAC009S0765W0131", "wav": "./aishell/wav/test/S0765/BAC009S0765W0131.wav", "txt": "这些房产全由宋芳自己打理"} -{"key": "BAC009S0765W0132", "wav": "./aishell/wav/test/S0765/BAC009S0765W0132.wav", "txt": "每月光租金收入便已远大于自己和子女的总开销"} -{"key": "BAC009S0765W0133", "wav": "./aishell/wav/test/S0765/BAC009S0765W0133.wav", "txt": "宋芳最近却有点烦恼"} -{"key": "BAC009S0765W0134", "wav": "./aishell/wav/test/S0765/BAC009S0765W0134.wav", "txt": "我想把房产留给儿女"} -{"key": "BAC009S0765W0135", "wav": "./aishell/wav/test/S0765/BAC009S0765W0135.wav", "txt": "万一以后儿女的婚姻出了问题"} -{"key": "BAC009S0765W0136", "wav": "./aishell/wav/test/S0765/BAC009S0765W0136.wav", "txt": "他们的财产和生活不会受到太大影响"} -{"key": "BAC009S0765W0137", "wav": "./aishell/wav/test/S0765/BAC009S0765W0137.wav", "txt": "宋芳对北京银行私人银行的财富顾问说"} -{"key": "BAC009S0765W0138", "wav": "./aishell/wav/test/S0765/BAC009S0765W0138.wav", "txt": "该信托出资购入宋芳的房产"} -{"key": "BAC009S0765W0139", "wav": "./aishell/wav/test/S0765/BAC009S0765W0139.wav", "txt": "成立资金信托购买自家房产"} -{"key": "BAC009S0765W0140", "wav": "./aishell/wav/test/S0765/BAC009S0765W0140.wav", "txt": "在了解宋芳的资产情况与需求之后"} -{"key": "BAC009S0765W0141", "wav": "./aishell/wav/test/S0765/BAC009S0765W0141.wav", "txt": "设立一个单一资金信托"} -{"key": "BAC009S0765W0142", "wav": "./aishell/wav/test/S0765/BAC009S0765W0142.wav", "txt": "宋芳本人为信托的发起人和委托人"} -{"key": "BAC009S0765W0143", "wav": "./aishell/wav/test/S0765/BAC009S0765W0143.wav", "txt": "北京信托作为受托人"} -{"key": "BAC009S0765W0144", "wav": "./aishell/wav/test/S0765/BAC009S0765W0144.wav", "txt": "之后由该信托对宋芳指定的房产发出购买要约"} -{"key": "BAC009S0765W0145", "wav": "./aishell/wav/test/S0765/BAC009S0765W0145.wav", "txt": "实现该信托对房产的控制"} -{"key": "BAC009S0765W0146", "wav": "./aishell/wav/test/S0765/BAC009S0765W0146.wav", "txt": "虽然房产是在信托的名下"} -{"key": "BAC009S0765W0147", "wav": "./aishell/wav/test/S0765/BAC009S0765W0147.wav", "txt": "但您和您的儿女能自由支配"} -{"key": "BAC009S0765W0148", "wav": "./aishell/wav/test/S0765/BAC009S0765W0148.wav", "txt": "这相当于左兜掏右兜"} -{"key": "BAC009S0765W0149", "wav": "./aishell/wav/test/S0765/BAC009S0765W0149.wav", "txt": "通过信托实现了财产的隔离保护"} -{"key": "BAC009S0765W0150", "wav": "./aishell/wav/test/S0765/BAC009S0765W0150.wav", "txt": "未来子女出现姻缘风险"} -{"key": "BAC009S0765W0151", "wav": "./aishell/wav/test/S0765/BAC009S0765W0151.wav", "txt": "其中资金这一要素指基于信托登记的相关法规局限"} -{"key": "BAC009S0765W0152", "wav": "./aishell/wav/test/S0765/BAC009S0765W0152.wav", "txt": "为了购买自己想要传承给子女的房产"} -{"key": "BAC009S0765W0153", "wav": "./aishell/wav/test/S0765/BAC009S0765W0153.wav", "txt": "宋芳必须再掏出完全属于自己的资金"} -{"key": "BAC009S0765W0154", "wav": "./aishell/wav/test/S0765/BAC009S0765W0154.wav", "txt": "委托人以其持有的资金设立一个单一资金信托"} -{"key": "BAC009S0765W0155", "wav": "./aishell/wav/test/S0765/BAC009S0765W0155.wav", "txt": "该资金可以是委托人的自有资金"} -{"key": "BAC009S0765W0156", "wav": "./aishell/wav/test/S0765/BAC009S0765W0156.wav", "txt": "也可以是委托人合法获得的过桥资金"} -{"key": "BAC009S0765W0157", "wav": "./aishell/wav/test/S0765/BAC009S0765W0157.wav", "txt": "确保所设信托的合法性"} -{"key": "BAC009S0765W0158", "wav": "./aishell/wav/test/S0765/BAC009S0765W0158.wav", "txt": "按公司持有房产计税"} -{"key": "BAC009S0765W0159", "wav": "./aishell/wav/test/S0765/BAC009S0765W0159.wav", "txt": "由于家族信托的存续期通常较长"} -{"key": "BAC009S0765W0160", "wav": "./aishell/wav/test/S0765/BAC009S0765W0160.wav", "txt": "在信托收益的处置上"} -{"key": "BAC009S0765W0161", "wav": "./aishell/wav/test/S0765/BAC009S0765W0161.wav", "txt": "不同的客户对收益再投资的需求差别较大"} -{"key": "BAC009S0765W0162", "wav": "./aishell/wav/test/S0765/BAC009S0765W0162.wav", "txt": "对收益率的要求差别却不至于相去甚远"} -{"key": "BAC009S0765W0163", "wav": "./aishell/wav/test/S0765/BAC009S0765W0163.wav", "txt": "从我们遇到的客户来看"} -{"key": "BAC009S0765W0164", "wav": "./aishell/wav/test/S0765/BAC009S0765W0164.wav", "txt": "回报率普遍要求并不高"} -{"key": "BAC009S0765W0165", "wav": "./aishell/wav/test/S0765/BAC009S0765W0165.wav", "txt": "有些客户只要求收益率超过利率即可"} -{"key": "BAC009S0765W0166", "wav": "./aishell/wav/test/S0765/BAC009S0765W0166.wav", "txt": "看中的是其财产保护与传承的功能"} -{"key": "BAC009S0765W0167", "wav": "./aishell/wav/test/S0765/BAC009S0765W0167.wav", "txt": "我现在就是担心自己哪天突然出现个什么情况"} -{"key": "BAC009S0765W0168", "wav": "./aishell/wav/test/S0765/BAC009S0765W0168.wav", "txt": "他们说不定又有离婚风险"} -{"key": "BAC009S0765W0169", "wav": "./aishell/wav/test/S0765/BAC009S0765W0169.wav", "txt": "我又不指望设立信托来赚钱"} -{"key": "BAC009S0765W0170", "wav": "./aishell/wav/test/S0765/BAC009S0765W0170.wav", "txt": "主要目的是把后辈的婚姻风险隔断"} -{"key": "BAC009S0765W0171", "wav": "./aishell/wav/test/S0765/BAC009S0765W0171.wav", "txt": "在宋芳的资金信托购买其房产时"} -{"key": "BAC009S0765W0172", "wav": "./aishell/wav/test/S0765/BAC009S0765W0172.wav", "txt": "需要按北京当地的要求缴纳二手房交易费用"} -{"key": "BAC009S0765W0173", "wav": "./aishell/wav/test/S0765/BAC009S0765W0173.wav", "txt": "而在信托持有这些房产后"} -{"key": "BAC009S0765W0174", "wav": "./aishell/wav/test/S0765/BAC009S0765W0174.wav", "txt": "这是因为宋芳购买其房产"} -{"key": "BAC009S0765W0175", "wav": "./aishell/wav/test/S0765/BAC009S0765W0175.wav", "txt": "按照公司持有房产计征"} -{"key": "BAC009S0765W0176", "wav": "./aishell/wav/test/S0765/BAC009S0765W0176.wav", "txt": "各项费用的加总并不低"} -{"key": "BAC009S0765W0177", "wav": "./aishell/wav/test/S0765/BAC009S0765W0177.wav", "txt": "在目前的法律框架下"} -{"key": "BAC009S0765W0178", "wav": "./aishell/wav/test/S0765/BAC009S0765W0178.wav", "txt": "这些税费均无法避免"} -{"key": "BAC009S0765W0179", "wav": "./aishell/wav/test/S0765/BAC009S0765W0179.wav", "txt": "他认为跟后辈姻缘风险相比"} -{"key": "BAC009S0765W0180", "wav": "./aishell/wav/test/S0765/BAC009S0765W0180.wav", "txt": "点击进入股友会参与讨论"} -{"key": "BAC009S0765W0181", "wav": "./aishell/wav/test/S0765/BAC009S0765W0181.wav", "txt": "本世纪经济报道"} -{"key": "BAC009S0765W0182", "wav": "./aishell/wav/test/S0765/BAC009S0765W0182.wav", "txt": "为了将后辈的婚姻风险隔断"} -{"key": "BAC009S0765W0183", "wav": "./aishell/wav/test/S0765/BAC009S0765W0183.wav", "txt": "并将受益人定为直系血亲后代非配偶继承人"} -{"key": "BAC009S0765W0184", "wav": "./aishell/wav/test/S0765/BAC009S0765W0184.wav", "txt": "今久整合营销集团迎来了它的生日"} -{"key": "BAC009S0765W0185", "wav": "./aishell/wav/test/S0765/BAC009S0765W0185.wav", "txt": "从最初的几十人"} -{"key": "BAC009S0765W0186", "wav": "./aishell/wav/test/S0765/BAC009S0765W0186.wav", "txt": "事业版图遍布全中国的集团化整合营销公司"} -{"key": "BAC009S0765W0187", "wav": "./aishell/wav/test/S0765/BAC009S0765W0187.wav", "txt": "无序竞争甚至恶意竞争时常发生"} -{"key": "BAC009S0765W0188", "wav": "./aishell/wav/test/S0765/BAC009S0765W0188.wav", "txt": "将发挥部际会议联席制度作用"} -{"key": "BAC009S0765W0189", "wav": "./aishell/wav/test/S0765/BAC009S0765W0189.wav", "txt": "制订境外投资总体战略"} -{"key": "BAC009S0765W0190", "wav": "./aishell/wav/test/S0765/BAC009S0765W0190.wav", "txt": "对重大项目和重大问题进行协调"} -{"key": "BAC009S0765W0191", "wav": "./aishell/wav/test/S0765/BAC009S0765W0191.wav", "txt": "引导企业围绕重点国家和地区在重点领域展开投资"} -{"key": "BAC009S0765W0192", "wav": "./aishell/wav/test/S0765/BAC009S0765W0192.wav", "txt": "鼓励本土中介机构提供服务"} -{"key": "BAC009S0765W0193", "wav": "./aishell/wav/test/S0765/BAC009S0765W0193.wav", "txt": "建立起政府部门企业和中介机构各司其职的组织架构"} -{"key": "BAC009S0765W0194", "wav": "./aishell/wav/test/S0765/BAC009S0765W0194.wav", "txt": "北京科技大学教授刘澄表示"} -{"key": "BAC009S0765W0195", "wav": "./aishell/wav/test/S0765/BAC009S0765W0195.wav", "txt": "主要是如何提供服务及做好监管"} -{"key": "BAC009S0765W0196", "wav": "./aishell/wav/test/S0765/BAC009S0765W0196.wav", "txt": "规划提出诸多想法"} -{"key": "BAC009S0765W0197", "wav": "./aishell/wav/test/S0765/BAC009S0765W0197.wav", "txt": "政府提供的服务企业是否需要"} -{"key": "BAC009S0765W0198", "wav": "./aishell/wav/test/S0765/BAC009S0765W0198.wav", "txt": "如何为企业提供信息避免海外投资风险"} -{"key": "BAC009S0765W0199", "wav": "./aishell/wav/test/S0765/BAC009S0765W0199.wav", "txt": "避免海外一窝蜂上项目等"} -{"key": "BAC009S0765W0200", "wav": "./aishell/wav/test/S0765/BAC009S0765W0200.wav", "txt": "发改委将尽快建立地方政府债务管理体系到"} -{"key": "BAC009S0765W0201", "wav": "./aishell/wav/test/S0765/BAC009S0765W0201.wav", "txt": "他就上述关注问题指出"} -{"key": "BAC009S0765W0202", "wav": "./aishell/wav/test/S0765/BAC009S0765W0202.wav", "txt": "目前我国出现政府性债务违约可能性并不大"} -{"key": "BAC009S0765W0203", "wav": "./aishell/wav/test/S0765/BAC009S0765W0203.wav", "txt": "下一步将进一步完善城投债券发行制度和防范风险机制"} -{"key": "BAC009S0765W0204", "wav": "./aishell/wav/test/S0765/BAC009S0765W0204.wav", "txt": "并尽快建立我国地方政府债务管理体系等"} -{"key": "BAC009S0765W0205", "wav": "./aishell/wav/test/S0765/BAC009S0765W0205.wav", "txt": "政府性违约可能性不大"} -{"key": "BAC009S0765W0206", "wav": "./aishell/wav/test/S0765/BAC009S0765W0206.wav", "txt": "中国证券报随着欧美等国主权债务危机陆续爆发"} -{"key": "BAC009S0765W0207", "wav": "./aishell/wav/test/S0765/BAC009S0765W0207.wav", "txt": "您如何看待政府的举债行为和债务风险"} -{"key": "BAC009S0765W0208", "wav": "./aishell/wav/test/S0765/BAC009S0765W0208.wav", "txt": "徐林吸取欧美等国主权债务危机的教训"} -{"key": "BAC009S0765W0209", "wav": "./aishell/wav/test/S0765/BAC009S0765W0209.wav", "txt": "采取必要措施加强政府债务管理"} -{"key": "BAC009S0765W0210", "wav": "./aishell/wav/test/S0765/BAC009S0765W0210.wav", "txt": "防范我国政府债务风险"} -{"key": "BAC009S0765W0211", "wav": "./aishell/wav/test/S0765/BAC009S0765W0211.wav", "txt": "但在具体评估我国地方政府债务风险程度时"} -{"key": "BAC009S0765W0212", "wav": "./aishell/wav/test/S0765/BAC009S0765W0212.wav", "txt": "也要看到我国与欧美国家的不同之处"} -{"key": "BAC009S0765W0213", "wav": "./aishell/wav/test/S0765/BAC009S0765W0213.wav", "txt": "我国地方政府性债务"} -{"key": "BAC009S0765W0214", "wav": "./aishell/wav/test/S0765/BAC009S0765W0214.wav", "txt": "特别是地方投融资平台公司形成的债务"} -{"key": "BAC009S0765W0215", "wav": "./aishell/wav/test/S0765/BAC009S0765W0215.wav", "txt": "主要用于各地基础设施的投资建设"} -{"key": "BAC009S0765W0216", "wav": "./aishell/wav/test/S0765/BAC009S0765W0216.wav", "txt": "当代人和后代人共同承担债务还本付息责任"} -{"key": "BAC009S0765W0217", "wav": "./aishell/wav/test/S0765/BAC009S0765W0217.wav", "txt": "可以更好地体现代际公平"} -{"key": "BAC009S0765W0218", "wav": "./aishell/wav/test/S0765/BAC009S0765W0218.wav", "txt": "克服当期建设资金不足的瓶颈制约"} -{"key": "BAC009S0765W0219", "wav": "./aishell/wav/test/S0765/BAC009S0765W0219.wav", "txt": "有利于加快完善基础设施和投资环境"} -{"key": "BAC009S0765W0220", "wav": "./aishell/wav/test/S0765/BAC009S0765W0220.wav", "txt": "是一种合理的基础设施投融资建设行为"} -{"key": "BAC009S0765W0221", "wav": "./aishell/wav/test/S0765/BAC009S0765W0221.wav", "txt": "政府举债建设形成了大量资产"} -{"key": "BAC009S0765W0222", "wav": "./aishell/wav/test/S0765/BAC009S0765W0222.wav", "txt": "相当部分资产具有长期的直接收益"} -{"key": "BAC009S0765W0223", "wav": "./aishell/wav/test/S0765/BAC009S0765W0223.wav", "txt": "一些没有直接收益的项目"} -{"key": "BAC009S0765W0224", "wav": "./aishell/wav/test/S0765/BAC009S0765W0224.wav", "txt": "也具有间接的经济效益或社会效益"} -{"key": "BAC009S0765W0225", "wav": "./aishell/wav/test/S0765/BAC009S0765W0225.wav", "txt": "对促进当地经济增长和政府财力的增长"} -{"key": "BAC009S0765W0226", "wav": "./aishell/wav/test/S0765/BAC009S0765W0226.wav", "txt": "不能简单地用寅吃卯粮来作价值判断"} -{"key": "BAC009S0765W0227", "wav": "./aishell/wav/test/S0765/BAC009S0765W0227.wav", "txt": "这并不意味着政府可以无节制地借债"} -{"key": "BAC009S0765W0228", "wav": "./aishell/wav/test/S0765/BAC009S0765W0228.wav", "txt": "关键是要把投资规模和债务规模"} -{"key": "BAC009S0765W0229", "wav": "./aishell/wav/test/S0765/BAC009S0765W0229.wav", "txt": "控制在合理的范围内"} -{"key": "BAC009S0765W0230", "wav": "./aishell/wav/test/S0765/BAC009S0765W0230.wav", "txt": "防止出现系统性的偿债风险"} -{"key": "BAC009S0765W0231", "wav": "./aishell/wav/test/S0765/BAC009S0765W0231.wav", "txt": "国务院高度重视防范地方政府债务风险"} -{"key": "BAC009S0765W0232", "wav": "./aishell/wav/test/S0765/BAC009S0765W0232.wav", "txt": "从二零零九年下半年就开始要求有关部门调研这一问题"} -{"key": "BAC009S0765W0233", "wav": "./aishell/wav/test/S0765/BAC009S0765W0233.wav", "txt": "国家审计署还专门组织力量"} -{"key": "BAC009S0765W0234", "wav": "./aishell/wav/test/S0765/BAC009S0765W0234.wav", "txt": "对全国各地的政府债务进行了严格审计"} -{"key": "BAC009S0765W0235", "wav": "./aishell/wav/test/S0765/BAC009S0765W0235.wav", "txt": "审计署的审计结论表明"} -{"key": "BAC009S0765W0236", "wav": "./aishell/wav/test/S0765/BAC009S0765W0236.wav", "txt": "我国地方政府的累积债务相对于偿付能力来看"} -{"key": "BAC009S0765W0237", "wav": "./aishell/wav/test/S0765/BAC009S0765W0237.wav", "txt": "远低于发生债务危机的欧美国家"} -{"key": "BAC009S0765W0238", "wav": "./aishell/wav/test/S0765/BAC009S0765W0238.wav", "txt": "考虑到我国正处在经济快速增长期"} -{"key": "BAC009S0765W0239", "wav": "./aishell/wav/test/S0765/BAC009S0765W0239.wav", "txt": "政府财力增长也相应较快"} -{"key": "BAC009S0765W0240", "wav": "./aishell/wav/test/S0765/BAC009S0765W0240.wav", "txt": "政府还拥有较多的可变现资产"} -{"key": "BAC009S0765W0241", "wav": "./aishell/wav/test/S0765/BAC009S0765W0241.wav", "txt": "相对于目前的负债规模"} -{"key": "BAC009S0765W0242", "wav": "./aishell/wav/test/S0765/BAC009S0765W0242.wav", "txt": "政府总体上具有较强的偿债能力"} -{"key": "BAC009S0765W0243", "wav": "./aishell/wav/test/S0765/BAC009S0765W0243.wav", "txt": "采取积极有效的措施化解部分地区和领域的债务风险"} -{"key": "BAC009S0765W0244", "wav": "./aishell/wav/test/S0765/BAC009S0765W0244.wav", "txt": "在我国出现政府性债务违约的可能性是不大的"} -{"key": "BAC009S0765W0245", "wav": "./aishell/wav/test/S0765/BAC009S0765W0245.wav", "txt": "债券市场城投债券发行不畅"} -{"key": "BAC009S0765W0246", "wav": "./aishell/wav/test/S0765/BAC009S0765W0246.wav", "txt": "从城投债券发行监管部门的角度"} -{"key": "BAC009S0765W0247", "wav": "./aishell/wav/test/S0765/BAC009S0765W0247.wav", "txt": "您如何看待这一现象"} -{"key": "BAC009S0765W0248", "wav": "./aishell/wav/test/S0765/BAC009S0765W0248.wav", "txt": "徐林出于对地方政府债务风险的担忧"} -{"key": "BAC009S0765W0249", "wav": "./aishell/wav/test/S0765/BAC009S0765W0249.wav", "txt": "投资者采取措施防范风险是成熟的表现"} -{"key": "BAC009S0765W0250", "wav": "./aishell/wav/test/S0765/BAC009S0765W0250.wav", "txt": "但出于对我国地方政府债务风险的不合理判断"} -{"key": "BAC009S0765W0251", "wav": "./aishell/wav/test/S0765/BAC009S0765W0251.wav", "txt": "并进而对城投债券进行唱空或做空"} -{"key": "BAC009S0765W0252", "wav": "./aishell/wav/test/S0765/BAC009S0765W0252.wav", "txt": "最近企业债券特别是城投债券的发行难度加大"} -{"key": "BAC009S0765W0253", "wav": "./aishell/wav/test/S0765/BAC009S0765W0253.wav", "txt": "其在纳斯达克上市时的发行价为一六美元"} -{"key": "BAC009S0765W0254", "wav": "./aishell/wav/test/S0765/BAC009S0765W0254.wav", "txt": "其股票价格在十五点七六美元上下徘徊"} -{"key": "BAC009S0765W0255", "wav": "./aishell/wav/test/S0765/BAC009S0765W0255.wav", "txt": "中国手游在退市之前的市盈率在十六十七倍左右"} -{"key": "BAC009S0765W0256", "wav": "./aishell/wav/test/S0765/BAC009S0765W0256.wav", "txt": "掌趣科技三零零三一五一度超过二百倍"} -{"key": "BAC009S0765W0257", "wav": "./aishell/wav/test/S0765/BAC009S0765W0257.wav", "txt": "华尔街并不认可游戏这种商业模式"} -{"key": "BAC009S0765W0258", "wav": "./aishell/wav/test/S0765/BAC009S0765W0258.wav", "txt": "并非仅仅针对中国公司"} -{"key": "BAC009S0765W0260", "wav": "./aishell/wav/test/S0765/BAC009S0765W0260.wav", "txt": "作为美国本土著名社交游戏开发商"} -{"key": "BAC009S0765W0261", "wav": "./aishell/wav/test/S0765/BAC009S0765W0261.wav", "txt": "在当年社交游戏风靡的时候"} -{"key": "BAC009S0765W0262", "wav": "./aishell/wav/test/S0765/BAC009S0765W0262.wav", "txt": "因为快速发展的业务和不断膨胀的营收受资本市场亲睐"} -{"key": "BAC009S0765W0265", "wav": "./aishell/wav/test/S0765/BAC009S0765W0265.wav", "txt": "亏损二千六百九十万美元相比上一季度"} -{"key": "BAC009S0765W0266", "wav": "./aishell/wav/test/S0765/BAC009S0765W0266.wav", "txt": "这一亏损已经收窄了百分之五十七"} -{"key": "BAC009S0765W0267", "wav": "./aishell/wav/test/S0765/BAC009S0765W0267.wav", "txt": "成熟的资本市场相对公平"} -{"key": "BAC009S0765W0268", "wav": "./aishell/wav/test/S0765/BAC009S0765W0268.wav", "txt": "这些被市场唱空的游戏公司本身业务模式遇到了困境"} -{"key": "BAC009S0765W0269", "wav": "./aishell/wav/test/S0765/BAC009S0765W0269.wav", "txt": "游戏公司往往靠一款游戏在市场上火爆"} -{"key": "BAC009S0765W0270", "wav": "./aishell/wav/test/S0765/BAC009S0765W0270.wav", "txt": "大多数游戏产品往往病毒式地成长"} -{"key": "BAC009S0765W0272", "wav": "./aishell/wav/test/S0765/BAC009S0765W0272.wav", "txt": "这些中国游戏公司大多成长于中国市场"} -{"key": "BAC009S0765W0273", "wav": "./aishell/wav/test/S0765/BAC009S0765W0273.wav", "txt": "他们的产品在海外市场也极少被认可"} -{"key": "BAC009S0765W0274", "wav": "./aishell/wav/test/S0765/BAC009S0765W0274.wav", "txt": "这些公司在海外市场上市往往除了获得融资机会"} -{"key": "BAC009S0765W0275", "wav": "./aishell/wav/test/S0765/BAC009S0765W0275.wav", "txt": "并未能给这些公司带来其他的效应"} -{"key": "BAC009S0765W0277", "wav": "./aishell/wav/test/S0765/BAC009S0765W0277.wav", "txt": "中国游戏产品和美国产品极为不同"} -{"key": "BAC009S0765W0278", "wav": "./aishell/wav/test/S0765/BAC009S0765W0278.wav", "txt": "美国玩家对游戏难度创造性要求较高"} -{"key": "BAC009S0765W0279", "wav": "./aishell/wav/test/S0765/BAC009S0765W0279.wav", "txt": "中国产品不可能照搬到美国市场"} -{"key": "BAC009S0765W0280", "wav": "./aishell/wav/test/S0765/BAC009S0765W0280.wav", "txt": "在融资和发展海外市场上"} -{"key": "BAC009S0765W0281", "wav": "./aishell/wav/test/S0765/BAC009S0765W0281.wav", "txt": "还希望拓展市场的话"} -{"key": "BAC009S0765W0282", "wav": "./aishell/wav/test/S0765/BAC009S0765W0282.wav", "txt": "触控科技全资韩国子公司在韩国上市"} -{"key": "BAC009S0765W0283", "wav": "./aishell/wav/test/S0765/BAC009S0765W0283.wav", "txt": "而从二零一三年开始"} -{"key": "BAC009S0765W0284", "wav": "./aishell/wav/test/S0765/BAC009S0765W0284.wav", "txt": "这家公司就在挖角当地游戏公司高管建立分公司"} -{"key": "BAC009S0765W0285", "wav": "./aishell/wav/test/S0765/BAC009S0765W0285.wav", "txt": "打造适合当地市场的产品"} -{"key": "BAC009S0765W0286", "wav": "./aishell/wav/test/S0765/BAC009S0765W0286.wav", "txt": "根据陈昊芝在二零一四年八月提供的数据"} -{"key": "BAC009S0765W0287", "wav": "./aishell/wav/test/S0765/BAC009S0765W0287.wav", "txt": "市场份额做到了前十位"} -{"key": "BAC009S0765W0288", "wav": "./aishell/wav/test/S0765/BAC009S0765W0288.wav", "txt": "未来上市能够放大公司的品牌效益"} -{"key": "BAC009S0765W0289", "wav": "./aishell/wav/test/S0765/BAC009S0765W0289.wav", "txt": "让当地更多的人知道这家公司"} -{"key": "BAC009S0765W0290", "wav": "./aishell/wav/test/S0765/BAC009S0765W0290.wav", "txt": "在韩国股市低迷情况下"} -{"key": "BAC009S0765W0291", "wav": "./aishell/wav/test/S0765/BAC009S0765W0291.wav", "txt": "触控科技子公司涨幅居前"} -{"key": "BAC009S0765W0292", "wav": "./aishell/wav/test/S0765/BAC009S0765W0292.wav", "txt": "对于游戏这种地域属性较重的产品"} -{"key": "BAC009S0765W0293", "wav": "./aishell/wav/test/S0765/BAC009S0765W0293.wav", "txt": "应慎重考虑上市时机和地点"} -{"key": "BAC009S0765W0294", "wav": "./aishell/wav/test/S0765/BAC009S0765W0294.wav", "txt": "反复检视自身商业模式"} -{"key": "BAC009S0765W0295", "wav": "./aishell/wav/test/S0765/BAC009S0765W0295.wav", "txt": "而不是迫不及待抓住一切可以上市圈钱的机会"} -{"key": "BAC009S0765W0296", "wav": "./aishell/wav/test/S0765/BAC009S0765W0296.wav", "txt": "往往连最直接的目标都无法达成"} -{"key": "BAC009S0765W0297", "wav": "./aishell/wav/test/S0765/BAC009S0765W0297.wav", "txt": "因为难以忍受股价长期被低估"} -{"key": "BAC009S0765W0298", "wav": "./aishell/wav/test/S0765/BAC009S0765W0298.wav", "txt": "中国游戏公司纷纷忙着退市"} -{"key": "BAC009S0765W0299", "wav": "./aishell/wav/test/S0765/BAC009S0765W0299.wav", "txt": "最近都在流行做预测"} -{"key": "BAC009S0765W0300", "wav": "./aishell/wav/test/S0765/BAC009S0765W0300.wav", "txt": "于是他也来凑凑热闹"} -{"key": "BAC009S0765W0301", "wav": "./aishell/wav/test/S0765/BAC009S0765W0301.wav", "txt": "他的预测有点毒基本上是在讨论谁会下台"} -{"key": "BAC009S0765W0303", "wav": "./aishell/wav/test/S0765/BAC009S0765W0303.wav", "txt": "每日经济新闻记者杨建江南嘉捷六万"} -{"key": "BAC009S0765W0304", "wav": "./aishell/wav/test/S0765/BAC009S0765W0304.wav", "txt": "收盘价十三点六五元于七月八日发布公告"} -{"key": "BAC009S0765W0305", "wav": "./aishell/wav/test/S0765/BAC009S0765W0305.wav", "txt": "为使股价与公司价值匹配"} -{"key": "BAC009S0765W0306", "wav": "./aishell/wav/test/S0765/BAC009S0765W0306.wav", "txt": "公司拟计划通过集中竞价交易方式回购公司股份"} -{"key": "BAC009S0765W0307", "wav": "./aishell/wav/test/S0765/BAC009S0765W0307.wav", "txt": "公司此次回购股份的价格不超过十五点一零七元股"} -{"key": "BAC009S0765W0308", "wav": "./aishell/wav/test/S0765/BAC009S0765W0308.wav", "txt": "用于回购的资金总额不超过一点五一七亿元"} -{"key": "BAC009S0765W0309", "wav": "./aishell/wav/test/S0765/BAC009S0765W0309.wav", "txt": "预计回购股份约一千万股"} -{"key": "BAC009S0765W0310", "wav": "./aishell/wav/test/S0765/BAC009S0765W0310.wav", "txt": "占公司总股本约二点百分之五十"} -{"key": "BAC009S0765W0311", "wav": "./aishell/wav/test/S0765/BAC009S0765W0311.wav", "txt": "公司股票于二零一五年七月八日复牌"} -{"key": "BAC009S0765W0312", "wav": "./aishell/wav/test/S0765/BAC009S0765W0312.wav", "txt": "每日经济新闻记者注意到"} -{"key": "BAC009S0765W0313", "wav": "./aishell/wav/test/S0765/BAC009S0765W0313.wav", "txt": "截至二零一四年十二月三十一日"} -{"key": "BAC009S0765W0314", "wav": "./aishell/wav/test/S0765/BAC009S0765W0314.wav", "txt": "资金来源为自有资金"} -{"key": "BAC009S0765W0315", "wav": "./aishell/wav/test/S0765/BAC009S0765W0315.wav", "txt": "回购期限为自回购股份方案之日起至今年底"} -{"key": "BAC009S0765W0317", "wav": "./aishell/wav/test/S0765/BAC009S0765W0317.wav", "txt": "收盘价四点九九元也于七月八日公告"} -{"key": "BAC009S0765W0321", "wav": "./aishell/wav/test/S0765/BAC009S0765W0321.wav", "txt": "其目前的股票市值已经不能完全反映公司价值"} -{"key": "BAC009S0765W0323", "wav": "./aishell/wav/test/S0765/BAC009S0765W0323.wav", "txt": "增持后持股比例为六十二点百分之二十三"} -{"key": "BAC009S0765W0325", "wav": "./aishell/wav/test/S0765/BAC009S0765W0325.wav", "txt": "拟在二零一五年二零一七年先行推出两期回购方案"} -{"key": "BAC009S0765W0326", "wav": "./aishell/wav/test/S0765/BAC009S0765W0326.wav", "txt": "其中第一期回购资金上限为二零一四年净利润的百分之二十五"} -{"key": "BAC009S0765W0327", "wav": "./aishell/wav/test/S0765/BAC009S0765W0327.wav", "txt": "第二期股票回购方案不晚于二零一七年六月三十日推出"} -{"key": "BAC009S0765W0328", "wav": "./aishell/wav/test/S0765/BAC009S0765W0328.wav", "txt": "回购期限为股东大会通过后不超过十二个月"} -{"key": "BAC009S0765W0329", "wav": "./aishell/wav/test/S0765/BAC009S0765W0329.wav", "txt": "预计可回购不少于七百九十一万股"} -{"key": "BAC009S0765W0330", "wav": "./aishell/wav/test/S0765/BAC009S0765W0330.wav", "txt": "每日经济新闻记者杨建江南嘉捷六万一千一百三十一三"} -{"key": "BAC009S0765W0331", "wav": "./aishell/wav/test/S0765/BAC009S0765W0331.wav", "txt": "收盘价十三点六五元于七月八日发布公告"} -{"key": "BAC009S0765W0332", "wav": "./aishell/wav/test/S0765/BAC009S0765W0332.wav", "txt": "为使股价与公司价值匹配"} -{"key": "BAC009S0765W0333", "wav": "./aishell/wav/test/S0765/BAC009S0765W0333.wav", "txt": "公司拟计划通过集中竞价交易方式回购公司股份"} -{"key": "BAC009S0765W0334", "wav": "./aishell/wav/test/S0765/BAC009S0765W0334.wav", "txt": "锂电池在今年上半年成为诸多上市公司的业绩功臣"} -{"key": "BAC009S0765W0335", "wav": "./aishell/wav/test/S0765/BAC009S0765W0335.wav", "txt": "成飞集成百二十一九十"} -{"key": "BAC009S0765W0337", "wav": "./aishell/wav/test/S0765/BAC009S0765W0337.wav", "txt": "公司上半年营业收入六点三四亿元"} -{"key": "BAC009S0765W0338", "wav": "./aishell/wav/test/S0765/BAC009S0765W0338.wav", "txt": "折合人民币大概二千四八零亿"} -{"key": "BAC009S0765W0339", "wav": "./aishell/wav/test/S0765/BAC009S0765W0339.wav", "txt": "而与经济实力相关的一些基础设施建设方面"} -{"key": "BAC009S0765W0340", "wav": "./aishell/wav/test/S0765/BAC009S0765W0340.wav", "txt": "无论是城市交通建设还是机场运力"} -{"key": "BAC009S0765W0341", "wav": "./aishell/wav/test/S0765/BAC009S0765W0341.wav", "txt": "阿拉木图都无法和北京相比"} -{"key": "BAC009S0765W0342", "wav": "./aishell/wav/test/S0765/BAC009S0765W0342.wav", "txt": "远远无法和北京相提并论"} -{"key": "BAC009S0765W0343", "wav": "./aishell/wav/test/S0765/BAC009S0765W0343.wav", "txt": "从申办冬奥会的硬件基础上看"} -{"key": "BAC009S0765W0344", "wav": "./aishell/wav/test/S0765/BAC009S0765W0344.wav", "txt": "北京冬奥会的硬件基础要强于阿拉木图"} -{"key": "BAC009S0765W0345", "wav": "./aishell/wav/test/S0765/BAC009S0765W0345.wav", "txt": "北京张家口计划启用一二个竞赛场馆"} -{"key": "BAC009S0765W0346", "wav": "./aishell/wav/test/S0765/BAC009S0765W0346.wav", "txt": "其中五个场馆需要新建"} -{"key": "BAC009S0765W0347", "wav": "./aishell/wav/test/S0765/BAC009S0765W0347.wav", "txt": "其馀场馆改扩建后可以满足赛事需要"} -{"key": "BAC009S0765W0348", "wav": "./aishell/wav/test/S0765/BAC009S0765W0348.wav", "txt": "其中北京市区仅需要新建一座速滑场馆"} -{"key": "BAC009S0765W0349", "wav": "./aishell/wav/test/S0765/BAC009S0765W0349.wav", "txt": "阿拉木图方面将会使用十四个场馆作为比赛之用"} -{"key": "BAC009S0765W0350", "wav": "./aishell/wav/test/S0765/BAC009S0765W0350.wav", "txt": "目前八座为已有场馆并在使用中"} -{"key": "BAC009S0765W0351", "wav": "./aishell/wav/test/S0765/BAC009S0765W0351.wav", "txt": "其于六个场馆都需要新建"} -{"key": "BAC009S0765W0352", "wav": "./aishell/wav/test/S0765/BAC009S0765W0352.wav", "txt": "从举办大型体育赛事的经验来看"} -{"key": "BAC009S0765W0353", "wav": "./aishell/wav/test/S0765/BAC009S0765W0353.wav", "txt": "北京的经验比阿拉木图丰富"} -{"key": "BAC009S0765W0354", "wav": "./aishell/wav/test/S0765/BAC009S0765W0354.wav", "txt": "还有二零一五年的田径世锦赛"} -{"key": "BAC009S0765W0355", "wav": "./aishell/wav/test/S0765/BAC009S0765W0355.wav", "txt": "这些使得北京积累了大量的举办与运营经验"} -{"key": "BAC009S0765W0356", "wav": "./aishell/wav/test/S0765/BAC009S0765W0356.wav", "txt": "也证明了北京举办大型体育赛事的能力"} -{"key": "BAC009S0765W0357", "wav": "./aishell/wav/test/S0765/BAC009S0765W0357.wav", "txt": "自从哈萨克斯坦独立"} -{"key": "BAC009S0765W0358", "wav": "./aishell/wav/test/S0765/BAC009S0765W0358.wav", "txt": "二零一一年的亚冬会是其举办的第一个国际性综合赛事"} -{"key": "BAC009S0765W0359", "wav": "./aishell/wav/test/S0765/BAC009S0765W0359.wav", "txt": "之后就没有举办过的大型体育赛事"} -{"key": "BAC009S0765W0360", "wav": "./aishell/wav/test/S0765/BAC009S0765W0360.wav", "txt": "花样滑冰大奖赛中国杯常年在北京和上海之间轮换"} -{"key": "BAC009S0765W0361", "wav": "./aishell/wav/test/S0765/BAC009S0765W0361.wav", "txt": "二零一四年的冰壶世锦赛也在北京举行"} -{"key": "BAC009S0765W0362", "wav": "./aishell/wav/test/S0765/BAC009S0765W0362.wav", "txt": "一系列大型赛事的承办"} -{"key": "BAC009S0765W0363", "wav": "./aishell/wav/test/S0765/BAC009S0765W0363.wav", "txt": "让北京具备了承办冬奥会这种顶级赛事的经验和能力"} -{"key": "BAC009S0765W0364", "wav": "./aishell/wav/test/S0765/BAC009S0765W0364.wav", "txt": "北京申办冬奥影响远超承办本身"} -{"key": "BAC009S0765W0365", "wav": "./aishell/wav/test/S0765/BAC009S0765W0365.wav", "txt": "此次北京申办冬奥会"} -{"key": "BAC009S0765W0366", "wav": "./aishell/wav/test/S0765/BAC009S0765W0366.wav", "txt": "也让我们看到了自身存在着的不足"} -{"key": "BAC009S0765W0367", "wav": "./aishell/wav/test/S0765/BAC009S0765W0367.wav", "txt": "其中主要集中于冰雪运动本身实力上的有所欠缺"} -{"key": "BAC009S0765W0368", "wav": "./aishell/wav/test/S0765/BAC009S0765W0368.wav", "txt": "二零二二年冬奥会的举行"} -{"key": "BAC009S0765W0369", "wav": "./aishell/wav/test/S0765/BAC009S0765W0369.wav", "txt": "对于我国冰雪运动实力的提升会有巨大的推动作用"} -{"key": "BAC009S0765W0370", "wav": "./aishell/wav/test/S0765/BAC009S0765W0370.wav", "txt": "和夏季奥运会上的斩金夺银不同"} -{"key": "BAC009S0765W0371", "wav": "./aishell/wav/test/S0765/BAC009S0765W0371.wav", "txt": "中国的冬季运动还处于半起步阶段"} -{"key": "BAC009S0765W0372", "wav": "./aishell/wav/test/S0765/BAC009S0765W0372.wav", "txt": "这在我国体育发展史上具有划时代的意义"} -{"key": "BAC009S0765W0373", "wav": "./aishell/wav/test/S0765/BAC009S0765W0373.wav", "txt": "标志着我国体育开始走向国际化"} -{"key": "BAC009S0765W0374", "wav": "./aishell/wav/test/S0765/BAC009S0765W0374.wav", "txt": "成为了国际体育运动大家庭中的一员"} -{"key": "BAC009S0765W0375", "wav": "./aishell/wav/test/S0765/BAC009S0765W0375.wav", "txt": "但直到一二年后的法国阿尔贝维尔冬奥会上"} -{"key": "BAC009S0765W0376", "wav": "./aishell/wav/test/S0765/BAC009S0765W0376.wav", "txt": "我国选手才实现了冬奥奖牌零的突破"} -{"key": "BAC009S0765W0377", "wav": "./aishell/wav/test/S0765/BAC009S0765W0377.wav", "txt": "取得这一突破的领军人就包括轮椅英雄叶乔波"} -{"key": "BAC009S0765W0378", "wav": "./aishell/wav/test/S0765/BAC009S0765W0378.wav", "txt": "又是十年的空白期"} -{"key": "BAC009S0765W0379", "wav": "./aishell/wav/test/S0765/BAC009S0765W0379.wav", "txt": "二零零二年的美国盐湖城冬奥会上"} -{"key": "BAC009S0765W0380", "wav": "./aishell/wav/test/S0765/BAC009S0765W0380.wav", "txt": "杨扬拿到了五百米和一千米两项短道速的金牌"} -{"key": "BAC009S0765W0381", "wav": "./aishell/wav/test/S0765/BAC009S0765W0381.wav", "txt": "更具历史性意义的是"} -{"key": "BAC009S0765W0382", "wav": "./aishell/wav/test/S0765/BAC009S0765W0382.wav", "txt": "这是中国奥运代表团在冬季奥运会上取得的首枚金牌"} -{"key": "BAC009S0765W0383", "wav": "./aishell/wav/test/S0765/BAC009S0765W0383.wav", "txt": "经过二十多年的努力"} -{"key": "BAC009S0765W0384", "wav": "./aishell/wav/test/S0765/BAC009S0765W0384.wav", "txt": "中国冰雪健儿终于站到了冬奥会的最高领奖台"} -{"key": "BAC009S0765W0385", "wav": "./aishell/wav/test/S0765/BAC009S0765W0385.wav", "txt": "以及拥有陈露的女单项目"} -{"key": "BAC009S0765W0386", "wav": "./aishell/wav/test/S0765/BAC009S0765W0386.wav", "txt": "可以在世界范围内立足"} -{"key": "BAC009S0765W0387", "wav": "./aishell/wav/test/S0765/BAC009S0765W0387.wav", "txt": "但随着这一系列名将的退役"} -{"key": "BAC009S0765W0388", "wav": "./aishell/wav/test/S0765/BAC009S0765W0388.wav", "txt": "在中国的这一传统优势项目上"} -{"key": "BAC009S0765W0389", "wav": "./aishell/wav/test/S0765/BAC009S0765W0389.wav", "txt": "我们可以说已经输给了其他强敌"} -{"key": "BAC009S0765W0390", "wav": "./aishell/wav/test/S0765/BAC009S0765W0390.wav", "txt": "更直观的体现是在冰雪运动的核心项目冰球上"} -{"key": "BAC009S0765W0391", "wav": "./aishell/wav/test/S0765/BAC009S0765W0391.wav", "txt": "竞争对手哈萨克斯坦在这一点上要强过我们"} -{"key": "BAC009S0765W0392", "wav": "./aishell/wav/test/S0765/BAC009S0765W0392.wav", "txt": "中国国家男子冰球队目前排名第三十二位"} -{"key": "BAC009S0765W0393", "wav": "./aishell/wav/test/S0765/BAC009S0765W0393.wav", "txt": "而哈萨克斯坦则是第十六位"} -{"key": "BAC009S0765W0394", "wav": "./aishell/wav/test/S0765/BAC009S0765W0394.wav", "txt": "所有主办国的男子冰球成绩排位均在二十位之内"} -{"key": "BAC009S0765W0395", "wav": "./aishell/wav/test/S0765/BAC009S0765W0395.wav", "txt": "二零一八年冬奥会的主办地韩国平昌是一个绝好的例子"} -{"key": "BAC009S0765W0396", "wav": "./aishell/wav/test/S0765/BAC009S0765W0396.wav", "txt": "平昌曾经三次申办冬奥会"} -{"key": "BAC009S0765W0397", "wav": "./aishell/wav/test/S0765/BAC009S0765W0397.wav", "txt": "前两次申办的过程中"} -{"key": "BAC009S0765W0398", "wav": "./aishell/wav/test/S0765/BAC009S0765W0398.wav", "txt": "男子冰球的战绩均在二十五名左右"} -{"key": "BAC009S0765W0399", "wav": "./aishell/wav/test/S0765/BAC009S0765W0399.wav", "txt": "而第三次申办周期内"} -{"key": "BAC009S0765W0401", "wav": "./aishell/wav/test/S0765/BAC009S0765W0401.wav", "txt": "几乎帮助了平昌拿下二零一八年冬奥会的主办权"} -{"key": "BAC009S0765W0402", "wav": "./aishell/wav/test/S0765/BAC009S0765W0402.wav", "txt": "由于韩国冰球协会的四年规划"} -{"key": "BAC009S0765W0403", "wav": "./aishell/wav/test/S0765/BAC009S0765W0403.wav", "txt": "保证国家队水平不会被其他球队相差太远的承诺下"} -{"key": "BAC009S0765W0404", "wav": "./aishell/wav/test/S0765/BAC009S0765W0404.wav", "txt": "业已正式启动斯诺登事件电影的拍摄"} -{"key": "BAC009S0765W0405", "wav": "./aishell/wav/test/S0765/BAC009S0765W0405.wav", "txt": "影片发布了第一批定装照"} -{"key": "BAC009S0765W0406", "wav": "./aishell/wav/test/S0765/BAC009S0765W0406.wav", "txt": "以一身越野军装黑框眼镜的造型出现"} -{"key": "BAC009S0765W0407", "wav": "./aishell/wav/test/S0765/BAC009S0765W0407.wav", "txt": "看上去和人物原型相当贴合"} -{"key": "BAC009S0765W0408", "wav": "./aishell/wav/test/S0765/BAC009S0765W0408.wav", "txt": "演技也日渐精湛"} -{"key": "BAC009S0765W0409", "wav": "./aishell/wav/test/S0765/BAC009S0765W0409.wav", "txt": "更有一手好厨艺"} -{"key": "BAC009S0765W0410", "wav": "./aishell/wav/test/S0765/BAC009S0765W0410.wav", "txt": "可说是超完美女神"} -{"key": "BAC009S0765W0411", "wav": "./aishell/wav/test/S0765/BAC009S0765W0411.wav", "txt": "男友却仍然劈腿偷吃"} -{"key": "BAC009S0765W0412", "wav": "./aishell/wav/test/S0765/BAC009S0765W0412.wav", "txt": "好友林心如也心疼喊话我会陪她"} -{"key": "BAC009S0765W0413", "wav": "./aishell/wav/test/S0765/BAC009S0765W0413.wav", "txt": "中新网五月六日电据台湾媒体报道"} -{"key": "BAC009S0765W0414", "wav": "./aishell/wav/test/S0765/BAC009S0765W0414.wav", "txt": "刚与阮经天传出情变不久的许玮甯近日接拍恐怖片"} -{"key": "BAC009S0765W0415", "wav": "./aishell/wav/test/S0765/BAC009S0765W0415.wav", "txt": "称为了演好戏"} -{"key": "BAC009S0765W0416", "wav": "./aishell/wav/test/S0765/BAC009S0765W0416.wav", "txt": "她看了不少恐怖片"} -{"key": "BAC009S0765W0417", "wav": "./aishell/wav/test/S0765/BAC009S0765W0417.wav", "txt": "看完片后会睡不好做恶梦"} -{"key": "BAC009S0765W0418", "wav": "./aishell/wav/test/S0765/BAC009S0765W0418.wav", "txt": "上厕所都要把灯全部打开"} -{"key": "BAC009S0765W0419", "wav": "./aishell/wav/test/S0765/BAC009S0765W0419.wav", "txt": "搜狐娱乐讯日前"} -{"key": "BAC009S0765W0420", "wav": "./aishell/wav/test/S0765/BAC009S0765W0420.wav", "txt": "引发众多粉丝围堵"} -{"key": "BAC009S0765W0421", "wav": "./aishell/wav/test/S0765/BAC009S0765W0421.wav", "txt": "玩心大起的许绍洋与玩家一起比拼游戏"} -{"key": "BAC009S0765W0422", "wav": "./aishell/wav/test/S0765/BAC009S0765W0422.wav", "txt": "没想竟然惨败"} -{"key": "BAC009S0765W0423", "wav": "./aishell/wav/test/S0765/BAC009S0765W0423.wav", "txt": "这让自称游戏达人的他颇有些不好意思"} -{"key": "BAC009S0765W0424", "wav": "./aishell/wav/test/S0765/BAC009S0765W0424.wav", "txt": "金陵晚报八月十二日报道二零一四年"} -{"key": "BAC009S0765W0425", "wav": "./aishell/wav/test/S0765/BAC009S0765W0425.wav", "txt": "许茹芸与韩籍男友举行了婚礼"} -{"key": "BAC009S0765W0426", "wav": "./aishell/wav/test/S0765/BAC009S0765W0426.wav", "txt": "迎来了人生崭新阶段"} -{"key": "BAC009S0765W0427", "wav": "./aishell/wav/test/S0765/BAC009S0765W0427.wav", "txt": "不同于大家心中按部就班的乖乖女形象"} -{"key": "BAC009S0765W0428", "wav": "./aishell/wav/test/S0765/BAC009S0765W0428.wav", "txt": "许茹芸突然闪婚让当时的娱乐圈也惊起了一阵小波澜"} -{"key": "BAC009S0765W0429", "wav": "./aishell/wav/test/S0765/BAC009S0765W0429.wav", "txt": "在许茹芸看来"} -{"key": "BAC009S0765W0430", "wav": "./aishell/wav/test/S0765/BAC009S0765W0430.wav", "txt": "但几乎一个都没有实现"} -{"key": "BAC009S0765W0431", "wav": "./aishell/wav/test/S0765/BAC009S0765W0431.wav", "txt": "一四年前轰动东莞沙田的一起命案"} -{"key": "BAC009S0765W0432", "wav": "./aishell/wav/test/S0765/BAC009S0765W0432.wav", "txt": "日前因为广东省高院作出的无罪判决"} -{"key": "BAC009S0765W0433", "wav": "./aishell/wav/test/S0765/BAC009S0765W0433.wav", "txt": "再次吸引了众人的目光"} -{"key": "BAC009S0765W0434", "wav": "./aishell/wav/test/S0765/BAC009S0765W0434.wav", "txt": "八月一七日上午一一时"} -{"key": "BAC009S0765W0435", "wav": "./aishell/wav/test/S0765/BAC009S0765W0435.wav", "txt": "陈传钧从东莞市第二看守所出来"} -{"key": "BAC009S0765W0436", "wav": "./aishell/wav/test/S0765/BAC009S0765W0436.wav", "txt": "这是二零一零年四月二三日以来"} -{"key": "BAC009S0765W0437", "wav": "./aishell/wav/test/S0765/BAC009S0765W0437.wav", "txt": "杀人犯出狱后喊冤被驳回供述与鉴定相印证"} -{"key": "BAC009S0765W0438", "wav": "./aishell/wav/test/S0765/BAC009S0765W0438.wav", "txt": "丈夫关某身负多处刀伤"} -{"key": "BAC009S0765W0439", "wav": "./aishell/wav/test/S0765/BAC009S0765W0439.wav", "txt": "呼救报警时称有人入屋行凶"} -{"key": "BAC009S0765W0440", "wav": "./aishell/wav/test/S0765/BAC009S0765W0440.wav", "txt": "又供称是自己失手杀妻"} -{"key": "BAC009S0765W0441", "wav": "./aishell/wav/test/S0765/BAC009S0765W0441.wav", "txt": "关某先后被判死刑死缓"} -{"key": "BAC009S0765W0442", "wav": "./aishell/wav/test/S0765/BAC009S0765W0442.wav", "txt": "他向广东省高院申诉"} -{"key": "BAC009S0765W0443", "wav": "./aishell/wav/test/S0765/BAC009S0765W0443.wav", "txt": "广东高院审理后驳回了关某的申诉"} -{"key": "BAC009S0765W0444", "wav": "./aishell/wav/test/S0765/BAC009S0765W0444.wav", "txt": "杀人犯受民警感召行刑前捐器官谢罪"} -{"key": "BAC009S0765W0445", "wav": "./aishell/wav/test/S0765/BAC009S0765W0445.wav", "txt": "杀人犯抢劫获刑未查出旧案警方指纹识别有遗漏"} -{"key": "BAC009S0765W0446", "wav": "./aishell/wav/test/S0765/BAC009S0765W0446.wav", "txt": "京华时报记者蒲东峰摄二零零七年"} -{"key": "BAC009S0765W0447", "wav": "./aishell/wav/test/S0765/BAC009S0765W0447.wav", "txt": "时年二三岁的杨柱军在北京抢劫杀害了一名出租车司机"} -{"key": "BAC009S0765W0448", "wav": "./aishell/wav/test/S0765/BAC009S0765W0448.wav", "txt": "此后他没有隐姓埋名逃往外地"} -{"key": "BAC009S0765W0449", "wav": "./aishell/wav/test/S0765/BAC009S0765W0449.wav", "txt": "公安机关并未查出其身上还背着命案"} -{"key": "BAC009S0765W0450", "wav": "./aishell/wav/test/S0765/BAC009S0765W0450.wav", "txt": "并于二零一五年一月将其抓获"} -{"key": "BAC009S0765W0451", "wav": "./aishell/wav/test/S0765/BAC009S0765W0451.wav", "txt": "曾多次比对二零零七年命案现场匕首上的指纹"} -{"key": "BAC009S0765W0452", "wav": "./aishell/wav/test/S0765/BAC009S0765W0452.wav", "txt": "但指纹比对识别系统会出现一定概率的遗漏"} -{"key": "BAC009S0765W0453", "wav": "./aishell/wav/test/S0765/BAC009S0765W0453.wav", "txt": "杨柱军因涉嫌抢劫罪在市二中院受审"} -{"key": "BAC009S0765W0454", "wav": "./aishell/wav/test/S0765/BAC009S0765W0454.wav", "txt": "杀害中传失联女主嫌犯想找个无辜的人发泄"} -{"key": "BAC009S0765W0455", "wav": "./aishell/wav/test/S0765/BAC009S0765W0455.wav", "txt": "视频截图新京报快讯记者杨锋昨日"} -{"key": "BAC009S0765W0456", "wav": "./aishell/wav/test/S0765/BAC009S0765W0456.wav", "txt": "杀害中传女学生犯罪嫌疑人从小家庭教育严格"} -{"key": "BAC009S0765W0457", "wav": "./aishell/wav/test/S0765/BAC009S0765W0457.wav", "txt": "失联近两天的中传研究生周云露"} -{"key": "BAC009S0765W0458", "wav": "./aishell/wav/test/S0765/BAC009S0765W0458.wav", "txt": "李斯达表示自己跟周云露并没有深仇大恨"} -{"key": "BAC009S0765W0459", "wav": "./aishell/wav/test/S0765/BAC009S0765W0459.wav", "txt": "称就是想找个无辜的人"} -{"key": "BAC009S0765W0460", "wav": "./aishell/wav/test/S0765/BAC009S0765W0460.wav", "txt": "目前李斯达被关押在朝阳区看守所"} -{"key": "BAC009S0765W0461", "wav": "./aishell/wav/test/S0765/BAC009S0765W0461.wav", "txt": "周云露的父母在昨天上午去过朝阳刑警队"} -{"key": "BAC009S0765W0462", "wav": "./aishell/wav/test/S0765/BAC009S0765W0462.wav", "txt": "杀害中传女生嫌犯曾私藏刺刀同学称其特立独行"} -{"key": "BAC009S0765W0463", "wav": "./aishell/wav/test/S0765/BAC009S0765W0463.wav", "txt": "李斯达手持尖刀的自拍照"} -{"key": "BAC009S0765W0464", "wav": "./aishell/wav/test/S0765/BAC009S0765W0464.wav", "txt": "新京报快讯记者杨锋凌晨今日下午"} -{"key": "BAC009S0765W0465", "wav": "./aishell/wav/test/S0765/BAC009S0765W0465.wav", "txt": "中国传媒大学官网发布消息称"} -{"key": "BAC009S0765W0466", "wav": "./aishell/wav/test/S0765/BAC009S0765W0466.wav", "txt": "在朝阳区百子湾阳光嘉园小区遇害"} -{"key": "BAC009S0765W0467", "wav": "./aishell/wav/test/S0765/BAC009S0765W0467.wav", "txt": "犯罪嫌疑人已被抓获"} -{"key": "BAC009S0765W0468", "wav": "./aishell/wav/test/S0765/BAC009S0765W0468.wav", "txt": "学校正在全力配合公安机关和家属进行善后处理"} -{"key": "BAC009S0765W0469", "wav": "./aishell/wav/test/S0765/BAC009S0765W0469.wav", "txt": "杀害夜跑女子嫌犯不言不语拾荒者身份尚未确认"} -{"key": "BAC009S0765W0470", "wav": "./aishell/wav/test/S0765/BAC009S0765W0470.wav", "txt": "杀害女教师疑犯行凶后脸有伤警方悬赏五万缉拿"} -{"key": "BAC009S0765W0471", "wav": "./aishell/wav/test/S0765/BAC009S0765W0471.wav", "txt": "遇害女教师昨晚七时五七分"} -{"key": "BAC009S0765W0472", "wav": "./aishell/wav/test/S0765/BAC009S0765W0472.wav", "txt": "其作案后身上有大量血迹"} -{"key": "BAC009S0765W0473", "wav": "./aishell/wav/test/S0765/BAC009S0765W0473.wav", "txt": "双手背脸部等裸露部位有刺伤划伤"} -{"key": "BAC009S0765W0474", "wav": "./aishell/wav/test/S0765/BAC009S0765W0474.wav", "txt": "通告呼吁广大群众积极检举揭发提供线索"} -{"key": "BAC009S0765W0475", "wav": "./aishell/wav/test/S0765/BAC009S0765W0475.wav", "txt": "对提供重大线索协助破案者"} -{"key": "BAC009S0765W0476", "wav": "./aishell/wav/test/S0765/BAC009S0765W0476.wav", "txt": "我局将给予五万元奖励"} -{"key": "BAC009S0765W0477", "wav": "./aishell/wav/test/S0765/BAC009S0765W0477.wav", "txt": "杀害女童凶手被抓指认现场上千民众喊打"} -{"key": "BAC009S0765W0478", "wav": "./aishell/wav/test/S0765/BAC009S0765W0478.wav", "txt": "四川广安一一岁女孩的失踪"} -{"key": "BAC009S0765W0479", "wav": "./aishell/wav/test/S0765/BAC009S0765W0479.wav", "txt": "九日晚女孩尸体被找到"} -{"key": "BAC009S0765W0480", "wav": "./aishell/wav/test/S0765/BAC009S0765W0480.wav", "txt": "凶手在郫县安靖镇被抓"} -{"key": "BAC009S0765W0481", "wav": "./aishell/wav/test/S0765/BAC009S0765W0481.wav", "txt": "凶手到岳池县石垭镇指认骗走孩子的现场"} -{"key": "BAC009S0765W0482", "wav": "./aishell/wav/test/S0765/BAC009S0765W0482.wav", "txt": "数千围观人群高呼打死这个杂碎"} -{"key": "BAC009S0765W0483", "wav": "./aishell/wav/test/S0765/BAC009S0765W0483.wav", "txt": "现场喊打声持续不断"} -{"key": "BAC009S0765W0484", "wav": "./aishell/wav/test/S0765/BAC009S0765W0484.wav", "txt": "杀害宝鸡夜跑教师嫌犯落网是否为拾荒者尚无定论"} -{"key": "BAC009S0765W0485", "wav": "./aishell/wav/test/S0765/BAC009S0765W0485.wav", "txt": "吕某于一零月一四日晚从家中外出锻炼失踪"} -{"key": "BAC009S0765W0486", "wav": "./aishell/wav/test/S0765/BAC009S0765W0486.wav", "txt": "尸体于一零月二零日在宝鸡渭河公园被发现"} -{"key": "BAC009S0765W0487", "wav": "./aishell/wav/test/S0765/BAC009S0765W0487.wav", "txt": "李克强集众智汇众力攻坚克难激发活力"} -{"key": "BAC009S0765W0488", "wav": "./aishell/wav/test/S0765/BAC009S0765W0488.wav", "txt": "李彬彬喂大象喝水略显老态提醒网友夏天要补水"} -{"key": "BAC009S0765W0489", "wav": "./aishell/wav/test/S0765/BAC009S0765W0489.wav", "txt": "联合国官方微博晒出一张李彬彬喂大象喝水的照片"} -{"key": "BAC009S0765W0490", "wav": "./aishell/wav/test/S0765/BAC009S0765W0490.wav", "txt": "华西都市报讯记者杜恩湖一零月二四日中午一二时"} -{"key": "BAC009S0765W0491", "wav": "./aishell/wav/test/S0765/BAC009S0765W0491.wav", "txt": "一零月二三曰现身成都平乐古城"} -{"key": "BAC009S0765W0492", "wav": "./aishell/wav/test/S0765/BAC009S0765W0492.wav", "txt": "应邀参加第二届天府古镇艺术节"} -{"key": "BAC009S0765W0493", "wav": "./aishell/wav/test/S0765/BAC009S0765W0493.wav", "txt": "现场李双江受到了观众的热烈欢迎"} -{"key": "BAC009S0765W0494", "wav": "./aishell/wav/test/S0765/BAC009S0765W0494.wav", "txt": "二零零多幅珍贵油画抵达南京"} -{"key": "BAC009S0765W0495", "wav": "./aishell/wav/test/S0765/BAC009S0765W0495.wav", "txt": "李嘉诚军师抛售马云一五亿买香港最贵单价豪宅"} -{"key": "BAC009S0766W0121", "wav": "./aishell/wav/test/S0766/BAC009S0766W0121.wav", "txt": "实现数字化整合营销"} -{"key": "BAC009S0766W0122", "wav": "./aishell/wav/test/S0766/BAC009S0766W0122.wav", "txt": "是当今广告行业的需要"} -{"key": "BAC009S0766W0123", "wav": "./aishell/wav/test/S0766/BAC009S0766W0123.wav", "txt": "消费者行为的变化及技术的进步"} -{"key": "BAC009S0766W0124", "wav": "./aishell/wav/test/S0766/BAC009S0766W0124.wav", "txt": "催生了广告领域新的变革和创新"} -{"key": "BAC009S0766W0125", "wav": "./aishell/wav/test/S0766/BAC009S0766W0125.wav", "txt": "唯有实力雄厚又颇具现代创新意识的广告企业"} -{"key": "BAC009S0766W0126", "wav": "./aishell/wav/test/S0766/BAC009S0766W0126.wav", "txt": "今久整合营销集团就是如此"} -{"key": "BAC009S0766W0127", "wav": "./aishell/wav/test/S0766/BAC009S0766W0127.wav", "txt": "成为圈子里首屈一指的超大企业"} -{"key": "BAC009S0766W0128", "wav": "./aishell/wav/test/S0766/BAC009S0766W0128.wav", "txt": "自成立以来"} -{"key": "BAC009S0766W0129", "wav": "./aishell/wav/test/S0766/BAC009S0766W0129.wav", "txt": "服务项目几千个"} -{"key": "BAC009S0766W0130", "wav": "./aishell/wav/test/S0766/BAC009S0766W0130.wav", "txt": "开创了蔓延全国的青年社区概念"} -{"key": "BAC009S0766W0131", "wav": "./aishell/wav/test/S0766/BAC009S0766W0131.wav", "txt": "确立了无人撼动的行业老大地位"} -{"key": "BAC009S0766W0132", "wav": "./aishell/wav/test/S0766/BAC009S0766W0132.wav", "txt": "成为房地产最信任的营销公司"} -{"key": "BAC009S0766W0133", "wav": "./aishell/wav/test/S0766/BAC009S0766W0133.wav", "txt": "然而这家雄心勃勃的公司并未止步于此"} -{"key": "BAC009S0766W0134", "wav": "./aishell/wav/test/S0766/BAC009S0766W0134.wav", "txt": "一个以互联网和大数据为核心的时代已经到来"} -{"key": "BAC009S0766W0135", "wav": "./aishell/wav/test/S0766/BAC009S0766W0135.wav", "txt": "今久必须担当起引领时代潮流的重任"} -{"key": "BAC009S0766W0136", "wav": "./aishell/wav/test/S0766/BAC009S0766W0136.wav", "txt": "蓝色光标以几亿人民币收购今久"} -{"key": "BAC009S0766W0137", "wav": "./aishell/wav/test/S0766/BAC009S0766W0137.wav", "txt": "这成为今久转型的起点"} -{"key": "BAC009S0766W0138", "wav": "./aishell/wav/test/S0766/BAC009S0766W0138.wav", "txt": "依托蓝色光标强大的技术和资源优势"} -{"key": "BAC009S0766W0139", "wav": "./aishell/wav/test/S0766/BAC009S0766W0139.wav", "txt": "今久率先提出整合营销的概念"} -{"key": "BAC009S0766W0140", "wav": "./aishell/wav/test/S0766/BAC009S0766W0140.wav", "txt": "其核心在于利用数字化工具"} -{"key": "BAC009S0766W0141", "wav": "./aishell/wav/test/S0766/BAC009S0766W0141.wav", "txt": "为房地产商提供系统化的服务"} -{"key": "BAC009S0766W0142", "wav": "./aishell/wav/test/S0766/BAC009S0766W0142.wav", "txt": "整合营销实现了从策略到执行的系统化服务"} -{"key": "BAC009S0766W0143", "wav": "./aishell/wav/test/S0766/BAC009S0766W0143.wav", "txt": "当地产商的效果预期不断提高"} -{"key": "BAC009S0766W0144", "wav": "./aishell/wav/test/S0766/BAC009S0766W0144.wav", "txt": "这时候更要求服务商具备思考和行动的一致性"} -{"key": "BAC009S0766W0145", "wav": "./aishell/wav/test/S0766/BAC009S0766W0145.wav", "txt": "这样也为开发商节省了运营成本"} -{"key": "BAC009S0766W0146", "wav": "./aishell/wav/test/S0766/BAC009S0766W0146.wav", "txt": "整合营销是利用全案思维和大数据技术"} -{"key": "BAC009S0766W0147", "wav": "./aishell/wav/test/S0766/BAC009S0766W0147.wav", "txt": "市场上就出现了各类新型技术软件"} -{"key": "BAC009S0766W0148", "wav": "./aishell/wav/test/S0766/BAC009S0766W0148.wav", "txt": "但大多是雷声大雨点小"} -{"key": "BAC009S0766W0149", "wav": "./aishell/wav/test/S0766/BAC009S0766W0149.wav", "txt": "与房地产商的需求相去甚远"} -{"key": "BAC009S0766W0150", "wav": "./aishell/wav/test/S0766/BAC009S0766W0150.wav", "txt": "大数据营销需要的是强大的技术实力"} -{"key": "BAC009S0766W0151", "wav": "./aishell/wav/test/S0766/BAC009S0766W0151.wav", "txt": "而非某些功能的简单嫁接"} -{"key": "BAC009S0766W0152", "wav": "./aishell/wav/test/S0766/BAC009S0766W0152.wav", "txt": "蓝色光标作为全球首屈一指的广告服务商"} -{"key": "BAC009S0766W0153", "wav": "./aishell/wav/test/S0766/BAC009S0766W0153.wav", "txt": "在大数据上的技术优势无可匹敌"} -{"key": "BAC009S0766W0154", "wav": "./aishell/wav/test/S0766/BAC009S0766W0154.wav", "txt": "今久正是在蓝色光标的技术支持下"} -{"key": "BAC009S0766W0155", "wav": "./aishell/wav/test/S0766/BAC009S0766W0155.wav", "txt": "实现了大数据营销的创新"} -{"key": "BAC009S0766W0156", "wav": "./aishell/wav/test/S0766/BAC009S0766W0156.wav", "txt": "许多数字新产品"} -{"key": "BAC009S0766W0157", "wav": "./aishell/wav/test/S0766/BAC009S0766W0157.wav", "txt": "广泛应用于移动端"} -{"key": "BAC009S0766W0158", "wav": "./aishell/wav/test/S0766/BAC009S0766W0158.wav", "txt": "分析用户的消费行为和生活方式"} -{"key": "BAC009S0766W0159", "wav": "./aishell/wav/test/S0766/BAC009S0766W0159.wav", "txt": "帮助广告主找出目标用户"} -{"key": "BAC009S0766W0160", "wav": "./aishell/wav/test/S0766/BAC009S0766W0160.wav", "txt": "然后对广告信息进行精确匹配"} -{"key": "BAC009S0766W0161", "wav": "./aishell/wav/test/S0766/BAC009S0766W0161.wav", "txt": "达到降低成本提升营销效果的目的"} -{"key": "BAC009S0766W0162", "wav": "./aishell/wav/test/S0766/BAC009S0766W0162.wav", "txt": "今久在大举创新的同时"} -{"key": "BAC009S0766W0163", "wav": "./aishell/wav/test/S0766/BAC009S0766W0163.wav", "txt": "保持原有业务的正常运作"} -{"key": "BAC009S0766W0164", "wav": "./aishell/wav/test/S0766/BAC009S0766W0164.wav", "txt": "这才是一个大企业应该有的战略方向"} -{"key": "BAC009S0766W0165", "wav": "./aishell/wav/test/S0766/BAC009S0766W0165.wav", "txt": "带动了区域板块的扩张"} -{"key": "BAC009S0766W0166", "wav": "./aishell/wav/test/S0766/BAC009S0766W0166.wav", "txt": "在海南成立了分公司"} -{"key": "BAC009S0766W0167", "wav": "./aishell/wav/test/S0766/BAC009S0766W0167.wav", "txt": "现在已经是海南本土最大的房地产推广公司"} -{"key": "BAC009S0766W0168", "wav": "./aishell/wav/test/S0766/BAC009S0766W0168.wav", "txt": "拥有许多优质客户"} -{"key": "BAC009S0766W0169", "wav": "./aishell/wav/test/S0766/BAC009S0766W0169.wav", "txt": "今久上海分公司又悄无声息地开张了"} -{"key": "BAC009S0766W0170", "wav": "./aishell/wav/test/S0766/BAC009S0766W0170.wav", "txt": "新媒体推广的业务扩张"} -{"key": "BAC009S0766W0171", "wav": "./aishell/wav/test/S0766/BAC009S0766W0171.wav", "txt": "逐渐地撬开了上海这个外来公司很难生根的大都市"} -{"key": "BAC009S0766W0172", "wav": "./aishell/wav/test/S0766/BAC009S0766W0172.wav", "txt": "郑州长春和哈尔滨三地办事处"} -{"key": "BAC009S0766W0173", "wav": "./aishell/wav/test/S0766/BAC009S0766W0173.wav", "txt": "用蓝色光标强大的新媒体技术和资源"} -{"key": "BAC009S0766W0174", "wav": "./aishell/wav/test/S0766/BAC009S0766W0174.wav", "txt": "搭起了全国地产推广新媒体的版图"} -{"key": "BAC009S0766W0175", "wav": "./aishell/wav/test/S0766/BAC009S0766W0175.wav", "txt": "今久又出高价"} -{"key": "BAC009S0766W0176", "wav": "./aishell/wav/test/S0766/BAC009S0766W0176.wav", "txt": "收购了房地产互联网营销公司沈阳新维一半股份"} -{"key": "BAC009S0766W0177", "wav": "./aishell/wav/test/S0766/BAC009S0766W0177.wav", "txt": "今久又一次利用资本市场"} -{"key": "BAC009S0766W0178", "wav": "./aishell/wav/test/S0766/BAC009S0766W0178.wav", "txt": "实现区域扩张"} -{"key": "BAC009S0766W0179", "wav": "./aishell/wav/test/S0766/BAC009S0766W0179.wav", "txt": "区域产品和业务三大层面"} -{"key": "BAC009S0766W0180", "wav": "./aishell/wav/test/S0766/BAC009S0766W0180.wav", "txt": "今久成功实现了转型"} -{"key": "BAC009S0766W0181", "wav": "./aishell/wav/test/S0766/BAC009S0766W0181.wav", "txt": "后今久时代正式到来"} -{"key": "BAC009S0766W0182", "wav": "./aishell/wav/test/S0766/BAC009S0766W0182.wav", "txt": "转型后的今久整合营销集团"} -{"key": "BAC009S0766W0183", "wav": "./aishell/wav/test/S0766/BAC009S0766W0183.wav", "txt": "在全球大数据浪潮中"} -{"key": "BAC009S0766W0184", "wav": "./aishell/wav/test/S0766/BAC009S0766W0184.wav", "txt": "依托蓝色光标的强大平台"} -{"key": "BAC009S0766W0185", "wav": "./aishell/wav/test/S0766/BAC009S0766W0185.wav", "txt": "助力中国房地产开发企业发掘并实现更大的价值需求"} -{"key": "BAC009S0766W0186", "wav": "./aishell/wav/test/S0766/BAC009S0766W0186.wav", "txt": "在机遇与挑战共存的互联网时代"} -{"key": "BAC009S0766W0187", "wav": "./aishell/wav/test/S0766/BAC009S0766W0187.wav", "txt": "发行利率也有较大幅度上升"} -{"key": "BAC009S0766W0188", "wav": "./aishell/wav/test/S0766/BAC009S0766W0188.wav", "txt": "人民银行多次提高存款准备金率和存贷款基准利率"} -{"key": "BAC009S0766W0189", "wav": "./aishell/wav/test/S0766/BAC009S0766W0189.wav", "txt": "不仅是城投债券发行利率"} -{"key": "BAC009S0766W0190", "wav": "./aishell/wav/test/S0766/BAC009S0766W0190.wav", "txt": "债券市场所有品种发行利率整体上都表现出向上的走向"} -{"key": "BAC009S0766W0191", "wav": "./aishell/wav/test/S0766/BAC009S0766W0191.wav", "txt": "导致城投债券发行产生较高的风险溢价"} -{"key": "BAC009S0766W0192", "wav": "./aishell/wav/test/S0766/BAC009S0766W0192.wav", "txt": "城投债券收益率上升"} -{"key": "BAC009S0766W0193", "wav": "./aishell/wav/test/S0766/BAC009S0766W0193.wav", "txt": "对债券投资人来说不是坏事"} -{"key": "BAC009S0766W0194", "wav": "./aishell/wav/test/S0766/BAC009S0766W0194.wav", "txt": "有利于提升城投债券的资产配置价值"} -{"key": "BAC009S0766W0195", "wav": "./aishell/wav/test/S0766/BAC009S0766W0195.wav", "txt": "则需要在发债时机和发债规模上进行合理的把握"} -{"key": "BAC009S0766W0196", "wav": "./aishell/wav/test/S0766/BAC009S0766W0196.wav", "txt": "我个人不赞成这一判断"} -{"key": "BAC009S0766W0197", "wav": "./aishell/wav/test/S0766/BAC009S0766W0197.wav", "txt": "债券发行人是优质的"} -{"key": "BAC009S0766W0198", "wav": "./aishell/wav/test/S0766/BAC009S0766W0198.wav", "txt": "还本付息也是正常的"} -{"key": "BAC009S0766W0199", "wav": "./aishell/wav/test/S0766/BAC009S0766W0199.wav", "txt": "投资者对城投债券风险表现出的恐慌"} -{"key": "BAC009S0766W0200", "wav": "./aishell/wav/test/S0766/BAC009S0766W0200.wav", "txt": "加强城投债监管完善制度建设"} -{"key": "BAC009S0766W0201", "wav": "./aishell/wav/test/S0766/BAC009S0766W0201.wav", "txt": "有的媒体甚至用井喷来描述"} -{"key": "BAC009S0766W0202", "wav": "./aishell/wav/test/S0766/BAC009S0766W0202.wav", "txt": "您如何看待城投债券这几年的发展和作用"} -{"key": "BAC009S0766W0203", "wav": "./aishell/wav/test/S0766/BAC009S0766W0203.wav", "txt": "徐林这几年城投债券发行数量的确有所增加"} -{"key": "BAC009S0766W0204", "wav": "./aishell/wav/test/S0766/BAC009S0766W0204.wav", "txt": "地方投融资平台公司通过发行债券进行融资"} -{"key": "BAC009S0766W0205", "wav": "./aishell/wav/test/S0766/BAC009S0766W0205.wav", "txt": "符合提高直接融资比重的要求"} -{"key": "BAC009S0766W0206", "wav": "./aishell/wav/test/S0766/BAC009S0766W0206.wav", "txt": "城投债券也适应了发行人和投资人的需要"} -{"key": "BAC009S0766W0207", "wav": "./aishell/wav/test/S0766/BAC009S0766W0207.wav", "txt": "这是这几年城投债券发行规模不断扩大的主要原因"} -{"key": "BAC009S0766W0208", "wav": "./aishell/wav/test/S0766/BAC009S0766W0208.wav", "txt": "我委核准发行的企业债券累计为七千亿元"} -{"key": "BAC009S0766W0209", "wav": "./aishell/wav/test/S0766/BAC009S0766W0209.wav", "txt": "其中城投债券共发行七千亿元"} -{"key": "BAC009S0766W0210", "wav": "./aishell/wav/test/S0766/BAC009S0766W0210.wav", "txt": "占比只有百分之七"} -{"key": "BAC009S0766W0211", "wav": "./aishell/wav/test/S0766/BAC009S0766W0211.wav", "txt": "城投债券的发行有比较严格的条件"} -{"key": "BAC009S0766W0212", "wav": "./aishell/wav/test/S0766/BAC009S0766W0212.wav", "txt": "从已发行的城投债券用途看"} -{"key": "BAC009S0766W0213", "wav": "./aishell/wav/test/S0766/BAC009S0766W0213.wav", "txt": "保障房建设和棚户区改造"} -{"key": "BAC009S0766W0214", "wav": "./aishell/wav/test/S0766/BAC009S0766W0214.wav", "txt": "城市文化和体育设施"} -{"key": "BAC009S0766W0215", "wav": "./aishell/wav/test/S0766/BAC009S0766W0215.wav", "txt": "地震灾后重建等领域"} -{"key": "BAC009S0766W0216", "wav": "./aishell/wav/test/S0766/BAC009S0766W0216.wav", "txt": "都起到了积极的作用"} -{"key": "BAC009S0766W0217", "wav": "./aishell/wav/test/S0766/BAC009S0766W0217.wav", "txt": "随着我国资本市场的进一步发展"} -{"key": "BAC009S0766W0218", "wav": "./aishell/wav/test/S0766/BAC009S0766W0218.wav", "txt": "城投债券作为中国债券市场的准市政债"} -{"key": "BAC009S0766W0219", "wav": "./aishell/wav/test/S0766/BAC009S0766W0219.wav", "txt": "发行规模还会稳步扩大"} -{"key": "BAC009S0766W0220", "wav": "./aishell/wav/test/S0766/BAC009S0766W0220.wav", "txt": "中国证券报面对市场对城投债券风险的担忧"} -{"key": "BAC009S0766W0221", "wav": "./aishell/wav/test/S0766/BAC009S0766W0221.wav", "txt": "是如何更好地防范城投债券可能出现的风险的"} -{"key": "BAC009S0766W0222", "wav": "./aishell/wav/test/S0766/BAC009S0766W0222.wav", "txt": "虽然已发行的城投债券的还本付息都是正常的"} -{"key": "BAC009S0766W0223", "wav": "./aishell/wav/test/S0766/BAC009S0766W0223.wav", "txt": "城投债作为一个信用产品"} -{"key": "BAC009S0766W0224", "wav": "./aishell/wav/test/S0766/BAC009S0766W0224.wav", "txt": "不可能是完全无风险的"} -{"key": "BAC009S0766W0225", "wav": "./aishell/wav/test/S0766/BAC009S0766W0225.wav", "txt": "我看了以后很受震动"} -{"key": "BAC009S0766W0226", "wav": "./aishell/wav/test/S0766/BAC009S0766W0226.wav", "txt": "虽然报道内容并没有具体的城投债券还本付息违约案"} -{"key": "BAC009S0766W0227", "wav": "./aishell/wav/test/S0766/BAC009S0766W0227.wav", "txt": "但却提醒了我们要更加关注城投债券可能出现的风险"} -{"key": "BAC009S0766W0228", "wav": "./aishell/wav/test/S0766/BAC009S0766W0228.wav", "txt": "并采取措施切实保护债券投资人的合法权益"} -{"key": "BAC009S0766W0229", "wav": "./aishell/wav/test/S0766/BAC009S0766W0229.wav", "txt": "作为城投债券发行监管部门"} -{"key": "BAC009S0766W0230", "wav": "./aishell/wav/test/S0766/BAC009S0766W0230.wav", "txt": "我们对城投债券发行人的审核一直是比较严格的"} -{"key": "BAC009S0766W0231", "wav": "./aishell/wav/test/S0766/BAC009S0766W0231.wav", "txt": "地方投融资平台公司申请发行债券"} -{"key": "BAC009S0766W0232", "wav": "./aishell/wav/test/S0766/BAC009S0766W0232.wav", "txt": "必须符合一些基本的条件企业必须连续三年盈利"} -{"key": "BAC009S0766W0233", "wav": "./aishell/wav/test/S0766/BAC009S0766W0233.wav", "txt": "所投项目必须经过合规性审查"} -{"key": "BAC009S0766W0234", "wav": "./aishell/wav/test/S0766/BAC009S0766W0234.wav", "txt": "我们还控制了投融资平台公司发债的范围"} -{"key": "BAC009S0766W0235", "wav": "./aishell/wav/test/S0766/BAC009S0766W0235.wav", "txt": "才能申请发行城投债券"} -{"key": "BAC009S0766W0236", "wav": "./aishell/wav/test/S0766/BAC009S0766W0236.wav", "txt": "就不得再通过发行城投债券新增政府性债务"} -{"key": "BAC009S0766W0237", "wav": "./aishell/wav/test/S0766/BAC009S0766W0237.wav", "txt": "正是有了这样一些严格的规定"} -{"key": "BAC009S0766W0238", "wav": "./aishell/wav/test/S0766/BAC009S0766W0238.wav", "txt": "使得很多投融资平台公司"} -{"key": "BAC009S0766W0239", "wav": "./aishell/wav/test/S0766/BAC009S0766W0239.wav", "txt": "难以满足发行城投债券的资格和条件"} -{"key": "BAC009S0766W0240", "wav": "./aishell/wav/test/S0766/BAC009S0766W0240.wav", "txt": "这在相当程度上控制了城投债券的发行规模"} -{"key": "BAC009S0766W0241", "wav": "./aishell/wav/test/S0766/BAC009S0766W0241.wav", "txt": "也降低了城投债券的风险"} -{"key": "BAC009S0766W0242", "wav": "./aishell/wav/test/S0766/BAC009S0766W0242.wav", "txt": "为了控制地方政府本届发债下届还钱的道德风险"} -{"key": "BAC009S0766W0243", "wav": "./aishell/wav/test/S0766/BAC009S0766W0243.wav", "txt": "我们还安排了专门的偿债均摊机制"} -{"key": "BAC009S0766W0244", "wav": "./aishell/wav/test/S0766/BAC009S0766W0244.wav", "txt": "也就是将债券还本压力在债券存续期内进行合理分摊"} -{"key": "BAC009S0766W0245", "wav": "./aishell/wav/test/S0766/BAC009S0766W0245.wav", "txt": "避免在最后一年累积过大的还本压力和风险"} -{"key": "BAC009S0766W0246", "wav": "./aishell/wav/test/S0766/BAC009S0766W0246.wav", "txt": "有媒体报道了云投集团等发债企业转移核心资产"} -{"key": "BAC009S0766W0247", "wav": "./aishell/wav/test/S0766/BAC009S0766W0247.wav", "txt": "损害债券持有人利益的事件"} -{"key": "BAC009S0766W0248", "wav": "./aishell/wav/test/S0766/BAC009S0766W0248.wav", "txt": "并对债券市场形成了不小的冲击"} -{"key": "BAC009S0766W0249", "wav": "./aishell/wav/test/S0766/BAC009S0766W0249.wav", "txt": "我们如何考虑防止这类事件再次发生"} -{"key": "BAC009S0766W0250", "wav": "./aishell/wav/test/S0766/BAC009S0766W0250.wav", "txt": "更好地保护债券投资人的利益"} -{"key": "BAC009S0766W0251", "wav": "./aishell/wav/test/S0766/BAC009S0766W0251.wav", "txt": "徐林发债企业在债券存续期内进行资产转移"} -{"key": "BAC009S0766W0252", "wav": "./aishell/wav/test/S0766/BAC009S0766W0252.wav", "txt": "极有可能对债券持有人利益构成不利影响"} -{"key": "BAC009S0766W0253", "wav": "./aishell/wav/test/S0766/BAC009S0766W0253.wav", "txt": "华尔街的半兽人已经为他的离开紧锣密鼓地敲起退堂鼓"} -{"key": "BAC009S0766W0257", "wav": "./aishell/wav/test/S0766/BAC009S0766W0257.wav", "txt": "问题是他也想不出谁能干得更好"} -{"key": "BAC009S0766W0260", "wav": "./aishell/wav/test/S0766/BAC009S0766W0260.wav", "txt": "但亏损却达到了一点八亿美元"} -{"key": "BAC009S0766W0262", "wav": "./aishell/wav/test/S0766/BAC009S0766W0262.wav", "txt": "不应从用户身上榨取广告收入"} -{"key": "BAC009S0766W0263", "wav": "./aishell/wav/test/S0766/BAC009S0766W0263.wav", "txt": "试问又有哪位有魔法能挽回巨额亏损呢"} -{"key": "BAC009S0766W0268", "wav": "./aishell/wav/test/S0766/BAC009S0766W0268.wav", "txt": "对于高中生来说这会有点令人尴尬罢了"} -{"key": "BAC009S0766W0269", "wav": "./aishell/wav/test/S0766/BAC009S0766W0269.wav", "txt": "可对于一个成年人来说算什么"} -{"key": "BAC009S0766W0270", "wav": "./aishell/wav/test/S0766/BAC009S0766W0270.wav", "txt": "还有他对日本文化的迷恋"} -{"key": "BAC009S0766W0271", "wav": "./aishell/wav/test/S0766/BAC009S0766W0271.wav", "txt": "然后又要去竞选纽约州长"} -{"key": "BAC009S0766W0273", "wav": "./aishell/wav/test/S0766/BAC009S0766W0273.wav", "txt": "跟星巴克的合作就是灾难"} -{"key": "BAC009S0766W0285", "wav": "./aishell/wav/test/S0766/BAC009S0766W0285.wav", "txt": "梅姐待的已算够长了"} -{"key": "BAC009S0766W0287", "wav": "./aishell/wav/test/S0766/BAC009S0766W0287.wav", "txt": "但是至少会给股价刺激一下"} -{"key": "BAC009S0766W0288", "wav": "./aishell/wav/test/S0766/BAC009S0766W0288.wav", "txt": "而梅姐则可以陪陪小孩或者去搞搞政治"} -{"key": "BAC009S0766W0293", "wav": "./aishell/wav/test/S0766/BAC009S0766W0293.wav", "txt": "但亏损达到了一点七亿美元"} -{"key": "BAC009S0766W0294", "wav": "./aishell/wav/test/S0766/BAC009S0766W0294.wav", "txt": "这样的成绩已经比二零一三年要好"} -{"key": "BAC009S0766W0296", "wav": "./aishell/wav/test/S0766/BAC009S0766W0296.wav", "txt": "十年都还没赚钱的话"} -{"key": "BAC009S0766W0299", "wav": "./aishell/wav/test/S0766/BAC009S0766W0299.wav", "txt": "它已经失去了作为独立公司的存在意义"} -{"key": "BAC009S0766W0303", "wav": "./aishell/wav/test/S0766/BAC009S0766W0303.wav", "txt": "同比增长一百三十四点七百分之三"} -{"key": "BAC009S0766W0304", "wav": "./aishell/wav/test/S0766/BAC009S0766W0304.wav", "txt": "归属于上市公司股东的净利润二十五十二万元"} -{"key": "BAC009S0766W0305", "wav": "./aishell/wav/test/S0766/BAC009S0766W0305.wav", "txt": "去年同期则是亏损二百四十二万元"} -{"key": "BAC009S0766W0306", "wav": "./aishell/wav/test/S0766/BAC009S0766W0306.wav", "txt": "同比增长十一五十六点四百分之二"} -{"key": "BAC009S0766W0307", "wav": "./aishell/wav/test/S0766/BAC009S0766W0307.wav", "txt": "公司锂电池业务实现营业收入四点零一亿元"} -{"key": "BAC009S0766W0308", "wav": "./aishell/wav/test/S0766/BAC009S0766W0308.wav", "txt": "同比增长二百六十八点五百分之一"} -{"key": "BAC009S0766W0309", "wav": "./aishell/wav/test/S0766/BAC009S0766W0309.wav", "txt": "成飞集成相关人士告诉每日经济新闻记者"} -{"key": "BAC009S0766W0310", "wav": "./aishell/wav/test/S0766/BAC009S0766W0310.wav", "txt": "前两年锂电池行业整体处于市场培育期"} -{"key": "BAC009S0766W0311", "wav": "./aishell/wav/test/S0766/BAC009S0766W0311.wav", "txt": "虽然国家在二零一零年就颁布了新能源补贴政策"} -{"key": "BAC009S0766W0312", "wav": "./aishell/wav/test/S0766/BAC009S0766W0312.wav", "txt": "但是市场启动不像预期那么快"} -{"key": "BAC009S0766W0313", "wav": "./aishell/wav/test/S0766/BAC009S0766W0313.wav", "txt": "基本上是从二零一四年下半年才有明显的感觉"} -{"key": "BAC009S0766W0314", "wav": "./aishell/wav/test/S0766/BAC009S0766W0314.wav", "txt": "目前公司锂电池订单比较充足"} -{"key": "BAC009S0766W0315", "wav": "./aishell/wav/test/S0766/BAC009S0766W0315.wav", "txt": "由于传统汽车产业步入寒冬"} -{"key": "BAC009S0766W0316", "wav": "./aishell/wav/test/S0766/BAC009S0766W0316.wav", "txt": "汽车厂商纷纷转型新能源汽车"} -{"key": "BAC009S0766W0317", "wav": "./aishell/wav/test/S0766/BAC009S0766W0317.wav", "txt": "新能源汽车的爆发使得锂电池供不应求"} -{"key": "BAC009S0766W0318", "wav": "./aishell/wav/test/S0766/BAC009S0766W0318.wav", "txt": "随着锂电池产业链迎来井喷"} -{"key": "BAC009S0766W0319", "wav": "./aishell/wav/test/S0766/BAC009S0766W0319.wav", "txt": "锂电需求带动业绩增长"} -{"key": "BAC009S0766W0320", "wav": "./aishell/wav/test/S0766/BAC009S0766W0320.wav", "txt": "成飞集成的锂电池业务在前两年情况并不好"} -{"key": "BAC009S0766W0321", "wav": "./aishell/wav/test/S0766/BAC009S0766W0321.wav", "txt": "新能源汽车市场在逐步启动"} -{"key": "BAC009S0766W0322", "wav": "./aishell/wav/test/S0766/BAC009S0766W0322.wav", "txt": "锂电池市场也在向好"} -{"key": "BAC009S0766W0323", "wav": "./aishell/wav/test/S0766/BAC009S0766W0323.wav", "txt": "成飞集成相关人士告诉记者"} -{"key": "BAC009S0766W0324", "wav": "./aishell/wav/test/S0766/BAC009S0766W0324.wav", "txt": "这是今年上半年锂电池业务爆发的原因"} -{"key": "BAC009S0766W0325", "wav": "./aishell/wav/test/S0766/BAC009S0766W0325.wav", "txt": "成飞集成的其他主营业务中"} -{"key": "BAC009S0766W0326", "wav": "./aishell/wav/test/S0766/BAC009S0766W0326.wav", "txt": "汽车模具以及汽车零部件表现一般"} -{"key": "BAC009S0766W0327", "wav": "./aishell/wav/test/S0766/BAC009S0766W0327.wav", "txt": "汽车模具实现营收一点一零一亿元"} -{"key": "BAC009S0766W0328", "wav": "./aishell/wav/test/S0766/BAC009S0766W0328.wav", "txt": "毛利率为十八点百分之三"} -{"key": "BAC009S0766W0329", "wav": "./aishell/wav/test/S0766/BAC009S0766W0329.wav", "txt": "毛利率为十九点六百分之六"} -{"key": "BAC009S0766W0330", "wav": "./aishell/wav/test/S0766/BAC009S0766W0330.wav", "txt": "同比增长四点百分之三十五"} -{"key": "BAC009S0766W0331", "wav": "./aishell/wav/test/S0766/BAC009S0766W0331.wav", "txt": "但是由于该项业务占比较小"} -{"key": "BAC009S0766W0332", "wav": "./aishell/wav/test/S0766/BAC009S0766W0332.wav", "txt": "所以对业绩的影响有限"} -{"key": "BAC009S0766W0333", "wav": "./aishell/wav/test/S0766/BAC009S0766W0333.wav", "txt": "汽车零部件总体规模不大"} -{"key": "BAC009S0766W0334", "wav": "./aishell/wav/test/S0766/BAC009S0766W0334.wav", "txt": "汽车模具毛利率下滑"} -{"key": "BAC009S0766W0335", "wav": "./aishell/wav/test/S0766/BAC009S0766W0335.wav", "txt": "一方面是由于上半年模具的比较基数较低"} -{"key": "BAC009S0766W0336", "wav": "./aishell/wav/test/S0766/BAC009S0766W0336.wav", "txt": "也就是去年和今年上半年营收总额都不高"} -{"key": "BAC009S0766W0337", "wav": "./aishell/wav/test/S0766/BAC009S0766W0337.wav", "txt": "另外今年上半年个别订单的价格也比较低"} -{"key": "BAC009S0766W0339", "wav": "./aishell/wav/test/S0766/BAC009S0766W0339.wav", "txt": "这一状况有望发生改变"} -{"key": "BAC009S0766W0340", "wav": "./aishell/wav/test/S0766/BAC009S0766W0340.wav", "txt": "七年之后宋安东二十五岁"} -{"key": "BAC009S0766W0341", "wav": "./aishell/wav/test/S0766/BAC009S0766W0341.wav", "txt": "正是冰球运动员的黄金年龄"} -{"key": "BAC009S0766W0342", "wav": "./aishell/wav/test/S0766/BAC009S0766W0342.wav", "txt": "年少成名的他带领国家队出征冬奥会"} -{"key": "BAC009S0766W0343", "wav": "./aishell/wav/test/S0766/BAC009S0766W0343.wav", "txt": "铁定会有助于提升我国的冰球水平"} -{"key": "BAC009S0766W0344", "wav": "./aishell/wav/test/S0766/BAC009S0766W0344.wav", "txt": "进而提升我国在冬奥会申办过程中的竞争力"} -{"key": "BAC009S0766W0345", "wav": "./aishell/wav/test/S0766/BAC009S0766W0345.wav", "txt": "二零二二年冬奥会在北京举行"} -{"key": "BAC009S0766W0346", "wav": "./aishell/wav/test/S0766/BAC009S0766W0346.wav", "txt": "以宋安东为首的运动员们可以说是鲜活的冬奥名片"} -{"key": "BAC009S0766W0347", "wav": "./aishell/wav/test/S0766/BAC009S0766W0347.wav", "txt": "让越来越多人关注并参与到其中来"} -{"key": "BAC009S0766W0348", "wav": "./aishell/wav/test/S0766/BAC009S0766W0348.wav", "txt": "建设三个相对集聚的场馆群"} -{"key": "BAC009S0766W0349", "wav": "./aishell/wav/test/S0766/BAC009S0766W0349.wav", "txt": "申奥过程本身已经推动了城际交通建设"} -{"key": "BAC009S0766W0350", "wav": "./aishell/wav/test/S0766/BAC009S0766W0350.wav", "txt": "因此对于北京申办冬奥会的最终结果"} -{"key": "BAC009S0766W0351", "wav": "./aishell/wav/test/S0766/BAC009S0766W0351.wav", "txt": "我们应该抱着更加长远和开阔的视角来看待"} -{"key": "BAC009S0766W0352", "wav": "./aishell/wav/test/S0766/BAC009S0766W0352.wav", "txt": "更要期待着中国冰雪运动真正强大起来的那一天"} -{"key": "BAC009S0766W0353", "wav": "./aishell/wav/test/S0766/BAC009S0766W0353.wav", "txt": "法国冰协同于放人五度世界冠军即将复出搜狐体育"} -{"key": "BAC009S0766W0354", "wav": "./aishell/wav/test/S0766/BAC009S0766W0354.wav", "txt": "北京时间十月二十七日"} -{"key": "BAC009S0766W0355", "wav": "./aishell/wav/test/S0766/BAC009S0766W0355.wav", "txt": "经过将近一年时间的漫长谈判"} -{"key": "BAC009S0766W0356", "wav": "./aishell/wav/test/S0766/BAC009S0766W0356.wav", "txt": "法国花样滑冰联合会终于同意"} -{"key": "BAC009S0766W0357", "wav": "./aishell/wav/test/S0766/BAC009S0766W0357.wav", "txt": "允许布鲁诺马塞洛特代表德国"} -{"key": "BAC009S0766W0358", "wav": "./aishell/wav/test/S0766/BAC009S0766W0358.wav", "txt": "两人的更改国籍禁赛期即将开始"} -{"key": "BAC009S0766W0359", "wav": "./aishell/wav/test/S0766/BAC009S0766W0359.wav", "txt": "这也意味着最晚在明年的各项赛事中"} -{"key": "BAC009S0766W0360", "wav": "./aishell/wav/test/S0766/BAC009S0766W0360.wav", "txt": "我们就能看到这对强大组合的身影"} -{"key": "BAC009S0766W0361", "wav": "./aishell/wav/test/S0766/BAC009S0766W0361.wav", "txt": "在金牌搭档罗宾索尔科维退役之后"} -{"key": "BAC009S0766W0362", "wav": "./aishell/wav/test/S0766/BAC009S0766W0362.wav", "txt": "萨维申科宣布会再坚持一个冬奥会周期"} -{"key": "BAC009S0766W0363", "wav": "./aishell/wav/test/S0766/BAC009S0766W0363.wav", "txt": "她所选择的新搭档就是法国猛男马塞洛特"} -{"key": "BAC009S0766W0364", "wav": "./aishell/wav/test/S0766/BAC009S0766W0364.wav", "txt": "但是因为涉及到男伴更改国籍问题"} -{"key": "BAC009S0766W0365", "wav": "./aishell/wav/test/S0766/BAC009S0766W0365.wav", "txt": "两人的联手十分不顺利"} -{"key": "BAC009S0766W0366", "wav": "./aishell/wav/test/S0766/BAC009S0766W0366.wav", "txt": "这也让他们虽然可以参加小型赛事"} -{"key": "BAC009S0766W0367", "wav": "./aishell/wav/test/S0766/BAC009S0766W0367.wav", "txt": "但是由于国籍不统一"} -{"key": "BAC009S0766W0368", "wav": "./aishell/wav/test/S0766/BAC009S0766W0368.wav", "txt": "无法参加奥运会的比赛"} -{"key": "BAC009S0766W0369", "wav": "./aishell/wav/test/S0766/BAC009S0766W0369.wav", "txt": "对于法国冰协的行为"} -{"key": "BAC009S0766W0370", "wav": "./aishell/wav/test/S0766/BAC009S0766W0370.wav", "txt": "不少粉丝都表达了谴责"} -{"key": "BAC009S0766W0371", "wav": "./aishell/wav/test/S0766/BAC009S0766W0371.wav", "txt": "支持他们继续训练参加比赛"} -{"key": "BAC009S0766W0372", "wav": "./aishell/wav/test/S0766/BAC009S0766W0372.wav", "txt": "显然处于最艰难时期的两人丝毫没有放弃"} -{"key": "BAC009S0766W0373", "wav": "./aishell/wav/test/S0766/BAC009S0766W0373.wav", "txt": "休赛期内他们参加了小型赛事"} -{"key": "BAC009S0766W0374", "wav": "./aishell/wav/test/S0766/BAC009S0766W0374.wav", "txt": "从目前已经传出的视频来看"} -{"key": "BAC009S0766W0375", "wav": "./aishell/wav/test/S0766/BAC009S0766W0375.wav", "txt": "男伴更是在最新公布的视频中"} -{"key": "BAC009S0766W0376", "wav": "./aishell/wav/test/S0766/BAC009S0766W0376.wav", "txt": "他们的不放弃换来了成功"} -{"key": "BAC009S0766W0377", "wav": "./aishell/wav/test/S0766/BAC009S0766W0377.wav", "txt": "马塞洛特的母亲表示"} -{"key": "BAC009S0766W0378", "wav": "./aishell/wav/test/S0766/BAC009S0766W0378.wav", "txt": "法国冰协方面的态度有了缓和"} -{"key": "BAC009S0766W0379", "wav": "./aishell/wav/test/S0766/BAC009S0766W0379.wav", "txt": "法国冰协提出最后要求"} -{"key": "BAC009S0766W0380", "wav": "./aishell/wav/test/S0766/BAC009S0766W0380.wav", "txt": "要求马塞洛特缴纳七万欧元的转国籍费用"} -{"key": "BAC009S0766W0381", "wav": "./aishell/wav/test/S0766/BAC009S0766W0381.wav", "txt": "随后冰迷们自发为其网上募集资金"} -{"key": "BAC009S0766W0382", "wav": "./aishell/wav/test/S0766/BAC009S0766W0382.wav", "txt": "马塞洛特来到法国花样滑冰联合会"} -{"key": "BAC009S0766W0383", "wav": "./aishell/wav/test/S0766/BAC009S0766W0383.wav", "txt": "双方进行了最后一次也是最成功的一次洽谈"} -{"key": "BAC009S0766W0384", "wav": "./aishell/wav/test/S0766/BAC009S0766W0384.wav", "txt": "能够保障他的职业生涯发展是我的荣幸"} -{"key": "BAC009S0766W0385", "wav": "./aishell/wav/test/S0766/BAC009S0766W0385.wav", "txt": "恭喜他与萨维申科走上正确的道路"} -{"key": "BAC009S0766W0386", "wav": "./aishell/wav/test/S0766/BAC009S0766W0386.wav", "txt": "我们的朋友将代表德国"} -{"key": "BAC009S0766W0387", "wav": "./aishell/wav/test/S0766/BAC009S0766W0387.wav", "txt": "继续征战花样滑冰的比赛"} -{"key": "BAC009S0766W0388", "wav": "./aishell/wav/test/S0766/BAC009S0766W0388.wav", "txt": "今年第二位离开法国冰协更换国籍的选手"} -{"key": "BAC009S0766W0389", "wav": "./aishell/wav/test/S0766/BAC009S0766W0389.wav", "txt": "对于这个万众期待的消息"} -{"key": "BAC009S0766W0390", "wav": "./aishell/wav/test/S0766/BAC009S0766W0390.wav", "txt": "我可以带着它回家了"} -{"key": "BAC009S0766W0391", "wav": "./aishell/wav/test/S0766/BAC009S0766W0391.wav", "txt": "谢谢每一个支持我们的人"} -{"key": "BAC009S0766W0392", "wav": "./aishell/wav/test/S0766/BAC009S0766W0392.wav", "txt": "没有你们的支持我们该如何度过最挣扎的时期呢"} -{"key": "BAC009S0766W0393", "wav": "./aishell/wav/test/S0766/BAC009S0766W0393.wav", "txt": "是时候去努力工作了"} -{"key": "BAC009S0766W0394", "wav": "./aishell/wav/test/S0766/BAC009S0766W0394.wav", "txt": "他们的禁赛期即将开始"} -{"key": "BAC009S0766W0395", "wav": "./aishell/wav/test/S0766/BAC009S0766W0395.wav", "txt": "我们或许就将看到他们征战各类大型赛事的身影"} -{"key": "BAC009S0766W0396", "wav": "./aishell/wav/test/S0766/BAC009S0766W0396.wav", "txt": "对于隋文静韩聪彭程张昊领衔的中国双人滑军团"} -{"key": "BAC009S0766W0397", "wav": "./aishell/wav/test/S0766/BAC009S0766W0397.wav", "txt": "五度世锦赛冠军萨维申科联手年轻新搭档马塞洛特"} -{"key": "BAC009S0766W0398", "wav": "./aishell/wav/test/S0766/BAC009S0766W0398.wav", "txt": "这会是一对绝对强大的对手"} -{"key": "BAC009S0766W0399", "wav": "./aishell/wav/test/S0766/BAC009S0766W0399.wav", "txt": "经过近两个星期的漫长等待"} -{"key": "BAC009S0766W0400", "wav": "./aishell/wav/test/S0766/BAC009S0766W0400.wav", "txt": "北京时间八月九日一五三零"} -{"key": "BAC009S0766W0401", "wav": "./aishell/wav/test/S0766/BAC009S0766W0401.wav", "txt": "为观众们奉上昆仑决鏖战香江的精彩赛事"} -{"key": "BAC009S0766W0402", "wav": "./aishell/wav/test/S0766/BAC009S0766W0402.wav", "txt": "泰拳黑王子播求无疑同小皇帝詹姆斯最为相似"} -{"key": "BAC009S0766W0403", "wav": "./aishell/wav/test/S0766/BAC009S0766W0403.wav", "txt": "并在各自领域中享受着各自粉丝们帝王般的顶礼膜拜"} -{"key": "BAC009S0766W0404", "wav": "./aishell/wav/test/S0766/BAC009S0766W0404.wav", "txt": "曾以分歧者星运里的错窜红的谢琳伍德蕾"} -{"key": "BAC009S0766W0405", "wav": "./aishell/wav/test/S0766/BAC009S0766W0405.wav", "txt": "将出演影片的女主角"} -{"key": "BAC009S0766W0406", "wav": "./aishell/wav/test/S0766/BAC009S0766W0406.wav", "txt": "搜狐娱乐讯文耷子备受关注的重拍版乌鸦"} -{"key": "BAC009S0766W0407", "wav": "./aishell/wav/test/S0766/BAC009S0766W0407.wav", "txt": "在经历了无数次的导演和演员更换之后"} -{"key": "BAC009S0766W0408", "wav": "./aishell/wav/test/S0766/BAC009S0766W0408.wav", "txt": "除了去年结婚"} -{"key": "BAC009S0766W0409", "wav": "./aishell/wav/test/S0766/BAC009S0766W0409.wav", "txt": "有时候人生是计划赶不上变化的"} -{"key": "BAC009S0766W0410", "wav": "./aishell/wav/test/S0766/BAC009S0766W0410.wav", "txt": "就顺着你的感觉走就好了"} -{"key": "BAC009S0766W0411", "wav": "./aishell/wav/test/S0766/BAC009S0766W0411.wav", "txt": "日前在初赛收官战中返场的她加盟猜评团"} -{"key": "BAC009S0766W0412", "wav": "./aishell/wav/test/S0766/BAC009S0766W0412.wav", "txt": "一袭土豪金西装简直潮爆"} -{"key": "BAC009S0766W0413", "wav": "./aishell/wav/test/S0766/BAC009S0766W0413.wav", "txt": "有眼尖的网友发现"} -{"key": "BAC009S0766W0414", "wav": "./aishell/wav/test/S0766/BAC009S0766W0414.wav", "txt": "与孙楠巫启贤共同起立鼓掌的许茹芸小腹凸起"} -{"key": "BAC009S0766W0415", "wav": "./aishell/wav/test/S0766/BAC009S0766W0415.wav", "txt": "搜狐娱乐讯九月十二日"} -{"key": "BAC009S0766W0416", "wav": "./aishell/wav/test/S0766/BAC009S0766W0416.wav", "txt": "许茹芸与韩国丈夫崔栽诚迎来结婚一周年纪念日"} -{"key": "BAC009S0766W0417", "wav": "./aishell/wav/test/S0766/BAC009S0766W0417.wav", "txt": "许茹芸特地发微博感谢婚姻带来的幸福"} -{"key": "BAC009S0766W0418", "wav": "./aishell/wav/test/S0766/BAC009S0766W0418.wav", "txt": "许茹芸重回舞台不做苦情女娱乐频道"} -{"key": "BAC009S0766W0419", "wav": "./aishell/wav/test/S0766/BAC009S0766W0419.wav", "txt": "华西都市报讯闪婚欧巴一年后二零一四年"} -{"key": "BAC009S0766W0420", "wav": "./aishell/wav/test/S0766/BAC009S0766W0420.wav", "txt": "许茹芸与韩籍男朋友举行了婚礼"} -{"key": "BAC009S0766W0421", "wav": "./aishell/wav/test/S0766/BAC009S0766W0421.wav", "txt": "迎来了人生崭新阶段"} -{"key": "BAC009S0766W0422", "wav": "./aishell/wav/test/S0766/BAC009S0766W0422.wav", "txt": "不同于大家心中按部就班的乖乖女形象"} -{"key": "BAC009S0766W0423", "wav": "./aishell/wav/test/S0766/BAC009S0766W0423.wav", "txt": "许茹芸突然闪婚让当时的娱乐圈也惊起了一阵小波澜"} -{"key": "BAC009S0766W0424", "wav": "./aishell/wav/test/S0766/BAC009S0766W0424.wav", "txt": "此后便鲜有消息"} -{"key": "BAC009S0766W0425", "wav": "./aishell/wav/test/S0766/BAC009S0766W0425.wav", "txt": "和往日的端庄淑女形象大有不同"} -{"key": "BAC009S0766W0426", "wav": "./aishell/wav/test/S0766/BAC009S0766W0426.wav", "txt": "对于重回舞台夺下当日歌王"} -{"key": "BAC009S0766W0427", "wav": "./aishell/wav/test/S0766/BAC009S0766W0427.wav", "txt": "她也坦言内心感触很多"} -{"key": "BAC009S0766W0428", "wav": "./aishell/wav/test/S0766/BAC009S0766W0428.wav", "txt": "论眉毛重要性"} -{"key": "BAC009S0766W0430", "wav": "./aishell/wav/test/S0766/BAC009S0766W0430.wav", "txt": "中新网五月七日电据台湾中国时报消息"} -{"key": "BAC009S0766W0431", "wav": "./aishell/wav/test/S0766/BAC009S0766W0431.wav", "txt": "李嘉诚回应撤资不爱国指控完全不成立"} -{"key": "BAC009S0766W0432", "wav": "./aishell/wav/test/S0766/BAC009S0766W0432.wav", "txt": "李嘉诚首次公开回应撤资不爱国等质疑"} -{"key": "BAC009S0766W0433", "wav": "./aishell/wav/test/S0766/BAC009S0766W0433.wav", "txt": "称一篇似是而非的文章"} -{"key": "BAC009S0766W0434", "wav": "./aishell/wav/test/S0766/BAC009S0766W0434.wav", "txt": "在其发给记者的新闻稿中说"} -{"key": "BAC009S0766W0435", "wav": "./aishell/wav/test/S0766/BAC009S0766W0435.wav", "txt": "所谓撤资指控完全不成立"} -{"key": "BAC009S0766W0436", "wav": "./aishell/wav/test/S0766/BAC009S0766W0436.wav", "txt": "其集团在全球拥有一三零零零间店铺"} -{"key": "BAC009S0766W0437", "wav": "./aishell/wav/test/S0766/BAC009S0766W0437.wav", "txt": "其中内地由两年前的一三零零间增至今天的二三零零间"} -{"key": "BAC009S0766W0438", "wav": "./aishell/wav/test/S0766/BAC009S0766W0438.wav", "txt": "李嘉诚怎么回答与中央关系有变"} -{"key": "BAC009S0766W0439", "wav": "./aishell/wav/test/S0766/BAC009S0766W0439.wav", "txt": "李嘉诚或再抛售内地地产项目拟出售上海办公楼"} -{"key": "BAC009S0766W0440", "wav": "./aishell/wav/test/S0766/BAC009S0766W0440.wav", "txt": "中国日报网八月三日电据华尔街日报三日报道"} -{"key": "BAC009S0766W0441", "wav": "./aishell/wav/test/S0766/BAC009S0766W0441.wav", "txt": "据两名知情人透露"} -{"key": "BAC009S0766W0442", "wav": "./aishell/wav/test/S0766/BAC009S0766W0442.wav", "txt": "李嘉诚正式回应撤资指控不相信文革式思维复苏"} -{"key": "BAC009S0766W0443", "wav": "./aishell/wav/test/S0766/BAC009S0766W0443.wav", "txt": "李嘉诚首次对撤资做出回应"} -{"key": "BAC009S0766W0444", "wav": "./aishell/wav/test/S0766/BAC009S0766W0444.wav", "txt": "我明白言论自由是一把两刃刀"} -{"key": "BAC009S0766W0445", "wav": "./aishell/wav/test/S0766/BAC009S0766W0445.wav", "txt": "因此一篇似是而非的文章"} -{"key": "BAC009S0766W0446", "wav": "./aishell/wav/test/S0766/BAC009S0766W0446.wav", "txt": "李嘉诚首次回应撤资传闻对中国发展充满信心"} -{"key": "BAC009S0766W0447", "wav": "./aishell/wav/test/S0766/BAC009S0766W0447.wav", "txt": "中新网九月三零日电据香港文汇报报道"} -{"key": "BAC009S0766W0448", "wav": "./aishell/wav/test/S0766/BAC009S0766W0448.wav", "txt": "对中央坚定不移继续改革开放"} -{"key": "BAC009S0766W0449", "wav": "./aishell/wav/test/S0766/BAC009S0766W0449.wav", "txt": "致力优化营商环境有信心"} -{"key": "BAC009S0766W0450", "wav": "./aishell/wav/test/S0766/BAC009S0766W0450.wav", "txt": "对中国发展充满信心"} -{"key": "BAC009S0766W0451", "wav": "./aishell/wav/test/S0766/BAC009S0766W0451.wav", "txt": "李娜产女后首次亮相运动员掌握英语很重要"} -{"key": "BAC009S0766W0452", "wav": "./aishell/wav/test/S0766/BAC009S0766W0452.wav", "txt": "李娜在一个商业活动中谈退役后的生活"} -{"key": "BAC009S0766W0453", "wav": "./aishell/wav/test/S0766/BAC009S0766W0453.wav", "txt": "李娜不想大家一直记得我那说明中国网球没突破"} -{"key": "BAC009S0766W0454", "wav": "./aishell/wav/test/S0766/BAC009S0766W0454.wav", "txt": "虽然已经退役近一年"} -{"key": "BAC009S0766W0455", "wav": "./aishell/wav/test/S0766/BAC009S0766W0455.wav", "txt": "但曾经的中国网球一姐李娜仍然没有淡出媒体的关注"} -{"key": "BAC009S0766W0456", "wav": "./aishell/wav/test/S0766/BAC009S0766W0456.wav", "txt": "李娜媒体用一次性参赛是对运动员的侮辱"} -{"key": "BAC009S0766W0457", "wav": "./aishell/wav/test/S0766/BAC009S0766W0457.wav", "txt": "长江商报消息本报记者张萌昨日"} -{"key": "BAC009S0766W0458", "wav": "./aishell/wav/test/S0766/BAC009S0766W0458.wav", "txt": "家居养娃的李娜又重新出现在媒体大众的面前"} -{"key": "BAC009S0766W0459", "wav": "./aishell/wav/test/S0766/BAC009S0766W0459.wav", "txt": "带着辛吉斯逛完了黄鹤楼"} -{"key": "BAC009S0766W0460", "wav": "./aishell/wav/test/S0766/BAC009S0766W0460.wav", "txt": "当日的新闻发布会上"} -{"key": "BAC009S0766W0461", "wav": "./aishell/wav/test/S0766/BAC009S0766W0461.wav", "txt": "李娜一身素色休闲装"} -{"key": "BAC009S0766W0462", "wav": "./aishell/wav/test/S0766/BAC009S0766W0462.wav", "txt": "走进了武网的新闻大厅"} -{"key": "BAC009S0766W0463", "wav": "./aishell/wav/test/S0766/BAC009S0766W0463.wav", "txt": "她身上少了些以往的悍将拼劲"} -{"key": "BAC009S0766W0464", "wav": "./aishell/wav/test/S0766/BAC009S0766W0464.wav", "txt": "多了初为人母的幸福光彩"} -{"key": "BAC009S0766W0465", "wav": "./aishell/wav/test/S0766/BAC009S0766W0465.wav", "txt": "看似犀利不再的娜姐老将气场立刻显出"} -{"key": "BAC009S0766W0466", "wav": "./aishell/wav/test/S0766/BAC009S0766W0466.wav", "txt": "一语回击一次性参赛这种说法是一种侮辱"} -{"key": "BAC009S0766W0467", "wav": "./aishell/wav/test/S0766/BAC009S0766W0467.wav", "txt": "希望媒体不要用这样的词来形容所有网球运动员"} -{"key": "BAC009S0766W0468", "wav": "./aishell/wav/test/S0766/BAC009S0766W0468.wav", "txt": "因为没有哪个运动员不想表现出最好的自己"} -{"key": "BAC009S0766W0469", "wav": "./aishell/wav/test/S0766/BAC009S0766W0469.wav", "txt": "李岚清座谈戏称自己八零后退休不等于生命终结"} -{"key": "BAC009S0766W0470", "wav": "./aishell/wav/test/S0766/BAC009S0766W0470.wav", "txt": "不知不觉我成为一个八零后的老头"} -{"key": "BAC009S0766W0471", "wav": "./aishell/wav/test/S0766/BAC009S0766W0471.wav", "txt": "退休后不在其位不谋其政"} -{"key": "BAC009S0766W0472", "wav": "./aishell/wav/test/S0766/BAC009S0766W0472.wav", "txt": "退休并不等于生命的终结"} -{"key": "BAC009S0766W0473", "wav": "./aishell/wav/test/S0766/BAC009S0766W0473.wav", "txt": "如果放弃学习没有追求"} -{"key": "BAC009S0766W0474", "wav": "./aishell/wav/test/S0766/BAC009S0766W0474.wav", "txt": "一个人的精神生命就将走向衰老"} -{"key": "BAC009S0766W0475", "wav": "./aishell/wav/test/S0766/BAC009S0766W0475.wav", "txt": "因此我给自己规划了八个字的退休生活"} -{"key": "BAC009S0766W0476", "wav": "./aishell/wav/test/S0766/BAC009S0766W0476.wav", "txt": "戏称自己年过八零当为八零后"} -{"key": "BAC009S0766W0477", "wav": "./aishell/wav/test/S0766/BAC009S0766W0477.wav", "txt": "李开复经历死亡这一课学会看透和放下"} -{"key": "BAC009S0766W0478", "wav": "./aishell/wav/test/S0766/BAC009S0766W0478.wav", "txt": "李开复被医生宣判为第四期淋巴癌"} -{"key": "BAC009S0766W0479", "wav": "./aishell/wav/test/S0766/BAC009S0766W0479.wav", "txt": "不期而至的阴霾让他被迫抛下工作"} -{"key": "BAC009S0766W0480", "wav": "./aishell/wav/test/S0766/BAC009S0766W0480.wav", "txt": "在新书向死而生我修的死亡学分中"} -{"key": "BAC009S0766W0481", "wav": "./aishell/wav/test/S0766/BAC009S0766W0481.wav", "txt": "我从没想过自己竟会出版一本这样的书"} -{"key": "BAC009S0766W0482", "wav": "./aishell/wav/test/S0766/BAC009S0766W0482.wav", "txt": "李晨马震就是玩笑任何情况都力挺范冰冰"} -{"key": "BAC009S0766W0483", "wav": "./aishell/wav/test/S0766/BAC009S0766W0483.wav", "txt": "新京报快讯记者刘玮近日"} -{"key": "BAC009S0766W0484", "wav": "./aishell/wav/test/S0766/BAC009S0766W0484.wav", "txt": "由于电影王朝的女人杨贵妃中的一场激情戏"} -{"key": "BAC009S0766W0485", "wav": "./aishell/wav/test/S0766/BAC009S0766W0485.wav", "txt": "范冰冰承包了娱乐头条"} -{"key": "BAC009S0766W0486", "wav": "./aishell/wav/test/S0766/BAC009S0766W0486.wav", "txt": "出席活动时笑称今后拍激情戏会征求男友李晨的意见"} -{"key": "BAC009S0766W0487", "wav": "./aishell/wav/test/S0766/BAC009S0766W0487.wav", "txt": "李晨秀才遇到兵发布会后回应称"} -{"key": "BAC009S0766W0488", "wav": "./aishell/wav/test/S0766/BAC009S0766W0488.wav", "txt": "如果这个事情反过来"} -{"key": "BAC009S0766W0489", "wav": "./aishell/wav/test/S0766/BAC009S0766W0489.wav", "txt": "演员这个职业就是这样"} -{"key": "BAC009S0766W0490", "wav": "./aishell/wav/test/S0766/BAC009S0766W0490.wav", "txt": "李玉刚张学友黄琦雯入选一零大最涨姿势歌曲"} -{"key": "BAC009S0766W0491", "wav": "./aishell/wav/test/S0766/BAC009S0766W0491.wav", "txt": "李玉刚新歌点击逾一亿网友李家每人只需半次"} -{"key": "BAC009S0766W0492", "wav": "./aishell/wav/test/S0766/BAC009S0766W0492.wav", "txt": "李玉刚饰演的杨贵妃被指芳华绝代说到神曲"} -{"key": "BAC009S0766W0493", "wav": "./aishell/wav/test/S0766/BAC009S0766W0493.wav", "txt": "该歌曲二零零字的歌词用典竟达三六处之多"} -{"key": "BAC009S0766W0494", "wav": "./aishell/wav/test/S0766/BAC009S0766W0494.wav", "txt": "让一些网友有如猜谜"} -{"key": "BAC009S0766W0495", "wav": "./aishell/wav/test/S0766/BAC009S0766W0495.wav", "txt": "被称为二零一五年第一神曲"} -{"key": "BAC009S0767W0121", "wav": "./aishell/wav/test/S0767/BAC009S0767W0121.wav", "txt": "时刻保持创新和变革意识"} -{"key": "BAC009S0767W0122", "wav": "./aishell/wav/test/S0767/BAC009S0767W0122.wav", "txt": "引领中国房地产广告行业走向新的黄金时代"} -{"key": "BAC009S0767W0123", "wav": "./aishell/wav/test/S0767/BAC009S0767W0123.wav", "txt": "今久整合营销集团迎来了它的十岁生日"} -{"key": "BAC009S0767W0124", "wav": "./aishell/wav/test/S0767/BAC009S0767W0124.wav", "txt": "今久从最初的几十个人"} -{"key": "BAC009S0767W0125", "wav": "./aishell/wav/test/S0767/BAC009S0767W0125.wav", "txt": "今久商品房销售额首次上涨"} -{"key": "BAC009S0767W0126", "wav": "./aishell/wav/test/S0767/BAC009S0767W0126.wav", "txt": "房地产投资增速仍下降"} -{"key": "BAC009S0767W0127", "wav": "./aishell/wav/test/S0767/BAC009S0767W0127.wav", "txt": "大智慧阿思达克通讯社"} -{"key": "BAC009S0767W0128", "wav": "./aishell/wav/test/S0767/BAC009S0767W0128.wav", "txt": "一五年一月份"} -{"key": "BAC009S0767W0129", "wav": "./aishell/wav/test/S0767/BAC009S0767W0129.wav", "txt": "全国房地产开发投资三万亿元"} -{"key": "BAC009S0767W0130", "wav": "./aishell/wav/test/S0767/BAC009S0767W0130.wav", "txt": "同比名义增长许多"} -{"key": "BAC009S0767W0131", "wav": "./aishell/wav/test/S0767/BAC009S0767W0131.wav", "txt": "增速比一月份回落零点九个百分点"} -{"key": "BAC009S0767W0132", "wav": "./aishell/wav/test/S0767/BAC009S0767W0132.wav", "txt": "全国商品房销售额两万亿元"} -{"key": "BAC009S0767W0133", "wav": "./aishell/wav/test/S0767/BAC009S0767W0133.wav", "txt": "年内首次出现同比增长"} -{"key": "BAC009S0767W0134", "wav": "./aishell/wav/test/S0767/BAC009S0767W0134.wav", "txt": "住宅销售额也增长了"} -{"key": "BAC009S0767W0135", "wav": "./aishell/wav/test/S0767/BAC009S0767W0135.wav", "txt": "办公楼销售额下降了"} -{"key": "BAC009S0767W0136", "wav": "./aishell/wav/test/S0767/BAC009S0767W0136.wav", "txt": "商业营业用房销售额下降了"} -{"key": "BAC009S0767W0137", "wav": "./aishell/wav/test/S0767/BAC009S0767W0137.wav", "txt": "住宅成为全国房地产销售金额唯一增长的板块"} -{"key": "BAC009S0767W0138", "wav": "./aishell/wav/test/S0767/BAC009S0767W0138.wav", "txt": "一系列楼市新政效果逐步显现"} -{"key": "BAC009S0767W0139", "wav": "./aishell/wav/test/S0767/BAC009S0767W0139.wav", "txt": "德佑链家市场研究部总监陆骑麟表示"} -{"key": "BAC009S0767W0140", "wav": "./aishell/wav/test/S0767/BAC009S0767W0140.wav", "txt": "全国房地产开发投资增速仍然延续了增速放缓的渠势"} -{"key": "BAC009S0767W0141", "wav": "./aishell/wav/test/S0767/BAC009S0767W0141.wav", "txt": "尽管有央行降息等各方利好刺激"} -{"key": "BAC009S0767W0142", "wav": "./aishell/wav/test/S0767/BAC009S0767W0142.wav", "txt": "尤其是库存高企的三四线城市"} -{"key": "BAC009S0767W0143", "wav": "./aishell/wav/test/S0767/BAC009S0767W0143.wav", "txt": "开发商仍然面临着较大的销售压力"} -{"key": "BAC009S0767W0144", "wav": "./aishell/wav/test/S0767/BAC009S0767W0144.wav", "txt": "国家统计局公布的数据显示"} -{"key": "BAC009S0767W0145", "wav": "./aishell/wav/test/S0767/BAC009S0767W0145.wav", "txt": "无论是东部中部还是西部地区"} -{"key": "BAC009S0767W0146", "wav": "./aishell/wav/test/S0767/BAC009S0767W0146.wav", "txt": "商品房房的销售面积同比数据出现好转"} -{"key": "BAC009S0767W0147", "wav": "./aishell/wav/test/S0767/BAC009S0767W0147.wav", "txt": "商品房销售面积三亿平方米"} -{"key": "BAC009S0767W0148", "wav": "./aishell/wav/test/S0767/BAC009S0767W0148.wav", "txt": "降幅比四月份收窄六个百分点"} -{"key": "BAC009S0767W0149", "wav": "./aishell/wav/test/S0767/BAC009S0767W0149.wav", "txt": "在公积金松绑等作用的刺激下"} -{"key": "BAC009S0767W0150", "wav": "./aishell/wav/test/S0767/BAC009S0767W0150.wav", "txt": "五月份商品房销售的面积同比数据由负转正"} -{"key": "BAC009S0767W0151", "wav": "./aishell/wav/test/S0767/BAC009S0767W0151.wav", "txt": "作为三四线城市最为集中的中部地区来说"} -{"key": "BAC009S0767W0152", "wav": "./aishell/wav/test/S0767/BAC009S0767W0152.wav", "txt": "房地产开发企业土地购置面积很大"} -{"key": "BAC009S0767W0153", "wav": "./aishell/wav/test/S0767/BAC009S0767W0153.wav", "txt": "同比下降不少"} -{"key": "BAC009S0767W0154", "wav": "./aishell/wav/test/S0767/BAC009S0767W0154.wav", "txt": "降幅收窄三个版百分点"} -{"key": "BAC009S0767W0155", "wav": "./aishell/wav/test/S0767/BAC009S0767W0155.wav", "txt": "各方原因的叠加导致了房企拿地量的明显减少"} -{"key": "BAC009S0767W0156", "wav": "./aishell/wav/test/S0767/BAC009S0767W0156.wav", "txt": "今年一线城市住宅用地价格涨五成"} -{"key": "BAC009S0767W0157", "wav": "./aishell/wav/test/S0767/BAC009S0767W0157.wav", "txt": "今年商品房销售一度低迷"} -{"key": "BAC009S0767W0158", "wav": "./aishell/wav/test/S0767/BAC009S0767W0158.wav", "txt": "一线城市土地市场成交火热"} -{"key": "BAC009S0767W0159", "wav": "./aishell/wav/test/S0767/BAC009S0767W0159.wav", "txt": "中介机构统计数据显示"} -{"key": "BAC009S0767W0160", "wav": "./aishell/wav/test/S0767/BAC009S0767W0160.wav", "txt": "平均价格为十万元每平方米"} -{"key": "BAC009S0767W0161", "wav": "./aishell/wav/test/S0767/BAC009S0767W0161.wav", "txt": "同比上涨五成"} -{"key": "BAC009S0767W0162", "wav": "./aishell/wav/test/S0767/BAC009S0767W0162.wav", "txt": "随着一线城市楼市企温回升"} -{"key": "BAC009S0767W0163", "wav": "./aishell/wav/test/S0767/BAC009S0767W0163.wav", "txt": "房企在一线城市拿地的热情还将提高"} -{"key": "BAC009S0767W0164", "wav": "./aishell/wav/test/S0767/BAC009S0767W0164.wav", "txt": "土地市场热度可能有所下降"} -{"key": "BAC009S0767W0165", "wav": "./aishell/wav/test/S0767/BAC009S0767W0165.wav", "txt": "大型房企低迷期拿地"} -{"key": "BAC009S0767W0166", "wav": "./aishell/wav/test/S0767/BAC009S0767W0166.wav", "txt": "中原地产市场研究部统计数据显示"} -{"key": "BAC009S0767W0167", "wav": "./aishell/wav/test/S0767/BAC009S0767W0167.wav", "txt": "土地成交价款三千亿元"} -{"key": "BAC009S0767W0168", "wav": "./aishell/wav/test/S0767/BAC009S0767W0168.wav", "txt": "预计全年有望突破四千亿元"} -{"key": "BAC009S0767W0169", "wav": "./aishell/wav/test/S0767/BAC009S0767W0169.wav", "txt": "一线城市住宅用地平均价格为十一万元每平方米"} -{"key": "BAC009S0767W0170", "wav": "./aishell/wav/test/S0767/BAC009S0767W0170.wav", "txt": "同比上涨约六成"} -{"key": "BAC009S0767W0171", "wav": "./aishell/wav/test/S0767/BAC009S0767W0171.wav", "txt": "中原地产首席分析师张大伟认为"} -{"key": "BAC009S0767W0172", "wav": "./aishell/wav/test/S0767/BAC009S0767W0172.wav", "txt": "住宅市场交易明显升温"} -{"key": "BAC009S0767W0173", "wav": "./aishell/wav/test/S0767/BAC009S0767W0173.wav", "txt": "迅速带动一线城市土地市场的整体成交"} -{"key": "BAC009S0767W0174", "wav": "./aishell/wav/test/S0767/BAC009S0767W0174.wav", "txt": "房地产业正经历一个调整阶段"} -{"key": "BAC009S0767W0175", "wav": "./aishell/wav/test/S0767/BAC009S0767W0175.wav", "txt": "大型房企实施低迷期拿地的策略"} -{"key": "BAC009S0767W0176", "wav": "./aishell/wav/test/S0767/BAC009S0767W0176.wav", "txt": "在整体市场供大于求区域分化严重的情况下"} -{"key": "BAC009S0767W0177", "wav": "./aishell/wav/test/S0767/BAC009S0767W0177.wav", "txt": "房企更加愿意扎堆一线城市"} -{"key": "BAC009S0767W0178", "wav": "./aishell/wav/test/S0767/BAC009S0767W0178.wav", "txt": "而非在三四线城市深耕"} -{"key": "BAC009S0767W0179", "wav": "./aishell/wav/test/S0767/BAC009S0767W0179.wav", "txt": "这使得一线城市的土地竞争激烈"} -{"key": "BAC009S0767W0180", "wav": "./aishell/wav/test/S0767/BAC009S0767W0180.wav", "txt": "今年一线城市宅地成交的溢价率不高"} -{"key": "BAC009S0767W0181", "wav": "./aishell/wav/test/S0767/BAC009S0767W0181.wav", "txt": "平均溢价率较低"} -{"key": "BAC009S0767W0182", "wav": "./aishell/wav/test/S0767/BAC009S0767W0182.wav", "txt": "较去年明显下降"} -{"key": "BAC009S0767W0183", "wav": "./aishell/wav/test/S0767/BAC009S0767W0183.wav", "txt": "这是因为土地一级开发成本提高"} -{"key": "BAC009S0767W0184", "wav": "./aishell/wav/test/S0767/BAC009S0767W0184.wav", "txt": "一线城市住宅用地的低价不断抬升"} -{"key": "BAC009S0767W0185", "wav": "./aishell/wav/test/S0767/BAC009S0767W0185.wav", "txt": "北京等城市在土地出让中"} -{"key": "BAC009S0767W0186", "wav": "./aishell/wav/test/S0767/BAC009S0767W0186.wav", "txt": "将保障房地块和商品房地块捆绑出让"} -{"key": "BAC009S0767W0187", "wav": "./aishell/wav/test/S0767/BAC009S0767W0187.wav", "txt": "直接涉及到债券持有人利益的保护问题"} -{"key": "BAC009S0767W0188", "wav": "./aishell/wav/test/S0767/BAC009S0767W0188.wav", "txt": "我们立即与云投集团进行了沟通"} -{"key": "BAC009S0767W0189", "wav": "./aishell/wav/test/S0767/BAC009S0767W0189.wav", "txt": "并严格按照合规程序进行"} -{"key": "BAC009S0767W0190", "wav": "./aishell/wav/test/S0767/BAC009S0767W0190.wav", "txt": "我委也注意到在企业债券存续期间"} -{"key": "BAC009S0767W0191", "wav": "./aishell/wav/test/S0767/BAC009S0767W0191.wav", "txt": "需要对发行人资产重组等重大事宜加强监管"} -{"key": "BAC009S0767W0192", "wav": "./aishell/wav/test/S0767/BAC009S0767W0192.wav", "txt": "在制度上对债券持有人的合法权益进行保护"} -{"key": "BAC009S0767W0193", "wav": "./aishell/wav/test/S0767/BAC009S0767W0193.wav", "txt": "建立地方政府债务管理体系"} -{"key": "BAC009S0767W0194", "wav": "./aishell/wav/test/S0767/BAC009S0767W0194.wav", "txt": "中国证券报从您刚才的介绍中我们了解到"} -{"key": "BAC009S0767W0195", "wav": "./aishell/wav/test/S0767/BAC009S0767W0195.wav", "txt": "城投债券对推动城市基础设施和市政设施的建设"} -{"key": "BAC009S0767W0196", "wav": "./aishell/wav/test/S0767/BAC009S0767W0196.wav", "txt": "起到了非常积极的作用"} -{"key": "BAC009S0767W0197", "wav": "./aishell/wav/test/S0767/BAC009S0767W0197.wav", "txt": "对丰富债券市场品种也具有积极意义"} -{"key": "BAC009S0767W0198", "wav": "./aishell/wav/test/S0767/BAC009S0767W0198.wav", "txt": "结合地方政府债务管理制度的完善"} -{"key": "BAC009S0767W0199", "wav": "./aishell/wav/test/S0767/BAC009S0767W0199.wav", "txt": "下一步我国的城投债券还需要做哪些完善"} -{"key": "BAC009S0767W0200", "wav": "./aishell/wav/test/S0767/BAC009S0767W0200.wav", "txt": "徐林这个问题涉及到一系列的制度完善"} -{"key": "BAC009S0767W0201", "wav": "./aishell/wav/test/S0767/BAC009S0767W0201.wav", "txt": "是一个比较复杂的问题"} -{"key": "BAC009S0767W0202", "wav": "./aishell/wav/test/S0767/BAC009S0767W0202.wav", "txt": "我个人是这么认认识的"} -{"key": "BAC009S0767W0203", "wav": "./aishell/wav/test/S0767/BAC009S0767W0203.wav", "txt": "我国还处于城市化快速发展期"} -{"key": "BAC009S0767W0204", "wav": "./aishell/wav/test/S0767/BAC009S0767W0204.wav", "txt": "需要为各地的城市建设提供规范的融资渠道"} -{"key": "BAC009S0767W0205", "wav": "./aishell/wav/test/S0767/BAC009S0767W0205.wav", "txt": "农业与非农产业之间劳动生产率的差距也很大"} -{"key": "BAC009S0767W0206", "wav": "./aishell/wav/test/S0767/BAC009S0767W0206.wav", "txt": "这决定了我国城市化动力十分强劲"} -{"key": "BAC009S0767W0207", "wav": "./aishell/wav/test/S0767/BAC009S0767W0207.wav", "txt": "城市化进程远未结束"} -{"key": "BAC009S0767W0208", "wav": "./aishell/wav/test/S0767/BAC009S0767W0208.wav", "txt": "城市化快速发展期的重要特征就是基础设施投资需求大"} -{"key": "BAC009S0767W0209", "wav": "./aishell/wav/test/S0767/BAC009S0767W0209.wav", "txt": "这是我国所处的发展阶段决定的"} -{"key": "BAC009S0767W0210", "wav": "./aishell/wav/test/S0767/BAC009S0767W0210.wav", "txt": "政府通过债务融资从事基础设施建设"} -{"key": "BAC009S0767W0211", "wav": "./aishell/wav/test/S0767/BAC009S0767W0211.wav", "txt": "我们应该建立风险可控的规范化的地方政府融资机制"} -{"key": "BAC009S0767W0212", "wav": "./aishell/wav/test/S0767/BAC009S0767W0212.wav", "txt": "为各地的基础设施建设提供有制度保障的融资渠道"} -{"key": "BAC009S0767W0213", "wav": "./aishell/wav/test/S0767/BAC009S0767W0213.wav", "txt": "城投债劵作为准市政债劵仍将是有效的融资工具"} -{"key": "BAC009S0767W0214", "wav": "./aishell/wav/test/S0767/BAC009S0767W0214.wav", "txt": "但还需要进一步改进"} -{"key": "BAC009S0767W0215", "wav": "./aishell/wav/test/S0767/BAC009S0767W0215.wav", "txt": "在政府投融资体制改革过程中"} -{"key": "BAC009S0767W0216", "wav": "./aishell/wav/test/S0767/BAC009S0767W0216.wav", "txt": "从事当地的基础设施建设"} -{"key": "BAC009S0767W0217", "wav": "./aishell/wav/test/S0767/BAC009S0767W0217.wav", "txt": "相对于过去的体制而言是更加市场化的"} -{"key": "BAC009S0767W0218", "wav": "./aishell/wav/test/S0767/BAC009S0767W0218.wav", "txt": "城投债劵作为投融资平台公司最透明的直接融资工具"} -{"key": "BAC009S0767W0219", "wav": "./aishell/wav/test/S0767/BAC009S0767W0219.wav", "txt": "仍然会存在并具有发展空间"} -{"key": "BAC009S0767W0220", "wav": "./aishell/wav/test/S0767/BAC009S0767W0220.wav", "txt": "由于目前城投债劵的发行需要符合企业债劵发行的条件"} -{"key": "BAC009S0767W0221", "wav": "./aishell/wav/test/S0767/BAC009S0767W0221.wav", "txt": "这使得我国城投债劵的发行利率相对偏高"} -{"key": "BAC009S0767W0222", "wav": "./aishell/wav/test/S0767/BAC009S0767W0222.wav", "txt": "城投债劵的发行期限和利率"} -{"key": "BAC009S0767W0223", "wav": "./aishell/wav/test/S0767/BAC009S0767W0223.wav", "txt": "未来应该在制度上作进一步完善"} -{"key": "BAC009S0767W0224", "wav": "./aishell/wav/test/S0767/BAC009S0767W0224.wav", "txt": "使得城投公司能够发行真正意义上的长期市政债劵"} -{"key": "BAC009S0767W0225", "wav": "./aishell/wav/test/S0767/BAC009S0767W0225.wav", "txt": "要尽快建立我国的地方政府债务管理体系"} -{"key": "BAC009S0767W0226", "wav": "./aishell/wav/test/S0767/BAC009S0767W0226.wav", "txt": "对于如何建立规范的地方政府融资渠道"} -{"key": "BAC009S0767W0227", "wav": "./aishell/wav/test/S0767/BAC009S0767W0227.wav", "txt": "加强地方政府债务管理和风险防控"} -{"key": "BAC009S0767W0228", "wav": "./aishell/wav/test/S0767/BAC009S0767W0228.wav", "txt": "一些专家学者提出了许多好的建议"} -{"key": "BAC009S0767W0229", "wav": "./aishell/wav/test/S0767/BAC009S0767W0229.wav", "txt": "如建立规范透明的地方政府融资渠道"} -{"key": "BAC009S0767W0230", "wav": "./aishell/wav/test/S0767/BAC009S0767W0230.wav", "txt": "并对地方政府债务进行监控和风险防范等"} -{"key": "BAC009S0767W0231", "wav": "./aishell/wav/test/S0767/BAC009S0767W0231.wav", "txt": "由于我国还没有建立统一的地方政府债务风险管理制度"} -{"key": "BAC009S0767W0232", "wav": "./aishell/wav/test/S0767/BAC009S0767W0232.wav", "txt": "设定政府性债务风险控制指标和标准"} -{"key": "BAC009S0767W0233", "wav": "./aishell/wav/test/S0767/BAC009S0767W0233.wav", "txt": "并对政府性债务实行馀额管理"} -{"key": "BAC009S0767W0234", "wav": "./aishell/wav/test/S0767/BAC009S0767W0234.wav", "txt": "使地方政府的债务融资规模控制在安全范围内"} -{"key": "BAC009S0767W0235", "wav": "./aishell/wav/test/S0767/BAC009S0767W0235.wav", "txt": "远低于发生债务危机的欧美国家"} -{"key": "BAC009S0767W0236", "wav": "./aishell/wav/test/S0767/BAC009S0767W0236.wav", "txt": "债券发行人是优质的"} -{"key": "BAC009S0767W0237", "wav": "./aishell/wav/test/S0767/BAC009S0767W0237.wav", "txt": "还本付息也是正常的"} -{"key": "BAC009S0767W0238", "wav": "./aishell/wav/test/S0767/BAC009S0767W0238.wav", "txt": "应该建立风险可控的规范化地方政府融资机制"} -{"key": "BAC009S0767W0239", "wav": "./aishell/wav/test/S0767/BAC009S0767W0239.wav", "txt": "为各地的基础设施建设提供有制度保障的融资渠道"} -{"key": "BAC009S0767W0240", "wav": "./aishell/wav/test/S0767/BAC009S0767W0240.wav", "txt": "本报记者曹志为王婷王颖春来源中国证券报"} -{"key": "BAC009S0767W0241", "wav": "./aishell/wav/test/S0767/BAC009S0767W0241.wav", "txt": "责任编辑廖一宁"} -{"key": "BAC009S0767W0242", "wav": "./aishell/wav/test/S0767/BAC009S0767W0242.wav", "txt": "据国家发改委网站消息"} -{"key": "BAC009S0767W0243", "wav": "./aishell/wav/test/S0767/BAC009S0767W0243.wav", "txt": "将考试费标准由各地自行制定改为实行上限管理"} -{"key": "BAC009S0767W0244", "wav": "./aishell/wav/test/S0767/BAC009S0767W0244.wav", "txt": "价格主管部门将按统一合理的平均成本确定考试费用"} -{"key": "BAC009S0767W0245", "wav": "./aishell/wav/test/S0767/BAC009S0767W0245.wav", "txt": "将切实减轻考生经济负担"} -{"key": "BAC009S0767W0246", "wav": "./aishell/wav/test/S0767/BAC009S0767W0246.wav", "txt": "针对目前职业资格考试收费项目增加"} -{"key": "BAC009S0767W0247", "wav": "./aishell/wav/test/S0767/BAC009S0767W0247.wav", "txt": "一些考试单位考务成本偏高"} -{"key": "BAC009S0767W0248", "wav": "./aishell/wav/test/S0767/BAC009S0767W0248.wav", "txt": "有的考试在不同地区收费标准差异较大"} -{"key": "BAC009S0767W0249", "wav": "./aishell/wav/test/S0767/BAC009S0767W0249.wav", "txt": "考生对考试收费问题反映较多等问题"} -{"key": "BAC009S0767W0250", "wav": "./aishell/wav/test/S0767/BAC009S0767W0250.wav", "txt": "改革了职业资格考试收费管理方式"} -{"key": "BAC009S0767W0251", "wav": "./aishell/wav/test/S0767/BAC009S0767W0251.wav", "txt": "对考务费标准实行统一标准化管理"} -{"key": "BAC009S0767W0252", "wav": "./aishell/wav/test/S0767/BAC009S0767W0252.wav", "txt": "通知按照不同考生规模考试类类别的合理平均成本"} -{"key": "BAC009S0767W0254", "wav": "./aishell/wav/test/S0767/BAC009S0767W0254.wav", "txt": "无疑也会成为投资者的宠儿"} -{"key": "BAC009S0767W0260", "wav": "./aishell/wav/test/S0767/BAC009S0767W0260.wav", "txt": "而苹果虽有可能卖出不少手表给忠实的粉丝"} -{"key": "BAC009S0767W0264", "wav": "./aishell/wav/test/S0767/BAC009S0767W0264.wav", "txt": "就开始追寻打造真正的机器人的梦想"} -{"key": "BAC009S0767W0265", "wav": "./aishell/wav/test/S0767/BAC009S0767W0265.wav", "txt": "但是过去整整一年他都在秘密工作"} -{"key": "BAC009S0767W0266", "wav": "./aishell/wav/test/S0767/BAC009S0767W0266.wav", "txt": "没人知道他在干什么"} -{"key": "BAC009S0767W0269", "wav": "./aishell/wav/test/S0767/BAC009S0767W0269.wav", "txt": "无论他做的什么都是什么都会引人注目的"} -{"key": "BAC009S0767W0270", "wav": "./aishell/wav/test/S0767/BAC009S0767W0270.wav", "txt": "像索尼被黑那样的事"} -{"key": "BAC009S0767W0271", "wav": "./aishell/wav/test/S0767/BAC009S0767W0271.wav", "txt": "也可能会是受到国家支持的攻击"} -{"key": "BAC009S0767W0272", "wav": "./aishell/wav/test/S0767/BAC009S0767W0272.wav", "txt": "未来的战争形态有可能就是计算机对抗计算机"} -{"key": "BAC009S0767W0273", "wav": "./aishell/wav/test/S0767/BAC009S0767W0273.wav", "txt": "当年泡沫破裂前也是这样的情景"} -{"key": "BAC009S0767W0274", "wav": "./aishell/wav/test/S0767/BAC009S0767W0274.wav", "txt": "一堆不赚钱的公司赶着上市当然不是什么好事"} -{"key": "BAC009S0767W0276", "wav": "./aishell/wav/test/S0767/BAC009S0767W0276.wav", "txt": "疯狂估值局限于私有公司内"} -{"key": "BAC009S0767W0277", "wav": "./aishell/wav/test/S0767/BAC009S0767W0277.wav", "txt": "可现在那些公司纷纷上市后疯狂是不是就暴露了呢"} -{"key": "BAC009S0767W0278", "wav": "./aishell/wav/test/S0767/BAC009S0767W0278.wav", "txt": "而现在的股票市场也已经达到创纪录的新高"} -{"key": "BAC009S0767W0282", "wav": "./aishell/wav/test/S0767/BAC009S0767W0282.wav", "txt": "称有些技术公司烧钱太快可能会人间蒸发"} -{"key": "BAC009S0767W0284", "wav": "./aishell/wav/test/S0767/BAC009S0767W0284.wav", "txt": "连这些人都预测不准的话"} -{"key": "BAC009S0767W0285", "wav": "./aishell/wav/test/S0767/BAC009S0767W0285.wav", "txt": "还有谁能预测得准呢"} -{"key": "BAC009S0767W0287", "wav": "./aishell/wav/test/S0767/BAC009S0767W0287.wav", "txt": "但是需记住对风向保持敏感"} -{"key": "BAC009S0767W0288", "wav": "./aishell/wav/test/S0767/BAC009S0767W0288.wav", "txt": "高空飞航时战略无人机"} -{"key": "BAC009S0767W0289", "wav": "./aishell/wav/test/S0767/BAC009S0767W0289.wav", "txt": "全球鹰并不能独占鳌头"} -{"key": "BAC009S0767W0290", "wav": "./aishell/wav/test/S0767/BAC009S0767W0290.wav", "txt": "继二零一一年出现独特的连翼造型的翔龙无人机以后"} -{"key": "BAC009S0767W0291", "wav": "./aishell/wav/test/S0767/BAC009S0767W0291.wav", "txt": "又一款个性十足的双机身气动外形的大型无人机神雕"} -{"key": "BAC009S0767W0292", "wav": "./aishell/wav/test/S0767/BAC009S0767W0292.wav", "txt": "又一次引爆坊间议论"} -{"key": "BAC009S0767W0293", "wav": "./aishell/wav/test/S0767/BAC009S0767W0293.wav", "txt": "今年三月美国大众科学杂志刊文称"} -{"key": "BAC009S0767W0294", "wav": "./aishell/wav/test/S0767/BAC009S0767W0294.wav", "txt": "中国正在研制一种世界上尺寸最大的无人机"} -{"key": "BAC009S0767W0295", "wav": "./aishell/wav/test/S0767/BAC009S0767W0295.wav", "txt": "发表的想象图与最近曝光的飞机布局很像"} -{"key": "BAC009S0767W0296", "wav": "./aishell/wav/test/S0767/BAC009S0767W0296.wav", "txt": "这使得神雕在全球也成为最大的无人机之一"} -{"key": "BAC009S0767W0297", "wav": "./aishell/wav/test/S0767/BAC009S0767W0297.wav", "txt": "据可靠的网络消息源称"} -{"key": "BAC009S0767W0298", "wav": "./aishell/wav/test/S0767/BAC009S0767W0298.wav", "txt": "其相应的对手不是全球鹰"} -{"key": "BAC009S0767W0300", "wav": "./aishell/wav/test/S0767/BAC009S0767W0300.wav", "txt": "神雕的两个机身前后装有两对机翼"} -{"key": "BAC009S0767W0301", "wav": "./aishell/wav/test/S0767/BAC009S0767W0301.wav", "txt": "位于后方的主翼中央挂着两具涡轮风扇发动机"} -{"key": "BAC009S0767W0303", "wav": "./aishell/wav/test/S0767/BAC009S0767W0303.wav", "txt": "上述公司人士对每日经济新闻记者表示"} -{"key": "BAC009S0767W0304", "wav": "./aishell/wav/test/S0767/BAC009S0767W0304.wav", "txt": "成飞机成业绩增长主要是由于锂电需求增长"} -{"key": "BAC009S0767W0305", "wav": "./aishell/wav/test/S0767/BAC009S0767W0305.wav", "txt": "传统汽车业务并没有太大起色"} -{"key": "BAC009S0767W0306", "wav": "./aishell/wav/test/S0767/BAC009S0767W0306.wav", "txt": "现在汽车市场也在下滑"} -{"key": "BAC009S0767W0307", "wav": "./aishell/wav/test/S0767/BAC009S0767W0307.wav", "txt": "整个汽车体系都是随着汽车销量在变动"} -{"key": "BAC009S0767W0308", "wav": "./aishell/wav/test/S0767/BAC009S0767W0308.wav", "txt": "在锂电业务爆发的情况下"} -{"key": "BAC009S0767W0309", "wav": "./aishell/wav/test/S0767/BAC009S0767W0309.wav", "txt": "公司的汽车业务应该会有一些弱化"} -{"key": "BAC009S0767W0310", "wav": "./aishell/wav/test/S0767/BAC009S0767W0310.wav", "txt": "新能源汽车的爆发带动了锂电池供不应求"} -{"key": "BAC009S0767W0311", "wav": "./aishell/wav/test/S0767/BAC009S0767W0311.wav", "txt": "几乎所有锂电厂商都在满负荷生产"} -{"key": "BAC009S0767W0312", "wav": "./aishell/wav/test/S0767/BAC009S0767W0312.wav", "txt": "上述成飞集成人士表示"} -{"key": "BAC009S0767W0313", "wav": "./aishell/wav/test/S0767/BAC009S0767W0313.wav", "txt": "产能现在已经满足不了订单需求"} -{"key": "BAC009S0767W0314", "wav": "./aishell/wav/test/S0767/BAC009S0767W0314.wav", "txt": "八月初公司通过了增加产能建设的决议"} -{"key": "BAC009S0767W0315", "wav": "./aishell/wav/test/S0767/BAC009S0767W0315.wav", "txt": "今年初也在原来厂里新增了生产线"} -{"key": "BAC009S0767W0316", "wav": "./aishell/wav/test/S0767/BAC009S0767W0316.wav", "txt": "预计在三四季度会有陆续新增产能"} -{"key": "BAC009S0767W0317", "wav": "./aishell/wav/test/S0767/BAC009S0767W0317.wav", "txt": "每日经济新闻记者注意到"} -{"key": "BAC009S0767W0318", "wav": "./aishell/wav/test/S0767/BAC009S0767W0318.wav", "txt": "项目总投资十四点五亿元"} -{"key": "BAC009S0767W0319", "wav": "./aishell/wav/test/S0767/BAC009S0767W0319.wav", "txt": "总投资预计一百二十五亿元"} -{"key": "BAC009S0767W0320", "wav": "./aishell/wav/test/S0767/BAC009S0767W0320.wav", "txt": "上述成飞集成人士告诉记者"} -{"key": "BAC009S0767W0321", "wav": "./aishell/wav/test/S0767/BAC009S0767W0321.wav", "txt": "公司目前看好锂电池行业的发展渠势"} -{"key": "BAC009S0767W0322", "wav": "./aishell/wav/test/S0767/BAC009S0767W0322.wav", "txt": "但其并未向记者透露项目盈利水平预测"} -{"key": "BAC009S0767W0323", "wav": "./aishell/wav/test/S0767/BAC009S0767W0323.wav", "txt": "洛阳本部项目是一个增量投资"} -{"key": "BAC009S0767W0324", "wav": "./aishell/wav/test/S0767/BAC009S0767W0324.wav", "txt": "有一部分研发楼办公楼是利用现成的"} -{"key": "BAC009S0767W0325", "wav": "./aishell/wav/test/S0767/BAC009S0767W0325.wav", "txt": "包括管理人员等并不会因为新增生产线而增加"} -{"key": "BAC009S0767W0326", "wav": "./aishell/wav/test/S0767/BAC009S0767W0326.wav", "txt": "这个项目是自有资金投入"} -{"key": "BAC009S0767W0327", "wav": "./aishell/wav/test/S0767/BAC009S0767W0327.wav", "txt": "就没有要求专业机构做可研报告"} -{"key": "BAC009S0767W0328", "wav": "./aishell/wav/test/S0767/BAC009S0767W0328.wav", "txt": "我们内部做的盈利测算数据暂时无法公告"} -{"key": "BAC009S0767W0329", "wav": "./aishell/wav/test/S0767/BAC009S0767W0329.wav", "txt": "成飞集成与常州市金坛区政府合作的项目将分三期完成"} -{"key": "BAC009S0767W0330", "wav": "./aishell/wav/test/S0767/BAC009S0767W0330.wav", "txt": "一期投资额为二十五亿元"} -{"key": "BAC009S0767W0331", "wav": "./aishell/wav/test/S0767/BAC009S0767W0331.wav", "txt": "上述成飞集成人士告诉记者"} -{"key": "BAC009S0767W0332", "wav": "./aishell/wav/test/S0767/BAC009S0767W0332.wav", "txt": "随着国内新能源车产业的迅猛发展"} -{"key": "BAC009S0767W0333", "wav": "./aishell/wav/test/S0767/BAC009S0767W0333.wav", "txt": "锂电池作为新能源车的重要部件"} -{"key": "BAC009S0767W0334", "wav": "./aishell/wav/test/S0767/BAC009S0767W0334.wav", "txt": "锂电池生产企业将迎来业绩持续高增长阶段"} -{"key": "BAC009S0767W0335", "wav": "./aishell/wav/test/S0767/BAC009S0767W0335.wav", "txt": "二零一四年其市场规模已达七十一五亿元"} -{"key": "BAC009S0767W0336", "wav": "./aishell/wav/test/S0767/BAC009S0767W0336.wav", "txt": "随着锂电池产业链迎来井喷"} -{"key": "BAC009S0767W0337", "wav": "./aishell/wav/test/S0767/BAC009S0767W0337.wav", "txt": "锂电池在今年上半年成为诸多上市公司的业绩功臣"} -{"key": "BAC009S0767W0338", "wav": "./aishell/wav/test/S0767/BAC009S0767W0338.wav", "txt": "二者虽从事项目不同"} -{"key": "BAC009S0767W0339", "wav": "./aishell/wav/test/S0767/BAC009S0767W0339.wav", "txt": "也恰恰符合播求的个人风格"} -{"key": "BAC009S0767W0340", "wav": "./aishell/wav/test/S0767/BAC009S0767W0340.wav", "txt": "擂台上的黑王子肌肉强健"} -{"key": "BAC009S0767W0341", "wav": "./aishell/wav/test/S0767/BAC009S0767W0341.wav", "txt": "这也是他一次次在擂台上打出恐怖重击的最大资本"} -{"key": "BAC009S0767W0342", "wav": "./aishell/wav/test/S0767/BAC009S0767W0342.wav", "txt": "却可以演绎出撼人心魄的体育大美"} -{"key": "BAC009S0767W0343", "wav": "./aishell/wav/test/S0767/BAC009S0767W0343.wav", "txt": "此次播求面对的强敌"} -{"key": "BAC009S0767W0344", "wav": "./aishell/wav/test/S0767/BAC009S0767W0344.wav", "txt": "恰恰在风格打法和比赛理念上"} -{"key": "BAC009S0767W0345", "wav": "./aishell/wav/test/S0767/BAC009S0767W0345.wav", "txt": "同詹姆斯昔年头号强敌卡梅隆安东尼如出一辙"} -{"key": "BAC009S0767W0346", "wav": "./aishell/wav/test/S0767/BAC009S0767W0346.wav", "txt": "丰富的战斗技巧是我的特色"} -{"key": "BAC009S0767W0347", "wav": "./aishell/wav/test/S0767/BAC009S0767W0347.wav", "txt": "我希望自己可以像卡梅隆一样"} -{"key": "BAC009S0767W0348", "wav": "./aishell/wav/test/S0767/BAC009S0767W0348.wav", "txt": "在比赛中展示出更多击败对手的手段"} -{"key": "BAC009S0767W0349", "wav": "./aishell/wav/test/S0767/BAC009S0767W0349.wav", "txt": "对于我的对手制造更多的麻烦"} -{"key": "BAC009S0767W0350", "wav": "./aishell/wav/test/S0767/BAC009S0767W0350.wav", "txt": "俄罗斯搏击新生代旗帜性天才高手哈亚在接受采访时"} -{"key": "BAC009S0767W0351", "wav": "./aishell/wav/test/S0767/BAC009S0767W0351.wav", "txt": "而哈亚的表现也正如其自己所言"} -{"key": "BAC009S0767W0352", "wav": "./aishell/wav/test/S0767/BAC009S0767W0352.wav", "txt": "展示出了如同其偶像安东尼一样的全面犀利"} -{"key": "BAC009S0767W0353", "wav": "./aishell/wav/test/S0767/BAC009S0767W0353.wav", "txt": "直接将威瑟里诺夫击倒"} -{"key": "BAC009S0767W0354", "wav": "./aishell/wav/test/S0767/BAC009S0767W0354.wav", "txt": "其搏击技能之全面精湛格斗天赋之卓越令人惊叹"} -{"key": "BAC009S0767W0355", "wav": "./aishell/wav/test/S0767/BAC009S0767W0355.wav", "txt": "当搏击界的勒布朗与卡梅隆狭路相逢"} -{"key": "BAC009S0767W0356", "wav": "./aishell/wav/test/S0767/BAC009S0767W0356.wav", "txt": "激情战火必将以燎原之势"} -{"key": "BAC009S0767W0357", "wav": "./aishell/wav/test/S0767/BAC009S0767W0357.wav", "txt": "彭博一英里接力赛将在十月十五日首次登陆上海"} -{"key": "BAC009S0767W0358", "wav": "./aishell/wav/test/S0767/BAC009S0767W0358.wav", "txt": "今天赛事举行了赛前新闻发布会"} -{"key": "BAC009S0767W0359", "wav": "./aishell/wav/test/S0767/BAC009S0767W0359.wav", "txt": "宣告彭博一英里接力赛上海站全面启动"} -{"key": "BAC009S0767W0360", "wav": "./aishell/wav/test/S0767/BAC009S0767W0360.wav", "txt": "让他们能在工作之馀释放对于体育的热情"} -{"key": "BAC009S0767W0361", "wav": "./aishell/wav/test/S0767/BAC009S0767W0361.wav", "txt": "从二零零七年在伦敦创办至今"} -{"key": "BAC009S0767W0362", "wav": "./aishell/wav/test/S0767/BAC009S0767W0362.wav", "txt": "已在新加坡香港等城市成功落地"} -{"key": "BAC009S0767W0363", "wav": "./aishell/wav/test/S0767/BAC009S0767W0363.wav", "txt": "得到当地企业的强烈积极响应"} -{"key": "BAC009S0767W0364", "wav": "./aishell/wav/test/S0767/BAC009S0767W0364.wav", "txt": "在各城市都有至少百支队伍报名参加"} -{"key": "BAC009S0767W0365", "wav": "./aishell/wav/test/S0767/BAC009S0767W0365.wav", "txt": "彭博一英里接力赛区别于其他跑步活动的是"} -{"key": "BAC009S0767W0366", "wav": "./aishell/wav/test/S0767/BAC009S0767W0366.wav", "txt": "每支报名队伍以企业为单位每队十名成员"} -{"key": "BAC009S0767W0367", "wav": "./aishell/wav/test/S0767/BAC009S0767W0367.wav", "txt": "每人分别完成一英里即约一点六公里的路程"} -{"key": "BAC009S0767W0368", "wav": "./aishell/wav/test/S0767/BAC009S0767W0368.wav", "txt": "最后取全队用时最少者为胜"} -{"key": "BAC009S0767W0369", "wav": "./aishell/wav/test/S0767/BAC009S0767W0369.wav", "txt": "冠军皆由麦格理集团获得"} -{"key": "BAC009S0767W0370", "wav": "./aishell/wav/test/S0767/BAC009S0767W0370.wav", "txt": "现场参赛企业誓言要打破这项记录"} -{"key": "BAC009S0767W0371", "wav": "./aishell/wav/test/S0767/BAC009S0767W0371.wav", "txt": "本次赛事已经开始接受团队报名"} -{"key": "BAC009S0767W0372", "wav": "./aishell/wav/test/S0767/BAC009S0767W0372.wav", "txt": "目前报名仍在进行中"} -{"key": "BAC009S0767W0373", "wav": "./aishell/wav/test/S0767/BAC009S0767W0373.wav", "txt": "这一项目将为神农架林区的孩子筹建开放式体育空间"} -{"key": "BAC009S0767W0374", "wav": "./aishell/wav/test/S0767/BAC009S0767W0374.wav", "txt": "为他们搭建一个特色自由的体育室加户外体育课堂"} -{"key": "BAC009S0767W0375", "wav": "./aishell/wav/test/S0767/BAC009S0767W0375.wav", "txt": "让他们也可以有机会参与体育运动"} -{"key": "BAC009S0767W0376", "wav": "./aishell/wav/test/S0767/BAC009S0767W0376.wav", "txt": "高清女排金花手捧奖杯庆夺冠"} -{"key": "BAC009S0767W0377", "wav": "./aishell/wav/test/S0767/BAC009S0767W0377.wav", "txt": "刚刚在日本女排世界杯上夺冠的中国女排载誉回京"} -{"key": "BAC009S0767W0378", "wav": "./aishell/wav/test/S0767/BAC009S0767W0378.wav", "txt": "在首都国际机场受到了各界人士的欢迎"} -{"key": "BAC009S0767W0379", "wav": "./aishell/wav/test/S0767/BAC009S0767W0379.wav", "txt": "这其中一位身材高挑的女孩子颇为引人关注"} -{"key": "BAC009S0767W0380", "wav": "./aishell/wav/test/S0767/BAC009S0767W0380.wav", "txt": "她就是因伤未能随队参加本次赛事的徐云丽"} -{"key": "BAC009S0767W0381", "wav": "./aishell/wav/test/S0767/BAC009S0767W0381.wav", "txt": "我觉得这冠军来之不易"} -{"key": "BAC009S0767W0382", "wav": "./aishell/wav/test/S0767/BAC009S0767W0382.wav", "txt": "特别是我们今年刚开始的时候特别艰难"} -{"key": "BAC009S0767W0383", "wav": "./aishell/wav/test/S0767/BAC009S0767W0383.wav", "txt": "整个队伍承受了很大的困难和考验"} -{"key": "BAC009S0767W0384", "wav": "./aishell/wav/test/S0767/BAC009S0767W0384.wav", "txt": "最后顶住困难和压力拿到冠军"} -{"key": "BAC009S0767W0385", "wav": "./aishell/wav/test/S0767/BAC009S0767W0385.wav", "txt": "我为她们感到骄傲和自豪"} -{"key": "BAC009S0767W0386", "wav": "./aishell/wav/test/S0767/BAC009S0767W0386.wav", "txt": "那就是姑娘们高举起惠若琪徐云丽和杨方旭的球衣"} -{"key": "BAC009S0767W0387", "wav": "./aishell/wav/test/S0767/BAC009S0767W0387.wav", "txt": "感谢这三位因伤未能参赛的姐妹对球队做出的巨大贡献"} -{"key": "BAC009S0767W0388", "wav": "./aishell/wav/test/S0767/BAC009S0767W0388.wav", "txt": "徐云丽透露在赛前队友曾经给自己发了一条短信"} -{"key": "BAC009S0767W0389", "wav": "./aishell/wav/test/S0767/BAC009S0767W0389.wav", "txt": "就此事征求她的意见"} -{"key": "BAC009S0767W0390", "wav": "./aishell/wav/test/S0767/BAC009S0767W0390.wav", "txt": "看到这一幕我控制不住自己了"} -{"key": "BAC009S0767W0391", "wav": "./aishell/wav/test/S0767/BAC009S0767W0391.wav", "txt": "一切都难以用言语来表达"} -{"key": "BAC009S0767W0392", "wav": "./aishell/wav/test/S0767/BAC009S0767W0392.wav", "txt": "自己此时此刻特别迫切地想要尽快恢复"} -{"key": "BAC009S0767W0393", "wav": "./aishell/wav/test/S0767/BAC009S0767W0393.wav", "txt": "希望能够跟大家一起再次站在领奖台上"} -{"key": "BAC009S0767W0394", "wav": "./aishell/wav/test/S0767/BAC009S0767W0394.wav", "txt": "徐云丽最后动情地说"} -{"key": "BAC009S0767W0395", "wav": "./aishell/wav/test/S0767/BAC009S0767W0395.wav", "txt": "搜狐体育郭健文"} -{"key": "BAC009S0767W0396", "wav": "./aishell/wav/test/S0767/BAC009S0767W0396.wav", "txt": "二零一五年八月十五"} -{"key": "BAC009S0767W0397", "wav": "./aishell/wav/test/S0767/BAC009S0767W0397.wav", "txt": "这次赛事是昆仑决二零一五欧洲之旅的第三站"} -{"key": "BAC009S0767W0398", "wav": "./aishell/wav/test/S0767/BAC009S0767W0398.wav", "txt": "四季如春的俄罗斯黑海东部沿岸"} -{"key": "BAC009S0767W0399", "wav": "./aishell/wav/test/S0767/BAC009S0767W0399.wav", "txt": "新一轮激战烽火即将炽烈点燃"} -{"key": "BAC009S0767W0400", "wav": "./aishell/wav/test/S0767/BAC009S0767W0400.wav", "txt": "我是这次中俄对抗赛第一个出场的中方选手"} -{"key": "BAC009S0767W0401", "wav": "./aishell/wav/test/S0767/BAC009S0767W0401.wav", "txt": "一定要尽全力打一场漂亮的比赛"} -{"key": "BAC009S0767W0402", "wav": "./aishell/wav/test/S0767/BAC009S0767W0402.wav", "txt": "为中国战队打响第一枪"} -{"key": "BAC009S0767W0403", "wav": "./aishell/wav/test/S0767/BAC009S0767W0403.wav", "txt": "徐永昊的站立打击技术在该级别中堪称翘楚"} -{"key": "BAC009S0767W0404", "wav": "./aishell/wav/test/S0767/BAC009S0767W0404.wav", "txt": "如今再遭厄运"} -{"key": "BAC009S0767W0406", "wav": "./aishell/wav/test/S0767/BAC009S0767W0406.wav", "txt": "最终又确定为科林哈迪"} -{"key": "BAC009S0767W0407", "wav": "./aishell/wav/test/S0767/BAC009S0767W0407.wav", "txt": "科林也退出了剧组"} -{"key": "BAC009S0767W0408", "wav": "./aishell/wav/test/S0767/BAC009S0767W0408.wav", "txt": "柯震东去年经历吸毒风波"} -{"key": "BAC009S0767W0409", "wav": "./aishell/wav/test/S0767/BAC009S0767W0409.wav", "txt": "演艺事业受挫"} -{"key": "BAC009S0767W0410", "wav": "./aishell/wav/test/S0767/BAC009S0767W0410.wav", "txt": "沉寂一段时间后"} -{"key": "BAC009S0767W0411", "wav": "./aishell/wav/test/S0767/BAC009S0767W0411.wav", "txt": "近期他积极复出"} -{"key": "BAC009S0767W0412", "wav": "./aishell/wav/test/S0767/BAC009S0767W0412.wav", "txt": "再度经营他的粉丝团与粉丝互动六日昨晚"} -{"key": "BAC009S0767W0414", "wav": "./aishell/wav/test/S0767/BAC009S0767W0414.wav", "txt": "他调皮地将自己的眉毛抹掉"} -{"key": "BAC009S0767W0415", "wav": "./aishell/wav/test/S0767/BAC009S0767W0415.wav", "txt": "搜狐娱乐讯据香港明晚九月十日报道"} -{"key": "BAC009S0767W0416", "wav": "./aishell/wav/test/S0767/BAC009S0767W0416.wav", "txt": "诸葛紫岐十日晚出席活动时表示"} -{"key": "BAC009S0767W0417", "wav": "./aishell/wav/test/S0767/BAC009S0767W0417.wav", "txt": "一个月内暴瘦了九至十三斤"} -{"key": "BAC009S0767W0418", "wav": "./aishell/wav/test/S0767/BAC009S0767W0418.wav", "txt": "有时甚至忘记吃饭"} -{"key": "BAC009S0767W0419", "wav": "./aishell/wav/test/S0767/BAC009S0767W0419.wav", "txt": "也有情绪问题"} -{"key": "BAC009S0767W0420", "wav": "./aishell/wav/test/S0767/BAC009S0767W0420.wav", "txt": "打算看医生寻求纾缓方式"} -{"key": "BAC009S0767W0421", "wav": "./aishell/wav/test/S0767/BAC009S0767W0421.wav", "txt": "她说之前打电话给医生"} -{"key": "BAC009S0767W0422", "wav": "./aishell/wav/test/S0767/BAC009S0767W0422.wav", "txt": "医生说得好恐怖"} -{"key": "BAC009S0767W0423", "wav": "./aishell/wav/test/S0767/BAC009S0767W0423.wav", "txt": "但不至于要吃药"} -{"key": "BAC009S0767W0424", "wav": "./aishell/wav/test/S0767/BAC009S0767W0424.wav", "txt": "我叫他不要吓我"} -{"key": "BAC009S0767W0425", "wav": "./aishell/wav/test/S0767/BAC009S0767W0425.wav", "txt": "现在有点怕要见他"} -{"key": "BAC009S0767W0426", "wav": "./aishell/wav/test/S0767/BAC009S0767W0426.wav", "txt": "搜狐娱乐讯九月十六日二十点二十七分"} -{"key": "BAC009S0767W0428", "wav": "./aishell/wav/test/S0767/BAC009S0767W0428.wav", "txt": "并自嘲的写道自幼就走性感风格"} -{"key": "BAC009S0767W0429", "wav": "./aishell/wav/test/S0767/BAC009S0767W0429.wav", "txt": "谢依霖穿着白色吊带裙"} -{"key": "BAC009S0767W0430", "wav": "./aishell/wav/test/S0767/BAC009S0767W0430.wav", "txt": "嘟嘴作亲吻状"} -{"key": "BAC009S0767W0431", "wav": "./aishell/wav/test/S0767/BAC009S0767W0431.wav", "txt": "李玉刚离开蒙面歌王网友遗憾没听见神曲"} -{"key": "BAC009S0767W0432", "wav": "./aishell/wav/test/S0767/BAC009S0767W0432.wav", "txt": "蒙面歌王迎来初赛的收官之战"} -{"key": "BAC009S0767W0433", "wav": "./aishell/wav/test/S0767/BAC009S0767W0433.wav", "txt": "千面娇娃绝地反击拿下最后一席歌王头衔"} -{"key": "BAC009S0767W0434", "wav": "./aishell/wav/test/S0767/BAC009S0767W0434.wav", "txt": "而绝代歌姬李玉刚揭面引起了广泛热议"} -{"key": "BAC009S0767W0435", "wav": "./aishell/wav/test/S0767/BAC009S0767W0435.wav", "txt": "也有网友发出疑问若是李玉刚演唱李的话"} -{"key": "BAC009S0767W0436", "wav": "./aishell/wav/test/S0767/BAC009S0767W0436.wav", "txt": "登上歌王宝座的概率是不是会要大很多倍呢"} -{"key": "BAC009S0767W0437", "wav": "./aishell/wav/test/S0767/BAC009S0767W0437.wav", "txt": "短短二零零字歌词运用诗词典故三六处"} -{"key": "BAC009S0767W0438", "wav": "./aishell/wav/test/S0767/BAC009S0767W0438.wav", "txt": "每句歌词都蕴含一段李姓历史文化在里头"} -{"key": "BAC009S0767W0439", "wav": "./aishell/wav/test/S0767/BAC009S0767W0439.wav", "txt": "该歌曲今年一经在各大音乐网站上线便收获无数好评"} -{"key": "BAC009S0767W0440", "wav": "./aishell/wav/test/S0767/BAC009S0767W0440.wav", "txt": "更有全球李氏宗亲大会将其列为祭祖主题曲"} -{"key": "BAC009S0767W0441", "wav": "./aishell/wav/test/S0767/BAC009S0767W0441.wav", "txt": "同时李也俘获了无数中国大妈的芳心"} -{"key": "BAC009S0767W0442", "wav": "./aishell/wav/test/S0767/BAC009S0767W0442.wav", "txt": "成为各国各地广场舞今年最流行的背景音乐之一"} -{"key": "BAC009S0767W0443", "wav": "./aishell/wav/test/S0767/BAC009S0767W0443.wav", "txt": "李磊灭门案遗产纠纷终结八零零馀万三人有份"} -{"key": "BAC009S0767W0444", "wav": "./aishell/wav/test/S0767/BAC009S0767W0444.wav", "txt": "大兴灭门案的凶犯李磊被执行死刑后"} -{"key": "BAC009S0767W0445", "wav": "./aishell/wav/test/S0767/BAC009S0767W0445.wav", "txt": "其身后的财产分割问题尘埃落定"} -{"key": "BAC009S0767W0446", "wav": "./aishell/wav/test/S0767/BAC009S0767W0446.wav", "txt": "市二中院终审认定李家遗产共计八零零多万元"} -{"key": "BAC009S0767W0447", "wav": "./aishell/wav/test/S0767/BAC009S0767W0447.wav", "txt": "李磊的奶奶继承四三七万馀元"} -{"key": "BAC009S0767W0448", "wav": "./aishell/wav/test/S0767/BAC009S0767W0448.wav", "txt": "姥姥继承二六六万馀元"} -{"key": "BAC009S0767W0449", "wav": "./aishell/wav/test/S0767/BAC009S0767W0449.wav", "txt": "岳父母继承一零九万馀元"} -{"key": "BAC009S0767W0450", "wav": "./aishell/wav/test/S0767/BAC009S0767W0450.wav", "txt": "李谷一曾怒揭东方歌舞团腐败事后被调离岗位"} -{"key": "BAC009S0767W0451", "wav": "./aishell/wav/test/S0767/BAC009S0767W0451.wav", "txt": "顾欣资料图片昨早九号一零时"} -{"key": "BAC009S0767W0452", "wav": "./aishell/wav/test/S0767/BAC009S0767W0452.wav", "txt": "东方演艺集团大门口戒备森严"} -{"key": "BAC009S0767W0453", "wav": "./aishell/wav/test/S0767/BAC009S0767W0453.wav", "txt": "中纪委监察部网站发布消息"} -{"key": "BAC009S0767W0454", "wav": "./aishell/wav/test/S0767/BAC009S0767W0454.wav", "txt": "顾欣因涉嫌严重违纪违法"} -{"key": "BAC009S0767W0455", "wav": "./aishell/wav/test/S0767/BAC009S0767W0455.wav", "txt": "集团新领导已经上任"} -{"key": "BAC009S0767W0456", "wav": "./aishell/wav/test/S0767/BAC009S0767W0456.wav", "txt": "是原中国文化集团党委书记宋官林"} -{"key": "BAC009S0767W0457", "wav": "./aishell/wav/test/S0767/BAC009S0767W0457.wav", "txt": "李连杰名誉维权案一审胜诉网站被判至致歉赔偿一零万"} -{"key": "BAC009S0767W0458", "wav": "./aishell/wav/test/S0767/BAC009S0767W0458.wav", "txt": "新京报快讯记者林野记者今天傍晚获悉"} -{"key": "BAC009S0767W0459", "wav": "./aishell/wav/test/S0767/BAC009S0767W0459.wav", "txt": "李银河的文学梦将出版虐恋小说三卷本"} -{"key": "BAC009S0767W0460", "wav": "./aishell/wav/test/S0767/BAC009S0767W0460.wav", "txt": "权义澎湃资料李银河在现实中是柔软的"} -{"key": "BAC009S0767W0461", "wav": "./aishell/wav/test/S0767/BAC009S0767W0461.wav", "txt": "不像她发表的那些先锋的观点一样冲击人眼球"} -{"key": "BAC009S0767W0462", "wav": "./aishell/wav/test/S0767/BAC009S0767W0462.wav", "txt": "李银河写虐恋不会伤害小波"} -{"key": "BAC009S0767W0463", "wav": "./aishell/wav/test/S0767/BAC009S0767W0463.wav", "txt": "北京南三环附近一家茶楼里"} -{"key": "BAC009S0767W0464", "wav": "./aishell/wav/test/S0767/BAC009S0767W0464.wav", "txt": "六三岁的李银河拿着钥匙袋走了进来"} -{"key": "BAC009S0767W0465", "wav": "./aishell/wav/test/S0767/BAC009S0767W0465.wav", "txt": "她身着湖蓝色细纱短袖黑长裤白运动鞋"} -{"key": "BAC009S0767W0466", "wav": "./aishell/wav/test/S0767/BAC009S0767W0466.wav", "txt": "手腕上还戴着一块与之呼应的白色塑料腕表"} -{"key": "BAC009S0767W0467", "wav": "./aishell/wav/test/S0767/BAC009S0767W0467.wav", "txt": "出门前我拿了两套衣服"} -{"key": "BAC009S0767W0468", "wav": "./aishell/wav/test/S0767/BAC009S0767W0468.wav", "txt": "面对第一财经日报记者"} -{"key": "BAC009S0767W0469", "wav": "./aishell/wav/test/S0767/BAC009S0767W0469.wav", "txt": "说起自己的伴侣大侠"} -{"key": "BAC009S0767W0470", "wav": "./aishell/wav/test/S0767/BAC009S0767W0470.wav", "txt": "村中数百亩农田干旱村民质疑水库断了灌溉水"} -{"key": "BAC009S0767W0471", "wav": "./aishell/wav/test/S0767/BAC009S0767W0471.wav", "txt": "高新区钓渭镇疙瘩沟村村民称"} -{"key": "BAC009S0767W0472", "wav": "./aishell/wav/test/S0767/BAC009S0767W0472.wav", "txt": "却为了发电断了灌溉农田的水"} -{"key": "BAC009S0767W0473", "wav": "./aishell/wav/test/S0767/BAC009S0767W0473.wav", "txt": "导致数百亩农田干旱"} -{"key": "BAC009S0767W0474", "wav": "./aishell/wav/test/S0767/BAC009S0767W0474.wav", "txt": "该镇农办一名主管水利负责人介绍"} -{"key": "BAC009S0767W0475", "wav": "./aishell/wav/test/S0767/BAC009S0767W0475.wav", "txt": "政府曾多次叫停电站发电"} -{"key": "BAC009S0767W0476", "wav": "./aishell/wav/test/S0767/BAC009S0767W0476.wav", "txt": "但是干旱原因主要系降水减少"} -{"key": "BAC009S0767W0477", "wav": "./aishell/wav/test/S0767/BAC009S0767W0477.wav", "txt": "今后将加强水库管理"} -{"key": "BAC009S0767W0478", "wav": "./aishell/wav/test/S0767/BAC009S0767W0478.wav", "txt": "努力处理好灌溉与发电之间的关系"} -{"key": "BAC009S0767W0479", "wav": "./aishell/wav/test/S0767/BAC009S0767W0479.wav", "txt": "村主任发环卫工一六零零元工资含一四张假钞"} -{"key": "BAC009S0767W0480", "wav": "./aishell/wav/test/S0767/BAC009S0767W0480.wav", "txt": "村主任发账号给村文书想要继续当拿四八万元"} -{"key": "BAC009S0767W0481", "wav": "./aishell/wav/test/S0767/BAC009S0767W0481.wav", "txt": "华商报商洛讯记者白鹏飞近日"} -{"key": "BAC009S0767W0482", "wav": "./aishell/wav/test/S0767/BAC009S0767W0482.wav", "txt": "并向其发送银行账号"} -{"key": "BAC009S0767W0483", "wav": "./aishell/wav/test/S0767/BAC009S0767W0483.wav", "txt": "原因是有人愿为该村垫资四八万元费用修桥"} -{"key": "BAC009S0767W0484", "wav": "./aishell/wav/test/S0767/BAC009S0767W0484.wav", "txt": "村委会主任建议由垫资人担任村文书"} -{"key": "BAC009S0767W0485", "wav": "./aishell/wav/test/S0767/BAC009S0767W0485.wav", "txt": "唐寨子村党支部书记村委会主任已被全镇通报批评"} -{"key": "BAC009S0767W0486", "wav": "./aishell/wav/test/S0767/BAC009S0767W0486.wav", "txt": "村主任向开发商索贿五二零万村组干部几乎全参与分赃"} -{"key": "BAC009S0767W0487", "wav": "./aishell/wav/test/S0767/BAC009S0767W0487.wav", "txt": "城改拆迁对很多村民来说"} -{"key": "BAC009S0767W0488", "wav": "./aishell/wav/test/S0767/BAC009S0767W0488.wav", "txt": "意味着生活条件的改善"} -{"key": "BAC009S0767W0489", "wav": "./aishell/wav/test/S0767/BAC009S0767W0489.wav", "txt": "但对于部分村官及个别政府工作人员来说"} -{"key": "BAC009S0767W0490", "wav": "./aishell/wav/test/S0767/BAC009S0767W0490.wav", "txt": "却是一块大大的唐僧肉"} -{"key": "BAC009S0767W0491", "wav": "./aishell/wav/test/S0767/BAC009S0767W0491.wav", "txt": "想办法扑上去咬一口"} -{"key": "BAC009S0767W0492", "wav": "./aishell/wav/test/S0767/BAC009S0767W0492.wav", "txt": "村主任给狗盖房吞六万公款被判刑二年八个月"} -{"key": "BAC009S0767W0493", "wav": "./aishell/wav/test/S0767/BAC009S0767W0493.wav", "txt": "京华时报讯记者王晓飞在农村"} -{"key": "BAC009S0767W0494", "wav": "./aishell/wav/test/S0767/BAC009S0767W0494.wav", "txt": "几乎家家户户都会在院子里养狗"} -{"key": "BAC009S0767W0495", "wav": "./aishell/wav/test/S0767/BAC009S0767W0495.wav", "txt": "平时作为看家护院之用"} -{"key": "BAC009S0768W0121", "wav": "./aishell/wav/test/S0768/BAC009S0768W0121.wav", "txt": "成本的转嫁使得商品房用地成本更高"} -{"key": "BAC009S0768W0122", "wav": "./aishell/wav/test/S0768/BAC009S0768W0122.wav", "txt": "明年初料迎供应淡季"} -{"key": "BAC009S0768W0123", "wav": "./aishell/wav/test/S0768/BAC009S0768W0123.wav", "txt": "土地市场交易火热的局面可能降温"} -{"key": "BAC009S0768W0124", "wav": "./aishell/wav/test/S0768/BAC009S0768W0124.wav", "txt": "土地供应往往呈现前松后紧的态势"} -{"key": "BAC009S0768W0125", "wav": "./aishell/wav/test/S0768/BAC009S0768W0125.wav", "txt": "年初往往是土地供应的淡季"} -{"key": "BAC009S0768W0126", "wav": "./aishell/wav/test/S0768/BAC009S0768W0126.wav", "txt": "为完成全念土地供应计划"} -{"key": "BAC009S0768W0127", "wav": "./aishell/wav/test/S0768/BAC009S0768W0127.wav", "txt": "地方政府倾向于频繁推出优质地块"} -{"key": "BAC009S0768W0128", "wav": "./aishell/wav/test/S0768/BAC009S0768W0128.wav", "txt": "土地交易可能随着供应淡季的到来而降温"} -{"key": "BAC009S0768W0129", "wav": "./aishell/wav/test/S0768/BAC009S0768W0129.wav", "txt": "叠加春节因素的影响"} -{"key": "BAC009S0768W0130", "wav": "./aishell/wav/test/S0768/BAC009S0768W0130.wav", "txt": "这种情况在二月可能较明显"} -{"key": "BAC009S0768W0131", "wav": "./aishell/wav/test/S0768/BAC009S0768W0131.wav", "txt": "房地产企业的整体资金状况超紧"} -{"key": "BAC009S0768W0132", "wav": "./aishell/wav/test/S0768/BAC009S0768W0132.wav", "txt": "不利于继续大规模拿地"} -{"key": "BAC009S0768W0133", "wav": "./aishell/wav/test/S0768/BAC009S0768W0133.wav", "txt": "国家统计局数据显示"} -{"key": "BAC009S0768W0134", "wav": "./aishell/wav/test/S0768/BAC009S0768W0134.wav", "txt": "房地产开发企业到位资金十万亿元"} -{"key": "BAC009S0768W0135", "wav": "./aishell/wav/test/S0768/BAC009S0768W0135.wav", "txt": "增速比三月回落六个百分点"} -{"key": "BAC009S0768W0136", "wav": "./aishell/wav/test/S0768/BAC009S0768W0136.wav", "txt": "未来房企拿地投资新开工等指标可能受到影响"} -{"key": "BAC009S0768W0137", "wav": "./aishell/wav/test/S0768/BAC009S0768W0137.wav", "txt": "尽管降息等利好政策出台"} -{"key": "BAC009S0768W0138", "wav": "./aishell/wav/test/S0768/BAC009S0768W0138.wav", "txt": "但房地产市场仍处于调整期"} -{"key": "BAC009S0768W0139", "wav": "./aishell/wav/test/S0768/BAC009S0768W0139.wav", "txt": "预计不会在短期内迅速回暖"} -{"key": "BAC009S0768W0140", "wav": "./aishell/wav/test/S0768/BAC009S0768W0140.wav", "txt": "与之相联系的土地市场也会受到影响"} -{"key": "BAC009S0768W0141", "wav": "./aishell/wav/test/S0768/BAC009S0768W0141.wav", "txt": "中国证券报报道"} -{"key": "BAC009S0768W0142", "wav": "./aishell/wav/test/S0768/BAC009S0768W0142.wav", "txt": "今年商品房销售一度低日迷"} -{"key": "BAC009S0768W0143", "wav": "./aishell/wav/test/S0768/BAC009S0768W0143.wav", "txt": "一线城市土地市场成交火热"} -{"key": "BAC009S0768W0144", "wav": "./aishell/wav/test/S0768/BAC009S0768W0144.wav", "txt": "中介机构统计数据显示"} -{"key": "BAC009S0768W0145", "wav": "./aishell/wav/test/S0768/BAC009S0768W0145.wav", "txt": "五环内商品住宅的成交在市场中并非主流"} -{"key": "BAC009S0768W0146", "wav": "./aishell/wav/test/S0768/BAC009S0768W0146.wav", "txt": "一位房企人士认为五环内项目的稀缺性难以改变"} -{"key": "BAC009S0768W0147", "wav": "./aishell/wav/test/S0768/BAC009S0768W0147.wav", "txt": "新京报讯记者张徐报道"} -{"key": "BAC009S0768W0148", "wav": "./aishell/wav/test/S0768/BAC009S0768W0148.wav", "txt": "北京去年土地出让落下大幕"} -{"key": "BAC009S0768W0149", "wav": "./aishell/wav/test/S0768/BAC009S0768W0149.wav", "txt": "在丰台潘家村一宗商业用地低价成交后"} -{"key": "BAC009S0768W0150", "wav": "./aishell/wav/test/S0768/BAC009S0768W0150.wav", "txt": "北京今年的土地出让金锁定在两千亿元"} -{"key": "BAC009S0768W0151", "wav": "./aishell/wav/test/S0768/BAC009S0768W0151.wav", "txt": "同比前年增长五成"} -{"key": "BAC009S0768W0152", "wav": "./aishell/wav/test/S0768/BAC009S0768W0152.wav", "txt": "丰台区域潘家村危改三号地成为今年的收官地质块"} -{"key": "BAC009S0768W0153", "wav": "./aishell/wav/test/S0768/BAC009S0768W0153.wav", "txt": "这宗零售商业用地位于南三环外"} -{"key": "BAC009S0768W0154", "wav": "./aishell/wav/test/S0768/BAC009S0768W0154.wav", "txt": "邻近地铁十号线首竟贸站"} -{"key": "BAC009S0768W0155", "wav": "./aishell/wav/test/S0768/BAC009S0768W0155.wav", "txt": "规划建筑面积约五万平方米"} -{"key": "BAC009S0768W0156", "wav": "./aishell/wav/test/S0768/BAC009S0768W0156.wav", "txt": "潘家村地块体量较小"} -{"key": "BAC009S0768W0157", "wav": "./aishell/wav/test/S0768/BAC009S0768W0157.wav", "txt": "未必吸引太多擅长上规模开发的企业"} -{"key": "BAC009S0768W0158", "wav": "./aishell/wav/test/S0768/BAC009S0768W0158.wav", "txt": "因此最终仅有龙湖地产一家报价"} -{"key": "BAC009S0768W0159", "wav": "./aishell/wav/test/S0768/BAC009S0768W0159.wav", "txt": "龙湖即以低价五点五亿元拿地"} -{"key": "BAC009S0768W0160", "wav": "./aishell/wav/test/S0768/BAC009S0768W0160.wav", "txt": "楼面价折合约一万元每平方米"} -{"key": "BAC009S0768W0161", "wav": "./aishell/wav/test/S0768/BAC009S0768W0161.wav", "txt": "龙湖地产有关人士对记者表示"} -{"key": "BAC009S0768W0162", "wav": "./aishell/wav/test/S0768/BAC009S0768W0162.wav", "txt": "龙湖已经在丰台有土地储备"} -{"key": "BAC009S0768W0163", "wav": "./aishell/wav/test/S0768/BAC009S0768W0163.wav", "txt": "未来还将继续深耕丰台区域"} -{"key": "BAC009S0768W0164", "wav": "./aishell/wav/test/S0768/BAC009S0768W0164.wav", "txt": "龙湖在丰台西局撤资三十亿元拿地"} -{"key": "BAC009S0768W0165", "wav": "./aishell/wav/test/S0768/BAC009S0768W0165.wav", "txt": "纯商品房楼面价接近六万元每平方米"} -{"key": "BAC009S0768W0166", "wav": "./aishell/wav/test/S0768/BAC009S0768W0166.wav", "txt": "并不代表全年土地市场行情走低"} -{"key": "BAC009S0768W0167", "wav": "./aishell/wav/test/S0768/BAC009S0768W0167.wav", "txt": "今年北京土地市场仍然是高温状态"} -{"key": "BAC009S0768W0168", "wav": "./aishell/wav/test/S0768/BAC009S0768W0168.wav", "txt": "特别是一至四月土地出让金即破千亿元"} -{"key": "BAC009S0768W0169", "wav": "./aishell/wav/test/S0768/BAC009S0768W0169.wav", "txt": "根据北京中原地产统计"} -{"key": "BAC009S0768W0170", "wav": "./aishell/wav/test/S0768/BAC009S0768W0170.wav", "txt": "去年北京共出让五十宗居住楼用地"} -{"key": "BAC009S0768W0171", "wav": "./aishell/wav/test/S0768/BAC009S0768W0171.wav", "txt": "规划建筑面积合计九百万平方米"} -{"key": "BAC009S0768W0172", "wav": "./aishell/wav/test/S0768/BAC009S0768W0172.wav", "txt": "出让金合计一千亿元"} -{"key": "BAC009S0768W0173", "wav": "./aishell/wav/test/S0768/BAC009S0768W0173.wav", "txt": "整体平均楼面价折合一万元每平方米"} -{"key": "BAC009S0768W0174", "wav": "./aishell/wav/test/S0768/BAC009S0768W0174.wav", "txt": "这一平均楼面价较年前的九千元每平方米"} -{"key": "BAC009S0768W0175", "wav": "./aishell/wav/test/S0768/BAC009S0768W0175.wav", "txt": "北京中原地产首席分析师张大伟认为"} -{"key": "BAC009S0768W0176", "wav": "./aishell/wav/test/S0768/BAC009S0768W0176.wav", "txt": "一二线城市特别是京沪这样的核心城市"} -{"key": "BAC009S0768W0177", "wav": "./aishell/wav/test/S0768/BAC009S0768W0177.wav", "txt": "投资价值更好房企看好"} -{"key": "BAC009S0768W0178", "wav": "./aishell/wav/test/S0768/BAC009S0768W0178.wav", "txt": "加上优质地块的连续供应"} -{"key": "BAC009S0768W0179", "wav": "./aishell/wav/test/S0768/BAC009S0768W0179.wav", "txt": "促成了今年北京土地市场的走高"} -{"key": "BAC009S0768W0180", "wav": "./aishell/wav/test/S0768/BAC009S0768W0180.wav", "txt": "通州新城彩虹之门用地挂出三十日"} -{"key": "BAC009S0768W0181", "wav": "./aishell/wav/test/S0768/BAC009S0768W0181.wav", "txt": "记者从北京市国土局网站看到"} -{"key": "BAC009S0768W0182", "wav": "./aishell/wav/test/S0768/BAC009S0768W0182.wav", "txt": "通州运河核心区一宗多功能用地挂出"} -{"key": "BAC009S0768W0183", "wav": "./aishell/wav/test/S0768/BAC009S0768W0183.wav", "txt": "将于明年投标"} -{"key": "BAC009S0768W0184", "wav": "./aishell/wav/test/S0768/BAC009S0768W0184.wav", "txt": "该地块位于通州新城五河交汇处东南角"} -{"key": "BAC009S0768W0185", "wav": "./aishell/wav/test/S0768/BAC009S0768W0185.wav", "txt": "规划建筑面积为四十万平方米"} -{"key": "BAC009S0768W0186", "wav": "./aishell/wav/test/S0768/BAC009S0768W0186.wav", "txt": "据记者从多个渠道了解"} -{"key": "BAC009S0768W0187", "wav": "./aishell/wav/test/S0768/BAC009S0768W0187.wav", "txt": "分档制定了中央部门收取的考务费统一上限标准"} -{"key": "BAC009S0768W0188", "wav": "./aishell/wav/test/S0768/BAC009S0768W0188.wav", "txt": "考虑到地方组织考试的成本相对比较固定"} -{"key": "BAC009S0768W0189", "wav": "./aishell/wav/test/S0768/BAC009S0768W0189.wav", "txt": "即各省在考务费标准基础上"} -{"key": "BAC009S0768W0190", "wav": "./aishell/wav/test/S0768/BAC009S0768W0190.wav", "txt": "实践技能操作和面试类考试科目"} -{"key": "BAC009S0768W0191", "wav": "./aishell/wav/test/S0768/BAC009S0768W0191.wav", "txt": "需配备租赁精密仪器专业设备大型场地"} -{"key": "BAC009S0768W0192", "wav": "./aishell/wav/test/S0768/BAC009S0768W0192.wav", "txt": "考试过程需要消耗相关材料或需聘请专业面试考官的"} -{"key": "BAC009S0768W0193", "wav": "./aishell/wav/test/S0768/BAC009S0768W0193.wav", "txt": "由于影响成本的因素过多"} -{"key": "BAC009S0768W0194", "wav": "./aishell/wav/test/S0768/BAC009S0768W0194.wav", "txt": "由各省根据实际成本制定"} -{"key": "BAC009S0768W0195", "wav": "./aishell/wav/test/S0768/BAC009S0768W0195.wav", "txt": "三是促进考务成本降低和考试单位合并"} -{"key": "BAC009S0768W0196", "wav": "./aishell/wav/test/S0768/BAC009S0768W0196.wav", "txt": "对考务费实行统一标准化管理后"} -{"key": "BAC009S0768W0197", "wav": "./aishell/wav/test/S0768/BAC009S0768W0197.wav", "txt": "而是改按统一合理的平均成本确定"} -{"key": "BAC009S0768W0198", "wav": "./aishell/wav/test/S0768/BAC009S0768W0198.wav", "txt": "将切实减轻考生经济负担"} -{"key": "BAC009S0768W0199", "wav": "./aishell/wav/test/S0768/BAC009S0768W0199.wav", "txt": "改革将对考试单位的费用支出形成倒逼机制"} -{"key": "BAC009S0768W0200", "wav": "./aishell/wav/test/S0768/BAC009S0768W0200.wav", "txt": "促使考试单位自觉降低成本由于形不成规模效益"} -{"key": "BAC009S0768W0201", "wav": "./aishell/wav/test/S0768/BAC009S0768W0201.wav", "txt": "一些规模较小的考试机构也将自动寻求合并"} -{"key": "BAC009S0768W0202", "wav": "./aishell/wav/test/S0768/BAC009S0768W0202.wav", "txt": "利用价格杠杆促进考试单位向集约化发展"} -{"key": "BAC009S0768W0203", "wav": "./aishell/wav/test/S0768/BAC009S0768W0203.wav", "txt": "他就上述关注问题指出"} -{"key": "BAC009S0768W0204", "wav": "./aishell/wav/test/S0768/BAC009S0768W0204.wav", "txt": "目前我国出现政府性债务违约可能性并不大"} -{"key": "BAC009S0768W0205", "wav": "./aishell/wav/test/S0768/BAC009S0768W0205.wav", "txt": "下一步将进一步完善城投债卷发行制度和防范风险机制"} -{"key": "BAC009S0768W0206", "wav": "./aishell/wav/test/S0768/BAC009S0768W0206.wav", "txt": "并尽快建立我国地方政府债务管理体系等"} -{"key": "BAC009S0768W0207", "wav": "./aishell/wav/test/S0768/BAC009S0768W0207.wav", "txt": "政府性违约可能性不大"} -{"key": "BAC009S0768W0208", "wav": "./aishell/wav/test/S0768/BAC009S0768W0208.wav", "txt": "中国证卷报随着欧美等国主权债务危机陆续爆发"} -{"key": "BAC009S0768W0209", "wav": "./aishell/wav/test/S0768/BAC009S0768W0209.wav", "txt": "您如何看待政府的举债行为和债务风险"} -{"key": "BAC009S0768W0210", "wav": "./aishell/wav/test/S0768/BAC009S0768W0210.wav", "txt": "徐林吸取欧美等国主权债务危机的教训"} -{"key": "BAC009S0768W0211", "wav": "./aishell/wav/test/S0768/BAC009S0768W0211.wav", "txt": "采取必要措施加强政府债务管理"} -{"key": "BAC009S0768W0212", "wav": "./aishell/wav/test/S0768/BAC009S0768W0212.wav", "txt": "防范我国政府债务风险"} -{"key": "BAC009S0768W0213", "wav": "./aishell/wav/test/S0768/BAC009S0768W0213.wav", "txt": "但在具体评估我国地方政府债务风险程度时"} -{"key": "BAC009S0768W0214", "wav": "./aishell/wav/test/S0768/BAC009S0768W0214.wav", "txt": "也要看到我国与欧美国家的不同之处"} -{"key": "BAC009S0768W0215", "wav": "./aishell/wav/test/S0768/BAC009S0768W0215.wav", "txt": "我国地方政府性债务"} -{"key": "BAC009S0768W0216", "wav": "./aishell/wav/test/S0768/BAC009S0768W0216.wav", "txt": "特别是地方投投融资平台公司形成的债务"} -{"key": "BAC009S0768W0217", "wav": "./aishell/wav/test/S0768/BAC009S0768W0217.wav", "txt": "主要由于各各种基础设施的投资建设"} -{"key": "BAC009S0768W0218", "wav": "./aishell/wav/test/S0768/BAC009S0768W0218.wav", "txt": "当代人和后代人共同承担债务还本付息责任"} -{"key": "BAC009S0768W0219", "wav": "./aishell/wav/test/S0768/BAC009S0768W0219.wav", "txt": "可以更好地体现代际公平"} -{"key": "BAC009S0768W0220", "wav": "./aishell/wav/test/S0768/BAC009S0768W0220.wav", "txt": "克服当期建设资金不足的瓶颈制约"} -{"key": "BAC009S0768W0221", "wav": "./aishell/wav/test/S0768/BAC009S0768W0221.wav", "txt": "有利于加快完善基础设施和投资环境"} -{"key": "BAC009S0768W0222", "wav": "./aishell/wav/test/S0768/BAC009S0768W0222.wav", "txt": "是一种合理的基础设施投融资金建设行为"} -{"key": "BAC009S0768W0223", "wav": "./aishell/wav/test/S0768/BAC009S0768W0223.wav", "txt": "政府举债建设形成大量资金"} -{"key": "BAC009S0768W0224", "wav": "./aishell/wav/test/S0768/BAC009S0768W0224.wav", "txt": "相当部分资产具有长期的直接收益"} -{"key": "BAC009S0768W0225", "wav": "./aishell/wav/test/S0768/BAC009S0768W0225.wav", "txt": "一些没有直接收益的项目"} -{"key": "BAC009S0768W0226", "wav": "./aishell/wav/test/S0768/BAC009S0768W0226.wav", "txt": "也具有间接的经济效益或社会效益"} -{"key": "BAC009S0768W0227", "wav": "./aishell/wav/test/S0768/BAC009S0768W0227.wav", "txt": "对促进当地经济增长和政府财力的增长"} -{"key": "BAC009S0768W0228", "wav": "./aishell/wav/test/S0768/BAC009S0768W0228.wav", "txt": "不能简单地用寅吃卯粮来作价值判断"} -{"key": "BAC009S0768W0229", "wav": "./aishell/wav/test/S0768/BAC009S0768W0229.wav", "txt": "这并不意味着政府可以无节制地借债"} -{"key": "BAC009S0768W0230", "wav": "./aishell/wav/test/S0768/BAC009S0768W0230.wav", "txt": "关键是要把投资规模和债务规模"} -{"key": "BAC009S0768W0231", "wav": "./aishell/wav/test/S0768/BAC009S0768W0231.wav", "txt": "控制在合理的范围内"} -{"key": "BAC009S0768W0232", "wav": "./aishell/wav/test/S0768/BAC009S0768W0232.wav", "txt": "防止出现系统性的偿债风险"} -{"key": "BAC009S0768W0233", "wav": "./aishell/wav/test/S0768/BAC009S0768W0233.wav", "txt": "国务院高度重视防范地方政府债务风险"} -{"key": "BAC009S0768W0234", "wav": "./aishell/wav/test/S0768/BAC009S0768W0234.wav", "txt": "从二零零九年下半年就开始要求有关部门调研这一问题"} -{"key": "BAC009S0768W0235", "wav": "./aishell/wav/test/S0768/BAC009S0768W0235.wav", "txt": "国家审计署还专门组织力量"} -{"key": "BAC009S0768W0236", "wav": "./aishell/wav/test/S0768/BAC009S0768W0236.wav", "txt": "对全国各地的政府债务进行啦严格审计"} -{"key": "BAC009S0768W0237", "wav": "./aishell/wav/test/S0768/BAC009S0768W0237.wav", "txt": "审计署的审计结论表明"} -{"key": "BAC009S0768W0238", "wav": "./aishell/wav/test/S0768/BAC009S0768W0238.wav", "txt": "我国地方政府的累积债务相对于偿付能力来看"} -{"key": "BAC009S0768W0239", "wav": "./aishell/wav/test/S0768/BAC009S0768W0239.wav", "txt": "远低于发生债务危机的欧美国家"} -{"key": "BAC009S0768W0240", "wav": "./aishell/wav/test/S0768/BAC009S0768W0240.wav", "txt": "考虑到我国正处在经济快速增长期"} -{"key": "BAC009S0768W0241", "wav": "./aishell/wav/test/S0768/BAC009S0768W0241.wav", "txt": "政府财力增长也相当较快"} -{"key": "BAC009S0768W0242", "wav": "./aishell/wav/test/S0768/BAC009S0768W0242.wav", "txt": "政府还拥有较多的可变现资产"} -{"key": "BAC009S0768W0243", "wav": "./aishell/wav/test/S0768/BAC009S0768W0243.wav", "txt": "相对于目前的负债规模"} -{"key": "BAC009S0768W0244", "wav": "./aishell/wav/test/S0768/BAC009S0768W0244.wav", "txt": "政府总体上具有较强的偿债能力"} -{"key": "BAC009S0768W0245", "wav": "./aishell/wav/test/S0768/BAC009S0768W0245.wav", "txt": "采取积极有效的措施化解部分地区和领域的债务风险"} -{"key": "BAC009S0768W0246", "wav": "./aishell/wav/test/S0768/BAC009S0768W0246.wav", "txt": "在我国出现政府性债务违约的可能性是不大的"} -{"key": "BAC009S0768W0247", "wav": "./aishell/wav/test/S0768/BAC009S0768W0247.wav", "txt": "债卷市场城投债卷发行不畅"} -{"key": "BAC009S0768W0248", "wav": "./aishell/wav/test/S0768/BAC009S0768W0248.wav", "txt": "从城投债券发行监管部门的角度"} -{"key": "BAC009S0768W0249", "wav": "./aishell/wav/test/S0768/BAC009S0768W0249.wav", "txt": "您如何看待这一现象"} -{"key": "BAC009S0768W0250", "wav": "./aishell/wav/test/S0768/BAC009S0768W0250.wav", "txt": "徐林出于对地方政府债务风险的担忧"} -{"key": "BAC009S0768W0251", "wav": "./aishell/wav/test/S0768/BAC009S0768W0251.wav", "txt": "投资者采取措施防范风险是成熟的表现"} -{"key": "BAC009S0768W0252", "wav": "./aishell/wav/test/S0768/BAC009S0768W0252.wav", "txt": "但出于对我国地方政府债务风险的不合理判断"} -{"key": "BAC009S0768W0253", "wav": "./aishell/wav/test/S0768/BAC009S0768W0253.wav", "txt": "神雕的机身四周装有分布式有源相控阵雷达天线"} -{"key": "BAC009S0768W0254", "wav": "./aishell/wav/test/S0768/BAC009S0768W0254.wav", "txt": "可以提供三六零度无死角的早期预警"} -{"key": "BAC009S0768W0255", "wav": "./aishell/wav/test/S0768/BAC009S0768W0255.wav", "txt": "它的雷达可能采用了双波段设计"} -{"key": "BAC009S0768W0259", "wav": "./aishell/wav/test/S0768/BAC009S0768W0259.wav", "txt": "该机的雷达还具备合成孔径工作能力"} -{"key": "BAC009S0768W0260", "wav": "./aishell/wav/test/S0768/BAC009S0768W0260.wav", "txt": "可用于侦察缓慢移动的地面和海面目标"} -{"key": "BAC009S0768W0261", "wav": "./aishell/wav/test/S0768/BAC009S0768W0261.wav", "txt": "神雕还有一定的隐身特性"} -{"key": "BAC009S0768W0262", "wav": "./aishell/wav/test/S0768/BAC009S0768W0262.wav", "txt": "加上它凭借远程雷达与对方舰队保持远距离"} -{"key": "BAC009S0768W0263", "wav": "./aishell/wav/test/S0768/BAC009S0768W0263.wav", "txt": "如果神雕大量服役和部署"} -{"key": "BAC009S0768W0264", "wav": "./aishell/wav/test/S0768/BAC009S0768W0264.wav", "txt": "在战区上空形成有效韧的信息网络"} -{"key": "BAC009S0768W0265", "wav": "./aishell/wav/test/S0768/BAC009S0768W0265.wav", "txt": "那将会是中国海空军的战力倍增器之一"} -{"key": "BAC009S0768W0266", "wav": "./aishell/wav/test/S0768/BAC009S0768W0266.wav", "txt": "高空长航时战略无人机"} -{"key": "BAC009S0768W0267", "wav": "./aishell/wav/test/S0768/BAC009S0768W0267.wav", "txt": "全球鹰并不能独占鳌头"} -{"key": "BAC009S0768W0268", "wav": "./aishell/wav/test/S0768/BAC009S0768W0268.wav", "txt": "继二零一一年出现独特的连翼造型的翔龙无人机以后"} -{"key": "BAC009S0768W0269", "wav": "./aishell/wav/test/S0768/BAC009S0768W0269.wav", "txt": "又一款个性十足的双机身气动外形的大型无人机神雕"} -{"key": "BAC009S0768W0271", "wav": "./aishell/wav/test/S0768/BAC009S0768W0271.wav", "txt": "据新华社电美国国际贸易委员会二十一日作出终裁"} -{"key": "BAC009S0768W0272", "wav": "./aishell/wav/test/S0768/BAC009S0768W0272.wav", "txt": "从台湾地区进口的此类产品存在切销行为"} -{"key": "BAC009S0768W0273", "wav": "./aishell/wav/test/S0768/BAC009S0768W0273.wav", "txt": "美国国际贸易委员会称"} -{"key": "BAC009S0768W0274", "wav": "./aishell/wav/test/S0768/BAC009S0768W0274.wav", "txt": "在征收反倾销或反补贴税之前"} -{"key": "BAC009S0768W0275", "wav": "./aishell/wav/test/S0768/BAC009S0768W0275.wav", "txt": "美商务部与国际贸易委员会都需作出肯定性终裁"} -{"key": "BAC009S0768W0276", "wav": "./aishell/wav/test/S0768/BAC009S0768W0276.wav", "txt": "商务部裁定切销或补贴幅度"} -{"key": "BAC009S0768W0277", "wav": "./aishell/wav/test/S0768/BAC009S0768W0277.wav", "txt": "根据美国商务部去年十二月份终裁确定的幅度"} -{"key": "BAC009S0768W0278", "wav": "./aishell/wav/test/S0768/BAC009S0768W0278.wav", "txt": "针对中美光伏贸易纠纷"} -{"key": "BAC009S0768W0279", "wav": "./aishell/wav/test/S0768/BAC009S0768W0279.wav", "txt": "中国商务部已明确表示"} -{"key": "BAC009S0768W0280", "wav": "./aishell/wav/test/S0768/BAC009S0768W0280.wav", "txt": "再次对中国光伏产品发起双反调查并试图征收高额关税"} -{"key": "BAC009S0768W0281", "wav": "./aishell/wav/test/S0768/BAC009S0768W0281.wav", "txt": "中方对此表示强烈不满"} -{"key": "BAC009S0768W0282", "wav": "./aishell/wav/test/S0768/BAC009S0768W0282.wav", "txt": "美方对中国产品进行限制的做法"} -{"key": "BAC009S0768W0283", "wav": "./aishell/wav/test/S0768/BAC009S0768W0283.wav", "txt": "是对贸易救济措施的滥用"} -{"key": "BAC009S0768W0284", "wav": "./aishell/wav/test/S0768/BAC009S0768W0284.wav", "txt": "势必使用中美光伏贸易纠纷再度升级"} -{"key": "BAC009S0768W0285", "wav": "./aishell/wav/test/S0768/BAC009S0768W0285.wav", "txt": "美国智库学学者和太阳能行业协会也多次警告"} -{"key": "BAC009S0768W0286", "wav": "./aishell/wav/test/S0768/BAC009S0768W0286.wav", "txt": "许多美国太阳太阳能制造商依赖于全球光伏供应链"} -{"key": "BAC009S0768W0287", "wav": "./aishell/wav/test/S0768/BAC009S0768W0287.wav", "txt": "并减少太阳能产业相关就业岗位"} -{"key": "BAC009S0768W0288", "wav": "./aishell/wav/test/S0768/BAC009S0768W0288.wav", "txt": "美初裁中国产轮胎倾销"} -{"key": "BAC009S0768W0289", "wav": "./aishell/wav/test/S0768/BAC009S0768W0289.wav", "txt": "据新华社电美国商务部二十一日宣布初裁结果"} -{"key": "BAC009S0768W0290", "wav": "./aishell/wav/test/S0768/BAC009S0768W0290.wav", "txt": "认定从中国进口的乘用车和轻型卡车轮胎存在倾销行为"} -{"key": "BAC009S0768W0291", "wav": "./aishell/wav/test/S0768/BAC009S0768W0291.wav", "txt": "美商务部当天发表声明说"} -{"key": "BAC009S0768W0292", "wav": "./aishell/wav/test/S0768/BAC009S0768W0292.wav", "txt": "倾销幅度从百分之十七至百分之九十九"} -{"key": "BAC009S0768W0293", "wav": "./aishell/wav/test/S0768/BAC009S0768W0293.wav", "txt": "基于倾销幅度的初裁结果"} -{"key": "BAC009S0768W0294", "wav": "./aishell/wav/test/S0768/BAC009S0768W0294.wav", "txt": "就美国对中国产轮胎发起双坊调查"} -{"key": "BAC009S0768W0295", "wav": "./aishell/wav/test/S0768/BAC009S0768W0295.wav", "txt": "中国商务部曾表示强烈反对"} -{"key": "BAC009S0768W0296", "wav": "./aishell/wav/test/S0768/BAC009S0768W0296.wav", "txt": "认为美国此举违反世界贸易组织规则和美国国内法"} -{"key": "BAC009S0768W0297", "wav": "./aishell/wav/test/S0768/BAC009S0768W0297.wav", "txt": "希望美方吸取前车之鉴"} -{"key": "BAC009S0768W0298", "wav": "./aishell/wav/test/S0768/BAC009S0768W0298.wav", "txt": "避免破坏两国相关产业的贸易与合作"} -{"key": "BAC009S0768W0299", "wav": "./aishell/wav/test/S0768/BAC009S0768W0299.wav", "txt": "据新华社电美国国际贸易委员会二十一日作出终裁"} -{"key": "BAC009S0768W0300", "wav": "./aishell/wav/test/S0768/BAC009S0768W0300.wav", "txt": "从台湾地区进口的此类产品存在倾销行为"} -{"key": "BAC009S0768W0301", "wav": "./aishell/wav/test/S0768/BAC009S0768W0301.wav", "txt": "这意味着美国将对相关产品"} -{"key": "BAC009S0768W0303", "wav": "./aishell/wav/test/S0768/BAC009S0768W0303.wav", "txt": "成飞集成百二十一九十"} -{"key": "BAC009S0768W0305", "wav": "./aishell/wav/test/S0768/BAC009S0768W0305.wav", "txt": "公司上半年营业收入六点三四亿元"} -{"key": "BAC009S0768W0307", "wav": "./aishell/wav/test/S0768/BAC009S0768W0307.wav", "txt": "从而获取用户信息的案件"} -{"key": "BAC009S0768W0308", "wav": "./aishell/wav/test/S0768/BAC009S0768W0308.wav", "txt": "杨某等四人一同在深圳成立了安丰公司"} -{"key": "BAC009S0768W0309", "wav": "./aishell/wav/test/S0768/BAC009S0768W0309.wav", "txt": "公司主要从事计算机手机的软件开发业务"} -{"key": "BAC009S0768W0310", "wav": "./aishell/wav/test/S0768/BAC009S0768W0310.wav", "txt": "由于安丰公司的业务不景气"} -{"key": "BAC009S0768W0311", "wav": "./aishell/wav/test/S0768/BAC009S0768W0311.wav", "txt": "杨某等四人经过商议"} -{"key": "BAC009S0768W0312", "wav": "./aishell/wav/test/S0768/BAC009S0768W0312.wav", "txt": "决定由麦德公司的技术部门研发静默插件"} -{"key": "BAC009S0768W0313", "wav": "./aishell/wav/test/S0768/BAC009S0768W0313.wav", "txt": "使用户在刷机过程中"} -{"key": "BAC009S0768W0314", "wav": "./aishell/wav/test/S0768/BAC009S0768W0314.wav", "txt": "不知不觉地安装上公司开发的插件"} -{"key": "BAC009S0768W0315", "wav": "./aishell/wav/test/S0768/BAC009S0768W0315.wav", "txt": "而手机被安装上这一插件后"} -{"key": "BAC009S0768W0316", "wav": "./aishell/wav/test/S0768/BAC009S0768W0316.wav", "txt": "公司不仅可以向手机推送软件广告等商业性电子信息"} -{"key": "BAC009S0768W0317", "wav": "./aishell/wav/test/S0768/BAC009S0768W0317.wav", "txt": "安丰公司的广告网页是他们推送的重要内容"} -{"key": "BAC009S0768W0318", "wav": "./aishell/wav/test/S0768/BAC009S0768W0318.wav", "txt": "他们通过这个插件已获利广告收入二十馀万元"} -{"key": "BAC009S0768W0319", "wav": "./aishell/wav/test/S0768/BAC009S0768W0319.wav", "txt": "同案被捕的马某等四人是公司技术部门的员工"} -{"key": "BAC009S0768W0320", "wav": "./aishell/wav/test/S0768/BAC009S0768W0320.wav", "txt": "软件开发是领导的授意"} -{"key": "BAC009S0768W0321", "wav": "./aishell/wav/test/S0768/BAC009S0768W0321.wav", "txt": "自己只是执行公司的工作要求"} -{"key": "BAC009S0768W0322", "wav": "./aishell/wav/test/S0768/BAC009S0768W0322.wav", "txt": "三百六十软件识别出了麦德公司的插件"} -{"key": "BAC009S0768W0323", "wav": "./aishell/wav/test/S0768/BAC009S0768W0323.wav", "txt": "将其列为恶意软件用户称其为流氓软件"} -{"key": "BAC009S0768W0324", "wav": "./aishell/wav/test/S0768/BAC009S0768W0324.wav", "txt": "马某等人进一步完善了插件"} -{"key": "BAC009S0768W0325", "wav": "./aishell/wav/test/S0768/BAC009S0768W0325.wav", "txt": "再次利用同样的静默安装方式继续推广软件"} -{"key": "BAC009S0768W0326", "wav": "./aishell/wav/test/S0768/BAC009S0768W0326.wav", "txt": "二十馀万部手机遭殃"} -{"key": "BAC009S0768W0327", "wav": "./aishell/wav/test/S0768/BAC009S0768W0327.wav", "txt": "在被公安机关查获后"} -{"key": "BAC009S0768W0328", "wav": "./aishell/wav/test/S0768/BAC009S0768W0328.wav", "txt": "警方在麦德公司数据库中发现"} -{"key": "BAC009S0768W0329", "wav": "./aishell/wav/test/S0768/BAC009S0768W0329.wav", "txt": "获取到的通讯录近两千万条"} -{"key": "BAC009S0768W0330", "wav": "./aishell/wav/test/S0768/BAC009S0768W0330.wav", "txt": "判处有期徒刑三年六个月"} -{"key": "BAC009S0768W0331", "wav": "./aishell/wav/test/S0768/BAC009S0768W0331.wav", "txt": "其馀九人获刑一年五个月至三年不等"} -{"key": "BAC009S0768W0332", "wav": "./aishell/wav/test/S0768/BAC009S0768W0332.wav", "txt": "依据国家相关法律法规"} -{"key": "BAC009S0768W0333", "wav": "./aishell/wav/test/S0768/BAC009S0768W0333.wav", "txt": "杨某等人在明知插件功能的情况下"} -{"key": "BAC009S0768W0334", "wav": "./aishell/wav/test/S0768/BAC009S0768W0334.wav", "txt": "未经用户同意将该插件预置到呃用户手机中"} -{"key": "BAC009S0768W0335", "wav": "./aishell/wav/test/S0768/BAC009S0768W0335.wav", "txt": "非法获取用户身份认证信息"} -{"key": "BAC009S0768W0336", "wav": "./aishell/wav/test/S0768/BAC009S0768W0336.wav", "txt": "已经构成了对他人计算机信息系统的侵入控制"} -{"key": "BAC009S0768W0337", "wav": "./aishell/wav/test/S0768/BAC009S0768W0337.wav", "txt": "侵犯了公民的合法权益"} -{"key": "BAC009S0768W0338", "wav": "./aishell/wav/test/S0768/BAC009S0768W0338.wav", "txt": "强劲犀利的拳法与膝法破坏力惊人"} -{"key": "BAC009S0768W0339", "wav": "./aishell/wav/test/S0768/BAC009S0768W0339.wav", "txt": "二零一五年初在南京的笼斗中"} -{"key": "BAC009S0768W0340", "wav": "./aishell/wav/test/S0768/BAC009S0768W0340.wav", "txt": "徐永昊以雷霆万钧之势缔造了一场震撼的秒杀之作"} -{"key": "BAC009S0768W0341", "wav": "./aishell/wav/test/S0768/BAC009S0768W0341.wav", "txt": "迅即杀狠的站立技术令人惊叹不已"} -{"key": "BAC009S0768W0342", "wav": "./aishell/wav/test/S0768/BAC009S0768W0342.wav", "txt": "也是我喜欢的格斗方式"} -{"key": "BAC009S0768W0343", "wav": "./aishell/wav/test/S0768/BAC009S0768W0343.wav", "txt": "我都会对站立技术进行重点强化"} -{"key": "BAC009S0768W0344", "wav": "./aishell/wav/test/S0768/BAC009S0768W0344.wav", "txt": "让自己的攻击变得更快更狠"} -{"key": "BAC009S0768W0345", "wav": "./aishell/wav/test/S0768/BAC009S0768W0345.wav", "txt": "对于这场比赛的备战"} -{"key": "BAC009S0768W0346", "wav": "./aishell/wav/test/S0768/BAC009S0768W0346.wav", "txt": "我在重点强化站立技术的同时"} -{"key": "BAC009S0768W0347", "wav": "./aishell/wav/test/S0768/BAC009S0768W0347.wav", "txt": "也对地面技术和防摔技术上做了很多针对性的训练"} -{"key": "BAC009S0768W0348", "wav": "./aishell/wav/test/S0768/BAC009S0768W0348.wav", "txt": "对于综合能力的严苛要求"} -{"key": "BAC009S0768W0349", "wav": "./aishell/wav/test/S0768/BAC009S0768W0349.wav", "txt": "是综合格斗运动的一大特色"} -{"key": "BAC009S0768W0350", "wav": "./aishell/wav/test/S0768/BAC009S0768W0350.wav", "txt": "相较于其精湛凶猛的站立技术"} -{"key": "BAC009S0768W0351", "wav": "./aishell/wav/test/S0768/BAC009S0768W0351.wav", "txt": "徐永昊的地面技术无疑是其格斗体系中的一块短板"} -{"key": "BAC009S0768W0352", "wav": "./aishell/wav/test/S0768/BAC009S0768W0352.wav", "txt": "上一场同包尔江的比赛之后"} -{"key": "BAC009S0768W0353", "wav": "./aishell/wav/test/S0768/BAC009S0768W0353.wav", "txt": "一个强项跟弱项同样突出的拳手"} -{"key": "BAC009S0768W0354", "wav": "./aishell/wav/test/S0768/BAC009S0768W0354.wav", "txt": "是很难成为真正的王者"} -{"key": "BAC009S0768W0355", "wav": "./aishell/wav/test/S0768/BAC009S0768W0355.wav", "txt": "我必须要变的更加全面"} -{"key": "BAC009S0768W0356", "wav": "./aishell/wav/test/S0768/BAC009S0768W0356.wav", "txt": "此次昆罗决中俄对抗赛上"} -{"key": "BAC009S0768W0357", "wav": "./aishell/wav/test/S0768/BAC009S0768W0357.wav", "txt": "对于代表中国战队略先出阵的徐永昊来讲"} -{"key": "BAC009S0768W0358", "wav": "./aishell/wav/test/S0768/BAC009S0768W0358.wav", "txt": "这无疑又是一次严峻的考验"} -{"key": "BAC009S0768W0359", "wav": "./aishell/wav/test/S0768/BAC009S0768W0359.wav", "txt": "也是其对于自身技术全面性提高程度的一次检验"} -{"key": "BAC009S0768W0360", "wav": "./aishell/wav/test/S0768/BAC009S0768W0360.wav", "txt": "我这次的对手水平很高"} -{"key": "BAC009S0768W0361", "wav": "./aishell/wav/test/S0768/BAC009S0768W0361.wav", "txt": "拳法和摔跤能力很出色"} -{"key": "BAC009S0768W0362", "wav": "./aishell/wav/test/S0768/BAC009S0768W0362.wav", "txt": "而在谈及此次应敌的策略时"} -{"key": "BAC009S0768W0363", "wav": "./aishell/wav/test/S0768/BAC009S0768W0363.wav", "txt": "我不会改变自己擅长的风格"} -{"key": "BAC009S0768W0364", "wav": "./aishell/wav/test/S0768/BAC009S0768W0364.wav", "txt": "这次比赛我会用胜利证明自己的实力"} -{"key": "BAC009S0768W0365", "wav": "./aishell/wav/test/S0768/BAC009S0768W0365.wav", "txt": "二零一五年世界田径锦标赛即将在北京拉开序幕"} -{"key": "BAC009S0768W0366", "wav": "./aishell/wav/test/S0768/BAC009S0768W0366.wav", "txt": "近日德郭队公布了参加此次世锦赛的六六人大名单"} -{"key": "BAC009S0768W0367", "wav": "./aishell/wav/test/S0768/BAC009S0768W0367.wav", "txt": "上届莫斯科世锦赛上拿到了金牌的四位选手悉数出战"} -{"key": "BAC009S0768W0368", "wav": "./aishell/wav/test/S0768/BAC009S0768W0368.wav", "txt": "主教练对于这支以老带新的队伍也充满了自信"} -{"key": "BAC009S0768W0369", "wav": "./aishell/wav/test/S0768/BAC009S0768W0369.wav", "txt": "上届莫斯科世锦赛上拿到的金牌的四位选手悉数出战"} -{"key": "BAC009S0768W0370", "wav": "./aishell/wav/test/S0768/BAC009S0768W0370.wav", "txt": "包括前秋运动员维斯多尔"} -{"key": "BAC009S0768W0371", "wav": "./aishell/wav/test/S0768/BAC009S0768W0371.wav", "txt": "撑杆跳选手拉斐尔霍尔泽德佩"} -{"key": "BAC009S0768W0372", "wav": "./aishell/wav/test/S0768/BAC009S0768W0372.wav", "txt": "哈特灵今年饱受十字韧带伤势困扰"} -{"key": "BAC009S0768W0373", "wav": "./aishell/wav/test/S0768/BAC009S0768W0373.wav", "txt": "他是否接受外卡参赛要视情况而定"} -{"key": "BAC009S0768W0374", "wav": "./aishell/wav/test/S0768/BAC009S0768W0374.wav", "txt": "德国队此次以老带新"} -{"key": "BAC009S0768W0375", "wav": "./aishell/wav/test/S0768/BAC009S0768W0375.wav", "txt": "这也是他一年四记来第一次参加世锦赛"} -{"key": "BAC009S0768W0376", "wav": "./aishell/wav/test/S0768/BAC009S0768W0376.wav", "txt": "也有经验丰富的老队员"} -{"key": "BAC009S0768W0377", "wav": "./aishell/wav/test/S0768/BAC009S0768W0377.wav", "txt": "我相信每个人都会付出一切来为团队力争最好的成绩"} -{"key": "BAC009S0768W0378", "wav": "./aishell/wav/test/S0768/BAC009S0768W0378.wav", "txt": "附二零一五田径世锦赛德国队名单"} -{"key": "BAC009S0768W0379", "wav": "./aishell/wav/test/S0768/BAC009S0768W0379.wav", "txt": "一百米塞文基尼菲尔斯"} -{"key": "BAC009S0768W0380", "wav": "./aishell/wav/test/S0768/BAC009S0768W0380.wav", "txt": "二百米罗宾埃尔瓦"} -{"key": "BAC009S0768W0381", "wav": "./aishell/wav/test/S0768/BAC009S0768W0381.wav", "txt": "八百米罗宾斯切姆贝拉"} -{"key": "BAC009S0768W0382", "wav": "./aishell/wav/test/S0768/BAC009S0768W0382.wav", "txt": "五千米理查德灵格"} -{"key": "BAC009S0768W0383", "wav": "./aishell/wav/test/S0768/BAC009S0768W0383.wav", "txt": "一万米阿尔恩加比乌斯"} -{"key": "BAC009S0768W0384", "wav": "./aishell/wav/test/S0768/BAC009S0768W0384.wav", "txt": "一百一十米栏马特里亚斯布赫雷尔"} -{"key": "BAC009S0768W0385", "wav": "./aishell/wav/test/S0768/BAC009S0768W0385.wav", "txt": "格里格尔特拉贝尔"} -{"key": "BAC009S0768W0386", "wav": "./aishell/wav/test/S0768/BAC009S0768W0386.wav", "txt": "马特伍兹菲兹比亚尔科"} -{"key": "BAC009S0768W0387", "wav": "./aishell/wav/test/S0768/BAC009S0768W0387.wav", "txt": "撑杆跳拉斐尔霍尔泽德斯佩"} -{"key": "BAC009S0768W0388", "wav": "./aishell/wav/test/S0768/BAC009S0768W0388.wav", "txt": "托比亚斯斯切尔巴尔斯"} -{"key": "BAC009S0768W0389", "wav": "./aishell/wav/test/S0768/BAC009S0768W0389.wav", "txt": "跳远阿莱恩卡马拉"} -{"key": "BAC009S0768W0390", "wav": "./aishell/wav/test/S0768/BAC009S0768W0390.wav", "txt": "铅球达维斯多尔"} -{"key": "BAC009S0768W0391", "wav": "./aishell/wav/test/S0768/BAC009S0768W0391.wav", "txt": "铁饼克里斯托弗哈特灵"} -{"key": "BAC009S0768W0392", "wav": "./aishell/wav/test/S0768/BAC009S0768W0392.wav", "txt": "标枪拉尔斯哈曼恩"} -{"key": "BAC009S0768W0393", "wav": "./aishell/wav/test/S0768/BAC009S0768W0393.wav", "txt": "全能里科费雷姆斯"} -{"key": "BAC009S0768W0394", "wav": "./aishell/wav/test/S0768/BAC009S0768W0394.wav", "txt": "迈克尔斯齐莱德尔"} -{"key": "BAC009S0768W0395", "wav": "./aishell/wav/test/S0768/BAC009S0768W0395.wav", "txt": "二零千米竞走尼尔斯布莱姆巴号"} -{"key": "BAC009S0768W0396", "wav": "./aishell/wav/test/S0768/BAC009S0768W0396.wav", "txt": "五零千米竞走卡尔多赫曼"} -{"key": "BAC009S0768W0397", "wav": "./aishell/wav/test/S0768/BAC009S0768W0397.wav", "txt": "四乘一百米接力罗伯特哈特灵"} -{"key": "BAC009S0768W0398", "wav": "./aishell/wav/test/S0768/BAC009S0768W0398.wav", "txt": "卢卡斯亚库比泽克"} -{"key": "BAC009S0768W0399", "wav": "./aishell/wav/test/S0768/BAC009S0768W0399.wav", "txt": "亚历山大克塞诺科夫"} -{"key": "BAC009S0768W0400", "wav": "./aishell/wav/test/S0768/BAC009S0768W0400.wav", "txt": "雅莱克斯欧帕拉迪尼门格"} -{"key": "BAC009S0768W0401", "wav": "./aishell/wav/test/S0768/BAC009S0768W0401.wav", "txt": "一百米莱贝卡哈塞"} -{"key": "BAC009S0768W0402", "wav": "./aishell/wav/test/S0768/BAC009S0768W0402.wav", "txt": "吉娜卢克肯科姆普尔"} -{"key": "BAC009S0768W0403", "wav": "./aishell/wav/test/S0768/BAC009S0768W0403.wav", "txt": "八百米克里斯蒂娜哈灵"} -{"key": "BAC009S0768W0404", "wav": "./aishell/wav/test/S0768/BAC009S0768W0404.wav", "txt": "这部命运多旭的电影"} -{"key": "BAC009S0768W0405", "wav": "./aishell/wav/test/S0768/BAC009S0768W0405.wav", "txt": "原本计划在今年六月正式开机"} -{"key": "BAC009S0768W0406", "wav": "./aishell/wav/test/S0768/BAC009S0768W0406.wav", "txt": "可现在已经全部泡汤"} -{"key": "BAC009S0768W0407", "wav": "./aishell/wav/test/S0768/BAC009S0768W0407.wav", "txt": "科林之前已经积极的支持影片拍摄"} -{"key": "BAC009S0768W0408", "wav": "./aishell/wav/test/S0768/BAC009S0768W0408.wav", "txt": "圆圆的脸蛋非常的可爱"} -{"key": "BAC009S0768W0409", "wav": "./aishell/wav/test/S0768/BAC009S0768W0409.wav", "txt": "此照片萌翻众网友"} -{"key": "BAC009S0768W0410", "wav": "./aishell/wav/test/S0768/BAC009S0768W0410.wav", "txt": "纷纷留言点赞"} -{"key": "BAC009S0768W0411", "wav": "./aishell/wav/test/S0768/BAC009S0768W0411.wav", "txt": "称哈哈哈性感的不要不要的"} -{"key": "BAC009S0768W0412", "wav": "./aishell/wav/test/S0768/BAC009S0768W0412.wav", "txt": "自小卖得一脸好萌"} -{"key": "BAC009S0768W0413", "wav": "./aishell/wav/test/S0768/BAC009S0768W0413.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0768W0414", "wav": "./aishell/wav/test/S0768/BAC009S0768W0414.wav", "txt": "诞下很像天华的小宝贝"} -{"key": "BAC009S0768W0415", "wav": "./aishell/wav/test/S0768/BAC009S0768W0415.wav", "txt": "一向都是在圈子中人缘甚佳的谢天华"} -{"key": "BAC009S0768W0416", "wav": "./aishell/wav/test/S0768/BAC009S0768W0416.wav", "txt": "使得宝宝刚出生就有了一大班星星级干爹干娘"} -{"key": "BAC009S0768W0417", "wav": "./aishell/wav/test/S0768/BAC009S0768W0417.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0768W0418", "wav": "./aishell/wav/test/S0768/BAC009S0768W0418.wav", "txt": "艺人谢婷婷出席活动时"} -{"key": "BAC009S0768W0419", "wav": "./aishell/wav/test/S0768/BAC009S0768W0419.wav", "txt": "被问到有传其胞兄谢霆锋将与王菲结婚"} -{"key": "BAC009S0768W0420", "wav": "./aishell/wav/test/S0768/BAC009S0768W0420.wav", "txt": "她回应是么"} -{"key": "BAC009S0768W0421", "wav": "./aishell/wav/test/S0768/BAC009S0768W0421.wav", "txt": "没有人同我讲"} -{"key": "BAC009S0768W0422", "wav": "./aishell/wav/test/S0768/BAC009S0768W0422.wav", "txt": "好多传闻我都不会特别问他"} -{"key": "BAC009S0768W0423", "wav": "./aishell/wav/test/S0768/BAC009S0768W0423.wav", "txt": "是真的话他自己会同我讲"} -{"key": "BAC009S0768W0424", "wav": "./aishell/wav/test/S0768/BAC009S0768W0424.wav", "txt": "想不想他再次成家立室"} -{"key": "BAC009S0768W0425", "wav": "./aishell/wav/test/S0768/BAC009S0768W0425.wav", "txt": "他开心就好"} -{"key": "BAC009S0768W0426", "wav": "./aishell/wav/test/S0768/BAC009S0768W0426.wav", "txt": "不过要看他心情工作同家人相处同小朋友"} -{"key": "BAC009S0768W0427", "wav": "./aishell/wav/test/S0768/BAC009S0768W0427.wav", "txt": "各样都平衡得好处理得好"} -{"key": "BAC009S0768W0428", "wav": "./aishell/wav/test/S0768/BAC009S0768W0428.wav", "txt": "结婚都只是一张纸同戒指"} -{"key": "BAC009S0768W0430", "wav": "./aishell/wav/test/S0768/BAC009S0768W0430.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0768W0431", "wav": "./aishell/wav/test/S0768/BAC009S0768W0431.wav", "txt": "为了给自己的爱犬盖狗舍及休息场所"} -{"key": "BAC009S0768W0432", "wav": "./aishell/wav/test/S0768/BAC009S0768W0432.wav", "txt": "他指使他人虚开发票六万馀元用公款报销"} -{"key": "BAC009S0768W0433", "wav": "./aishell/wav/test/S0768/BAC009S0768W0433.wav", "txt": "贾某被市三中院终审判处有期徒刑两年八个月"} -{"key": "BAC009S0768W0434", "wav": "./aishell/wav/test/S0768/BAC009S0768W0434.wav", "txt": "村书记被村民驾车撞倒身亡肇事者已被刑拘"} -{"key": "BAC009S0768W0435", "wav": "./aishell/wav/test/S0768/BAC009S0768W0435.wav", "txt": "京华时报讯记者迟名常鑫前天中午近一一点半"} -{"key": "BAC009S0768W0436", "wav": "./aishell/wav/test/S0768/BAC009S0768W0436.wav", "txt": "大兴区礼贤镇紫各庄村"} -{"key": "BAC009S0768W0437", "wav": "./aishell/wav/test/S0768/BAC009S0768W0437.wav", "txt": "村书记乔俊然在家门前被一辆轿车撞倒后"} -{"key": "BAC009S0768W0438", "wav": "./aishell/wav/test/S0768/BAC009S0768W0438.wav", "txt": "肇事者为紫各庄村民乔某"} -{"key": "BAC009S0768W0439", "wav": "./aishell/wav/test/S0768/BAC009S0768W0439.wav", "txt": "大兴警方以涉嫌交通肇事罪将肇事者刑事拘留"} -{"key": "BAC009S0768W0440", "wav": "./aishell/wav/test/S0768/BAC009S0768W0440.wav", "txt": "案件正在进一步调查中"} -{"key": "BAC009S0768W0441", "wav": "./aishell/wav/test/S0768/BAC009S0768W0441.wav", "txt": "村书记骗补助被判一一年受审辩称不了解政策"} -{"key": "BAC009S0768W0442", "wav": "./aishell/wav/test/S0768/BAC009S0768W0442.wav", "txt": "新京报讯记者王巍利用村里遭受泥石流灾害后"} -{"key": "BAC009S0768W0443", "wav": "./aishell/wav/test/S0768/BAC009S0768W0443.wav", "txt": "政府出钱搬迁盖房的机会"} -{"key": "BAC009S0768W0444", "wav": "./aishell/wav/test/S0768/BAC009S0768W0444.wav", "txt": "延庆县永宁镇偏坡峪村原党支部书记钱某"} -{"key": "BAC009S0768W0445", "wav": "./aishell/wav/test/S0768/BAC009S0768W0445.wav", "txt": "将不应享受政府的两个女儿作为搬迁户上报"} -{"key": "BAC009S0768W0446", "wav": "./aishell/wav/test/S0768/BAC009S0768W0446.wav", "txt": "骗取搬迁补助资金用于支付搬迁安置房费用"} -{"key": "BAC009S0768W0447", "wav": "./aishell/wav/test/S0768/BAC009S0768W0447.wav", "txt": "延庆法院一审判决认为"} -{"key": "BAC009S0768W0448", "wav": "./aishell/wav/test/S0768/BAC009S0768W0448.wav", "txt": "钱某贪污一二馀万元拆迁款"} -{"key": "BAC009S0768W0449", "wav": "./aishell/wav/test/S0768/BAC009S0768W0449.wav", "txt": "判处有期徒刑一一年"} -{"key": "BAC009S0768W0450", "wav": "./aishell/wav/test/S0768/BAC009S0768W0450.wav", "txt": "村内常有蛇出没疑从养蛇村民中爬出"} -{"key": "BAC009S0768W0451", "wav": "./aishell/wav/test/S0768/BAC009S0768W0451.wav", "txt": "信息时报讯记者陈子玉近日"} -{"key": "BAC009S0768W0452", "wav": "./aishell/wav/test/S0768/BAC009S0768W0452.wav", "txt": "白云区钟落潭竹一村民白云区钟落潭竹一村的村民跟记者报料"} -{"key": "BAC009S0768W0453", "wav": "./aishell/wav/test/S0768/BAC009S0768W0453.wav", "txt": "说最近他们村里经常有蛇出没"} -{"key": "BAC009S0768W0454", "wav": "./aishell/wav/test/S0768/BAC009S0768W0454.wav", "txt": "甚至还会爬到村民家中"} -{"key": "BAC009S0768W0455", "wav": "./aishell/wav/test/S0768/BAC009S0768W0455.wav", "txt": "他们怀疑是有人在村里养蛇所致"} -{"key": "BAC009S0768W0456", "wav": "./aishell/wav/test/S0768/BAC009S0768W0456.wav", "txt": "蛇主刘先生表示以后将不在家里养蛇"} -{"key": "BAC009S0768W0457", "wav": "./aishell/wav/test/S0768/BAC009S0768W0457.wav", "txt": "村医研发神奇止痛药网销全全国获刑三年"} -{"key": "BAC009S0768W0458", "wav": "./aishell/wav/test/S0768/BAC009S0768W0458.wav", "txt": "村卫生室医师兼职黑b超记者暗访结束被跟踪"} -{"key": "BAC009S0768W0459", "wav": "./aishell/wav/test/S0768/BAC009S0768W0459.wav", "txt": "明着是大兴区黄村镇狼垡三村的医师"} -{"key": "BAC009S0768W0460", "wav": "./aishell/wav/test/S0768/BAC009S0768W0460.wav", "txt": "暗地里却发布小广告揽客"} -{"key": "BAC009S0768W0461", "wav": "./aishell/wav/test/S0768/BAC009S0768W0461.wav", "txt": "村妇为缓解丈夫病痛种罂丽当药用被判刑六个月"} -{"key": "BAC009S0768W0462", "wav": "./aishell/wav/test/S0768/BAC009S0768W0462.wav", "txt": "曲靖一村妇竟在自家菜地内非法种植罂丽一零四二株"} -{"key": "BAC009S0768W0463", "wav": "./aishell/wav/test/S0768/BAC009S0768W0463.wav", "txt": "用罂丽熬汤为丈夫止痛"} -{"key": "BAC009S0768W0464", "wav": "./aishell/wav/test/S0768/BAC009S0768W0464.wav", "txt": "该村妇因犯非法种植毒品原植物罪"} -{"key": "BAC009S0768W0465", "wav": "./aishell/wav/test/S0768/BAC009S0768W0465.wav", "txt": "被麒麟区法院判处有期徒刑六个月"} -{"key": "BAC009S0768W0466", "wav": "./aishell/wav/test/S0768/BAC009S0768W0466.wav", "txt": "并处罚金人民币一千元"} -{"key": "BAC009S0768W0467", "wav": "./aishell/wav/test/S0768/BAC009S0768W0467.wav", "txt": "村妇将一零万元现金埋地底四年多已腐烂成碎块"} -{"key": "BAC009S0768W0468", "wav": "./aishell/wav/test/S0768/BAC009S0768W0468.wav", "txt": "村委会主任因经济问题两次被免第三次当选惹争议"} -{"key": "BAC009S0768W0469", "wav": "./aishell/wav/test/S0768/BAC009S0768W0469.wav", "txt": "张绵跃当选村委会主任"} -{"key": "BAC009S0768W0470", "wav": "./aishell/wav/test/S0768/BAC009S0768W0470.wav", "txt": "村委会在农田搭起违法建筑每年收租金一四万元"} -{"key": "BAC009S0768W0471", "wav": "./aishell/wav/test/S0768/BAC009S0768W0471.wav", "txt": "奉化江口儒江村村委会却带头盖起了违法建筑"} -{"key": "BAC009S0768W0472", "wav": "./aishell/wav/test/S0768/BAC009S0768W0472.wav", "txt": "记者接到这样的报料"} -{"key": "BAC009S0768W0473", "wav": "./aishell/wav/test/S0768/BAC009S0768W0473.wav", "txt": "三改一拆可以说是一条红线"} -{"key": "BAC009S0768W0474", "wav": "./aishell/wav/test/S0768/BAC009S0768W0474.wav", "txt": "村委会竟然会顶风作案"} -{"key": "BAC009S0768W0475", "wav": "./aishell/wav/test/S0768/BAC009S0768W0475.wav", "txt": "记者和宁波市三三改一拆办工作人员前往现场"} -{"key": "BAC009S0768W0476", "wav": "./aishell/wav/test/S0768/BAC009S0768W0476.wav", "txt": "这听起来多少有些匪夷所思的违建竟然是真的"} -{"key": "BAC009S0768W0477", "wav": "./aishell/wav/test/S0768/BAC009S0768W0477.wav", "txt": "村官一顿工作餐吃二六个菜挂钩蹲点领导被诫勉谈话"} -{"key": "BAC009S0768W0478", "wav": "./aishell/wav/test/S0768/BAC009S0768W0478.wav", "txt": "一顿工作餐竟上二六个菜"} -{"key": "BAC009S0768W0479", "wav": "./aishell/wav/test/S0768/BAC009S0768W0479.wav", "txt": "且逢餐必有烟酒从园区领导到村组干部"} -{"key": "BAC009S0768W0480", "wav": "./aishell/wav/test/S0768/BAC009S0768W0480.wav", "txt": "在严查四风的高压态势下"} -{"key": "BAC009S0768W0481", "wav": "./aishell/wav/test/S0768/BAC009S0768W0481.wav", "txt": "以公务招待为名大肆公款吃喝"} -{"key": "BAC009S0768W0482", "wav": "./aishell/wav/test/S0768/BAC009S0768W0482.wav", "txt": "村官借四零零多户居民三亿一携款失联"} -{"key": "BAC009S0768W0483", "wav": "./aishell/wav/test/S0768/BAC009S0768W0483.wav", "txt": "村官接连顶风违纪其子领证为热闹摆酒六七桌"} -{"key": "BAC009S0768W0484", "wav": "./aishell/wav/test/S0768/BAC009S0768W0484.wav", "txt": "编者按为深入贯彻落实中央八项规定精神"} -{"key": "BAC009S0768W0485", "wav": "./aishell/wav/test/S0768/BAC009S0768W0485.wav", "txt": "按照中央纪委宣传部的统一部署"} -{"key": "BAC009S0768W0486", "wav": "./aishell/wav/test/S0768/BAC009S0768W0486.wav", "txt": "陆续派记者深入采访"} -{"key": "BAC009S0768W0487", "wav": "./aishell/wav/test/S0768/BAC009S0768W0487.wav", "txt": "进一步加大舆论监督力度"} -{"key": "BAC009S0768W0488", "wav": "./aishell/wav/test/S0768/BAC009S0768W0488.wav", "txt": "通报一个教育一批震灭一片"} -{"key": "BAC009S0768W0489", "wav": "./aishell/wav/test/S0768/BAC009S0768W0489.wav", "txt": "释放出中央执纪必严紧抓不放的强烈信号"} -{"key": "BAC009S0768W0490", "wav": "./aishell/wav/test/S0768/BAC009S0768W0490.wav", "txt": "广大领导干部要以引以为戒守住底线"} -{"key": "BAC009S0768W0491", "wav": "./aishell/wav/test/S0768/BAC009S0768W0491.wav", "txt": "坚决不在四风问题上犯错犯错误跌跟头"} -{"key": "BAC009S0768W0492", "wav": "./aishell/wav/test/S0768/BAC009S0768W0492.wav", "txt": "村官涉不雅视频被免职饭桌上摸女子胸部臀部等"} -{"key": "BAC009S0768W0493", "wav": "./aishell/wav/test/S0768/BAC009S0768W0493.wav", "txt": "村官私刻公章侵占二八万粮补派人殴打上访村民"} -{"key": "BAC009S0768W0494", "wav": "./aishell/wav/test/S0768/BAC009S0768W0494.wav", "txt": "党和人民不会管到我身上来"} -{"key": "BAC009S0768W0495", "wav": "./aishell/wav/test/S0768/BAC009S0768W0495.wav", "txt": "侵吞征地种粮补偿款"} -{"key": "BAC009S0769W0121", "wav": "./aishell/wav/test/S0769/BAC009S0769W0121.wav", "txt": "该地块即为通州新城核心地标彩虹之门用地"} -{"key": "BAC009S0769W0122", "wav": "./aishell/wav/test/S0769/BAC009S0769W0122.wav", "txt": "北京通州新城投资公司网站显示"} -{"key": "BAC009S0769W0123", "wav": "./aishell/wav/test/S0769/BAC009S0769W0123.wav", "txt": "彩虹之门建筑净高三十米"} -{"key": "BAC009S0769W0124", "wav": "./aishell/wav/test/S0769/BAC009S0769W0124.wav", "txt": "为双拱形非中心对称建筑"} -{"key": "BAC009S0769W0125", "wav": "./aishell/wav/test/S0769/BAC009S0769W0125.wav", "txt": "新京报讯记者张旭报道"} -{"key": "BAC009S0769W0126", "wav": "./aishell/wav/test/S0769/BAC009S0769W0126.wav", "txt": "北京去年土地出让落下大幕"} -{"key": "BAC009S0769W0127", "wav": "./aishell/wav/test/S0769/BAC009S0769W0127.wav", "txt": "在丰台樊家村一宗商业用地底价成交后"} -{"key": "BAC009S0769W0128", "wav": "./aishell/wav/test/S0769/BAC009S0769W0128.wav", "txt": "北京今年的土地出让金锁定在两千亿元"} -{"key": "BAC009S0769W0129", "wav": "./aishell/wav/test/S0769/BAC009S0769W0129.wav", "txt": "同比去年增长五成"} -{"key": "BAC009S0769W0130", "wav": "./aishell/wav/test/S0769/BAC009S0769W0130.wav", "txt": "市政府决定今年将全面加快棚户区改造步伐"} -{"key": "BAC009S0769W0131", "wav": "./aishell/wav/test/S0769/BAC009S0769W0131.wav", "txt": "确保完成六万户搬迁改造任务"} -{"key": "BAC009S0769W0132", "wav": "./aishell/wav/test/S0769/BAC009S0769W0132.wav", "txt": "推进上百个棚改项目全面启动实施"} -{"key": "BAC009S0769W0133", "wav": "./aishell/wav/test/S0769/BAC009S0769W0133.wav", "txt": "今年北京要建设筹集各类保障房十万套"} -{"key": "BAC009S0769W0134", "wav": "./aishell/wav/test/S0769/BAC009S0769W0134.wav", "txt": "各区县力争完成十五万套开工任务竣工八万套"} -{"key": "BAC009S0769W0135", "wav": "./aishell/wav/test/S0769/BAC009S0769W0135.wav", "txt": "开工建设公租房不低于三万套"} -{"key": "BAC009S0769W0136", "wav": "./aishell/wav/test/S0769/BAC009S0769W0136.wav", "txt": "为了确保保障房住宅的优良品质"} -{"key": "BAC009S0769W0137", "wav": "./aishell/wav/test/S0769/BAC009S0769W0137.wav", "txt": "北京将继续改进住宅产业化推进方式"} -{"key": "BAC009S0769W0138", "wav": "./aishell/wav/test/S0769/BAC009S0769W0138.wav", "txt": "推行标准化装配式装修"} -{"key": "BAC009S0769W0139", "wav": "./aishell/wav/test/S0769/BAC009S0769W0139.wav", "txt": "前年至今年期间"} -{"key": "BAC009S0769W0140", "wav": "./aishell/wav/test/S0769/BAC009S0769W0140.wav", "txt": "北京要筹集建设各类保障性住房一百万套"} -{"key": "BAC009S0769W0141", "wav": "./aishell/wav/test/S0769/BAC009S0769W0141.wav", "txt": "为改善中低收入家庭住房条件"} -{"key": "BAC009S0769W0142", "wav": "./aishell/wav/test/S0769/BAC009S0769W0142.wav", "txt": "今年北京除了建设保障性住房外"} -{"key": "BAC009S0769W0143", "wav": "./aishell/wav/test/S0769/BAC009S0769W0143.wav", "txt": "还加大棚户区的改造任务"} -{"key": "BAC009S0769W0144", "wav": "./aishell/wav/test/S0769/BAC009S0769W0144.wav", "txt": "各区县各单位要按照下达的任务指标"} -{"key": "BAC009S0769W0145", "wav": "./aishell/wav/test/S0769/BAC009S0769W0145.wav", "txt": "确保完成今年六万户棚户区改造任务"} -{"key": "BAC009S0769W0146", "wav": "./aishell/wav/test/S0769/BAC009S0769W0146.wav", "txt": "今年是十二五规划的收官之年"} -{"key": "BAC009S0769W0147", "wav": "./aishell/wav/test/S0769/BAC009S0769W0147.wav", "txt": "各区县各单位要加强协作配合"} -{"key": "BAC009S0769W0148", "wav": "./aishell/wav/test/S0769/BAC009S0769W0148.wav", "txt": "要重点加大政策支持"} -{"key": "BAC009S0769W0149", "wav": "./aishell/wav/test/S0769/BAC009S0769W0149.wav", "txt": "破解棚户区改造征收瓶颈问题"} -{"key": "BAC009S0769W0150", "wav": "./aishell/wav/test/S0769/BAC009S0769W0150.wav", "txt": "各相关部门要主动服务区县服务各参建单位"} -{"key": "BAC009S0769W0151", "wav": "./aishell/wav/test/S0769/BAC009S0769W0151.wav", "txt": "对于今后棚户区改造中遇到的问题"} -{"key": "BAC009S0769W0152", "wav": "./aishell/wav/test/S0769/BAC009S0769W0152.wav", "txt": "各项目标任务已分解至各区县"} -{"key": "BAC009S0769W0153", "wav": "./aishell/wav/test/S0769/BAC009S0769W0153.wav", "txt": "今年北京将继续加大集体土地建设公租房试点力度"} -{"key": "BAC009S0769W0154", "wav": "./aishell/wav/test/S0769/BAC009S0769W0154.wav", "txt": "加快公租房的配租进度"} -{"key": "BAC009S0769W0155", "wav": "./aishell/wav/test/S0769/BAC009S0769W0155.wav", "txt": "力争配租三万户以上"} -{"key": "BAC009S0769W0156", "wav": "./aishell/wav/test/S0769/BAC009S0769W0156.wav", "txt": "今年北京还将加大社会单位泵租力度"} -{"key": "BAC009S0769W0157", "wav": "./aishell/wav/test/S0769/BAC009S0769W0157.wav", "txt": "市政府决定今年将全面加快棚户区改造步伐"} -{"key": "BAC009S0769W0158", "wav": "./aishell/wav/test/S0769/BAC009S0769W0158.wav", "txt": "确保完成六万户搬迁改造任务"} -{"key": "BAC009S0769W0159", "wav": "./aishell/wav/test/S0769/BAC009S0769W0159.wav", "txt": "今年土地收入预计近四万亿元"} -{"key": "BAC009S0769W0160", "wav": "./aishell/wav/test/S0769/BAC009S0769W0160.wav", "txt": "今年国有土地使用权出让收入四千亿元"} -{"key": "BAC009S0769W0161", "wav": "./aishell/wav/test/S0769/BAC009S0769W0161.wav", "txt": "继前年和去年连续两年突破四万亿元后"} -{"key": "BAC009S0769W0162", "wav": "./aishell/wav/test/S0769/BAC009S0769W0162.wav", "txt": "今年土地收入再维持稳定"} -{"key": "BAC009S0769W0163", "wav": "./aishell/wav/test/S0769/BAC009S0769W0163.wav", "txt": "相关公司股票走势"} -{"key": "BAC009S0769W0164", "wav": "./aishell/wav/test/S0769/BAC009S0769W0164.wav", "txt": "房地产市场竞争加大"} -{"key": "BAC009S0769W0165", "wav": "./aishell/wav/test/S0769/BAC009S0769W0165.wav", "txt": "房企应走差异化路线"} -{"key": "BAC009S0769W0166", "wav": "./aishell/wav/test/S0769/BAC009S0769W0166.wav", "txt": "还有多家机构分析认为"} -{"key": "BAC009S0769W0167", "wav": "./aishell/wav/test/S0769/BAC009S0769W0167.wav", "txt": "政府对今年的土地出让金收入预期下降"} -{"key": "BAC009S0769W0168", "wav": "./aishell/wav/test/S0769/BAC009S0769W0168.wav", "txt": "这暗示房地产的库存大"} -{"key": "BAC009S0769W0169", "wav": "./aishell/wav/test/S0769/BAC009S0769W0169.wav", "txt": "这直接影响到房地产的买地情况"} -{"key": "BAC009S0769W0170", "wav": "./aishell/wav/test/S0769/BAC009S0769W0170.wav", "txt": "相应的房价涨跌"} -{"key": "BAC009S0769W0171", "wav": "./aishell/wav/test/S0769/BAC009S0769W0171.wav", "txt": "如今房地产市场已经供需相对平衡"} -{"key": "BAC009S0769W0172", "wav": "./aishell/wav/test/S0769/BAC009S0769W0172.wav", "txt": "甚至开始进入了供过于求的局面"} -{"key": "BAC009S0769W0173", "wav": "./aishell/wav/test/S0769/BAC009S0769W0173.wav", "txt": "但去年住宅土地成交建筑面积仅十二亿平米"} -{"key": "BAC009S0769W0174", "wav": "./aishell/wav/test/S0769/BAC009S0769W0174.wav", "txt": "远低于去年和前年平均的二十亿平米水平"} -{"key": "BAC009S0769W0175", "wav": "./aishell/wav/test/S0769/BAC009S0769W0175.wav", "txt": "除了开发商的买地行为减少"} -{"key": "BAC009S0769W0176", "wav": "./aishell/wav/test/S0769/BAC009S0769W0176.wav", "txt": "全国房地产库存正在堆积"} -{"key": "BAC009S0769W0177", "wav": "./aishell/wav/test/S0769/BAC009S0769W0177.wav", "txt": "而出让的住宅建筑面积总和至少为一百亿平米"} -{"key": "BAC009S0769W0178", "wav": "./aishell/wav/test/S0769/BAC009S0769W0178.wav", "txt": "约可供销售四年"} -{"key": "BAC009S0769W0179", "wav": "./aishell/wav/test/S0769/BAC009S0769W0179.wav", "txt": "开发商整体在手土地充足"} -{"key": "BAC009S0769W0180", "wav": "./aishell/wav/test/S0769/BAC009S0769W0180.wav", "txt": "瑞银分析师丁晓预测"} -{"key": "BAC009S0769W0181", "wav": "./aishell/wav/test/S0769/BAC009S0769W0181.wav", "txt": "预计明年全国土地市场仍难复苏"} -{"key": "BAC009S0769W0182", "wav": "./aishell/wav/test/S0769/BAC009S0769W0182.wav", "txt": "各路开发商一致看好一线城市房地产市场"} -{"key": "BAC009S0769W0183", "wav": "./aishell/wav/test/S0769/BAC009S0769W0183.wav", "txt": "从一月的一线城市的土地成交看"} -{"key": "BAC009S0769W0184", "wav": "./aishell/wav/test/S0769/BAC009S0769W0184.wav", "txt": "溢价率楼面价均处于高位"} -{"key": "BAC009S0769W0185", "wav": "./aishell/wav/test/S0769/BAC009S0769W0185.wav", "txt": "预计后市一二线城市拿地竞争将更加剧烈"} -{"key": "BAC009S0769W0186", "wav": "./aishell/wav/test/S0769/BAC009S0769W0186.wav", "txt": "中原地产首席市场分析师张大伟告诉南都记者"} -{"key": "BAC009S0769W0187", "wav": "./aishell/wav/test/S0769/BAC009S0769W0187.wav", "txt": "并进而对城投债券进行唱空或做空"} -{"key": "BAC009S0769W0188", "wav": "./aishell/wav/test/S0769/BAC009S0769W0188.wav", "txt": "最近企业债券特别是城投债券的发行难度加大"} -{"key": "BAC009S0769W0189", "wav": "./aishell/wav/test/S0769/BAC009S0769W0189.wav", "txt": "发行利率也有较大幅度上升"} -{"key": "BAC009S0769W0190", "wav": "./aishell/wav/test/S0769/BAC009S0769W0190.wav", "txt": "人民银行多次提高存款准备金率和存贷款基准利率"} -{"key": "BAC009S0769W0191", "wav": "./aishell/wav/test/S0769/BAC009S0769W0191.wav", "txt": "不仅是城投债券发行利率"} -{"key": "BAC009S0769W0192", "wav": "./aishell/wav/test/S0769/BAC009S0769W0192.wav", "txt": "债券市场所有品种发行利率整体上都表现出向上的走向"} -{"key": "BAC009S0769W0193", "wav": "./aishell/wav/test/S0769/BAC009S0769W0193.wav", "txt": "导致城投债券发行产生较高的风险溢价"} -{"key": "BAC009S0769W0194", "wav": "./aishell/wav/test/S0769/BAC009S0769W0194.wav", "txt": "城投债券收益率上升"} -{"key": "BAC009S0769W0195", "wav": "./aishell/wav/test/S0769/BAC009S0769W0195.wav", "txt": "对债券投资人来说不是坏事"} -{"key": "BAC009S0769W0196", "wav": "./aishell/wav/test/S0769/BAC009S0769W0196.wav", "txt": "有利于提升城投债券的资产配置价值"} -{"key": "BAC009S0769W0197", "wav": "./aishell/wav/test/S0769/BAC009S0769W0197.wav", "txt": "则需要在发债时机和发债规模上进行合理的把握"} -{"key": "BAC009S0769W0198", "wav": "./aishell/wav/test/S0769/BAC009S0769W0198.wav", "txt": "我个人不赞成这一判断"} -{"key": "BAC009S0769W0199", "wav": "./aishell/wav/test/S0769/BAC009S0769W0199.wav", "txt": "债券发行人是优质的"} -{"key": "BAC009S0769W0200", "wav": "./aishell/wav/test/S0769/BAC009S0769W0200.wav", "txt": "还本付息也是正常的"} -{"key": "BAC009S0769W0201", "wav": "./aishell/wav/test/S0769/BAC009S0769W0201.wav", "txt": "投资者对城投债券风险表现出的恐慌"} -{"key": "BAC009S0769W0202", "wav": "./aishell/wav/test/S0769/BAC009S0769W0202.wav", "txt": "加强城投债监管完善制度建设"} -{"key": "BAC009S0769W0203", "wav": "./aishell/wav/test/S0769/BAC009S0769W0203.wav", "txt": "有的媒体甚至用井喷来描述"} -{"key": "BAC009S0769W0204", "wav": "./aishell/wav/test/S0769/BAC009S0769W0204.wav", "txt": "您如何看待城投债券这几年的发展和作用"} -{"key": "BAC009S0769W0205", "wav": "./aishell/wav/test/S0769/BAC009S0769W0205.wav", "txt": "这几年城投债券发行数量的确有所增加"} -{"key": "BAC009S0769W0206", "wav": "./aishell/wav/test/S0769/BAC009S0769W0206.wav", "txt": "地方投融资平台公司通过发行债券进行融资"} -{"key": "BAC009S0769W0207", "wav": "./aishell/wav/test/S0769/BAC009S0769W0207.wav", "txt": "符合提高直接融资比重的要求"} -{"key": "BAC009S0769W0208", "wav": "./aishell/wav/test/S0769/BAC009S0769W0208.wav", "txt": "城投债券也适应了发行人和投资人的需要"} -{"key": "BAC009S0769W0209", "wav": "./aishell/wav/test/S0769/BAC009S0769W0209.wav", "txt": "这是这几年城投债券发行规模不断扩大的主要原因"} -{"key": "BAC009S0769W0210", "wav": "./aishell/wav/test/S0769/BAC009S0769W0210.wav", "txt": "我委核准发行的企业债券累计为七千亿元"} -{"key": "BAC009S0769W0211", "wav": "./aishell/wav/test/S0769/BAC009S0769W0211.wav", "txt": "其中城投债券共发行七千亿元"} -{"key": "BAC009S0769W0212", "wav": "./aishell/wav/test/S0769/BAC009S0769W0212.wav", "txt": "占比只有百分之七"} -{"key": "BAC009S0769W0213", "wav": "./aishell/wav/test/S0769/BAC009S0769W0213.wav", "txt": "城投债券的发行有比较严格的条件"} -{"key": "BAC009S0769W0214", "wav": "./aishell/wav/test/S0769/BAC009S0769W0214.wav", "txt": "从已发行的城投债券用途看"} -{"key": "BAC009S0769W0215", "wav": "./aishell/wav/test/S0769/BAC009S0769W0215.wav", "txt": "保障房建设和棚户区改造"} -{"key": "BAC009S0769W0216", "wav": "./aishell/wav/test/S0769/BAC009S0769W0216.wav", "txt": "城市文化和体育设施"} -{"key": "BAC009S0769W0217", "wav": "./aishell/wav/test/S0769/BAC009S0769W0217.wav", "txt": "地震灾后重建等领域"} -{"key": "BAC009S0769W0218", "wav": "./aishell/wav/test/S0769/BAC009S0769W0218.wav", "txt": "都起到了积极的作用"} -{"key": "BAC009S0769W0219", "wav": "./aishell/wav/test/S0769/BAC009S0769W0219.wav", "txt": "随着我国资本市场的进一步发展"} -{"key": "BAC009S0769W0220", "wav": "./aishell/wav/test/S0769/BAC009S0769W0220.wav", "txt": "城投债券作为中国债券市场的准市政债"} -{"key": "BAC009S0769W0221", "wav": "./aishell/wav/test/S0769/BAC009S0769W0221.wav", "txt": "发行规模还会稳步扩大"} -{"key": "BAC009S0769W0222", "wav": "./aishell/wav/test/S0769/BAC009S0769W0222.wav", "txt": "中国证券报面对市场对城投债券风险的担忧"} -{"key": "BAC009S0769W0223", "wav": "./aishell/wav/test/S0769/BAC009S0769W0223.wav", "txt": "是如何更好地防范城投债券可能出现的风险的"} -{"key": "BAC009S0769W0224", "wav": "./aishell/wav/test/S0769/BAC009S0769W0224.wav", "txt": "虽然已发行的城投债券的还本付息都是正常的"} -{"key": "BAC009S0769W0225", "wav": "./aishell/wav/test/S0769/BAC009S0769W0225.wav", "txt": "城投债作为一个信用产品"} -{"key": "BAC009S0769W0226", "wav": "./aishell/wav/test/S0769/BAC009S0769W0226.wav", "txt": "不可能是完全无风险的"} -{"key": "BAC009S0769W0227", "wav": "./aishell/wav/test/S0769/BAC009S0769W0227.wav", "txt": "我看了以后很受震动"} -{"key": "BAC009S0769W0228", "wav": "./aishell/wav/test/S0769/BAC009S0769W0228.wav", "txt": "虽然报道内容并没有具体的城投债券还本付息违约案"} -{"key": "BAC009S0769W0229", "wav": "./aishell/wav/test/S0769/BAC009S0769W0229.wav", "txt": "但却提醒了我们要更加关注城投债券可能出现的风险"} -{"key": "BAC009S0769W0230", "wav": "./aishell/wav/test/S0769/BAC009S0769W0230.wav", "txt": "并采取措施切实保护债券投资人的合法权益"} -{"key": "BAC009S0769W0231", "wav": "./aishell/wav/test/S0769/BAC009S0769W0231.wav", "txt": "作为城投债券发行监管部门"} -{"key": "BAC009S0769W0232", "wav": "./aishell/wav/test/S0769/BAC009S0769W0232.wav", "txt": "我们对城投债券发行人的审核一直是比较严格的"} -{"key": "BAC009S0769W0233", "wav": "./aishell/wav/test/S0769/BAC009S0769W0233.wav", "txt": "地方投融资平台公司申请发行债券"} -{"key": "BAC009S0769W0234", "wav": "./aishell/wav/test/S0769/BAC009S0769W0234.wav", "txt": "必须符合一些基本的条件企业必须连续三年盈利"} -{"key": "BAC009S0769W0235", "wav": "./aishell/wav/test/S0769/BAC009S0769W0235.wav", "txt": "所投项目必须经过合规性审查"} -{"key": "BAC009S0769W0236", "wav": "./aishell/wav/test/S0769/BAC009S0769W0236.wav", "txt": "我们还控制了投融资平台公司发债的范围"} -{"key": "BAC009S0769W0237", "wav": "./aishell/wav/test/S0769/BAC009S0769W0237.wav", "txt": "才能申请发行城投债券"} -{"key": "BAC009S0769W0238", "wav": "./aishell/wav/test/S0769/BAC009S0769W0238.wav", "txt": "就不得再通过发行城投债券新增政府性债务"} -{"key": "BAC009S0769W0239", "wav": "./aishell/wav/test/S0769/BAC009S0769W0239.wav", "txt": "正是有了这样一些严格的规定"} -{"key": "BAC009S0769W0240", "wav": "./aishell/wav/test/S0769/BAC009S0769W0240.wav", "txt": "使得很多投融资平台公司"} -{"key": "BAC009S0769W0241", "wav": "./aishell/wav/test/S0769/BAC009S0769W0241.wav", "txt": "难以满足发行城投债券的资格和条件"} -{"key": "BAC009S0769W0242", "wav": "./aishell/wav/test/S0769/BAC009S0769W0242.wav", "txt": "这在相当程度上控制了城投债劵的发行规模"} -{"key": "BAC009S0769W0243", "wav": "./aishell/wav/test/S0769/BAC009S0769W0243.wav", "txt": "也降低了城投债劵的风险"} -{"key": "BAC009S0769W0244", "wav": "./aishell/wav/test/S0769/BAC009S0769W0244.wav", "txt": "为了控制地方政府本届发债下届还钱的道德风险"} -{"key": "BAC009S0769W0245", "wav": "./aishell/wav/test/S0769/BAC009S0769W0245.wav", "txt": "我们还安排了专门的偿债均摊机制"} -{"key": "BAC009S0769W0246", "wav": "./aishell/wav/test/S0769/BAC009S0769W0246.wav", "txt": "也就是将债劵还本压力在债劵存续期内进行合理分摊"} -{"key": "BAC009S0769W0247", "wav": "./aishell/wav/test/S0769/BAC009S0769W0247.wav", "txt": "避免在最后一年累积过大的还本压力和风险"} -{"key": "BAC009S0769W0248", "wav": "./aishell/wav/test/S0769/BAC009S0769W0248.wav", "txt": "有媒体报道了云投集团等发债企业转移核心资产"} -{"key": "BAC009S0769W0249", "wav": "./aishell/wav/test/S0769/BAC009S0769W0249.wav", "txt": "损害债劵持有人利益的事件"} -{"key": "BAC009S0769W0250", "wav": "./aishell/wav/test/S0769/BAC009S0769W0250.wav", "txt": "并对债券市场形成了不小的冲击"} -{"key": "BAC009S0769W0251", "wav": "./aishell/wav/test/S0769/BAC009S0769W0251.wav", "txt": "你们如何考虑防止这类事件再次发生"} -{"key": "BAC009S0769W0252", "wav": "./aishell/wav/test/S0769/BAC009S0769W0252.wav", "txt": "更好地保护债券投资人的利益"} -{"key": "BAC009S0769W0253", "wav": "./aishell/wav/test/S0769/BAC009S0769W0253.wav", "txt": "据新华社电有病当然要吃药"} -{"key": "BAC009S0769W0254", "wav": "./aishell/wav/test/S0769/BAC009S0769W0254.wav", "txt": "但吃下去的药能否真正作用到病灶就很难说了"} -{"key": "BAC009S0769W0255", "wav": "./aishell/wav/test/S0769/BAC009S0769W0255.wav", "txt": "通过它能够实现药物的精准投送"} -{"key": "BAC009S0769W0256", "wav": "./aishell/wav/test/S0769/BAC009S0769W0256.wav", "txt": "他们开发出一种只有二十微米长的机器人"} -{"key": "BAC009S0769W0257", "wav": "./aishell/wav/test/S0769/BAC009S0769W0257.wav", "txt": "这个机器人由高分子材料制成"} -{"key": "BAC009S0769W0258", "wav": "./aishell/wav/test/S0769/BAC009S0769W0258.wav", "txt": "当它进入动物胃部时"} -{"key": "BAC009S0769W0259", "wav": "./aishell/wav/test/S0769/BAC009S0769W0259.wav", "txt": "锌就会与胃酸发生反应"} -{"key": "BAC009S0769W0260", "wav": "./aishell/wav/test/S0769/BAC009S0769W0260.wav", "txt": "从而推动机器人在胃部前行"} -{"key": "BAC009S0769W0261", "wav": "./aishell/wav/test/S0769/BAC009S0769W0261.wav", "txt": "这种技术很适合用来治疗胃溃疡等胃部疾病"} -{"key": "BAC009S0769W0262", "wav": "./aishell/wav/test/S0769/BAC009S0769W0262.wav", "txt": "高效精准投送药物不仅可降低用药量"} -{"key": "BAC009S0769W0263", "wav": "./aishell/wav/test/S0769/BAC009S0769W0263.wav", "txt": "这项技术离临床应用还有一段距离"} -{"key": "BAC009S0769W0264", "wav": "./aishell/wav/test/S0769/BAC009S0769W0264.wav", "txt": "据新华社电有病当然要吃药"} -{"key": "BAC009S0769W0265", "wav": "./aishell/wav/test/S0769/BAC009S0769W0265.wav", "txt": "但吃下去的药能否真正作用到病灶就很难说了"} -{"key": "BAC009S0769W0266", "wav": "./aishell/wav/test/S0769/BAC009S0769W0266.wav", "txt": "美国政府部门当地时间周四警示称"} -{"key": "BAC009S0769W0267", "wav": "./aishell/wav/test/S0769/BAC009S0769W0267.wav", "txt": "苹果设备的用户应当注意"} -{"key": "BAC009S0769W0269", "wav": "./aishell/wav/test/S0769/BAC009S0769W0269.wav", "txt": "不要在弹出窗口点击安装打开应用时"} -{"key": "BAC009S0769W0271", "wav": "./aishell/wav/test/S0769/BAC009S0769W0271.wav", "txt": "苹果公司也在第一时间发布官方声明"} -{"key": "BAC009S0769W0273", "wav": "./aishell/wav/test/S0769/BAC009S0769W0273.wav", "txt": "还没有任何一个用户真正遭受过此攻击"} -{"key": "BAC009S0769W0274", "wav": "./aishell/wav/test/S0769/BAC009S0769W0274.wav", "txt": "我们鼓励用户只从可信任的渠道"} -{"key": "BAC009S0769W0276", "wav": "./aishell/wav/test/S0769/BAC009S0769W0276.wav", "txt": "并注意下载过程中的任何警告"} -{"key": "BAC009S0769W0277", "wav": "./aishell/wav/test/S0769/BAC009S0769W0277.wav", "txt": "企业用户在安装定制应用程序时"} -{"key": "BAC009S0769W0278", "wav": "./aishell/wav/test/S0769/BAC009S0769W0278.wav", "txt": "须从他们公司的安全网站上进行下载并安装"} -{"key": "BAC009S0769W0279", "wav": "./aishell/wav/test/S0769/BAC009S0769W0279.wav", "txt": "美国政府部门当地时间周四警示称"} -{"key": "BAC009S0769W0280", "wav": "./aishell/wav/test/S0769/BAC009S0769W0280.wav", "txt": "苹果设备的用户应当注意"} -{"key": "BAC009S0769W0283", "wav": "./aishell/wav/test/S0769/BAC009S0769W0283.wav", "txt": "据新华社电印度官员透露"} -{"key": "BAC009S0769W0284", "wav": "./aishell/wav/test/S0769/BAC009S0769W0284.wav", "txt": "美国将向印度转让两项军事技术"} -{"key": "BAC009S0769W0285", "wav": "./aishell/wav/test/S0769/BAC009S0769W0285.wav", "txt": "其中包括美国大鸦无人机今后将由印度工厂制造"} -{"key": "BAC009S0769W0286", "wav": "./aishell/wav/test/S0769/BAC009S0769W0286.wav", "txt": "印度斯坦时报二十四日援引消息人士的话报道"} -{"key": "BAC009S0769W0287", "wav": "./aishell/wav/test/S0769/BAC009S0769W0287.wav", "txt": "大鸦无人机由美国航空环境公司研制"} -{"key": "BAC009S0769W0288", "wav": "./aishell/wav/test/S0769/BAC009S0769W0288.wav", "txt": "由士兵直接用手投掷起飞"} -{"key": "BAC009S0769W0289", "wav": "./aishell/wav/test/S0769/BAC009S0769W0289.wav", "txt": "二零零三年以来在阿富汗得到了广泛应用"} -{"key": "BAC009S0769W0290", "wav": "./aishell/wav/test/S0769/BAC009S0769W0290.wav", "txt": "预计从二零一五年下半年开始"} -{"key": "BAC009S0769W0291", "wav": "./aishell/wav/test/S0769/BAC009S0769W0291.wav", "txt": "美国将不再生产大鸦无人机"} -{"key": "BAC009S0769W0292", "wav": "./aishell/wav/test/S0769/BAC009S0769W0292.wav", "txt": "改由设在印度本加卢鲁的一家美印合资公司生产"} -{"key": "BAC009S0769W0293", "wav": "./aishell/wav/test/S0769/BAC009S0769W0293.wav", "txt": "一名印度高级官员透露"} -{"key": "BAC009S0769W0294", "wav": "./aishell/wav/test/S0769/BAC009S0769W0294.wav", "txt": "眼下已有七个国家打算购买大鸦无人机"} -{"key": "BAC009S0769W0295", "wav": "./aishell/wav/test/S0769/BAC009S0769W0295.wav", "txt": "预计订单总额为三十亿美元"} -{"key": "BAC009S0769W0296", "wav": "./aishell/wav/test/S0769/BAC009S0769W0296.wav", "txt": "美国航空环境公司停止生产大鸦无人机后"} -{"key": "BAC009S0769W0297", "wav": "./aishell/wav/test/S0769/BAC009S0769W0297.wav", "txt": "印方工厂将继续完成剩馀订单"} -{"key": "BAC009S0769W0298", "wav": "./aishell/wav/test/S0769/BAC009S0769W0298.wav", "txt": "此外还将与美方联手研制一款升级版大鸦无人机"} -{"key": "BAC009S0769W0299", "wav": "./aishell/wav/test/S0769/BAC009S0769W0299.wav", "txt": "该技术可用于识别隐藏于伪装下的目标"} -{"key": "BAC009S0769W0300", "wav": "./aishell/wav/test/S0769/BAC009S0769W0300.wav", "txt": "从而把运输机转化为更为复杂的远程侦察机"} -{"key": "BAC009S0769W0301", "wav": "./aishell/wav/test/S0769/BAC009S0769W0301.wav", "txt": "美国外交消息人士透露"} -{"key": "BAC009S0769W0302", "wav": "./aishell/wav/test/S0769/BAC009S0769W0302.wav", "txt": "肯德尔定于二月二十三日访问印度"} -{"key": "BAC009S0769W0303", "wav": "./aishell/wav/test/S0769/BAC009S0769W0303.wav", "txt": "且达到情节特别严重程度"} -{"key": "BAC009S0769W0304", "wav": "./aishell/wav/test/S0769/BAC009S0769W0304.wav", "txt": "故依法裁定驳回上诉"} -{"key": "BAC009S0769W0306", "wav": "./aishell/wav/test/S0769/BAC009S0769W0306.wav", "txt": "从而获取用户信息的案件"} -{"key": "BAC009S0769W0308", "wav": "./aishell/wav/test/S0769/BAC009S0769W0308.wav", "txt": "虽然工信部很快就删除了后半句话"} -{"key": "BAC009S0769W0309", "wav": "./aishell/wav/test/S0769/BAC009S0769W0309.wav", "txt": "但还是引发业内广泛关注"} -{"key": "BAC009S0769W0310", "wav": "./aishell/wav/test/S0769/BAC009S0769W0310.wav", "txt": "这种宣传方式的目的是什么"} -{"key": "BAC009S0769W0311", "wav": "./aishell/wav/test/S0769/BAC009S0769W0311.wav", "txt": "截至中国经营报记者发稿前"} -{"key": "BAC009S0769W0312", "wav": "./aishell/wav/test/S0769/BAC009S0769W0312.wav", "txt": "浪潮官方尚未给出回应"} -{"key": "BAC009S0769W0313", "wav": "./aishell/wav/test/S0769/BAC009S0769W0313.wav", "txt": "旗下拥有浪潮信息浪潮软件浪潮国际三家上市公司"} -{"key": "BAC009S0769W0314", "wav": "./aishell/wav/test/S0769/BAC009S0769W0314.wav", "txt": "尽管政府对国产品牌有一定扶持"} -{"key": "BAC009S0769W0315", "wav": "./aishell/wav/test/S0769/BAC009S0769W0315.wav", "txt": "浪潮的发展也有可圈可可点之处"} -{"key": "BAC009S0769W0318", "wav": "./aishell/wav/test/S0769/BAC009S0769W0318.wav", "txt": "浪潮信息的研发支出约四亿元"} -{"key": "BAC009S0769W0319", "wav": "./aishell/wav/test/S0769/BAC009S0769W0319.wav", "txt": "占营业收入的比例是五点百分之四十七"} -{"key": "BAC009S0769W0320", "wav": "./aishell/wav/test/S0769/BAC009S0769W0320.wav", "txt": "较上年同期增长八十四点百分之三十九"} -{"key": "BAC009S0769W0321", "wav": "./aishell/wav/test/S0769/BAC009S0769W0321.wav", "txt": "研发支出主要用于服务器产品的研究开发和升级换代"} -{"key": "BAC009S0769W0322", "wav": "./aishell/wav/test/S0769/BAC009S0769W0322.wav", "txt": "研发投入是一个刚性指标"} -{"key": "BAC009S0769W0323", "wav": "./aishell/wav/test/S0769/BAC009S0769W0323.wav", "txt": "与技术的更新换代速度还是有相关性"} -{"key": "BAC009S0769W0324", "wav": "./aishell/wav/test/S0769/BAC009S0769W0324.wav", "txt": "国内几个服务器品牌的盘子还比较小"} -{"key": "BAC009S0769W0325", "wav": "./aishell/wav/test/S0769/BAC009S0769W0325.wav", "txt": "他们的硬件技术研发等力量"} -{"key": "BAC009S0769W0326", "wav": "./aishell/wav/test/S0769/BAC009S0769W0326.wav", "txt": "经验积累不足也是一个大问题"} -{"key": "BAC009S0769W0327", "wav": "./aishell/wav/test/S0769/BAC009S0769W0327.wav", "txt": "国产服务器即使是自主设计"} -{"key": "BAC009S0769W0329", "wav": "./aishell/wav/test/S0769/BAC009S0769W0329.wav", "txt": "核心架构也基本照抄国外厂商"} -{"key": "BAC009S0769W0330", "wav": "./aishell/wav/test/S0769/BAC009S0769W0330.wav", "txt": "在中低端市场或占有相应份额"} -{"key": "BAC009S0769W0331", "wav": "./aishell/wav/test/S0769/BAC009S0769W0331.wav", "txt": "但高端市场仍然难以企及"} -{"key": "BAC009S0769W0332", "wav": "./aishell/wav/test/S0769/BAC009S0769W0332.wav", "txt": "一位股份制银行科技部负责人如此讲述"} -{"key": "BAC009S0769W0334", "wav": "./aishell/wav/test/S0769/BAC009S0769W0334.wav", "txt": "国内厂商在高端核心技术上普遍存有差距"} -{"key": "BAC009S0769W0336", "wav": "./aishell/wav/test/S0769/BAC009S0769W0336.wav", "txt": "浪潮高管在接受媒体采访时表示"} -{"key": "BAC009S0769W0337", "wav": "./aishell/wav/test/S0769/BAC009S0769W0337.wav", "txt": "浪潮将通过产品渠道服务价格的全方位发力"} -{"key": "BAC009S0769W0338", "wav": "./aishell/wav/test/S0769/BAC009S0769W0338.wav", "txt": "一百米栏辛迪罗勒德尔"} -{"key": "BAC009S0769W0339", "wav": "./aishell/wav/test/S0769/BAC009S0769W0339.wav", "txt": "三千米障碍吉萨费里欣塔斯卡鲁塞"} -{"key": "BAC009S0769W0340", "wav": "./aishell/wav/test/S0769/BAC009S0769W0340.wav", "txt": "跳高玛丽劳伦斯荣格菲利斯"} -{"key": "BAC009S0769W0341", "wav": "./aishell/wav/test/S0769/BAC009S0769W0341.wav", "txt": "撑杆跳丽萨莱兹奇"} -{"key": "BAC009S0769W0342", "wav": "./aishell/wav/test/S0769/BAC009S0769W0342.wav", "txt": "跳远莱纳马尔库斯"} -{"key": "BAC009S0769W0343", "wav": "./aishell/wav/test/S0769/BAC009S0769W0343.wav", "txt": "三级跳克里斯丁吉尔奇"} -{"key": "BAC009S0769W0344", "wav": "./aishell/wav/test/S0769/BAC009S0769W0344.wav", "txt": "铅球克里斯蒂娜斯齐万兹"} -{"key": "BAC009S0769W0345", "wav": "./aishell/wav/test/S0769/BAC009S0769W0345.wav", "txt": "铁饼沙尼斯克拉夫特"} -{"key": "BAC009S0769W0346", "wav": "./aishell/wav/test/S0769/BAC009S0769W0346.wav", "txt": "链球贝蒂海德尔"} -{"key": "BAC009S0769W0347", "wav": "./aishell/wav/test/S0769/BAC009S0769W0347.wav", "txt": "标枪克里斯丁胡宋"} -{"key": "BAC009S0769W0348", "wav": "./aishell/wav/test/S0769/BAC009S0769W0348.wav", "txt": "克里斯蒂娜奥伯福尔"} -{"key": "BAC009S0769W0349", "wav": "./aishell/wav/test/S0769/BAC009S0769W0349.wav", "txt": "全能詹妮弗奥赛尔"} -{"key": "BAC009S0769W0350", "wav": "./aishell/wav/test/S0769/BAC009S0769W0350.wav", "txt": "四乘一百米接力亚历山大布尔格哈德特"} -{"key": "BAC009S0769W0351", "wav": "./aishell/wav/test/S0769/BAC009S0769W0351.wav", "txt": "安娜莱纳法拉塞"} -{"key": "BAC009S0769W0352", "wav": "./aishell/wav/test/S0769/BAC009S0769W0352.wav", "txt": "吉娜卢克肯科姆普尔"} -{"key": "BAC009S0769W0353", "wav": "./aishell/wav/test/S0769/BAC009S0769W0353.wav", "txt": "孙杨因心脏不适退出一千五百米自由泳决赛"} -{"key": "BAC009S0769W0354", "wav": "./aishell/wav/test/S0769/BAC009S0769W0354.wav", "txt": "无疑是刚刚结束的喀山世锦赛最大的遗憾"} -{"key": "BAC009S0769W0355", "wav": "./aishell/wav/test/S0769/BAC009S0769W0355.wav", "txt": "孙杨在一千五百米自由泳上的实力不容置疑"} -{"key": "BAC009S0769W0356", "wav": "./aishell/wav/test/S0769/BAC009S0769W0356.wav", "txt": "而这一次击败他的不是对手"} -{"key": "BAC009S0769W0357", "wav": "./aishell/wav/test/S0769/BAC009S0769W0357.wav", "txt": "孙杨的心脏不适早就不是秘密"} -{"key": "BAC009S0769W0358", "wav": "./aishell/wav/test/S0769/BAC009S0769W0358.wav", "txt": "是孙杨在二零一四年因治疗心脏不适"} -{"key": "BAC009S0769W0359", "wav": "./aishell/wav/test/S0769/BAC009S0769W0359.wav", "txt": "误服曲美他嗪导致兴奋剂检测呈阳性遭禁赛"} -{"key": "BAC009S0769W0360", "wav": "./aishell/wav/test/S0769/BAC009S0769W0360.wav", "txt": "正是治疗他心悸不适症状的"} -{"key": "BAC009S0769W0361", "wav": "./aishell/wav/test/S0769/BAC009S0769W0361.wav", "txt": "也第一次被媒体关注"} -{"key": "BAC009S0769W0362", "wav": "./aishell/wav/test/S0769/BAC009S0769W0362.wav", "txt": "记者从浙江省游泳协会了解到"} -{"key": "BAC009S0769W0363", "wav": "./aishell/wav/test/S0769/BAC009S0769W0363.wav", "txt": "孙杨就出现过心脏问题"} -{"key": "BAC009S0769W0364", "wav": "./aishell/wav/test/S0769/BAC009S0769W0364.wav", "txt": "孙杨因感冒后出现了胸闷心悸不适等症状"} -{"key": "BAC009S0769W0365", "wav": "./aishell/wav/test/S0769/BAC009S0769W0365.wav", "txt": "专家会诊之后认为孙杨存在心肌缺血情况"} -{"key": "BAC009S0769W0366", "wav": "./aishell/wav/test/S0769/BAC009S0769W0366.wav", "txt": "与感冒病毒感染损伤心肌有关"} -{"key": "BAC009S0769W0367", "wav": "./aishell/wav/test/S0769/BAC009S0769W0367.wav", "txt": "予服用处方药以治疗心肌缺血保护心肌"} -{"key": "BAC009S0769W0368", "wav": "./aishell/wav/test/S0769/BAC009S0769W0368.wav", "txt": "孙杨的心肌损伤是在感冒后引发的"} -{"key": "BAC009S0769W0369", "wav": "./aishell/wav/test/S0769/BAC009S0769W0369.wav", "txt": "心肌同位素扫描显示局部灌注差"} -{"key": "BAC009S0769W0370", "wav": "./aishell/wav/test/S0769/BAC009S0769W0370.wav", "txt": "达到保护心脏的作用"} -{"key": "BAC009S0769W0371", "wav": "./aishell/wav/test/S0769/BAC009S0769W0371.wav", "txt": "是去年备战亚运会选拔赛期间"} -{"key": "BAC009S0769W0372", "wav": "./aishell/wav/test/S0769/BAC009S0769W0372.wav", "txt": "直到二零一四年四月才解禁复出"} -{"key": "BAC009S0769W0373", "wav": "./aishell/wav/test/S0769/BAC009S0769W0373.wav", "txt": "尽管期间孙杨的训练并没有中断"} -{"key": "BAC009S0769W0374", "wav": "./aishell/wav/test/S0769/BAC009S0769W0374.wav", "txt": "但训练量几乎和正常时不可同日而语"} -{"key": "BAC009S0769W0375", "wav": "./aishell/wav/test/S0769/BAC009S0769W0375.wav", "txt": "为了备战亚运会选拔赛"} -{"key": "BAC009S0769W0376", "wav": "./aishell/wav/test/S0769/BAC009S0769W0376.wav", "txt": "在世锦赛决赛检录前突感不适"} -{"key": "BAC009S0769W0377", "wav": "./aishell/wav/test/S0769/BAC009S0769W0377.wav", "txt": "也是孙杨整个比赛期间疲劳所致"} -{"key": "BAC009S0769W0378", "wav": "./aishell/wav/test/S0769/BAC009S0769W0378.wav", "txt": "从四百米预赛到最后的一千五百米预赛"} -{"key": "BAC009S0769W0379", "wav": "./aishell/wav/test/S0769/BAC009S0769W0379.wav", "txt": "二百米的高强度无氧到一千五百米的有氧"} -{"key": "BAC009S0769W0380", "wav": "./aishell/wav/test/S0769/BAC009S0769W0380.wav", "txt": "师姐罗雪娟也忍不住落泪"} -{"key": "BAC009S0769W0381", "wav": "./aishell/wav/test/S0769/BAC009S0769W0381.wav", "txt": "回忆起自己从前训练时因心脏不适被抢救的事"} -{"key": "BAC009S0769W0382", "wav": "./aishell/wav/test/S0769/BAC009S0769W0382.wav", "txt": "更大的战场还在里约"} -{"key": "BAC009S0769W0383", "wav": "./aishell/wav/test/S0769/BAC009S0769W0383.wav", "txt": "华西都市报记者陈甘露"} -{"key": "BAC009S0769W0384", "wav": "./aishell/wav/test/S0769/BAC009S0769W0384.wav", "txt": "二零零八年北京奥运会时"} -{"key": "BAC009S0769W0385", "wav": "./aishell/wav/test/S0769/BAC009S0769W0385.wav", "txt": "曾经在鸟巢服务的志愿者们"} -{"key": "BAC009S0769W0386", "wav": "./aishell/wav/test/S0769/BAC009S0769W0386.wav", "txt": "顶级田径赛事再次落户鸟巢"} -{"key": "BAC009S0769W0387", "wav": "./aishell/wav/test/S0769/BAC009S0769W0387.wav", "txt": "如今为这次赛事服务的志愿者们更为年轻"} -{"key": "BAC009S0769W0388", "wav": "./aishell/wav/test/S0769/BAC009S0769W0388.wav", "txt": "他们几乎都是九零后"} -{"key": "BAC009S0769W0389", "wav": "./aishell/wav/test/S0769/BAC009S0769W0389.wav", "txt": "这批志愿者也被称为新鸟巢一代"} -{"key": "BAC009S0769W0390", "wav": "./aishell/wav/test/S0769/BAC009S0769W0390.wav", "txt": "而他们已经为这次田径世锦赛做好了准备"} -{"key": "BAC009S0769W0391", "wav": "./aishell/wav/test/S0769/BAC009S0769W0391.wav", "txt": "并要为国内外运动员献上一张张北京最美的名片"} -{"key": "BAC009S0769W0392", "wav": "./aishell/wav/test/S0769/BAC009S0769W0392.wav", "txt": "在每次大型赛事中志愿者都是必不可少的一部分"} -{"key": "BAC009S0769W0393", "wav": "./aishell/wav/test/S0769/BAC009S0769W0393.wav", "txt": "他们也是历届大赛的一个亮点"} -{"key": "BAC009S0769W0394", "wav": "./aishell/wav/test/S0769/BAC009S0769W0394.wav", "txt": "总共有二千七百六十人来为这项大赛志愿服务"} -{"key": "BAC009S0769W0395", "wav": "./aishell/wav/test/S0769/BAC009S0769W0395.wav", "txt": "他们最大的特点就是九零后占主角"} -{"key": "BAC009S0769W0396", "wav": "./aishell/wav/test/S0769/BAC009S0769W0396.wav", "txt": "比例超过百分之九十四的志愿者是九零后"} -{"key": "BAC009S0769W0397", "wav": "./aishell/wav/test/S0769/BAC009S0769W0397.wav", "txt": "在今年世锦赛的志愿者中"} -{"key": "BAC009S0769W0398", "wav": "./aishell/wav/test/S0769/BAC009S0769W0398.wav", "txt": "有的人还会八国语言"} -{"key": "BAC009S0769W0399", "wav": "./aishell/wav/test/S0769/BAC009S0769W0399.wav", "txt": "志愿者除了要具备流利的英语交流能力外"} -{"key": "BAC009S0769W0400", "wav": "./aishell/wav/test/S0769/BAC009S0769W0400.wav", "txt": "还要求具备大型赛会或日常从事社会志愿服务的经验"} -{"key": "BAC009S0769W0401", "wav": "./aishell/wav/test/S0769/BAC009S0769W0401.wav", "txt": "北京青年报记者昨日在鸟巢采访了一些志愿者"} -{"key": "BAC009S0769W0402", "wav": "./aishell/wav/test/S0769/BAC009S0769W0402.wav", "txt": "发现他们中间真有不少是志愿达人"} -{"key": "BAC009S0769W0403", "wav": "./aishell/wav/test/S0769/BAC009S0769W0403.wav", "txt": "例如在竞赛部赛后控制中心的陈田希"} -{"key": "BAC009S0769W0404", "wav": "./aishell/wav/test/S0769/BAC009S0769W0404.wav", "txt": "也等待了很长时间"} -{"key": "BAC009S0769W0405", "wav": "./aishell/wav/test/S0769/BAC009S0769W0405.wav", "txt": "但最终却因为出品公司相对论影业申请破产"} -{"key": "BAC009S0769W0406", "wav": "./aishell/wav/test/S0769/BAC009S0769W0406.wav", "txt": "而不得不离开这个项目"} -{"key": "BAC009S0769W0407", "wav": "./aishell/wav/test/S0769/BAC009S0769W0407.wav", "txt": "乌鸦在没有其他公司愿意接手的情况下"} -{"key": "BAC009S0769W0408", "wav": "./aishell/wav/test/S0769/BAC009S0769W0408.wav", "txt": "谈到前日爸爸谢贤在宣传活动上出手打曾江"} -{"key": "BAC009S0769W0409", "wav": "./aishell/wav/test/S0769/BAC009S0769W0409.wav", "txt": "婷婷指收到消息时正在拍摄广告"} -{"key": "BAC009S0769W0410", "wav": "./aishell/wav/test/S0769/BAC009S0769W0410.wav", "txt": "亦未联络到爸爸了解他不是一个常打架的人"} -{"key": "BAC009S0769W0411", "wav": "./aishell/wav/test/S0769/BAC009S0769W0411.wav", "txt": "他是一个大人"} -{"key": "BAC009S0769W0412", "wav": "./aishell/wav/test/S0769/BAC009S0769W0412.wav", "txt": "他一定有他的原因"} -{"key": "BAC009S0769W0413", "wav": "./aishell/wav/test/S0769/BAC009S0769W0413.wav", "txt": "又指自己未试过受爸爸体罚"} -{"key": "BAC009S0769W0415", "wav": "./aishell/wav/test/S0769/BAC009S0769W0415.wav", "txt": "婷婷就坦言靠传媒得知"} -{"key": "BAC009S0769W0416", "wav": "./aishell/wav/test/S0769/BAC009S0769W0416.wav", "txt": "但会给哥哥谢霆锋传短信了解情况"} -{"key": "BAC009S0769W0417", "wav": "./aishell/wav/test/S0769/BAC009S0769W0417.wav", "txt": "中新网六月二十四日电六月二十三日"} -{"key": "BAC009S0769W0418", "wav": "./aishell/wav/test/S0769/BAC009S0769W0418.wav", "txt": "谢霆锋妹妹谢婷婷在微博晒出与父亲合影"} -{"key": "BAC009S0769W0419", "wav": "./aishell/wav/test/S0769/BAC009S0769W0419.wav", "txt": "谢婷婷将头挨着父亲的头"} -{"key": "BAC009S0769W0420", "wav": "./aishell/wav/test/S0769/BAC009S0769W0420.wav", "txt": "二人一脸笑容"} -{"key": "BAC009S0769W0421", "wav": "./aishell/wav/test/S0769/BAC009S0769W0421.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0769W0422", "wav": "./aishell/wav/test/S0769/BAC009S0769W0422.wav", "txt": "艺人谢婷婷从小就成为媒体焦点"} -{"key": "BAC009S0769W0423", "wav": "./aishell/wav/test/S0769/BAC009S0769W0423.wav", "txt": "而有鬼妹仔性格的婷婷不时以性感打扮亮相"} -{"key": "BAC009S0769W0424", "wav": "./aishell/wav/test/S0769/BAC009S0769W0424.wav", "txt": "她去游泳解暑"} -{"key": "BAC009S0769W0425", "wav": "./aishell/wav/test/S0769/BAC009S0769W0425.wav", "txt": "还在网上分享身穿比基尼泳装照"} -{"key": "BAC009S0769W0426", "wav": "./aishell/wav/test/S0769/BAC009S0769W0426.wav", "txt": "这种天气很适合搞池边派对"} -{"key": "BAC009S0769W0427", "wav": "./aishell/wav/test/S0769/BAC009S0769W0427.wav", "txt": "中新网五月二十一日报道据香港明报消息"} -{"key": "BAC009S0769W0428", "wav": "./aishell/wav/test/S0769/BAC009S0769W0428.wav", "txt": "谢婷婷为服装拍摄时装宣传照"} -{"key": "BAC009S0769W0429", "wav": "./aishell/wav/test/S0769/BAC009S0769W0429.wav", "txt": "她透露现在父母哥哥谢霆锋都各忙各的"} -{"key": "BAC009S0769W0430", "wav": "./aishell/wav/test/S0769/BAC009S0769W0430.wav", "txt": "一家人很难有机会团聚"} -{"key": "BAC009S0769W0431", "wav": "./aishell/wav/test/S0769/BAC009S0769W0431.wav", "txt": "施王祥被陆丰市纪委立案调查"} -{"key": "BAC009S0769W0432", "wav": "./aishell/wav/test/S0769/BAC009S0769W0432.wav", "txt": "二零一三一二二六"} -{"key": "BAC009S0769W0433", "wav": "./aishell/wav/test/S0769/BAC009S0769W0433.wav", "txt": "二零一四三七"} -{"key": "BAC009S0769W0434", "wav": "./aishell/wav/test/S0769/BAC009S0769W0434.wav", "txt": "陆丰市纪委决定给予施王祥开除党籍处分"} -{"key": "BAC009S0769W0435", "wav": "./aishell/wav/test/S0769/BAC009S0769W0435.wav", "txt": "南粤清风网通报该案详情"} -{"key": "BAC009S0769W0436", "wav": "./aishell/wav/test/S0769/BAC009S0769W0436.wav", "txt": "村官遭判刑处罚证据涉嫌造假公检法自查迟迟无果"} -{"key": "BAC009S0769W0437", "wav": "./aishell/wav/test/S0769/BAC009S0769W0437.wav", "txt": "山西省临汾市尧都区刘村镇刘南村一零名村干部"} -{"key": "BAC009S0769W0438", "wav": "./aishell/wav/test/S0769/BAC009S0769W0438.wav", "txt": "因决定取消刁天恩的土地承包合同移栽地上树苗"} -{"key": "BAC009S0769W0439", "wav": "./aishell/wav/test/S0769/BAC009S0769W0439.wav", "txt": "被法院以故意毁坏财物罪判刑或处罚"} -{"key": "BAC009S0769W0440", "wav": "./aishell/wav/test/S0769/BAC009S0769W0440.wav", "txt": "村小老师自掏腰包八零零零元为贫困生设奖学金"} -{"key": "BAC009S0769W0441", "wav": "./aishell/wav/test/S0769/BAC009S0769W0441.wav", "txt": "薛孝文在学生家中家访"} -{"key": "BAC009S0769W0442", "wav": "./aishell/wav/test/S0769/BAC009S0769W0442.wav", "txt": "从金堂县城驱车一个半小时至土桥镇的大禹村"} -{"key": "BAC009S0769W0443", "wav": "./aishell/wav/test/S0769/BAC009S0769W0443.wav", "txt": "就到了薛孝文任教的学校金堂县平桥学校"} -{"key": "BAC009S0769W0444", "wav": "./aishell/wav/test/S0769/BAC009S0769W0444.wav", "txt": "乡间公路也就四米宽"} -{"key": "BAC009S0769W0445", "wav": "./aishell/wav/test/S0769/BAC009S0769W0445.wav", "txt": "薛孝文还在给学生上课"} -{"key": "BAC009S0769W0446", "wav": "./aishell/wav/test/S0769/BAC009S0769W0446.wav", "txt": "在年轻时也有着跳龙门的梦"} -{"key": "BAC009S0769W0447", "wav": "./aishell/wav/test/S0769/BAC009S0769W0447.wav", "txt": "他辗转三所乡村学校"} -{"key": "BAC009S0769W0448", "wav": "./aishell/wav/test/S0769/BAC009S0769W0448.wav", "txt": "村干部大闹天宫孙大圣口碑爆棚"} -{"key": "BAC009S0769W0449", "wav": "./aishell/wav/test/S0769/BAC009S0769W0449.wav", "txt": "村干部强揽工程遭拒绝雇百名老人阻挠施工"} -{"key": "BAC009S0769W0450", "wav": "./aishell/wav/test/S0769/BAC009S0769W0450.wav", "txt": "犯罪嫌疑人刘德怀等六人被刑拘"} -{"key": "BAC009S0769W0451", "wav": "./aishell/wav/test/S0769/BAC009S0769W0451.wav", "txt": "村干部靠打架成名被抓后喊我是市人大代表"} -{"key": "BAC009S0769W0452", "wav": "./aishell/wav/test/S0769/BAC009S0769W0452.wav", "txt": "和平花苑现已更名为龙和华府"} -{"key": "BAC009S0769W0453", "wav": "./aishell/wav/test/S0769/BAC009S0769W0453.wav", "txt": "村庄晴天降奇冰十几斤重来历不明"} -{"key": "BAC009S0769W0454", "wav": "./aishell/wav/test/S0769/BAC009S0769W0454.wav", "txt": "天上掉下一块重约十几斤的冰块"} -{"key": "BAC009S0769W0455", "wav": "./aishell/wav/test/S0769/BAC009S0769W0455.wav", "txt": "虽然事情过去三天了"} -{"key": "BAC009S0769W0456", "wav": "./aishell/wav/test/S0769/BAC009S0769W0456.wav", "txt": "但嵩县德亭镇大王沟村村民们仍感到好奇"} -{"key": "BAC009S0769W0457", "wav": "./aishell/wav/test/S0769/BAC009S0769W0457.wav", "txt": "一零月一零日临近中午"} -{"key": "BAC009S0769W0458", "wav": "./aishell/wav/test/S0769/BAC009S0769W0458.wav", "txt": "砸到了村民的菜地里"} -{"key": "BAC009S0769W0459", "wav": "./aishell/wav/test/S0769/BAC009S0769W0459.wav", "txt": "还把地面砸了个大坑"} -{"key": "BAC009S0769W0460", "wav": "./aishell/wav/test/S0769/BAC009S0769W0460.wav", "txt": "附近村民闻讯纷纷赶来瞧个新鲜"} -{"key": "BAC009S0769W0461", "wav": "./aishell/wav/test/S0769/BAC009S0769W0461.wav", "txt": "捡拾一些冰块回家冰冻保存"} -{"key": "BAC009S0769W0462", "wav": "./aishell/wav/test/S0769/BAC009S0769W0462.wav", "txt": "专家排除了冰雹和飞机上落冰的两种可能"} -{"key": "BAC009S0769W0463", "wav": "./aishell/wav/test/S0769/BAC009S0769W0463.wav", "txt": "这块天降奇冰究竟是何物"} -{"key": "BAC009S0769W0464", "wav": "./aishell/wav/test/S0769/BAC009S0769W0464.wav", "txt": "村庄现两名村支书假支书无名有实村内掌权"} -{"key": "BAC009S0769W0465", "wav": "./aishell/wav/test/S0769/BAC009S0769W0465.wav", "txt": "村庄遭人倾倒数百吨化工废料附近植物全空死"} -{"key": "BAC009S0769W0466", "wav": "./aishell/wav/test/S0769/BAC009S0769W0466.wav", "txt": "非法倾倒数百吨化工废料"} -{"key": "BAC009S0769W0467", "wav": "./aishell/wav/test/S0769/BAC009S0769W0467.wav", "txt": "村民生活因此发生巨变井水变味田地减产前日"} -{"key": "BAC009S0769W0468", "wav": "./aishell/wav/test/S0769/BAC009S0769W0468.wav", "txt": "该村村民黎胜明向楚天快报求助"} -{"key": "BAC009S0769W0469", "wav": "./aishell/wav/test/S0769/BAC009S0769W0469.wav", "txt": "希望相关部门能处理此事"} -{"key": "BAC009S0769W0470", "wav": "./aishell/wav/test/S0769/BAC009S0769W0470.wav", "txt": "村支书一周只上二小时班村民称反映会遭报复"} -{"key": "BAC009S0769W0471", "wav": "./aishell/wav/test/S0769/BAC009S0769W0471.wav", "txt": "村支书上班时间带彩娱乐神秘人曝光视频证据"} -{"key": "BAC009S0769W0472", "wav": "./aishell/wav/test/S0769/BAC009S0769W0472.wav", "txt": "视频中正在带彩娱乐的灰衣男"} -{"key": "BAC009S0769W0473", "wav": "./aishell/wav/test/S0769/BAC009S0769W0473.wav", "txt": "被警方确认为新农村党支部书记毛家文"} -{"key": "BAC009S0769W0474", "wav": "./aishell/wav/test/S0769/BAC009S0769W0474.wav", "txt": "村支书为考公务员改小一零岁一四岁时三月内生两子"} -{"key": "BAC009S0769W0475", "wav": "./aishell/wav/test/S0769/BAC009S0769W0475.wav", "txt": "淅川县上集镇一名村支书被指将年龄改小一零岁"} -{"key": "BAC009S0769W0476", "wav": "./aishell/wav/test/S0769/BAC009S0769W0476.wav", "txt": "图为时上集镇派出所"} -{"key": "BAC009S0769W0477", "wav": "./aishell/wav/test/S0769/BAC009S0769W0477.wav", "txt": "三个月内连生两个儿子"} -{"key": "BAC009S0769W0478", "wav": "./aishell/wav/test/S0769/BAC009S0769W0478.wav", "txt": "村支书将两女儿家七口人列为搬迁户骗领搬迁款"} -{"key": "BAC009S0769W0479", "wav": "./aishell/wav/test/S0769/BAC009S0769W0479.wav", "txt": "村支书违法占地建加油站多部门介入处罚仍未拆"} -{"key": "BAC009S0769W0480", "wav": "./aishell/wav/test/S0769/BAC009S0769W0480.wav", "txt": "浙江在线零九月二一日讯浙江日报记者季建荣近日"} -{"key": "BAC009S0769W0481", "wav": "./aishell/wav/test/S0769/BAC009S0769W0481.wav", "txt": "村民多次向温岭市有关部门投诉反映"} -{"key": "BAC009S0769W0482", "wav": "./aishell/wav/test/S0769/BAC009S0769W0482.wav", "txt": "但问题至今没有解决"} -{"key": "BAC009S0769W0483", "wav": "./aishell/wav/test/S0769/BAC009S0769W0483.wav", "txt": "村支书违规建小产权房花钱买通所有关系"} -{"key": "BAC009S0769W0484", "wav": "./aishell/wav/test/S0769/BAC009S0769W0484.wav", "txt": "都说下属有困难找领导"} -{"key": "BAC009S0769W0485", "wav": "./aishell/wav/test/S0769/BAC009S0769W0485.wav", "txt": "灵璧县韦集镇韦集村原村支书石某"} -{"key": "BAC009S0769W0486", "wav": "./aishell/wav/test/S0769/BAC009S0769W0486.wav", "txt": "就花钱请领导为他撑腰"} -{"key": "BAC009S0769W0487", "wav": "./aishell/wav/test/S0769/BAC009S0769W0487.wav", "txt": "村支书遭集体举报买鼠药欲投毒报复村民"} -{"key": "BAC009S0769W0488", "wav": "./aishell/wav/test/S0769/BAC009S0769W0488.wav", "txt": "本报一零月五日讯国庆长假"} -{"key": "BAC009S0769W0489", "wav": "./aishell/wav/test/S0769/BAC009S0769W0489.wav", "txt": "省纪委要求新闻媒体主动参与到纠四风监督工作中"} -{"key": "BAC009S0769W0490", "wav": "./aishell/wav/test/S0769/BAC009S0769W0490.wav", "txt": "强化媒体根据群众举报开展调查采访和舆论监督"} -{"key": "BAC009S0769W0491", "wav": "./aishell/wav/test/S0769/BAC009S0769W0491.wav", "txt": "发生了一起村民举报村支书贪腐"} -{"key": "BAC009S0769W0492", "wav": "./aishell/wav/test/S0769/BAC009S0769W0492.wav", "txt": "村支书以在全村井水中投毒以报复村民的离奇事件"} -{"key": "BAC009S0769W0493", "wav": "./aishell/wav/test/S0769/BAC009S0769W0493.wav", "txt": "村支书醉驾撞伤孕妇刑满释放后仍当人大代表"} -{"key": "BAC009S0769W0494", "wav": "./aishell/wav/test/S0769/BAC009S0769W0494.wav", "txt": "华江瑶族乡十四届人大代表会第五次会议会务材料上"} -{"key": "BAC009S0769W0495", "wav": "./aishell/wav/test/S0769/BAC009S0769W0495.wav", "txt": "于二零一四年四月一九日晚"} -{"key": "BAC009S0770W0121", "wav": "./aishell/wav/test/S0770/BAC009S0770W0121.wav", "txt": "住宅土地出让金及成交面积均大幅下降"} -{"key": "BAC009S0770W0122", "wav": "./aishell/wav/test/S0770/BAC009S0770W0122.wav", "txt": "开发商进驻一二线城市"} -{"key": "BAC009S0770W0123", "wav": "./aishell/wav/test/S0770/BAC009S0770W0123.wav", "txt": "抛售三线城市"} -{"key": "BAC009S0770W0124", "wav": "./aishell/wav/test/S0770/BAC009S0770W0124.wav", "txt": "遇到毛利率低的问题"} -{"key": "BAC009S0770W0125", "wav": "./aishell/wav/test/S0770/BAC009S0770W0125.wav", "txt": "发现土地成本占比持续提升"} -{"key": "BAC009S0770W0126", "wav": "./aishell/wav/test/S0770/BAC009S0770W0126.wav", "txt": "目前全国该指标"} -{"key": "BAC009S0770W0127", "wav": "./aishell/wav/test/S0770/BAC009S0770W0127.wav", "txt": "一线城市超过三成"} -{"key": "BAC009S0770W0128", "wav": "./aishell/wav/test/S0770/BAC009S0770W0128.wav", "txt": "三线及以下为一成"} -{"key": "BAC009S0770W0129", "wav": "./aishell/wav/test/S0770/BAC009S0770W0129.wav", "txt": "一二线城市用地紧张"} -{"key": "BAC009S0770W0130", "wav": "./aishell/wav/test/S0770/BAC009S0770W0130.wav", "txt": "房地产商需要解决毛利率低的问题"} -{"key": "BAC009S0770W0131", "wav": "./aishell/wav/test/S0770/BAC009S0770W0131.wav", "txt": "中指院广州公司总经理张化学向南都记者表示"} -{"key": "BAC009S0770W0132", "wav": "./aishell/wav/test/S0770/BAC009S0770W0132.wav", "txt": "三线城市库存积压又逼倒房地产商在一线城市抢地"} -{"key": "BAC009S0770W0133", "wav": "./aishell/wav/test/S0770/BAC009S0770W0133.wav", "txt": "建议房企不要一味强调做大"} -{"key": "BAC009S0770W0134", "wav": "./aishell/wav/test/S0770/BAC009S0770W0134.wav", "txt": "可以重点关注如何做强"} -{"key": "BAC009S0770W0135", "wav": "./aishell/wav/test/S0770/BAC009S0770W0135.wav", "txt": "在自身优势领域发力"} -{"key": "BAC009S0770W0136", "wav": "./aishell/wav/test/S0770/BAC009S0770W0136.wav", "txt": "发现无论是千亿巨头地产商"} -{"key": "BAC009S0770W0137", "wav": "./aishell/wav/test/S0770/BAC009S0770W0137.wav", "txt": "多数在积极剑指一线城市"} -{"key": "BAC009S0770W0138", "wav": "./aishell/wav/test/S0770/BAC009S0770W0138.wav", "txt": "从今年房企的买地情况来看"} -{"key": "BAC009S0770W0139", "wav": "./aishell/wav/test/S0770/BAC009S0770W0139.wav", "txt": "今年万科拿下九宗地块"} -{"key": "BAC009S0770W0140", "wav": "./aishell/wav/test/S0770/BAC009S0770W0140.wav", "txt": "包括五个一二线城市"} -{"key": "BAC009S0770W0141", "wav": "./aishell/wav/test/S0770/BAC009S0770W0141.wav", "txt": "保利地产开始进军成都珠海"} -{"key": "BAC009S0770W0142", "wav": "./aishell/wav/test/S0770/BAC009S0770W0142.wav", "txt": "中海地产作为国企龙头"} -{"key": "BAC009S0770W0143", "wav": "./aishell/wav/test/S0770/BAC009S0770W0143.wav", "txt": "也在厦门拿下几宗商住用地和济南几宗居住用地"} -{"key": "BAC009S0770W0144", "wav": "./aishell/wav/test/S0770/BAC009S0770W0144.wav", "txt": "在房企扎堆一二线城市时"} -{"key": "BAC009S0770W0145", "wav": "./aishell/wav/test/S0770/BAC009S0770W0145.wav", "txt": "更致命的是中小房企在融资方面的短板"} -{"key": "BAC009S0770W0146", "wav": "./aishell/wav/test/S0770/BAC009S0770W0146.wav", "txt": "相比千亿房企的借贷利率"} -{"key": "BAC009S0770W0147", "wav": "./aishell/wav/test/S0770/BAC009S0770W0147.wav", "txt": "中小房企要面临高达两位数利率"} -{"key": "BAC009S0770W0148", "wav": "./aishell/wav/test/S0770/BAC009S0770W0148.wav", "txt": "中国市场空间多样化"} -{"key": "BAC009S0770W0149", "wav": "./aishell/wav/test/S0770/BAC009S0770W0149.wav", "txt": "房企除了像千亿地产一样做大"} -{"key": "BAC009S0770W0150", "wav": "./aishell/wav/test/S0770/BAC009S0770W0150.wav", "txt": "在某一方面找到自己的企业竞争力"} -{"key": "BAC009S0770W0151", "wav": "./aishell/wav/test/S0770/BAC009S0770W0151.wav", "txt": "行业的玩家门槛越来越高"} -{"key": "BAC009S0770W0152", "wav": "./aishell/wav/test/S0770/BAC009S0770W0152.wav", "txt": "主动退出和寻求并购的中小开发商增多"} -{"key": "BAC009S0770W0153", "wav": "./aishell/wav/test/S0770/BAC009S0770W0153.wav", "txt": "大开发商有机会借此提高行业集中程度"} -{"key": "BAC009S0770W0154", "wav": "./aishell/wav/test/S0770/BAC009S0770W0154.wav", "txt": "张大伟向南都记者分析"} -{"key": "BAC009S0770W0155", "wav": "./aishell/wav/test/S0770/BAC009S0770W0155.wav", "txt": "在三线城市库存高攀销售停滞的情况下"} -{"key": "BAC009S0770W0156", "wav": "./aishell/wav/test/S0770/BAC009S0770W0156.wav", "txt": "没有雄厚的资金良好的业绩以及成熟的融资平台"} -{"key": "BAC009S0770W0157", "wav": "./aishell/wav/test/S0770/BAC009S0770W0157.wav", "txt": "似乎难以在一二线城市站稳"} -{"key": "BAC009S0770W0158", "wav": "./aishell/wav/test/S0770/BAC009S0770W0158.wav", "txt": "房地产业将在明年有所洗牌"} -{"key": "BAC009S0770W0159", "wav": "./aishell/wav/test/S0770/BAC009S0770W0159.wav", "txt": "点击进入股友会参与讨论"} -{"key": "BAC009S0770W0160", "wav": "./aishell/wav/test/S0770/BAC009S0770W0160.wav", "txt": "今年国有土地出让权收入四千亿元"} -{"key": "BAC009S0770W0161", "wav": "./aishell/wav/test/S0770/BAC009S0770W0161.wav", "txt": "今年房地产市场地域分化将加剧"} -{"key": "BAC009S0770W0162", "wav": "./aishell/wav/test/S0770/BAC009S0770W0162.wav", "txt": "政策放松和高库存背景下"} -{"key": "BAC009S0770W0163", "wav": "./aishell/wav/test/S0770/BAC009S0770W0163.wav", "txt": "开发商均面临不均衡的复苏前景"} -{"key": "BAC009S0770W0164", "wav": "./aishell/wav/test/S0770/BAC009S0770W0164.wav", "txt": "今年中国房地产开发商仍将面临供应过剩"} -{"key": "BAC009S0770W0165", "wav": "./aishell/wav/test/S0770/BAC009S0770W0165.wav", "txt": "房价不太可能强劲反弹"} -{"key": "BAC009S0770W0166", "wav": "./aishell/wav/test/S0770/BAC009S0770W0166.wav", "txt": "房地产在不同城市之间的复苏也将存在分化"} -{"key": "BAC009S0770W0167", "wav": "./aishell/wav/test/S0770/BAC009S0770W0167.wav", "txt": "一线城市或将复苏率先复苏"} -{"key": "BAC009S0770W0168", "wav": "./aishell/wav/test/S0770/BAC009S0770W0168.wav", "txt": "三四线城市可能在继续因高库存而承压"} -{"key": "BAC009S0770W0169", "wav": "./aishell/wav/test/S0770/BAC009S0770W0169.wav", "txt": "中国房地产的市场价格和销量将继续调整"} -{"key": "BAC009S0770W0170", "wav": "./aishell/wav/test/S0770/BAC009S0770W0170.wav", "txt": "但下半年的销售可能会回升"} -{"key": "BAC009S0770W0171", "wav": "./aishell/wav/test/S0770/BAC009S0770W0171.wav", "txt": "开发商只需选择继续降价"} -{"key": "BAC009S0770W0172", "wav": "./aishell/wav/test/S0770/BAC009S0770W0172.wav", "txt": "尤其是在三四线城市"} -{"key": "BAC009S0770W0173", "wav": "./aishell/wav/test/S0770/BAC009S0770W0173.wav", "txt": "中国经济增速放缓的背景下"} -{"key": "BAC009S0770W0174", "wav": "./aishell/wav/test/S0770/BAC009S0770W0174.wav", "txt": "预期政府将继续放松政策"} -{"key": "BAC009S0770W0175", "wav": "./aishell/wav/test/S0770/BAC009S0770W0175.wav", "txt": "而政府放松限购按揭和内地融资政策"} -{"key": "BAC009S0770W0176", "wav": "./aishell/wav/test/S0770/BAC009S0770W0176.wav", "txt": "房地产需求可能会上升"} -{"key": "BAC009S0770W0177", "wav": "./aishell/wav/test/S0770/BAC009S0770W0177.wav", "txt": "这将有助于开发商明年维持销量"} -{"key": "BAC009S0770W0178", "wav": "./aishell/wav/test/S0770/BAC009S0770W0178.wav", "txt": "政府放松政策对房地产销售的正面影响可能会提升"} -{"key": "BAC009S0770W0179", "wav": "./aishell/wav/test/S0770/BAC009S0770W0179.wav", "txt": "标普信用分析师孔磊说道"} -{"key": "BAC009S0770W0180", "wav": "./aishell/wav/test/S0770/BAC009S0770W0180.wav", "txt": "关于明年的房价走势"} -{"key": "BAC009S0770W0181", "wav": "./aishell/wav/test/S0770/BAC009S0770W0181.wav", "txt": "标普在基准情景假设下的预期是"} -{"key": "BAC009S0770W0182", "wav": "./aishell/wav/test/S0770/BAC009S0770W0182.wav", "txt": "明年平均售价将维持不变"} -{"key": "BAC009S0770W0183", "wav": "./aishell/wav/test/S0770/BAC009S0770W0183.wav", "txt": "销售额则将维持不变"} -{"key": "BAC009S0770W0184", "wav": "./aishell/wav/test/S0770/BAC009S0770W0184.wav", "txt": "房地产价格调整还未完全结束"} -{"key": "BAC009S0770W0185", "wav": "./aishell/wav/test/S0770/BAC009S0770W0185.wav", "txt": "未来一年内中国房地产价格不太可能强劲反弹"} -{"key": "BAC009S0770W0186", "wav": "./aishell/wav/test/S0770/BAC009S0770W0186.wav", "txt": "虽然过去一年一些获评级开发商的信用状况变差"} -{"key": "BAC009S0770W0187", "wav": "./aishell/wav/test/S0770/BAC009S0770W0187.wav", "txt": "徐林发债企业在债劵存续期内进行资产转移"} -{"key": "BAC009S0770W0188", "wav": "./aishell/wav/test/S0770/BAC009S0770W0188.wav", "txt": "极可能对债劵持有人利益构成不利影响"} -{"key": "BAC009S0770W0189", "wav": "./aishell/wav/test/S0770/BAC009S0770W0189.wav", "txt": "直接涉及到债劵持有人的利益保护问题"} -{"key": "BAC009S0770W0190", "wav": "./aishell/wav/test/S0770/BAC009S0770W0190.wav", "txt": "我们立即与云投集团进行了沟通"} -{"key": "BAC009S0770W0191", "wav": "./aishell/wav/test/S0770/BAC009S0770W0191.wav", "txt": "并严格按照合规程序进行"} -{"key": "BAC009S0770W0192", "wav": "./aishell/wav/test/S0770/BAC009S0770W0192.wav", "txt": "我委也注意到在企业债劵存续期内"} -{"key": "BAC009S0770W0193", "wav": "./aishell/wav/test/S0770/BAC009S0770W0193.wav", "txt": "需要对发行人资产重组等重大事宜加强监管"} -{"key": "BAC009S0770W0194", "wav": "./aishell/wav/test/S0770/BAC009S0770W0194.wav", "txt": "在制度上对债券人的合法权益进行保护"} -{"key": "BAC009S0770W0195", "wav": "./aishell/wav/test/S0770/BAC009S0770W0195.wav", "txt": "建立地方政府债务管理体系"} -{"key": "BAC009S0770W0196", "wav": "./aishell/wav/test/S0770/BAC009S0770W0196.wav", "txt": "从您刚才的介绍中我们了解到"} -{"key": "BAC009S0770W0197", "wav": "./aishell/wav/test/S0770/BAC009S0770W0197.wav", "txt": "城投债劵对公司城市基础设施和市政的建设"} -{"key": "BAC009S0770W0198", "wav": "./aishell/wav/test/S0770/BAC009S0770W0198.wav", "txt": "起到了非常积极的作用"} -{"key": "BAC009S0770W0199", "wav": "./aishell/wav/test/S0770/BAC009S0770W0199.wav", "txt": "对丰富债劵市场品种也具有积极意义"} -{"key": "BAC009S0770W0200", "wav": "./aishell/wav/test/S0770/BAC009S0770W0200.wav", "txt": "结合地方政府债务管理制度的完善"} -{"key": "BAC009S0770W0201", "wav": "./aishell/wav/test/S0770/BAC009S0770W0201.wav", "txt": "下一步我国的城投债劵还需要做哪些完善"} -{"key": "BAC009S0770W0202", "wav": "./aishell/wav/test/S0770/BAC009S0770W0202.wav", "txt": "这个问题涉及到一系列的制度完善"} -{"key": "BAC009S0770W0203", "wav": "./aishell/wav/test/S0770/BAC009S0770W0203.wav", "txt": "是一个比较复杂的问题"} -{"key": "BAC009S0770W0204", "wav": "./aishell/wav/test/S0770/BAC009S0770W0204.wav", "txt": "我个人是这么认识的"} -{"key": "BAC009S0770W0205", "wav": "./aishell/wav/test/S0770/BAC009S0770W0205.wav", "txt": "我国还处于城市化快速发展期"} -{"key": "BAC009S0770W0206", "wav": "./aishell/wav/test/S0770/BAC009S0770W0206.wav", "txt": "需要为各地的城市建设提供规范的融资渠道"} -{"key": "BAC009S0770W0207", "wav": "./aishell/wav/test/S0770/BAC009S0770W0207.wav", "txt": "农业与非农产业之间劳动生产率的差距也很大"} -{"key": "BAC009S0770W0208", "wav": "./aishell/wav/test/S0770/BAC009S0770W0208.wav", "txt": "这决定了我国城市化动力十分强劲"} -{"key": "BAC009S0770W0209", "wav": "./aishell/wav/test/S0770/BAC009S0770W0209.wav", "txt": "城市化进程远未结束"} -{"key": "BAC009S0770W0210", "wav": "./aishell/wav/test/S0770/BAC009S0770W0210.wav", "txt": "城市化快速发展期的重要特征就是基础设施投资需求量大"} -{"key": "BAC009S0770W0211", "wav": "./aishell/wav/test/S0770/BAC009S0770W0211.wav", "txt": "这是我国所处的发展阶段决定的"} -{"key": "BAC009S0770W0212", "wav": "./aishell/wav/test/S0770/BAC009S0770W0212.wav", "txt": "政府通过债务融资从事基础建设"} -{"key": "BAC009S0770W0213", "wav": "./aishell/wav/test/S0770/BAC009S0770W0213.wav", "txt": "我们应该建设可控的规范化的地方政府融资机制"} -{"key": "BAC009S0770W0214", "wav": "./aishell/wav/test/S0770/BAC009S0770W0214.wav", "txt": "为各地的基础建设设提供有制度保障的融资渠道"} -{"key": "BAC009S0770W0215", "wav": "./aishell/wav/test/S0770/BAC009S0770W0215.wav", "txt": "城投债劵作为准市政债劵仍将是有效的融资工具"} -{"key": "BAC009S0770W0216", "wav": "./aishell/wav/test/S0770/BAC009S0770W0216.wav", "txt": "但是还需要进一步改进"} -{"key": "BAC009S0770W0217", "wav": "./aishell/wav/test/S0770/BAC009S0770W0217.wav", "txt": "在政府投融资体制改革过程中"} -{"key": "BAC009S0770W0218", "wav": "./aishell/wav/test/S0770/BAC009S0770W0218.wav", "txt": "从事当地基础建设"} -{"key": "BAC009S0770W0219", "wav": "./aishell/wav/test/S0770/BAC009S0770W0219.wav", "txt": "相当于过去体制而言是更加市场化的"} -{"key": "BAC009S0770W0220", "wav": "./aishell/wav/test/S0770/BAC009S0770W0220.wav", "txt": "城投债劵作为融资平台公司最透明的直接融资工具"} -{"key": "BAC009S0770W0221", "wav": "./aishell/wav/test/S0770/BAC009S0770W0221.wav", "txt": "仍然存在并具有发展空间"} -{"key": "BAC009S0770W0222", "wav": "./aishell/wav/test/S0770/BAC009S0770W0222.wav", "txt": "由于目前城投债劵的发行需要符合企业的债劵发行的条件"} -{"key": "BAC009S0770W0223", "wav": "./aishell/wav/test/S0770/BAC009S0770W0223.wav", "txt": "这使得我国城投债劵的发行利率相对偏高"} -{"key": "BAC009S0770W0224", "wav": "./aishell/wav/test/S0770/BAC009S0770W0224.wav", "txt": "城投债劵的发行期限和利率"} -{"key": "BAC009S0770W0225", "wav": "./aishell/wav/test/S0770/BAC009S0770W0225.wav", "txt": "未来应该在制度上进一步完善"} -{"key": "BAC009S0770W0226", "wav": "./aishell/wav/test/S0770/BAC009S0770W0226.wav", "txt": "使得城投公司能够发行真正意义上的长期市政债劵"} -{"key": "BAC009S0770W0227", "wav": "./aishell/wav/test/S0770/BAC009S0770W0227.wav", "txt": "要尽快建立我国的地方政府债务管理体系"} -{"key": "BAC009S0770W0228", "wav": "./aishell/wav/test/S0770/BAC009S0770W0228.wav", "txt": "对于如何建立规范的地方政府融资渠道"} -{"key": "BAC009S0770W0229", "wav": "./aishell/wav/test/S0770/BAC009S0770W0229.wav", "txt": "加强地方政府债务管理和风险防控"} -{"key": "BAC009S0770W0230", "wav": "./aishell/wav/test/S0770/BAC009S0770W0230.wav", "txt": "一些专家学者提出了许多好的建议"} -{"key": "BAC009S0770W0231", "wav": "./aishell/wav/test/S0770/BAC009S0770W0231.wav", "txt": "如建立规范透明的地方政府融资渠道"} -{"key": "BAC009S0770W0232", "wav": "./aishell/wav/test/S0770/BAC009S0770W0232.wav", "txt": "并对地方政府债务进行监控和风险防范等"} -{"key": "BAC009S0770W0233", "wav": "./aishell/wav/test/S0770/BAC009S0770W0233.wav", "txt": "由于我国还没有建立统一的地方政府债务风险管理制度"} -{"key": "BAC009S0770W0234", "wav": "./aishell/wav/test/S0770/BAC009S0770W0234.wav", "txt": "设定政府性债务风险控制指标和标准"} -{"key": "BAC009S0770W0235", "wav": "./aishell/wav/test/S0770/BAC009S0770W0235.wav", "txt": "并对政府性债务进行馀额管理"} -{"key": "BAC009S0770W0236", "wav": "./aishell/wav/test/S0770/BAC009S0770W0236.wav", "txt": "使用地方政府的债务融资规模控制在安全范围内"} -{"key": "BAC009S0770W0237", "wav": "./aishell/wav/test/S0770/BAC009S0770W0237.wav", "txt": "远低于发生债务危机的欧美国家"} -{"key": "BAC009S0770W0238", "wav": "./aishell/wav/test/S0770/BAC009S0770W0238.wav", "txt": "债劵发行人是优质的"} -{"key": "BAC009S0770W0239", "wav": "./aishell/wav/test/S0770/BAC009S0770W0239.wav", "txt": "还本付息也是正常的"} -{"key": "BAC009S0770W0240", "wav": "./aishell/wav/test/S0770/BAC009S0770W0240.wav", "txt": "应该建立风险可控的规范化地方政府融资机制"} -{"key": "BAC009S0770W0241", "wav": "./aishell/wav/test/S0770/BAC009S0770W0241.wav", "txt": "为各地的基础建设提供有力的保障的融资渠道"} -{"key": "BAC009S0770W0242", "wav": "./aishell/wav/test/S0770/BAC009S0770W0242.wav", "txt": "责任编辑廖一宁"} -{"key": "BAC009S0770W0243", "wav": "./aishell/wav/test/S0770/BAC009S0770W0243.wav", "txt": "该政策将于二零一二年施行"} -{"key": "BAC009S0770W0244", "wav": "./aishell/wav/test/S0770/BAC009S0770W0244.wav", "txt": "要继续深化天然气价格改革"} -{"key": "BAC009S0770W0245", "wav": "./aishell/wav/test/S0770/BAC009S0770W0245.wav", "txt": "加快理顺天然气价格与可代替能源的比价关系"} -{"key": "BAC009S0770W0246", "wav": "./aishell/wav/test/S0770/BAC009S0770W0246.wav", "txt": "引导天然气合理消费"} -{"key": "BAC009S0770W0247", "wav": "./aishell/wav/test/S0770/BAC009S0770W0247.wav", "txt": "提高天然气利用率支持天然气贸易机制创新"} -{"key": "BAC009S0770W0248", "wav": "./aishell/wav/test/S0770/BAC009S0770W0248.wav", "txt": "天然气用户为优先允许限制类和禁止类"} -{"key": "BAC009S0770W0249", "wav": "./aishell/wav/test/S0770/BAC009S0770W0249.wav", "txt": "限制类主要是指天然化工"} -{"key": "BAC009S0770W0250", "wav": "./aishell/wav/test/S0770/BAC009S0770W0250.wav", "txt": "各地要按照天然气利用优先顺序加强需求侧管理"} -{"key": "BAC009S0770W0251", "wav": "./aishell/wav/test/S0770/BAC009S0770W0251.wav", "txt": "鼓励优先类支持允许类天然气利用项目发展"} -{"key": "BAC009S0770W0252", "wav": "./aishell/wav/test/S0770/BAC009S0770W0252.wav", "txt": "对限制类项目的核准和审核要从严把握"} -{"key": "BAC009S0770W0253", "wav": "./aishell/wav/test/S0770/BAC009S0770W0253.wav", "txt": "商议向印度转移更多军事技术的事宜"} -{"key": "BAC009S0770W0254", "wav": "./aishell/wav/test/S0770/BAC009S0770W0254.wav", "txt": "据新华社电印度官员透露"} -{"key": "BAC009S0770W0255", "wav": "./aishell/wav/test/S0770/BAC009S0770W0255.wav", "txt": "美国将向印度转让两项军事技术"} -{"key": "BAC009S0770W0256", "wav": "./aishell/wav/test/S0770/BAC009S0770W0256.wav", "txt": "其中包括美国大鸦无人机今后将由印度工厂制造"} -{"key": "BAC009S0770W0257", "wav": "./aishell/wav/test/S0770/BAC009S0770W0257.wav", "txt": "印度斯坦时报对二十四日援引消息人士的话报道"} -{"key": "BAC009S0770W0258", "wav": "./aishell/wav/test/S0770/BAC009S0770W0258.wav", "txt": "二零一五年最适宜供职的公司仍在科技领域"} -{"key": "BAC009S0770W0259", "wav": "./aishell/wav/test/S0770/BAC009S0770W0259.wav", "txt": "该网站根据雇员的反馈"} -{"key": "BAC009S0770W0260", "wav": "./aishell/wav/test/S0770/BAC009S0770W0260.wav", "txt": "给出了前五十名的公司排名"} -{"key": "BAC009S0770W0261", "wav": "./aishell/wav/test/S0770/BAC009S0770W0261.wav", "txt": "排名前十的科技公司"} -{"key": "BAC009S0770W0263", "wav": "./aishell/wav/test/S0770/BAC009S0770W0263.wav", "txt": "不仅在科技公司领域排名第一"} -{"key": "BAC009S0770W0264", "wav": "./aishell/wav/test/S0770/BAC009S0770W0264.wav", "txt": "而且在整个榜单也位居首位"} -{"key": "BAC009S0770W0265", "wav": "./aishell/wav/test/S0770/BAC009S0770W0265.wav", "txt": "谷歌不仅会以优厚薪酬招募顶尖人才"} -{"key": "BAC009S0770W0267", "wav": "./aishell/wav/test/S0770/BAC009S0770W0267.wav", "txt": "该应用交付网络在整个榜单中位居第四"} -{"key": "BAC009S0770W0268", "wav": "./aishell/wav/test/S0770/BAC009S0770W0268.wav", "txt": "在科技领域排名第二"} -{"key": "BAC009S0770W0270", "wav": "./aishell/wav/test/S0770/BAC009S0770W0270.wav", "txt": "这家社交网络巨头对待员工也是相当慷慨"} -{"key": "BAC009S0770W0271", "wav": "./aishell/wav/test/S0770/BAC009S0770W0271.wav", "txt": "谷歌的福利待遇他家基本都有"} -{"key": "BAC009S0770W0272", "wav": "./aishell/wav/test/S0770/BAC009S0770W0272.wav", "txt": "之前刚刚提出为女性员工提供冷冻卵子费用"} -{"key": "BAC009S0770W0274", "wav": "./aishell/wav/test/S0770/BAC009S0770W0274.wav", "txt": "去年高通被评为最佳实习科技公司"} -{"key": "BAC009S0770W0276", "wav": "./aishell/wav/test/S0770/BAC009S0770W0276.wav", "txt": "对于苹果公司来说这是很关键的一年"} -{"key": "BAC009S0770W0279", "wav": "./aishell/wav/test/S0770/BAC009S0770W0279.wav", "txt": "都是该公司的强心剂"} -{"key": "BAC009S0770W0280", "wav": "./aishell/wav/test/S0770/BAC009S0770W0280.wav", "txt": "雇员们也在很大程度上受到了鼓舞"} -{"key": "BAC009S0770W0282", "wav": "./aishell/wav/test/S0770/BAC009S0770W0282.wav", "txt": "作为全球最大的职业社交网站"} -{"key": "BAC009S0770W0283", "wav": "./aishell/wav/test/S0770/BAC009S0770W0283.wav", "txt": "领英在榜单上的成绩也是相当不错的"} -{"key": "BAC009S0770W0284", "wav": "./aishell/wav/test/S0770/BAC009S0770W0284.wav", "txt": "提供免费房地产估价服务的网站"} -{"key": "BAC009S0770W0285", "wav": "./aishell/wav/test/S0770/BAC009S0770W0285.wav", "txt": "在美国一上线就造成大轰动"} -{"key": "BAC009S0770W0290", "wav": "./aishell/wav/test/S0770/BAC009S0770W0290.wav", "txt": "且把服务范围特别局限在医疗健康领域"} -{"key": "BAC009S0770W0291", "wav": "./aishell/wav/test/S0770/BAC009S0770W0291.wav", "txt": "搜狐消息外媒消息"} -{"key": "BAC009S0770W0292", "wav": "./aishell/wav/test/S0770/BAC009S0770W0292.wav", "txt": "二零一五年最适宜供职的公司仍在科技领域"} -{"key": "BAC009S0770W0293", "wav": "./aishell/wav/test/S0770/BAC009S0770W0293.wav", "txt": "该网站根据雇员的反馈"} -{"key": "BAC009S0770W0297", "wav": "./aishell/wav/test/S0770/BAC009S0770W0297.wav", "txt": "排名从第十三位上升至第十一位"} -{"key": "BAC009S0770W0298", "wav": "./aishell/wav/test/S0770/BAC009S0770W0298.wav", "txt": "高通二零一四年所获专利也增长了百分之二十三"} -{"key": "BAC009S0770W0299", "wav": "./aishell/wav/test/S0770/BAC009S0770W0299.wav", "txt": "排名从第九升至第七"} -{"key": "BAC009S0770W0300", "wav": "./aishell/wav/test/S0770/BAC009S0770W0300.wav", "txt": "以上大多数专利都与计算软件及相关技术有关"} -{"key": "BAC009S0770W0304", "wav": "./aishell/wav/test/S0770/BAC009S0770W0304.wav", "txt": "加速推进中国服务器市场份额的第一目标"} -{"key": "BAC009S0770W0305", "wav": "./aishell/wav/test/S0770/BAC009S0770W0305.wav", "txt": "这是浪潮借助政策东风来做的营销手段"} -{"key": "BAC009S0770W0306", "wav": "./aishell/wav/test/S0770/BAC009S0770W0306.wav", "txt": "对于企业提高股价促成业务"} -{"key": "BAC009S0770W0307", "wav": "./aishell/wav/test/S0770/BAC009S0770W0307.wav", "txt": "某个银行的系统采购"} -{"key": "BAC009S0770W0308", "wav": "./aishell/wav/test/S0770/BAC009S0770W0308.wav", "txt": "在确保系统顺利运行的情况下"} -{"key": "BAC009S0770W0309", "wav": "./aishell/wav/test/S0770/BAC009S0770W0309.wav", "txt": "大家可能因为国家政策扶持国产品牌的大势"} -{"key": "BAC009S0770W0310", "wav": "./aishell/wav/test/S0770/BAC009S0770W0310.wav", "txt": "而选择国产的服务器"} -{"key": "BAC009S0770W0311", "wav": "./aishell/wav/test/S0770/BAC009S0770W0311.wav", "txt": "就更加愿意长期持有他们的股票"} -{"key": "BAC009S0770W0312", "wav": "./aishell/wav/test/S0770/BAC009S0770W0312.wav", "txt": "核心技术待突破自棱镜门事件之后"} -{"key": "BAC009S0770W0313", "wav": "./aishell/wav/test/S0770/BAC009S0770W0313.wav", "txt": "国家信息安全的问题被推到了风口浪尖"} -{"key": "BAC009S0770W0314", "wav": "./aishell/wav/test/S0770/BAC009S0770W0314.wav", "txt": "而体现在服务器产业上"} -{"key": "BAC009S0770W0315", "wav": "./aishell/wav/test/S0770/BAC009S0770W0315.wav", "txt": "由于中国政府的大力扶持"} -{"key": "BAC009S0770W0316", "wav": "./aishell/wav/test/S0770/BAC009S0770W0316.wav", "txt": "国产服务器厂商迎来利好"} -{"key": "BAC009S0770W0317", "wav": "./aishell/wav/test/S0770/BAC009S0770W0317.wav", "txt": "在国内四大厂商浪潮华为联想曙光中"} -{"key": "BAC009S0770W0318", "wav": "./aishell/wav/test/S0770/BAC009S0770W0318.wav", "txt": "浪潮的特点在于定制化策略"} -{"key": "BAC009S0770W0319", "wav": "./aishell/wav/test/S0770/BAC009S0770W0319.wav", "txt": "与互联网企业深度合作"} -{"key": "BAC009S0770W0320", "wav": "./aishell/wav/test/S0770/BAC009S0770W0320.wav", "txt": "而这种策略带来的结果是市场份额的快速提升"} -{"key": "BAC009S0770W0322", "wav": "./aishell/wav/test/S0770/BAC009S0770W0322.wav", "txt": "至于像整机柜这类深度定制化的细分市场"} -{"key": "BAC009S0770W0323", "wav": "./aishell/wav/test/S0770/BAC009S0770W0323.wav", "txt": "百分之百为国产品牌"} -{"key": "BAC009S0770W0324", "wav": "./aishell/wav/test/S0770/BAC009S0770W0324.wav", "txt": "其中浪潮达到了百分之六十的市场占有率"} -{"key": "BAC009S0770W0325", "wav": "./aishell/wav/test/S0770/BAC009S0770W0325.wav", "txt": "近年来随着国内互联网企业的快速发展"} -{"key": "BAC009S0770W0326", "wav": "./aishell/wav/test/S0770/BAC009S0770W0326.wav", "txt": "宽带和服务器的采购量也水涨船高"} -{"key": "BAC009S0770W0327", "wav": "./aishell/wav/test/S0770/BAC009S0770W0327.wav", "txt": "由于各家之间竞争激烈"} -{"key": "BAC009S0770W0328", "wav": "./aishell/wav/test/S0770/BAC009S0770W0328.wav", "txt": "往往在采购过程中尽量压低报价"} -{"key": "BAC009S0770W0329", "wav": "./aishell/wav/test/S0770/BAC009S0770W0329.wav", "txt": "再加上互联网企业对服务器技术性可能等要求很高"} -{"key": "BAC009S0770W0330", "wav": "./aishell/wav/test/S0770/BAC009S0770W0330.wav", "txt": "很多服务器厂商进入做一两年"} -{"key": "BAC009S0770W0331", "wav": "./aishell/wav/test/S0770/BAC009S0770W0331.wav", "txt": "而浪潮从二零一零坚持做到现在"} -{"key": "BAC009S0770W0332", "wav": "./aishell/wav/test/S0770/BAC009S0770W0332.wav", "txt": "业内对其做法的解读是先凭着低价杀入市场"} -{"key": "BAC009S0770W0333", "wav": "./aishell/wav/test/S0770/BAC009S0770W0333.wav", "txt": "以品质和服务黏住用户"} -{"key": "BAC009S0770W0334", "wav": "./aishell/wav/test/S0770/BAC009S0770W0334.wav", "txt": "虽然面对赔钱赚吆喝的质疑"} -{"key": "BAC009S0770W0335", "wav": "./aishell/wav/test/S0770/BAC009S0770W0335.wav", "txt": "浪潮与海关总署启动战略合作助推智慧海关搜狐科技"} -{"key": "BAC009S0770W0336", "wav": "./aishell/wav/test/S0770/BAC009S0770W0336.wav", "txt": "浪潮集团与海关总署启动战略合作"} -{"key": "BAC009S0770W0337", "wav": "./aishell/wav/test/S0770/BAC009S0770W0337.wav", "txt": "合作范围遍及全国各直属海关及隶属海关"} -{"key": "BAC009S0770W0338", "wav": "./aishell/wav/test/S0770/BAC009S0770W0338.wav", "txt": "对于我而言现在已经成为了一种习惯与本能"} -{"key": "BAC009S0770W0339", "wav": "./aishell/wav/test/S0770/BAC009S0770W0339.wav", "txt": "有着较为丰富志愿服务经历的九零后吴雯的话"} -{"key": "BAC009S0770W0340", "wav": "./aishell/wav/test/S0770/BAC009S0770W0340.wav", "txt": "只是本次田径世锦志愿者这个大群体的一个缩影"} -{"key": "BAC009S0770W0341", "wav": "./aishell/wav/test/S0770/BAC009S0770W0341.wav", "txt": "他们有理由相信九零后同样可以做好"} -{"key": "BAC009S0770W0342", "wav": "./aishell/wav/test/S0770/BAC009S0770W0342.wav", "txt": "我希望能通过这次田径世锦赛"} -{"key": "BAC009S0770W0343", "wav": "./aishell/wav/test/S0770/BAC009S0770W0343.wav", "txt": "以及未来几年更多志愿经历"} -{"key": "BAC009S0770W0344", "wav": "./aishell/wav/test/S0770/BAC009S0770W0344.wav", "txt": "来为二零二二年的冬奥会积累经验"} -{"key": "BAC009S0770W0345", "wav": "./aishell/wav/test/S0770/BAC009S0770W0345.wav", "txt": "到时将会成为冬奥会志愿者的主力"} -{"key": "BAC009S0770W0346", "wav": "./aishell/wav/test/S0770/BAC009S0770W0346.wav", "txt": "张锦麟将为自己称为鸟巢新一代志愿者"} -{"key": "BAC009S0770W0347", "wav": "./aishell/wav/test/S0770/BAC009S0770W0347.wav", "txt": "他在为此时刻准备着"} -{"key": "BAC009S0770W0348", "wav": "./aishell/wav/test/S0770/BAC009S0770W0348.wav", "txt": "本报记者宋翔王薇"} -{"key": "BAC009S0770W0349", "wav": "./aishell/wav/test/S0770/BAC009S0770W0349.wav", "txt": "著名双人滑运动员庞清和董健虽未正式宣布退役"} -{"key": "BAC009S0770W0350", "wav": "./aishell/wav/test/S0770/BAC009S0770W0350.wav", "txt": "但现在的生活已经进入了准退役状态"} -{"key": "BAC009S0770W0351", "wav": "./aishell/wav/test/S0770/BAC009S0770W0351.wav", "txt": "两人把更多的精力放到了花滑运动的推广上"} -{"key": "BAC009S0770W0352", "wav": "./aishell/wav/test/S0770/BAC009S0770W0352.wav", "txt": "他俩组建了工作团队"} -{"key": "BAC009S0770W0353", "wav": "./aishell/wav/test/S0770/BAC009S0770W0353.wav", "txt": "过上了比运动员复杂得多的生活"} -{"key": "BAC009S0770W0355", "wav": "./aishell/wav/test/S0770/BAC009S0770W0355.wav", "txt": "九月初顺利通过了考试"} -{"key": "BAC009S0770W0356", "wav": "./aishell/wav/test/S0770/BAC009S0770W0356.wav", "txt": "佟健已经完成了第一个学模块的学习"} -{"key": "BAC009S0770W0357", "wav": "./aishell/wav/test/S0770/BAC009S0770W0357.wav", "txt": "常年的专业训练给身体带来了各种伤病"} -{"key": "BAC009S0770W0358", "wav": "./aishell/wav/test/S0770/BAC009S0770W0358.wav", "txt": "二零一四年索契冬奥会上"} -{"key": "BAC009S0770W0359", "wav": "./aishell/wav/test/S0770/BAC009S0770W0359.wav", "txt": "早到了退役年龄的庞清和佟健克服了伤病困难"} -{"key": "BAC009S0770W0360", "wav": "./aishell/wav/test/S0770/BAC009S0770W0360.wav", "txt": "但这对老将却以追梦无悔的精神"} -{"key": "BAC009S0770W0361", "wav": "./aishell/wav/test/S0770/BAC009S0770W0361.wav", "txt": "赢得了同行媒体和观众的敬意"} -{"key": "BAC009S0770W0362", "wav": "./aishell/wav/test/S0770/BAC009S0770W0362.wav", "txt": "庞清和佟健没有马上退役"} -{"key": "BAC009S0770W0363", "wav": "./aishell/wav/test/S0770/BAC009S0770W0363.wav", "txt": "而是坚持参加了今年三月的世界花滑锦标赛"} -{"key": "BAC009S0770W0364", "wav": "./aishell/wav/test/S0770/BAC009S0770W0364.wav", "txt": "一方面是他们从事花样滑冰二零多年"} -{"key": "BAC009S0770W0365", "wav": "./aishell/wav/test/S0770/BAC009S0770W0365.wav", "txt": "与这项运动结下深厚感情"} -{"key": "BAC009S0770W0366", "wav": "./aishell/wav/test/S0770/BAC009S0770W0366.wav", "txt": "始终对那块冰面恋恋不舍"} -{"key": "BAC009S0770W0367", "wav": "./aishell/wav/test/S0770/BAC009S0770W0367.wav", "txt": "也是中国双人滑在申雪赵宏退役后"} -{"key": "BAC009S0770W0368", "wav": "./aishell/wav/test/S0770/BAC009S0770W0368.wav", "txt": "庞清和佟健仍肩负着扛起中国双人滑大旗的重任"} -{"key": "BAC009S0770W0369", "wav": "./aishell/wav/test/S0770/BAC009S0770W0369.wav", "txt": "这让他们的退役迟迟没有提上日程"} -{"key": "BAC009S0770W0370", "wav": "./aishell/wav/test/S0770/BAC009S0770W0370.wav", "txt": "中国双人滑项目的后续发展应当有了较为清晰的前景"} -{"key": "BAC009S0770W0371", "wav": "./aishell/wav/test/S0770/BAC009S0770W0371.wav", "txt": "庞清和佟健终于可以放心地考虑退役的事情了"} -{"key": "BAC009S0770W0372", "wav": "./aishell/wav/test/S0770/BAC009S0770W0372.wav", "txt": "受大学生的提议启发"} -{"key": "BAC009S0770W0373", "wav": "./aishell/wav/test/S0770/BAC009S0770W0373.wav", "txt": "该公众号已经举办了两期公益活动"} -{"key": "BAC009S0770W0374", "wav": "./aishell/wav/test/S0770/BAC009S0770W0374.wav", "txt": "佟健又将国内部分优秀的单人滑和冰舞运动员集合起来"} -{"key": "BAC009S0770W0375", "wav": "./aishell/wav/test/S0770/BAC009S0770W0375.wav", "txt": "组建了花滑表演团队"} -{"key": "BAC009S0770W0376", "wav": "./aishell/wav/test/S0770/BAC009S0770W0376.wav", "txt": "与商业性冰场达成合作协议"} -{"key": "BAC009S0770W0377", "wav": "./aishell/wav/test/S0770/BAC009S0770W0377.wav", "txt": "以表演的方式推广花样滑冰"} -{"key": "BAC009S0770W0378", "wav": "./aishell/wav/test/S0770/BAC009S0770W0378.wav", "txt": "现成的选择就在面前"} -{"key": "BAC009S0770W0379", "wav": "./aishell/wav/test/S0770/BAC009S0770W0379.wav", "txt": "或进入体育行政机关"} -{"key": "BAC009S0770W0380", "wav": "./aishell/wav/test/S0770/BAC009S0770W0380.wav", "txt": "这些出路也是中国运动员比较常见的退役选择"} -{"key": "BAC009S0770W0381", "wav": "./aishell/wav/test/S0770/BAC009S0770W0381.wav", "txt": "但庞清和佟健并不愿意随遇而安地安排自己的后半生"} -{"key": "BAC009S0770W0382", "wav": "./aishell/wav/test/S0770/BAC009S0770W0382.wav", "txt": "自己和庞清曾在赛场上努力地追求优秀更优秀"} -{"key": "BAC009S0770W0383", "wav": "./aishell/wav/test/S0770/BAC009S0770W0383.wav", "txt": "他们对退役后的人生同样也有追求"} -{"key": "BAC009S0770W0384", "wav": "./aishell/wav/test/S0770/BAC009S0770W0384.wav", "txt": "佟健给自己定下了要做就做到最好"} -{"key": "BAC009S0770W0385", "wav": "./aishell/wav/test/S0770/BAC009S0770W0385.wav", "txt": "和绝不会是短期行为的基调"} -{"key": "BAC009S0770W0386", "wav": "./aishell/wav/test/S0770/BAC009S0770W0386.wav", "txt": "对于工作中遇到的管理经验和能力欠缺问题"} -{"key": "BAC009S0770W0387", "wav": "./aishell/wav/test/S0770/BAC009S0770W0387.wav", "txt": "佟健的解决办法就只能努力提高自己"} -{"key": "BAC009S0770W0388", "wav": "./aishell/wav/test/S0770/BAC009S0770W0388.wav", "txt": "佟健报考了北大光华管理学院"} -{"key": "BAC009S0770W0389", "wav": "./aishell/wav/test/S0770/BAC009S0770W0389.wav", "txt": "在九月初参加考试时"} -{"key": "BAC009S0770W0390", "wav": "./aishell/wav/test/S0770/BAC009S0770W0390.wav", "txt": "佟健做好了考不上的思想准备"} -{"key": "BAC009S0770W0391", "wav": "./aishell/wav/test/S0770/BAC009S0770W0391.wav", "txt": "佟健因此顺利通过了入学考试"} -{"key": "BAC009S0770W0392", "wav": "./aishell/wav/test/S0770/BAC009S0770W0392.wav", "txt": "佟健是同班同学里唯一运动员出身的"} -{"key": "BAC009S0770W0393", "wav": "./aishell/wav/test/S0770/BAC009S0770W0393.wav", "txt": "记者查阅相关资料发现"} -{"key": "BAC009S0770W0395", "wav": "./aishell/wav/test/S0770/BAC009S0770W0395.wav", "txt": "来自体育圈的并不多见"} -{"key": "BAC009S0770W0396", "wav": "./aishell/wav/test/S0770/BAC009S0770W0396.wav", "txt": "只有姚明和刘国梁等少数几个人"} -{"key": "BAC009S0770W0397", "wav": "./aishell/wav/test/S0770/BAC009S0770W0397.wav", "txt": "佟健希望自己能真的学到管理知识"} -{"key": "BAC009S0770W0398", "wav": "./aishell/wav/test/S0770/BAC009S0770W0398.wav", "txt": "管理知识肯定都是用的上的"} -{"key": "BAC009S0770W0399", "wav": "./aishell/wav/test/S0770/BAC009S0770W0399.wav", "txt": "至于中国花滑运动的推广"} -{"key": "BAC009S0770W0400", "wav": "./aishell/wav/test/S0770/BAC009S0770W0400.wav", "txt": "佟健更希望能有实实在在的发展"} -{"key": "BAC009S0770W0401", "wav": "./aishell/wav/test/S0770/BAC009S0770W0401.wav", "txt": "这同样需要有效的办法和手段"} -{"key": "BAC009S0770W0402", "wav": "./aishell/wav/test/S0770/BAC009S0770W0402.wav", "txt": "借着北京将要举办二零二二年冬奥会的东风"} -{"key": "BAC009S0770W0403", "wav": "./aishell/wav/test/S0770/BAC009S0770W0403.wav", "txt": "冰雪运动在中国势必会有一次发展高潮"} -{"key": "BAC009S0770W0404", "wav": "./aishell/wav/test/S0770/BAC009S0770W0404.wav", "txt": "很可能被雪藏下去"} -{"key": "BAC009S0770W0405", "wav": "./aishell/wav/test/S0770/BAC009S0770W0405.wav", "txt": "搜狐娱乐赛文耷子备受关注重拍版乌鸦"} -{"key": "BAC009S0770W0406", "wav": "./aishell/wav/test/S0770/BAC009S0770W0406.wav", "txt": "在原定男主角卢克伊万斯退出剧组之后"} -{"key": "BAC009S0770W0407", "wav": "./aishell/wav/test/S0770/BAC009S0770W0407.wav", "txt": "将双双加盟该片"} -{"key": "BAC009S0770W0408", "wav": "./aishell/wav/test/S0770/BAC009S0770W0408.wav", "txt": "搜狐娱乐据香港媒体报道"} -{"key": "BAC009S0770W0409", "wav": "./aishell/wav/test/S0770/BAC009S0770W0409.wav", "txt": "谢婷婷九月七日三十三岁生日"} -{"key": "BAC009S0770W0410", "wav": "./aishell/wav/test/S0770/BAC009S0770W0410.wav", "txt": "网友纷纷留言祝谢婷婷生日快乐"} -{"key": "BAC009S0770W0411", "wav": "./aishell/wav/test/S0770/BAC009S0770W0411.wav", "txt": "还拉赞姑还是那么漂亮"} -{"key": "BAC009S0770W0412", "wav": "./aishell/wav/test/S0770/BAC009S0770W0412.wav", "txt": "搜狐娱乐讯北京时间八月十二日消息"} -{"key": "BAC009S0770W0413", "wav": "./aishell/wav/test/S0770/BAC009S0770W0413.wav", "txt": "据香港媒体报道"} -{"key": "BAC009S0770W0414", "wav": "./aishell/wav/test/S0770/BAC009S0770W0414.wav", "txt": "谢贤昨天庆祝七十九岁生日"} -{"key": "BAC009S0770W0415", "wav": "./aishell/wav/test/S0770/BAC009S0770W0415.wav", "txt": "相约家人到谢霆锋家中上演十二道锋味私房菜"} -{"key": "BAC009S0770W0416", "wav": "./aishell/wav/test/S0770/BAC009S0770W0416.wav", "txt": "由于谢霆锋亲为家人做大厨"} -{"key": "BAC009S0770W0417", "wav": "./aishell/wav/test/S0770/BAC009S0770W0417.wav", "txt": "同场更有两个神秘嘉宾"} -{"key": "BAC009S0770W0419", "wav": "./aishell/wav/test/S0770/BAC009S0770W0419.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0770W0420", "wav": "./aishell/wav/test/S0770/BAC009S0770W0420.wav", "txt": "谢贤怒打曾江"} -{"key": "BAC009S0770W0421", "wav": "./aishell/wav/test/S0770/BAC009S0770W0421.wav", "txt": "究竟是演戏还是积怨已深"} -{"key": "BAC009S0770W0422", "wav": "./aishell/wav/test/S0770/BAC009S0770W0422.wav", "txt": "只有他们才知道"} -{"key": "BAC009S0770W0423", "wav": "./aishell/wav/test/S0770/BAC009S0770W0423.wav", "txt": "有不少幕后花絮片花"} -{"key": "BAC009S0770W0424", "wav": "./aishell/wav/test/S0770/BAC009S0770W0424.wav", "txt": "节目推出至今收视很高"} -{"key": "BAC009S0770W0425", "wav": "./aishell/wav/test/S0770/BAC009S0770W0425.wav", "txt": "下星期更进入结局周"} -{"key": "BAC009S0770W0426", "wav": "./aishell/wav/test/S0770/BAC009S0770W0426.wav", "txt": "曾江谢贤四哥及胡枫修哥大谈往日情时"} -{"key": "BAC009S0770W0427", "wav": "./aishell/wav/test/S0770/BAC009S0770W0427.wav", "txt": "曾江当时说我和谢贤相识多年"} -{"key": "BAC009S0770W0428", "wav": "./aishell/wav/test/S0770/BAC009S0770W0428.wav", "txt": "也没有发生什么冲突"} -{"key": "BAC009S0770W0429", "wav": "./aishell/wav/test/S0770/BAC009S0770W0429.wav", "txt": "不好的事情发生"} -{"key": "BAC009S0770W0430", "wav": "./aishell/wav/test/S0770/BAC009S0770W0430.wav", "txt": "怎料无心说话却一语成谶"} -{"key": "BAC009S0770W0431", "wav": "./aishell/wav/test/S0770/BAC009S0770W0431.wav", "txt": "经兴安县人大常委会许可"} -{"key": "BAC009S0770W0432", "wav": "./aishell/wav/test/S0770/BAC009S0770W0432.wav", "txt": "杨爱明被兴安警方刑事拘留"} -{"key": "BAC009S0770W0433", "wav": "./aishell/wav/test/S0770/BAC009S0770W0433.wav", "txt": "二零一四九二"} -{"key": "BAC009S0770W0434", "wav": "./aishell/wav/test/S0770/BAC009S0770W0434.wav", "txt": "兴安县法院判杨爱明拘役四个月"} -{"key": "BAC009S0770W0435", "wav": "./aishell/wav/test/S0770/BAC009S0770W0435.wav", "txt": "杨爱明却参加了兴安县第十五届人大五次会议"} -{"key": "BAC009S0770W0436", "wav": "./aishell/wav/test/S0770/BAC009S0770W0436.wav", "txt": "村支书骗拆迁款一二万获刑一一年"} -{"key": "BAC009S0770W0437", "wav": "./aishell/wav/test/S0770/BAC009S0770W0437.wav", "txt": "骗取搬迁补偿金一二二万元"} -{"key": "BAC009S0770W0438", "wav": "./aishell/wav/test/S0770/BAC009S0770W0438.wav", "txt": "北京晨报记者昨天获悉"} -{"key": "BAC009S0770W0439", "wav": "./aishell/wav/test/S0770/BAC009S0770W0439.wav", "txt": "延庆法院一审以贪污罪判处钱某有期徒刑一一年"} -{"key": "BAC009S0770W0440", "wav": "./aishell/wav/test/S0770/BAC009S0770W0440.wav", "txt": "村支书村民被政府工作人员土埋系邻里纠纷"} -{"key": "BAC009S0770W0441", "wav": "./aishell/wav/test/S0770/BAC009S0770W0441.wav", "txt": "河南省新乡市封丘县留光镇政府东五零零米左右"} -{"key": "BAC009S0770W0442", "wav": "./aishell/wav/test/S0770/BAC009S0770W0442.wav", "txt": "当地村民孙秋英在自家门口因是否垫路与邻居产生争执"} -{"key": "BAC009S0770W0443", "wav": "./aishell/wav/test/S0770/BAC009S0770W0443.wav", "txt": "遭到镇政府工作人员用土掩埋"} -{"key": "BAC009S0770W0444", "wav": "./aishell/wav/test/S0770/BAC009S0770W0444.wav", "txt": "肇事方为镇政府安全保卫人员"} -{"key": "BAC009S0770W0445", "wav": "./aishell/wav/test/S0770/BAC009S0770W0445.wav", "txt": "所开拉土车辆是镇政府扣押车辆"} -{"key": "BAC009S0770W0446", "wav": "./aishell/wav/test/S0770/BAC009S0770W0446.wav", "txt": "村支委办公室猥亵女童被刑拘的孩子奶奶在隔壁开会"} -{"key": "BAC009S0770W0447", "wav": "./aishell/wav/test/S0770/BAC009S0770W0447.wav", "txt": "海峡都市报大白天"} -{"key": "BAC009S0770W0448", "wav": "./aishell/wav/test/S0770/BAC009S0770W0448.wav", "txt": "在村委会办公楼书记办公室"} -{"key": "BAC009S0770W0449", "wav": "./aishell/wav/test/S0770/BAC009S0770W0449.wav", "txt": "五一岁的村支委猥亵一名一零岁的留守儿童隔壁"} -{"key": "BAC009S0770W0450", "wav": "./aishell/wav/test/S0770/BAC009S0770W0450.wav", "txt": "孩子的奶奶正在参加村里的道路环境综合治理工作会议"} -{"key": "BAC009S0770W0451", "wav": "./aishell/wav/test/S0770/BAC009S0770W0451.wav", "txt": "这事发生在福建省漳州市诏安县林头村"} -{"key": "BAC009S0770W0452", "wav": "./aishell/wav/test/S0770/BAC009S0770W0452.wav", "txt": "该村支委李某因涉嫌猥亵儿童被警方传唤"} -{"key": "BAC009S0770W0453", "wav": "./aishell/wav/test/S0770/BAC009S0770W0453.wav", "txt": "村民一零年在沙洲植树造林已成林却被指种错地方"} -{"key": "BAC009S0770W0454", "wav": "./aishell/wav/test/S0770/BAC009S0770W0454.wav", "txt": "两个村子之间的长江江面上"} -{"key": "BAC009S0770W0455", "wav": "./aishell/wav/test/S0770/BAC009S0770W0455.wav", "txt": "有一块面积近五零零零面积的沙洲"} -{"key": "BAC009S0770W0456", "wav": "./aishell/wav/test/S0770/BAC009S0770W0456.wav", "txt": "沙洲几乎年年被淹"} -{"key": "BAC009S0770W0457", "wav": "./aishell/wav/test/S0770/BAC009S0770W0457.wav", "txt": "村民二六零棵梨树被连根推倒在地里住房被夷为平地"} -{"key": "BAC009S0770W0458", "wav": "./aishell/wav/test/S0770/BAC009S0770W0458.wav", "txt": "华商报讯记者张林实习生邓泽惠一夜之间"} -{"key": "BAC009S0770W0459", "wav": "./aishell/wav/test/S0770/BAC009S0770W0459.wav", "txt": "村民地里二六零馀棵正在挂果的梨树被连根推倒"} -{"key": "BAC009S0770W0460", "wav": "./aishell/wav/test/S0770/BAC009S0770W0460.wav", "txt": "地头边的一间平房也被夷为平地"} -{"key": "BAC009S0770W0461", "wav": "./aishell/wav/test/S0770/BAC009S0770W0461.wav", "txt": "至今未找到肇事者"} -{"key": "BAC009S0770W0462", "wav": "./aishell/wav/test/S0770/BAC009S0770W0462.wav", "txt": "村民不满地讨说法要求楼盘开发商停工被拘留"} -{"key": "BAC009S0770W0463", "wav": "./aishell/wav/test/S0770/BAC009S0770W0463.wav", "txt": "去年一二月四日在村民多次上访无果的情况下"} -{"key": "BAC009S0770W0464", "wav": "./aishell/wav/test/S0770/BAC009S0770W0464.wav", "txt": "大家到施工现场的临时大门外"} -{"key": "BAC009S0770W0465", "wav": "./aishell/wav/test/S0770/BAC009S0770W0465.wav", "txt": "尽管检察管最后以事事实不清"} -{"key": "BAC009S0770W0466", "wav": "./aishell/wav/test/S0770/BAC009S0770W0466.wav", "txt": "但在张关押了三七天后"} -{"key": "BAC009S0770W0467", "wav": "./aishell/wav/test/S0770/BAC009S0770W0467.wav", "txt": "公安局仍采取了取保候审的手段"} -{"key": "BAC009S0770W0468", "wav": "./aishell/wav/test/S0770/BAC009S0770W0468.wav", "txt": "没有发生任何肢体冲突"} -{"key": "BAC009S0770W0469", "wav": "./aishell/wav/test/S0770/BAC009S0770W0469.wav", "txt": "更没有扰乱社会秩序"} -{"key": "BAC009S0770W0470", "wav": "./aishell/wav/test/S0770/BAC009S0770W0470.wav", "txt": "村民不满行政批复诉市区政府区长庭应诉"} -{"key": "BAC009S0770W0471", "wav": "./aishell/wav/test/S0770/BAC009S0770W0471.wav", "txt": "门头沟雁翅镇村民李冬梅因不服行政批复"} -{"key": "BAC009S0770W0472", "wav": "./aishell/wav/test/S0770/BAC009S0770W0472.wav", "txt": "将市区两级政府告上法庭"} -{"key": "BAC009S0770W0473", "wav": "./aishell/wav/test/S0770/BAC009S0770W0473.wav", "txt": "门头沟区长张贵林出庭应诉"} -{"key": "BAC009S0770W0474", "wav": "./aishell/wav/test/S0770/BAC009S0770W0474.wav", "txt": "门头沟雁翅镇村民李冬梅向市政府提起了行政复议"} -{"key": "BAC009S0770W0475", "wav": "./aishell/wav/test/S0770/BAC009S0770W0475.wav", "txt": "复议维持了区政府的认定结论"} -{"key": "BAC009S0770W0476", "wav": "./aishell/wav/test/S0770/BAC009S0770W0476.wav", "txt": "村民为多拿补偿在拆迁前突击装修全用劣质建材"} -{"key": "BAC009S0770W0477", "wav": "./aishell/wav/test/S0770/BAC009S0770W0477.wav", "txt": "村里随处可见装潢小广告"} -{"key": "BAC009S0770W0478", "wav": "./aishell/wav/test/S0770/BAC009S0770W0478.wav", "txt": "村民为救坠井男童身亡被拉出时呈托举姿势"} -{"key": "BAC009S0770W0479", "wav": "./aishell/wav/test/S0770/BAC009S0770W0479.wav", "txt": "为了一名坠入废井的男童"} -{"key": "BAC009S0770W0480", "wav": "./aishell/wav/test/S0770/BAC009S0770W0480.wav", "txt": "邳州几名村民先后下井救人"} -{"key": "BAC009S0770W0481", "wav": "./aishell/wav/test/S0770/BAC009S0770W0481.wav", "txt": "第一个下井救人的大叔却再也没能爬上来"} -{"key": "BAC009S0770W0482", "wav": "./aishell/wav/test/S0770/BAC009S0770W0482.wav", "txt": "他的双手还保持着托举的姿势"} -{"key": "BAC009S0770W0483", "wav": "./aishell/wav/test/S0770/BAC009S0770W0483.wav", "txt": "他的义举感动了四里八乡"} -{"key": "BAC009S0770W0484", "wav": "./aishell/wav/test/S0770/BAC009S0770W0484.wav", "txt": "七月一三日的葬礼上"} -{"key": "BAC009S0770W0485", "wav": "./aishell/wav/test/S0770/BAC009S0770W0485.wav", "txt": "数百名乡邻自发赶来送他一程"} -{"key": "BAC009S0770W0486", "wav": "./aishell/wav/test/S0770/BAC009S0770W0486.wav", "txt": "实习生郭杨雪通讯员耿万志现代快报记者李伟豪"} -{"key": "BAC009S0770W0487", "wav": "./aishell/wav/test/S0770/BAC009S0770W0487.wav", "txt": "村民为解决问题给领导建庙官员其诉求不合规"} -{"key": "BAC009S0770W0488", "wav": "./aishell/wav/test/S0770/BAC009S0770W0488.wav", "txt": "其在村西旁花费万元建起一名叫清明堂的家庙"} -{"key": "BAC009S0770W0489", "wav": "./aishell/wav/test/S0770/BAC009S0770W0489.wav", "txt": "每天烧香敬拜办事处主任"} -{"key": "BAC009S0770W0490", "wav": "./aishell/wav/test/S0770/BAC009S0770W0490.wav", "txt": "该事件引发社会关注"} -{"key": "BAC009S0770W0491", "wav": "./aishell/wav/test/S0770/BAC009S0770W0491.wav", "txt": "以上两村民所要求的内容不符合相关规定"} -{"key": "BAC009S0770W0492", "wav": "./aishell/wav/test/S0770/BAC009S0770W0492.wav", "txt": "村民为阻止儿子与女友相见编造偷小孩谎言"} -{"key": "BAC009S0770W0493", "wav": "./aishell/wav/test/S0770/BAC009S0770W0493.wav", "txt": "涉嫌编造谣言非法拘禁被刑拘"} -{"key": "BAC009S0770W0494", "wav": "./aishell/wav/test/S0770/BAC009S0770W0494.wav", "txt": "村民举报县城干部建十馀栋别墅纪检委部门介入调查"} -{"key": "BAC009S0770W0495", "wav": "./aishell/wav/test/S0770/BAC009S0770W0495.wav", "txt": "村小组干部未经过小组集集体讨论"} -{"key": "BAC009S0901W0121", "wav": "./aishell/wav/test/S0901/BAC009S0901W0121.wav", "txt": "作为一线城市的北京"} -{"key": "BAC009S0901W0122", "wav": "./aishell/wav/test/S0901/BAC009S0901W0122.wav", "txt": "其市管国管住房公积金政策也均进行调整"} -{"key": "BAC009S0901W0123", "wav": "./aishell/wav/test/S0901/BAC009S0901W0123.wav", "txt": "公积金贷款最高额度由七万元提升至十万元"} -{"key": "BAC009S0901W0124", "wav": "./aishell/wav/test/S0901/BAC009S0901W0124.wav", "txt": "公积金政策调整方式各异对楼市影响几何"} -{"key": "BAC009S0901W0125", "wav": "./aishell/wav/test/S0901/BAC009S0901W0125.wav", "txt": "盘活各地公积金资源"} -{"key": "BAC009S0901W0126", "wav": "./aishell/wav/test/S0901/BAC009S0901W0126.wav", "txt": "以北京提高公积金贷款最高额度为例"} -{"key": "BAC009S0901W0127", "wav": "./aishell/wav/test/S0901/BAC009S0901W0127.wav", "txt": "据伟嘉安捷数据统计显示"} -{"key": "BAC009S0901W0128", "wav": "./aishell/wav/test/S0901/BAC009S0901W0128.wav", "txt": "该政策在七月份实施一周后"} -{"key": "BAC009S0901W0129", "wav": "./aishell/wav/test/S0901/BAC009S0901W0129.wav", "txt": "公积金贷款额度的提高"} -{"key": "BAC009S0901W0130", "wav": "./aishell/wav/test/S0901/BAC009S0901W0130.wav", "txt": "将使更多购房者具备买房支付能力"} -{"key": "BAC009S0901W0131", "wav": "./aishell/wav/test/S0901/BAC009S0901W0131.wav", "txt": "中原地产首席分析师张大伟认为"} -{"key": "BAC009S0901W0132", "wav": "./aishell/wav/test/S0901/BAC009S0901W0132.wav", "txt": "放宽提取住房公积金支付房租条件则对楼市影响甚微"} -{"key": "BAC009S0901W0133", "wav": "./aishell/wav/test/S0901/BAC009S0901W0133.wav", "txt": "对楼市也有较大影响"} -{"key": "BAC009S0901W0134", "wav": "./aishell/wav/test/S0901/BAC009S0901W0134.wav", "txt": "利用公积金可以减少租赁者负担"} -{"key": "BAC009S0901W0135", "wav": "./aishell/wav/test/S0901/BAC009S0901W0135.wav", "txt": "使其缓冲过度到买房阶段"} -{"key": "BAC009S0901W0136", "wav": "./aishell/wav/test/S0901/BAC009S0901W0136.wav", "txt": "对楼市消化库存起到正面作用"} -{"key": "BAC009S0901W0137", "wav": "./aishell/wav/test/S0901/BAC009S0901W0137.wav", "txt": "中新网房产频道每每"} -{"key": "BAC009S0901W0138", "wav": "./aishell/wav/test/S0901/BAC009S0901W0138.wav", "txt": "要求各地放宽公积金贷款条件后"} -{"key": "BAC009S0901W0139", "wav": "./aishell/wav/test/S0901/BAC009S0901W0139.wav", "txt": "美丽北京大型绿色公益品牌项目"} -{"key": "BAC009S0901W0140", "wav": "./aishell/wav/test/S0901/BAC009S0901W0140.wav", "txt": "住建部等三部委再次联合发"} -{"key": "BAC009S0901W0141", "wav": "./aishell/wav/test/S0901/BAC009S0901W0141.wav", "txt": "美丽北京大型绿色公益品牌项目"} -{"key": "BAC009S0901W0142", "wav": "./aishell/wav/test/S0901/BAC009S0901W0142.wav", "txt": "随着广州住房公积金贷款政策的调整实施"} -{"key": "BAC009S0901W0143", "wav": "./aishell/wav/test/S0901/BAC009S0901W0143.wav", "txt": "公积金贷款最高额度也不同程度上调"} -{"key": "BAC009S0901W0144", "wav": "./aishell/wav/test/S0901/BAC009S0901W0144.wav", "txt": "住房公积金贷款因其利率较低的优势"} -{"key": "BAC009S0901W0145", "wav": "./aishell/wav/test/S0901/BAC009S0901W0145.wav", "txt": "一直以来广受购房者青睐"} -{"key": "BAC009S0901W0146", "wav": "./aishell/wav/test/S0901/BAC009S0901W0146.wav", "txt": "本轮住房公积金房贷政策调整"} -{"key": "BAC009S0901W0147", "wav": "./aishell/wav/test/S0901/BAC009S0901W0147.wav", "txt": "进一步加速了消费者的入市节奏"} -{"key": "BAC009S0901W0148", "wav": "./aishell/wav/test/S0901/BAC009S0901W0148.wav", "txt": "广州调整住房公积金个人住房贷款政策"} -{"key": "BAC009S0901W0149", "wav": "./aishell/wav/test/S0901/BAC009S0901W0149.wav", "txt": "同时对申请公积金贷款的缴纳时限调整为五个月"} -{"key": "BAC009S0901W0150", "wav": "./aishell/wav/test/S0901/BAC009S0901W0150.wav", "txt": "据广州日报昨天报道"} -{"key": "BAC009S0901W0151", "wav": "./aishell/wav/test/S0901/BAC009S0901W0151.wav", "txt": "公积金贷款首付比例降低的消息令购房者喜出望外"} -{"key": "BAC009S0901W0152", "wav": "./aishell/wav/test/S0901/BAC009S0901W0152.wav", "txt": "其中刚需买家入市积极性明显提高"} -{"key": "BAC009S0901W0153", "wav": "./aishell/wav/test/S0901/BAC009S0901W0153.wav", "txt": "据伟嘉安捷提供的数据显示"} -{"key": "BAC009S0901W0154", "wav": "./aishell/wav/test/S0901/BAC009S0901W0154.wav", "txt": "北京公积金贷款首付比例松绑一周后"} -{"key": "BAC009S0901W0155", "wav": "./aishell/wav/test/S0901/BAC009S0901W0155.wav", "txt": "公积金贷款及组合贷咨询量明显上涨"} -{"key": "BAC009S0901W0156", "wav": "./aishell/wav/test/S0901/BAC009S0901W0156.wav", "txt": "尤其组合贷的咨询量较上月月初一周上涨百分之五左右"} -{"key": "BAC009S0901W0157", "wav": "./aishell/wav/test/S0901/BAC009S0901W0157.wav", "txt": "上海深圳等主要城市也在公积金新政推动下"} -{"key": "BAC009S0901W0158", "wav": "./aishell/wav/test/S0901/BAC009S0901W0158.wav", "txt": "呈现购房者积极入市的行情"} -{"key": "BAC009S0901W0159", "wav": "./aishell/wav/test/S0901/BAC009S0901W0159.wav", "txt": "全国已有超百个城市发布了不同力度的公积金松绑政策"} -{"key": "BAC009S0901W0160", "wav": "./aishell/wav/test/S0901/BAC009S0901W0160.wav", "txt": "加之降息降准等政策组合拳"} -{"key": "BAC009S0901W0161", "wav": "./aishell/wav/test/S0901/BAC009S0901W0161.wav", "txt": "呈现出量价齐涨的局面"} -{"key": "BAC009S0901W0162", "wav": "./aishell/wav/test/S0901/BAC009S0901W0162.wav", "txt": "据中国指数研究院最新数据显示"} -{"key": "BAC009S0901W0163", "wav": "./aishell/wav/test/S0901/BAC009S0901W0163.wav", "txt": "深圳环比上涨百分之七"} -{"key": "BAC009S0901W0164", "wav": "./aishell/wav/test/S0901/BAC009S0901W0164.wav", "txt": "涨幅据十大城市之首"} -{"key": "BAC009S0901W0165", "wav": "./aishell/wav/test/S0901/BAC009S0901W0165.wav", "txt": "五月份多地楼市的成交量明显上涨"} -{"key": "BAC009S0901W0166", "wav": "./aishell/wav/test/S0901/BAC009S0901W0166.wav", "txt": "是房地产当前发展格局下的一个必然"} -{"key": "BAC009S0901W0167", "wav": "./aishell/wav/test/S0901/BAC009S0901W0167.wav", "txt": "唯独这样才能盘活公积金资源"} -{"key": "BAC009S0901W0168", "wav": "./aishell/wav/test/S0901/BAC009S0901W0168.wav", "txt": "促使更多购房者积极入市"} -{"key": "BAC009S0901W0169", "wav": "./aishell/wav/test/S0901/BAC009S0901W0169.wav", "txt": "伴随着各地中住房公积金新政的落地实施"} -{"key": "BAC009S0901W0170", "wav": "./aishell/wav/test/S0901/BAC009S0901W0170.wav", "txt": "楼市进展仍需进一步观望"} -{"key": "BAC009S0901W0171", "wav": "./aishell/wav/test/S0901/BAC009S0901W0171.wav", "txt": "购房者受惠于政策利好的同时"} -{"key": "BAC009S0901W0172", "wav": "./aishell/wav/test/S0901/BAC009S0901W0172.wav", "txt": "公积金在申请放贷流程上并未提速"} -{"key": "BAC009S0901W0173", "wav": "./aishell/wav/test/S0901/BAC009S0901W0173.wav", "txt": "相反相关环节上审批更加严格"} -{"key": "BAC009S0901W0174", "wav": "./aishell/wav/test/S0901/BAC009S0901W0174.wav", "txt": "从目前上海住房公积金的具体政策看"} -{"key": "BAC009S0901W0175", "wav": "./aishell/wav/test/S0901/BAC009S0901W0175.wav", "txt": "购房的扶持力度在加大"} -{"key": "BAC009S0901W0176", "wav": "./aishell/wav/test/S0901/BAC009S0901W0176.wav", "txt": "但主要还是体现在贷款成本的降低"} -{"key": "BAC009S0901W0177", "wav": "./aishell/wav/test/S0901/BAC009S0901W0177.wav", "txt": "而申请公积金贷款方面还是需要走严格的流程"} -{"key": "BAC009S0901W0178", "wav": "./aishell/wav/test/S0901/BAC009S0901W0178.wav", "txt": "公积金提取一直是目前试图突破的内容"} -{"key": "BAC009S0901W0179", "wav": "./aishell/wav/test/S0901/BAC009S0901W0179.wav", "txt": "但目前还未出现大面积提取行为"} -{"key": "BAC009S0901W0180", "wav": "./aishell/wav/test/S0901/BAC009S0901W0180.wav", "txt": "来自广州日报的报道称"} -{"key": "BAC009S0901W0181", "wav": "./aishell/wav/test/S0901/BAC009S0901W0181.wav", "txt": "从申请到最后的拨放款"} -{"key": "BAC009S0901W0182", "wav": "./aishell/wav/test/S0901/BAC009S0901W0182.wav", "txt": "部分客户甚至等两个多月"} -{"key": "BAC009S0901W0183", "wav": "./aishell/wav/test/S0901/BAC009S0901W0183.wav", "txt": "如果申请公积金贷款或公积金贷款与商业贷款的组合贷"} -{"key": "BAC009S0901W0184", "wav": "./aishell/wav/test/S0901/BAC009S0901W0184.wav", "txt": "伟嘉安捷对中新网房产频道表示"} -{"key": "BAC009S0901W0185", "wav": "./aishell/wav/test/S0901/BAC009S0901W0185.wav", "txt": "现在公积金贷款办理需要一个月左右的时间"} -{"key": "BAC009S0901W0186", "wav": "./aishell/wav/test/S0901/BAC009S0901W0186.wav", "txt": "而申请办理组合贷款的手续则更为复杂"} -{"key": "BAC009S0901W0187", "wav": "./aishell/wav/test/S0901/BAC009S0901W0187.wav", "txt": "农业现代化水平显着提升"} -{"key": "BAC009S0901W0188", "wav": "./aishell/wav/test/S0901/BAC009S0901W0188.wav", "txt": "发展现代农业的条件更加有利"} -{"key": "BAC009S0901W0189", "wav": "./aishell/wav/test/S0901/BAC009S0901W0189.wav", "txt": "加快发展现代农业机遇遇得"} -{"key": "BAC009S0901W0190", "wav": "./aishell/wav/test/S0901/BAC009S0901W0190.wav", "txt": "一是工业化城镇化的引领推动作用将更加明显"} -{"key": "BAC009S0901W0191", "wav": "./aishell/wav/test/S0901/BAC009S0901W0191.wav", "txt": "信息化水平不断提高"} -{"key": "BAC009S0901W0192", "wav": "./aishell/wav/test/S0901/BAC009S0901W0192.wav", "txt": "农村劳动力大量转移"} -{"key": "BAC009S0901W0193", "wav": "./aishell/wav/test/S0901/BAC009S0901W0193.wav", "txt": "以及扩大内需战略的实施"} -{"key": "BAC009S0901W0194", "wav": "./aishell/wav/test/S0901/BAC009S0901W0194.wav", "txt": "二是政策支持将更加强化"} -{"key": "BAC009S0901W0195", "wav": "./aishell/wav/test/S0901/BAC009S0901W0195.wav", "txt": "随着我国综合国力和财政实力不断增强"} -{"key": "BAC009S0901W0196", "wav": "./aishell/wav/test/S0901/BAC009S0901W0196.wav", "txt": "强农惠农富农政策力度将进一步加大"} -{"key": "BAC009S0901W0197", "wav": "./aishell/wav/test/S0901/BAC009S0901W0197.wav", "txt": "支持现代农业发展的物质基础更加牢固"} -{"key": "BAC009S0901W0198", "wav": "./aishell/wav/test/S0901/BAC009S0901W0198.wav", "txt": "三是科技支撑将更加有力"} -{"key": "BAC009S0901W0199", "wav": "./aishell/wav/test/S0901/BAC009S0901W0199.wav", "txt": "科技创新孕育新突破"} -{"key": "BAC009S0901W0200", "wav": "./aishell/wav/test/S0901/BAC009S0901W0200.wav", "txt": "全球绿色经济低碳技术正在兴起"} -{"key": "BAC009S0901W0201", "wav": "./aishell/wav/test/S0901/BAC009S0901W0201.wav", "txt": "现代农业发展的动力更加强劲"} -{"key": "BAC009S0901W0202", "wav": "./aishell/wav/test/S0901/BAC009S0901W0202.wav", "txt": "四是外部环境将更加优化"} -{"key": "BAC009S0901W0203", "wav": "./aishell/wav/test/S0901/BAC009S0901W0203.wav", "txt": "形成合力推进现代农业发展的新局面"} -{"key": "BAC009S0901W0204", "wav": "./aishell/wav/test/S0901/BAC009S0901W0204.wav", "txt": "广大农民的积极性创造性将得到进一步激发和释放"} -{"key": "BAC009S0901W0205", "wav": "./aishell/wav/test/S0901/BAC009S0901W0205.wav", "txt": "发展现代农业的要求更加迫切"} -{"key": "BAC009S0901W0206", "wav": "./aishell/wav/test/S0901/BAC009S0901W0206.wav", "txt": "在工业化城镇化快速推进时期"} -{"key": "BAC009S0901W0207", "wav": "./aishell/wav/test/S0901/BAC009S0901W0207.wav", "txt": "农业面临着容易被忽视或削弱的风险"} -{"key": "BAC009S0901W0208", "wav": "./aishell/wav/test/S0901/BAC009S0901W0208.wav", "txt": "我国工业化城镇化快速发展"} -{"key": "BAC009S0901W0209", "wav": "./aishell/wav/test/S0901/BAC009S0901W0209.wav", "txt": "但农业现代化明显滞后"} -{"key": "BAC009S0901W0210", "wav": "./aishell/wav/test/S0901/BAC009S0901W0210.wav", "txt": "面临着一系列严峻挑战"} -{"key": "BAC009S0901W0211", "wav": "./aishell/wav/test/S0901/BAC009S0901W0211.wav", "txt": "科技创新和推广新应用能力不强"} -{"key": "BAC009S0901W0212", "wav": "./aishell/wav/test/S0901/BAC009S0901W0212.wav", "txt": "农业社会化服务体系不健全"} -{"key": "BAC009S0901W0213", "wav": "./aishell/wav/test/S0901/BAC009S0901W0213.wav", "txt": "国际农产品市场投机炒作及传导影响加深"} -{"key": "BAC009S0901W0214", "wav": "./aishell/wav/test/S0901/BAC009S0901W0214.wav", "txt": "我国现代农业发展面临更多的外部不确定性"} -{"key": "BAC009S0901W0215", "wav": "./aishell/wav/test/S0901/BAC009S0901W0215.wav", "txt": "必须珍惜抓住用好难得的历史机遇"} -{"key": "BAC009S0901W0216", "wav": "./aishell/wav/test/S0901/BAC009S0901W0216.wav", "txt": "坚持用现代物质条件装备农业"} -{"key": "BAC009S0901W0217", "wav": "./aishell/wav/test/S0901/BAC009S0901W0217.wav", "txt": "努力探索出一条具有中国特色的农业现代化道路"} -{"key": "BAC009S0901W0218", "wav": "./aishell/wav/test/S0901/BAC009S0901W0218.wav", "txt": "指导思想基本原则与发展目标"} -{"key": "BAC009S0901W0219", "wav": "./aishell/wav/test/S0901/BAC009S0901W0219.wav", "txt": "以邓小平理论和三个代表重要思想为指导"} -{"key": "BAC009S0901W0220", "wav": "./aishell/wav/test/S0901/BAC009S0901W0220.wav", "txt": "深入贯彻落实科学发展观"} -{"key": "BAC009S0901W0221", "wav": "./aishell/wav/test/S0901/BAC009S0901W0221.wav", "txt": "坚持走中国特色农业现代化道路"} -{"key": "BAC009S0901W0222", "wav": "./aishell/wav/test/S0901/BAC009S0901W0222.wav", "txt": "以转变农业发展方式为主线"} -{"key": "BAC009S0901W0223", "wav": "./aishell/wav/test/S0901/BAC009S0901W0223.wav", "txt": "着力强化政策科技设施装备人才和体制支撑"} -{"key": "BAC009S0901W0224", "wav": "./aishell/wav/test/S0901/BAC009S0901W0224.wav", "txt": "着力完善现代农业产业体系"} -{"key": "BAC009S0901W0225", "wav": "./aishell/wav/test/S0901/BAC009S0901W0225.wav", "txt": "提高农业现代化水平农民生活水平和新农村建设水平"} -{"key": "BAC009S0901W0226", "wav": "./aishell/wav/test/S0901/BAC009S0901W0226.wav", "txt": "坚持确保国家粮食安全"} -{"key": "BAC009S0901W0227", "wav": "./aishell/wav/test/S0901/BAC009S0901W0227.wav", "txt": "坚持立足国内实现粮食基本自给的方针"} -{"key": "BAC009S0901W0228", "wav": "./aishell/wav/test/S0901/BAC009S0901W0228.wav", "txt": "实行最严格的耕地保护和节约用地制度"} -{"key": "BAC009S0901W0229", "wav": "./aishell/wav/test/S0901/BAC009S0901W0229.wav", "txt": "加强农业基础设施建设"} -{"key": "BAC009S0901W0230", "wav": "./aishell/wav/test/S0901/BAC009S0901W0230.wav", "txt": "着力提高粮食综合生产能力"} -{"key": "BAC009S0901W0231", "wav": "./aishell/wav/test/S0901/BAC009S0901W0231.wav", "txt": "坚持和完善农村基本经营制度"} -{"key": "BAC009S0901W0232", "wav": "./aishell/wav/test/S0901/BAC009S0901W0232.wav", "txt": "在保持农村土地承包关系稳定并长久不变的前提下"} -{"key": "BAC009S0901W0233", "wav": "./aishell/wav/test/S0901/BAC009S0901W0233.wav", "txt": "推进农业经营体系体制创新"} -{"key": "BAC009S0901W0234", "wav": "./aishell/wav/test/S0901/BAC009S0901W0234.wav", "txt": "坚持科教兴农和人才强农"} -{"key": "BAC009S0901W0235", "wav": "./aishell/wav/test/S0901/BAC009S0901W0235.wav", "txt": "加快农业科技自主创新和农业农村人才培养"} -{"key": "BAC009S0901W0236", "wav": "./aishell/wav/test/S0901/BAC009S0901W0236.wav", "txt": "加快农业科技成果转化与推广应用"} -{"key": "BAC009S0901W0237", "wav": "./aishell/wav/test/S0901/BAC009S0901W0237.wav", "txt": "提高农业物质技术水装备水平"} -{"key": "BAC009S0901W0238", "wav": "./aishell/wav/test/S0901/BAC009S0901W0238.wav", "txt": "坚持政府支持农民主体社会参与"} -{"key": "BAC009S0901W0239", "wav": "./aishell/wav/test/S0901/BAC009S0901W0239.wav", "txt": "加大强农惠农富农力度"} -{"key": "BAC009S0901W0240", "wav": "./aishell/wav/test/S0901/BAC009S0901W0240.wav", "txt": "充分发挥农民的主体作用和首创精神"} -{"key": "BAC009S0901W0241", "wav": "./aishell/wav/test/S0901/BAC009S0901W0241.wav", "txt": "引导和鼓励社会资本投入农业"} -{"key": "BAC009S0901W0242", "wav": "./aishell/wav/test/S0901/BAC009S0901W0242.wav", "txt": "合力推进现代农业发展"} -{"key": "BAC009S0901W0243", "wav": "./aishell/wav/test/S0901/BAC009S0901W0243.wav", "txt": "坚持分类指导重点突破梯次推进"} -{"key": "BAC009S0901W0244", "wav": "./aishell/wav/test/S0901/BAC009S0901W0244.wav", "txt": "进一步优化农业生产力布局"} -{"key": "BAC009S0901W0245", "wav": "./aishell/wav/test/S0901/BAC009S0901W0245.wav", "txt": "因地制宜地采取有选择差别化扶持政策"} -{"key": "BAC009S0901W0246", "wav": "./aishell/wav/test/S0901/BAC009S0901W0246.wav", "txt": "支持主要农产品优势产区建设"} -{"key": "BAC009S0901W0247", "wav": "./aishell/wav/test/S0901/BAC009S0901W0247.wav", "txt": "鼓励有条件地区率先实现农业现代化"} -{"key": "BAC009S0901W0248", "wav": "./aishell/wav/test/S0901/BAC009S0901W0248.wav", "txt": "推动其他地区加快发展"} -{"key": "BAC009S0901W0249", "wav": "./aishell/wav/test/S0901/BAC009S0901W0249.wav", "txt": "全面提高农业现代化水平"} -{"key": "BAC009S0901W0250", "wav": "./aishell/wav/test/S0901/BAC009S0901W0250.wav", "txt": "现代农业建设取得明显进展"} -{"key": "BAC009S0901W0251", "wav": "./aishell/wav/test/S0901/BAC009S0901W0251.wav", "txt": "粮食等主要农产品供给得到有效保障"} -{"key": "BAC009S0901W0252", "wav": "./aishell/wav/test/S0901/BAC009S0901W0252.wav", "txt": "物质装备水平明显提高"} -{"key": "BAC009S0901W0253", "wav": "./aishell/wav/test/S0901/BAC009S0901W0253.wav", "txt": "并没有提供什么帮助"} -{"key": "BAC009S0901W0254", "wav": "./aishell/wav/test/S0901/BAC009S0901W0254.wav", "txt": "由于关于乔布斯的电话即将上演了"} -{"key": "BAC009S0901W0255", "wav": "./aishell/wav/test/S0901/BAC009S0901W0255.wav", "txt": "想了解苹果最初的事"} -{"key": "BAC009S0901W0256", "wav": "./aishell/wav/test/S0901/BAC009S0901W0256.wav", "txt": "乔布斯在最初产品开发过程中"} -{"key": "BAC009S0901W0257", "wav": "./aishell/wav/test/S0901/BAC009S0901W0257.wav", "txt": "到底发挥了什么作用"} -{"key": "BAC009S0901W0259", "wav": "./aishell/wav/test/S0901/BAC009S0901W0259.wav", "txt": "乔布斯几乎没发挥什么作用"} -{"key": "BAC009S0901W0261", "wav": "./aishell/wav/test/S0901/BAC009S0901W0261.wav", "txt": "而这都是我自己的努力"} -{"key": "BAC009S0901W0262", "wav": "./aishell/wav/test/S0901/BAC009S0901W0262.wav", "txt": "乔布斯在它出现之前都不知道它的存在"} -{"key": "BAC009S0901W0263", "wav": "./aishell/wav/test/S0901/BAC009S0901W0263.wav", "txt": "不过这话他在去年就说过"} -{"key": "BAC009S0901W0264", "wav": "./aishell/wav/test/S0901/BAC009S0901W0264.wav", "txt": "其中一个回答就说过"} -{"key": "BAC009S0901W0265", "wav": "./aishell/wav/test/S0901/BAC009S0901W0265.wav", "txt": "乔布斯不是一名工程师"} -{"key": "BAC009S0901W0266", "wav": "./aishell/wav/test/S0901/BAC009S0901W0266.wav", "txt": "他从来没有写过代码"} -{"key": "BAC009S0901W0267", "wav": "./aishell/wav/test/S0901/BAC009S0901W0267.wav", "txt": "也没有参与过任何产品的原始设计"} -{"key": "BAC009S0901W0268", "wav": "./aishell/wav/test/S0901/BAC009S0901W0268.wav", "txt": "乔帮主并没有他说的那么不堪"} -{"key": "BAC009S0901W0269", "wav": "./aishell/wav/test/S0901/BAC009S0901W0269.wav", "txt": "沃兹尼亚克自己也说"} -{"key": "BAC009S0901W0270", "wav": "./aishell/wav/test/S0901/BAC009S0901W0270.wav", "txt": "乔布斯想成为重要人物"} -{"key": "BAC009S0901W0271", "wav": "./aishell/wav/test/S0901/BAC009S0901W0271.wav", "txt": "而这种人通常是商业人士"} -{"key": "BAC009S0901W0272", "wav": "./aishell/wav/test/S0901/BAC009S0901W0272.wav", "txt": "他是一个杰出的商人"} -{"key": "BAC009S0901W0273", "wav": "./aishell/wav/test/S0901/BAC009S0901W0273.wav", "txt": "一个公司不能缺少两种人"} -{"key": "BAC009S0901W0274", "wav": "./aishell/wav/test/S0901/BAC009S0901W0274.wav", "txt": "公司的成功缺一不可"} -{"key": "BAC009S0901W0275", "wav": "./aishell/wav/test/S0901/BAC009S0901W0275.wav", "txt": "而沃兹尼亚克似乎乐于承担驱魅的角色"} -{"key": "BAC009S0901W0276", "wav": "./aishell/wav/test/S0901/BAC009S0901W0276.wav", "txt": "车库没有发挥过太大作用"} -{"key": "BAC009S0901W0277", "wav": "./aishell/wav/test/S0901/BAC009S0901W0277.wav", "txt": "除了有时候让他们觉得那里像家"} -{"key": "BAC009S0901W0278", "wav": "./aishell/wav/test/S0901/BAC009S0901W0278.wav", "txt": "车库虽然最能够代表初期创业"} -{"key": "BAC009S0901W0279", "wav": "./aishell/wav/test/S0901/BAC009S0901W0279.wav", "txt": "但是在那没做任何设计工作"} -{"key": "BAC009S0901W0280", "wav": "./aishell/wav/test/S0901/BAC009S0901W0280.wav", "txt": "他还吐槽过乔布斯电影中的桥段"} -{"key": "BAC009S0901W0281", "wav": "./aishell/wav/test/S0901/BAC009S0901W0281.wav", "txt": "他从未对产品被偷发表过任何评论"} -{"key": "BAC009S0901W0282", "wav": "./aishell/wav/test/S0901/BAC009S0901W0282.wav", "txt": "并不像乔布斯那样激动"} -{"key": "BAC009S0901W0283", "wav": "./aishell/wav/test/S0901/BAC009S0901W0283.wav", "txt": "我们外人是无法知道真相的"} -{"key": "BAC009S0901W0284", "wav": "./aishell/wav/test/S0901/BAC009S0901W0284.wav", "txt": "原创张驰乔布斯逝世已久"} -{"key": "BAC009S0901W0285", "wav": "./aishell/wav/test/S0901/BAC009S0901W0285.wav", "txt": "而苹果的另一位联合创始人沃兹尼亚克还活跃在科技圈"} -{"key": "BAC009S0901W0286", "wav": "./aishell/wav/test/S0901/BAC009S0901W0286.wav", "txt": "而且以喜欢点评各家公司着称"} -{"key": "BAC009S0901W0287", "wav": "./aishell/wav/test/S0901/BAC009S0901W0287.wav", "txt": "乔帮主在首批苹果产品的开发中"} -{"key": "BAC009S0901W0288", "wav": "./aishell/wav/test/S0901/BAC009S0901W0288.wav", "txt": "苹果股价下跌百分之五分析师出现重大分歧搜狐科技"} -{"key": "BAC009S0901W0289", "wav": "./aishell/wav/test/S0901/BAC009S0901W0289.wav", "txt": "本报记者纪佳鹏北京报道北京时间八月十二日"} -{"key": "BAC009S0901W0290", "wav": "./aishell/wav/test/S0901/BAC009S0901W0290.wav", "txt": "作为科技股领头羊的苹果股份当天下挫百分之二"} -{"key": "BAC009S0901W0291", "wav": "./aishell/wav/test/S0901/BAC009S0901W0291.wav", "txt": "人民币的贬值很可能会增加苹果设备进口的费用"} -{"key": "BAC009S0901W0292", "wav": "./aishell/wav/test/S0901/BAC009S0901W0292.wav", "txt": "这也是影响股价的一大因素"} -{"key": "BAC009S0901W0293", "wav": "./aishell/wav/test/S0901/BAC009S0901W0293.wav", "txt": "苹果股价的这轮连续下跌"} -{"key": "BAC009S0901W0294", "wav": "./aishell/wav/test/S0901/BAC009S0901W0294.wav", "txt": "从今年的七月二十一日便开始了"} -{"key": "BAC009S0901W0295", "wav": "./aishell/wav/test/S0901/BAC009S0901W0295.wav", "txt": "苹果股价已下挫了百分之七十九"} -{"key": "BAC009S0901W0296", "wav": "./aishell/wav/test/S0901/BAC009S0901W0296.wav", "txt": "不少报道与评论表示"} -{"key": "BAC009S0901W0299", "wav": "./aishell/wav/test/S0901/BAC009S0901W0299.wav", "txt": "也过分依赖于大中华地区"} -{"key": "BAC009S0901W0300", "wav": "./aishell/wav/test/S0901/BAC009S0901W0300.wav", "txt": "甚至是负增长而其中"} -{"key": "BAC009S0901W0301", "wav": "./aishell/wav/test/S0901/BAC009S0901W0301.wav", "txt": "根据近期公布的苹果第三财季业业绩"} -{"key": "BAC009S0901W0302", "wav": "./aishell/wav/test/S0901/BAC009S0901W0302.wav", "txt": "该季度苹果大中华区营收为一百三十二点三亿美元"} -{"key": "BAC009S0901W0303", "wav": "./aishell/wav/test/S0901/BAC009S0901W0303.wav", "txt": "为中国的智能制造产业做出贡献"} -{"key": "BAC009S0901W0304", "wav": "./aishell/wav/test/S0901/BAC009S0901W0304.wav", "txt": "由于该项目尚处于保密期"} -{"key": "BAC009S0901W0305", "wav": "./aishell/wav/test/S0901/BAC009S0901W0305.wav", "txt": "赵伟国并未透露更多内容"} -{"key": "BAC009S0901W0306", "wav": "./aishell/wav/test/S0901/BAC009S0901W0306.wav", "txt": "沈阳机床董事长关锡友认为"} -{"key": "BAC009S0901W0307", "wav": "./aishell/wav/test/S0901/BAC009S0901W0307.wav", "txt": "中国企业与世界企业同在同一起跑线上"} -{"key": "BAC009S0901W0308", "wav": "./aishell/wav/test/S0901/BAC009S0901W0308.wav", "txt": "中国的中高端嵌入式芯片全部从德国日本进口"} -{"key": "BAC009S0901W0309", "wav": "./aishell/wav/test/S0901/BAC009S0901W0309.wav", "txt": "德国制造业最核心的技术就是嵌入式系统"} -{"key": "BAC009S0901W0310", "wav": "./aishell/wav/test/S0901/BAC009S0901W0310.wav", "txt": "在体积能耗上存在一定的不足"} -{"key": "BAC009S0901W0311", "wav": "./aishell/wav/test/S0901/BAC009S0901W0311.wav", "txt": "紫光与沈阳机床可以在此布局"} -{"key": "BAC009S0901W0312", "wav": "./aishell/wav/test/S0901/BAC009S0901W0312.wav", "txt": "三十一九二零一五"} -{"key": "BAC009S0901W0313", "wav": "./aishell/wav/test/S0901/BAC009S0901W0313.wav", "txt": "紫光集团系清华控股旗下最主要的资产"} -{"key": "BAC009S0901W0314", "wav": "./aishell/wav/test/S0901/BAC009S0901W0314.wav", "txt": "二零一三年二零一四年"} -{"key": "BAC009S0901W0315", "wav": "./aishell/wav/test/S0901/BAC009S0901W0315.wav", "txt": "并一举成为中国最大全球第三大通讯芯片设计公司"} -{"key": "BAC009S0901W0316", "wav": "./aishell/wav/test/S0901/BAC009S0901W0316.wav", "txt": "紫光集团还计划布局物联网网络设备芯片"} -{"key": "BAC009S0901W0317", "wav": "./aishell/wav/test/S0901/BAC009S0901W0317.wav", "txt": "二零一五年紫光集团预计收入约四百亿元"} -{"key": "BAC009S0901W0318", "wav": "./aishell/wav/test/S0901/BAC009S0901W0318.wav", "txt": "资产规模将达到六十五亿八百亿元"} -{"key": "BAC009S0901W0319", "wav": "./aishell/wav/test/S0901/BAC009S0901W0319.wav", "txt": "中国机床龙头企业沈阳机床在北京举行战略发布会"} -{"key": "BAC009S0901W0320", "wav": "./aishell/wav/test/S0901/BAC009S0901W0320.wav", "txt": "紫光股份云计算股收涨停搜狐科技"} -{"key": "BAC009S0901W0321", "wav": "./aishell/wav/test/S0901/BAC009S0901W0321.wav", "txt": "大盘股仍是毫无作为"} -{"key": "BAC009S0901W0322", "wav": "./aishell/wav/test/S0901/BAC009S0901W0322.wav", "txt": "题材股继续扮演黑马角色"} -{"key": "BAC009S0901W0323", "wav": "./aishell/wav/test/S0901/BAC009S0901W0323.wav", "txt": "紫光股份在公告扩展云计算市场后"} -{"key": "BAC009S0901W0324", "wav": "./aishell/wav/test/S0901/BAC009S0901W0324.wav", "txt": "盘中有二千六百八十六万元资金净流入"} -{"key": "BAC009S0901W0325", "wav": "./aishell/wav/test/S0901/BAC009S0901W0325.wav", "txt": "主营信息电子和环保"} -{"key": "BAC009S0901W0326", "wav": "./aishell/wav/test/S0901/BAC009S0901W0326.wav", "txt": "公司昨日发布公告称"} -{"key": "BAC009S0901W0327", "wav": "./aishell/wav/test/S0901/BAC009S0901W0327.wav", "txt": "各方本着互惠互利优势互补合作共赢的原则"} -{"key": "BAC009S0901W0328", "wav": "./aishell/wav/test/S0901/BAC009S0901W0328.wav", "txt": "通过搭建具有领先技术水平的混合云解决方案平台"} -{"key": "BAC009S0901W0329", "wav": "./aishell/wav/test/S0901/BAC009S0901W0329.wav", "txt": "共同拓展国内云计算市场"} -{"key": "BAC009S0901W0330", "wav": "./aishell/wav/test/S0901/BAC009S0901W0330.wav", "txt": "紫光股份将与世纪互联共同出资组建合资公司"} -{"key": "BAC009S0901W0332", "wav": "./aishell/wav/test/S0901/BAC009S0901W0332.wav", "txt": "搭建混合云解决方案平台"} -{"key": "BAC009S0901W0333", "wav": "./aishell/wav/test/S0901/BAC009S0901W0333.wav", "txt": "满足政府和企业级客户云计算下的定制化需求"} -{"key": "BAC009S0901W0334", "wav": "./aishell/wav/test/S0901/BAC009S0901W0334.wav", "txt": "推动公司云服务战略的实施"} -{"key": "BAC009S0901W0335", "wav": "./aishell/wav/test/S0901/BAC009S0901W0335.wav", "txt": "紫光股份拟定增募资二百二十五亿元"} -{"key": "BAC009S0901W0336", "wav": "./aishell/wav/test/S0901/BAC009S0901W0336.wav", "txt": "公司继续推进云服务战略"} -{"key": "BAC009S0901W0337", "wav": "./aishell/wav/test/S0901/BAC009S0901W0337.wav", "txt": "紫光集团和员工持股计划参与非公开增发"} -{"key": "BAC009S0901W0338", "wav": "./aishell/wav/test/S0901/BAC009S0901W0338.wav", "txt": "医生此次将对惠若琪的心脏进行微创手术"} -{"key": "BAC009S0901W0339", "wav": "./aishell/wav/test/S0901/BAC009S0901W0339.wav", "txt": "彻底解决目前存在的隐患"} -{"key": "BAC009S0901W0340", "wav": "./aishell/wav/test/S0901/BAC009S0901W0340.wav", "txt": "惠若琪将在微创手术后回到南京调养"} -{"key": "BAC009S0901W0341", "wav": "./aishell/wav/test/S0901/BAC009S0901W0341.wav", "txt": "张蓉芳主持排管中心"} -{"key": "BAC009S0901W0342", "wav": "./aishell/wav/test/S0901/BAC009S0901W0342.wav", "txt": "成就了中国女排五连冠伟业"} -{"key": "BAC009S0901W0343", "wav": "./aishell/wav/test/S0901/BAC009S0901W0343.wav", "txt": "北京时间九月十七日"} -{"key": "BAC009S0901W0344", "wav": "./aishell/wav/test/S0901/BAC009S0901W0344.wav", "txt": "已经确定本赛季不会参加任何的比赛"} -{"key": "BAC009S0901W0345", "wav": "./aishell/wav/test/S0901/BAC009S0901W0345.wav", "txt": "明年春天普鲁申科将再次进行手术"} -{"key": "BAC009S0901W0346", "wav": "./aishell/wav/test/S0901/BAC009S0901W0346.wav", "txt": "作为有史以来天赋最高的花样滑冰运动员之一"} -{"key": "BAC009S0901W0347", "wav": "./aishell/wav/test/S0901/BAC009S0901W0347.wav", "txt": "普鲁申科的职业生涯却堪称多灾多难"} -{"key": "BAC009S0901W0348", "wav": "./aishell/wav/test/S0901/BAC009S0901W0348.wav", "txt": "他屡次受到伤病的困扰"} -{"key": "BAC009S0901W0349", "wav": "./aishell/wav/test/S0901/BAC009S0901W0349.wav", "txt": "背伤更是常年阻碍着他的发挥"} -{"key": "BAC009S0901W0350", "wav": "./aishell/wav/test/S0901/BAC009S0901W0350.wav", "txt": "去年的索契冬奥会上"} -{"key": "BAC009S0901W0351", "wav": "./aishell/wav/test/S0901/BAC009S0901W0351.wav", "txt": "赛后有媒体发布了他几乎扭曲的背部肌肉的照片"} -{"key": "BAC009S0901W0352", "wav": "./aishell/wav/test/S0901/BAC009S0901W0352.wav", "txt": "照片中看到普鲁申科的背部肌肉伤痕累累"} -{"key": "BAC009S0901W0353", "wav": "./aishell/wav/test/S0901/BAC009S0901W0353.wav", "txt": "全都是手术缝合的痕迹"} -{"key": "BAC009S0901W0354", "wav": "./aishell/wav/test/S0901/BAC009S0901W0354.wav", "txt": "他不仅动过多次肌肉手术"} -{"key": "BAC009S0901W0355", "wav": "./aishell/wav/test/S0901/BAC009S0901W0355.wav", "txt": "连身上的痛觉神经都进行了更换"} -{"key": "BAC009S0901W0356", "wav": "./aishell/wav/test/S0901/BAC009S0901W0356.wav", "txt": "普鲁申科出人意料地宣布复出"} -{"key": "BAC009S0901W0357", "wav": "./aishell/wav/test/S0901/BAC009S0901W0357.wav", "txt": "表示愿意再征战一个冬奥会周期"} -{"key": "BAC009S0901W0358", "wav": "./aishell/wav/test/S0901/BAC009S0901W0358.wav", "txt": "但就在外界期待着冰王子的卷土重来时"} -{"key": "BAC009S0901W0359", "wav": "./aishell/wav/test/S0901/BAC009S0901W0359.wav", "txt": "本赛季的各项赛事参赛名单上却都没有见到他的身影"} -{"key": "BAC009S0901W0360", "wav": "./aishell/wav/test/S0901/BAC009S0901W0360.wav", "txt": "据外媒最新的爆料显示"} -{"key": "BAC009S0901W0361", "wav": "./aishell/wav/test/S0901/BAC009S0901W0361.wav", "txt": "普鲁申科被诊断患上了一种新的脊椎疾病"} -{"key": "BAC009S0901W0362", "wav": "./aishell/wav/test/S0901/BAC009S0901W0362.wav", "txt": "这也让他必须在明年春天进行一次小手术来加以治疗"} -{"key": "BAC009S0901W0363", "wav": "./aishell/wav/test/S0901/BAC009S0901W0363.wav", "txt": "普鲁申科将错过整个二零一五二零一六季一六赛季的比赛"} -{"key": "BAC009S0901W0364", "wav": "./aishell/wav/test/S0901/BAC009S0901W0364.wav", "txt": "普鲁申科丝毫没有隐退的打算"} -{"key": "BAC009S0901W0365", "wav": "./aishell/wav/test/S0901/BAC009S0901W0365.wav", "txt": "他还在积极地为二零一八年韩国平昌冬奥会进行着准备"} -{"key": "BAC009S0901W0367", "wav": "./aishell/wav/test/S0901/BAC009S0901W0367.wav", "txt": "因为卷入兴奋剂丑闻"} -{"key": "BAC009S0901W0368", "wav": "./aishell/wav/test/S0901/BAC009S0901W0368.wav", "txt": "朴泰桓无法加入海外先进的训练团队进行训练"} -{"key": "BAC009S0901W0369", "wav": "./aishell/wav/test/S0901/BAC009S0901W0369.wav", "txt": "转投到昔日恩师卢民相任教练的游泳俱乐部训练"} -{"key": "BAC009S0901W0370", "wav": "./aishell/wav/test/S0901/BAC009S0901W0370.wav", "txt": "但遭到了韩国国内舆论的非议"} -{"key": "BAC009S0901W0372", "wav": "./aishell/wav/test/S0901/BAC009S0901W0372.wav", "txt": "到今年十二月为止将在东京的法政大学进行训练"} -{"key": "BAC009S0901W0373", "wav": "./aishell/wav/test/S0901/BAC009S0901W0373.wav", "txt": "备战明年的里约奥运会"} -{"key": "BAC009S0901W0374", "wav": "./aishell/wav/test/S0901/BAC009S0901W0374.wav", "txt": "但法政大学很快公开辟谣"} -{"key": "BAC009S0901W0375", "wav": "./aishell/wav/test/S0901/BAC009S0901W0375.wav", "txt": "韩国媒体报道称朴泰桓确实人在日本"} -{"key": "BAC009S0901W0376", "wav": "./aishell/wav/test/S0901/BAC009S0901W0376.wav", "txt": "他状告首尔某美容医院的官司将在十一月迎来终审"} -{"key": "BAC009S0901W0377", "wav": "./aishell/wav/test/S0901/BAC009S0901W0377.wav", "txt": "判决结果成为他能否参加里约奥运的变数"} -{"key": "BAC009S0901W0378", "wav": "./aishell/wav/test/S0901/BAC009S0901W0378.wav", "txt": "据韩国体育首尔的最新消息"} -{"key": "BAC009S0901W0379", "wav": "./aishell/wav/test/S0901/BAC009S0901W0379.wav", "txt": "二十一日抵达日本的朴泰桓目前确实在东京"} -{"key": "BAC009S0901W0380", "wav": "./aishell/wav/test/S0901/BAC009S0901W0380.wav", "txt": "计划在那里进行三个月的封闭训练"} -{"key": "BAC009S0901W0381", "wav": "./aishell/wav/test/S0901/BAC009S0901W0381.wav", "txt": "备战明年的里约奥运"} -{"key": "BAC009S0901W0382", "wav": "./aishell/wav/test/S0901/BAC009S0901W0382.wav", "txt": "但他的具体行踪成为谜团"} -{"key": "BAC009S0901W0383", "wav": "./aishell/wav/test/S0901/BAC009S0901W0383.wav", "txt": "能否在里约奥运东山再起"} -{"key": "BAC009S0901W0384", "wav": "./aishell/wav/test/S0901/BAC009S0901W0384.wav", "txt": "不仅要看他的竞技状态恢复程度"} -{"key": "BAC009S0901W0385", "wav": "./aishell/wav/test/S0901/BAC009S0901W0385.wav", "txt": "首先要跨过大韩体育会这一关"} -{"key": "BAC009S0901W0386", "wav": "./aishell/wav/test/S0901/BAC009S0901W0386.wav", "txt": "朴泰桓的禁期禁赛期将在明年三月期满"} -{"key": "BAC009S0901W0387", "wav": "./aishell/wav/test/S0901/BAC009S0901W0387.wav", "txt": "因为服用禁药被停赛的选手在禁赛期满起的三年内"} -{"key": "BAC009S0901W0388", "wav": "./aishell/wav/test/S0901/BAC009S0901W0388.wav", "txt": "都无法代表韩国参加国际比赛"} -{"key": "BAC009S0901W0389", "wav": "./aishell/wav/test/S0901/BAC009S0901W0389.wav", "txt": "义不容辞地想拯救运动生涯在绝境中的朴泰桓"} -{"key": "BAC009S0901W0390", "wav": "./aishell/wav/test/S0901/BAC009S0901W0390.wav", "txt": "废除这个第五条第六项"} -{"key": "BAC009S0901W0391", "wav": "./aishell/wav/test/S0901/BAC009S0901W0391.wav", "txt": "为他参加里约奥运扫清最后的障碍"} -{"key": "BAC009S0901W0392", "wav": "./aishell/wav/test/S0901/BAC009S0901W0392.wav", "txt": "体育首尔的报道分析"} -{"key": "BAC009S0901W0393", "wav": "./aishell/wav/test/S0901/BAC009S0901W0393.wav", "txt": "大韩体育会这一计划的顺利实施"} -{"key": "BAC009S0901W0394", "wav": "./aishell/wav/test/S0901/BAC009S0901W0394.wav", "txt": "最终判决结果将在十一月出炉"} -{"key": "BAC009S0901W0395", "wav": "./aishell/wav/test/S0901/BAC009S0901W0395.wav", "txt": "如果该医院罪名被判成立的话"} -{"key": "BAC009S0901W0396", "wav": "./aishell/wav/test/S0901/BAC009S0901W0396.wav", "txt": "这样一来可以获得韩国舆论的同情和理解"} -{"key": "BAC009S0901W0397", "wav": "./aishell/wav/test/S0901/BAC009S0901W0397.wav", "txt": "民众自然会支持大韩体育会给他一个人修改规则"} -{"key": "BAC009S0901W0398", "wav": "./aishell/wav/test/S0901/BAC009S0901W0398.wav", "txt": "如果美容院的医疗过失罪名不成立"} -{"key": "BAC009S0901W0399", "wav": "./aishell/wav/test/S0901/BAC009S0901W0399.wav", "txt": "朴泰桓会面临更加严峻的舆论环境"} -{"key": "BAC009S0901W0400", "wav": "./aishell/wav/test/S0901/BAC009S0901W0400.wav", "txt": "这场官司的前五次公判"} -{"key": "BAC009S0901W0401", "wav": "./aishell/wav/test/S0901/BAC009S0901W0401.wav", "txt": "朴泰桓和美容院都互不相让"} -{"key": "BAC009S0901W0402", "wav": "./aishell/wav/test/S0901/BAC009S0901W0402.wav", "txt": "一度让不少粉丝心碎不已"} -{"key": "BAC009S0901W0403", "wav": "./aishell/wav/test/S0901/BAC009S0901W0403.wav", "txt": "退役之后的高桥大辅并未远离公众视线"} -{"key": "BAC009S0901W0404", "wav": "./aishell/wav/test/S0901/BAC009S0901W0404.wav", "txt": "瓦尔兹将精心演绎这个著名角色"} -{"key": "BAC009S0901W0405", "wav": "./aishell/wav/test/S0901/BAC009S0901W0405.wav", "txt": "其首脑恩斯特布鲁菲尔是邦德的最终敌人"} -{"key": "BAC009S0901W0406", "wav": "./aishell/wav/test/S0901/BAC009S0901W0406.wav", "txt": "这个角色拥有一只白色的波斯猫作为自己的宠物"} -{"key": "BAC009S0901W0407", "wav": "./aishell/wav/test/S0901/BAC009S0901W0407.wav", "txt": "值得一提的是"} -{"key": "BAC009S0901W0408", "wav": "./aishell/wav/test/S0901/BAC009S0901W0408.wav", "txt": "搞怪调皮吐舌卖萌娱乐频道"} -{"key": "BAC009S0901W0409", "wav": "./aishell/wav/test/S0901/BAC009S0901W0409.wav", "txt": "搜狐娱乐讯八月九日晚"} -{"key": "BAC009S0901W0410", "wav": "./aishell/wav/test/S0901/BAC009S0901W0410.wav", "txt": "陈冠希在微博晒出一段小视频"} -{"key": "BAC009S0901W0411", "wav": "./aishell/wav/test/S0901/BAC009S0901W0411.wav", "txt": "陈冠希开始一直把镜头对着帽子上的皮卡丘"} -{"key": "BAC009S0901W0412", "wav": "./aishell/wav/test/S0901/BAC009S0901W0412.wav", "txt": "后来突然冒出头来"} -{"key": "BAC009S0901W0413", "wav": "./aishell/wav/test/S0901/BAC009S0901W0413.wav", "txt": "对着镜头吐舌卖萌"} -{"key": "BAC009S0901W0414", "wav": "./aishell/wav/test/S0901/BAC009S0901W0414.wav", "txt": "搜狐娱乐讯九月二日凌晨"} -{"key": "BAC009S0901W0415", "wav": "./aishell/wav/test/S0901/BAC009S0901W0415.wav", "txt": "陈冠希在微博晒出一张自拍照"} -{"key": "BAC009S0901W0416", "wav": "./aishell/wav/test/S0901/BAC009S0901W0416.wav", "txt": "陈冠希穿休闲短袖配宽松裤子"} -{"key": "BAC009S0901W0417", "wav": "./aishell/wav/test/S0901/BAC009S0901W0417.wav", "txt": "网友纷纷留言越来越像潮流教父了"} -{"key": "BAC009S0901W0418", "wav": "./aishell/wav/test/S0901/BAC009S0901W0418.wav", "txt": "这裙子娇艳"} -{"key": "BAC009S0901W0419", "wav": "./aishell/wav/test/S0901/BAC009S0901W0419.wav", "txt": "帅出新高度"} -{"key": "BAC009S0901W0420", "wav": "./aishell/wav/test/S0901/BAC009S0901W0420.wav", "txt": "搜狐娱乐讯据台湾媒体报道"} -{"key": "BAC009S0901W0421", "wav": "./aishell/wav/test/S0901/BAC009S0901W0421.wav", "txt": "多次想复合却无下文"} -{"key": "BAC009S0901W0422", "wav": "./aishell/wav/test/S0901/BAC009S0901W0422.wav", "txt": "感情事备受关注"} -{"key": "BAC009S0901W0423", "wav": "./aishell/wav/test/S0901/BAC009S0901W0423.wav", "txt": "前天他在脸书晒出自拍照"} -{"key": "BAC009S0901W0424", "wav": "./aishell/wav/test/S0901/BAC009S0901W0424.wav", "txt": "满脸黑斑与大眼袋"} -{"key": "BAC009S0901W0425", "wav": "./aishell/wav/test/S0901/BAC009S0901W0425.wav", "txt": "老残样再度乍现"} -{"key": "BAC009S0901W0426", "wav": "./aishell/wav/test/S0901/BAC009S0901W0426.wav", "txt": "搜狐娱乐讯据台湾媒体报道"} -{"key": "BAC009S0901W0427", "wav": "./aishell/wav/test/S0901/BAC009S0901W0427.wav", "txt": "事后解释是生活观不同才分开"} -{"key": "BAC009S0901W0428", "wav": "./aishell/wav/test/S0901/BAC009S0901W0428.wav", "txt": "但隔年三月却又分享一张女方坐他大腿的照片"} -{"key": "BAC009S0901W0429", "wav": "./aishell/wav/test/S0901/BAC009S0901W0429.wav", "txt": "一度让外界以为两人复合"} -{"key": "BAC009S0901W0430", "wav": "./aishell/wav/test/S0901/BAC009S0901W0430.wav", "txt": "但现在又有别的女孩坐上他的大腿"} -{"key": "BAC009S0901W0431", "wav": "./aishell/wav/test/S0901/BAC009S0901W0431.wav", "txt": "校长邱勇上任后首次参加学生毕业典礼并演讲"} -{"key": "BAC009S0901W0432", "wav": "./aishell/wav/test/S0901/BAC009S0901W0432.wav", "txt": "追求使命需要有强大的定力昨日上午"} -{"key": "BAC009S0901W0433", "wav": "./aishell/wav/test/S0901/BAC009S0901W0433.wav", "txt": "他叮嘱五千馀名毕业生"} -{"key": "BAC009S0901W0434", "wav": "./aishell/wav/test/S0901/BAC009S0901W0434.wav", "txt": "要有清晰的目标人文情怀和做到执着坚守"} -{"key": "BAC009S0901W0435", "wav": "./aishell/wav/test/S0901/BAC009S0901W0435.wav", "txt": "清华法学院教授司法改革应限制两长权力"} -{"key": "BAC009S0901W0436", "wav": "./aishell/wav/test/S0901/BAC009S0901W0436.wav", "txt": "本报讯记者汪红日前"} -{"key": "BAC009S0901W0437", "wav": "./aishell/wav/test/S0901/BAC009S0901W0437.wav", "txt": "对允许其亲自过问的案件提出严格限定标准"} -{"key": "BAC009S0901W0438", "wav": "./aishell/wav/test/S0901/BAC009S0901W0438.wav", "txt": "清华辟谣保安迫降无人机为人为诋毁"} -{"key": "BAC009S0901W0439", "wav": "./aishell/wav/test/S0901/BAC009S0901W0439.wav", "txt": "该事件引发广泛关注"} -{"key": "BAC009S0901W0440", "wav": "./aishell/wav/test/S0901/BAC009S0901W0440.wav", "txt": "清华大学通过调取监控录线发现"} -{"key": "BAC009S0901W0441", "wav": "./aishell/wav/test/S0901/BAC009S0901W0441.wav", "txt": "该保安为附近大厦保安"} -{"key": "BAC009S0901W0442", "wav": "./aishell/wav/test/S0901/BAC009S0901W0442.wav", "txt": "目前该保安承认有人花二百元雇他进行拍照"} -{"key": "BAC009S0901W0443", "wav": "./aishell/wav/test/S0901/BAC009S0901W0443.wav", "txt": "称当时几位学生模样的人让他帮忙配合拍照用来宣传"} -{"key": "BAC009S0901W0444", "wav": "./aishell/wav/test/S0901/BAC009S0901W0444.wav", "txt": "抓着男生的动作为摆拍"} -{"key": "BAC009S0901W0445", "wav": "./aishell/wav/test/S0901/BAC009S0901W0445.wav", "txt": "摔毁无人机一事为杜撰"} -{"key": "BAC009S0901W0446", "wav": "./aishell/wav/test/S0901/BAC009S0901W0446.wav", "txt": "保安得知自己被骗后表示我真的很恨他们"} -{"key": "BAC009S0901W0447", "wav": "./aishell/wav/test/S0901/BAC009S0901W0447.wav", "txt": "记者联系发微博男子"} -{"key": "BAC009S0901W0448", "wav": "./aishell/wav/test/S0901/BAC009S0901W0448.wav", "txt": "他表示我没想到弄这么大"} -{"key": "BAC009S0901W0449", "wav": "./aishell/wav/test/S0901/BAC009S0901W0449.wav", "txt": "目前该男子已将微博内容全部删除"} -{"key": "BAC009S0901W0450", "wav": "./aishell/wav/test/S0901/BAC009S0901W0450.wav", "txt": "清华附小昨迎百年校庆校长诠释成志教育理念"} -{"key": "BAC009S0901W0451", "wav": "./aishell/wav/test/S0901/BAC009S0901W0451.wav", "txt": "清华附小校长窦桂海诠释成志教育理念"} -{"key": "BAC009S0901W0452", "wav": "./aishell/wav/test/S0901/BAC009S0901W0452.wav", "txt": "清晨飘来辣眼白雾济南八名村民中毒入院"} -{"key": "BAC009S0901W0453", "wav": "./aishell/wav/test/S0901/BAC009S0901W0453.wav", "txt": "赵女士的公公躺在病床上"} -{"key": "BAC009S0901W0454", "wav": "./aishell/wav/test/S0901/BAC009S0901W0454.wav", "txt": "目前神志已恢复清醒"} -{"key": "BAC009S0901W0455", "wav": "./aishell/wav/test/S0901/BAC009S0901W0455.wav", "txt": "记者李焜染摄十三日早晨"} -{"key": "BAC009S0901W0456", "wav": "./aishell/wav/test/S0901/BAC009S0901W0456.wav", "txt": "历城区港沟镇神武村飘来多股白色不明气体"} -{"key": "BAC009S0901W0457", "wav": "./aishell/wav/test/S0901/BAC009S0901W0457.wav", "txt": "八位村民先后出现中毒症状"} -{"key": "BAC009S0901W0458", "wav": "./aishell/wav/test/S0901/BAC009S0901W0458.wav", "txt": "目前八人均已脱离生命危险"} -{"key": "BAC009S0901W0459", "wav": "./aishell/wav/test/S0901/BAC009S0901W0459.wav", "txt": "神秘气体成分及来源正在进一步核实"} -{"key": "BAC009S0901W0460", "wav": "./aishell/wav/test/S0901/BAC009S0901W0460.wav", "txt": "清洁工开宝马上下班真实身份为在逃诈骗犯"} -{"key": "BAC009S0901W0461", "wav": "./aishell/wav/test/S0901/BAC009S0901W0461.wav", "txt": "彭某下班后准备开车离开"} -{"key": "BAC009S0901W0462", "wav": "./aishell/wav/test/S0901/BAC009S0901W0462.wav", "txt": "清洁工开宝马上班被称励志故事经查系逃犯"} -{"key": "BAC009S0901W0463", "wav": "./aishell/wav/test/S0901/BAC009S0901W0463.wav", "txt": "十四日开宝马来上班重庆晨报记者罗伟雷罗伟雷键摄"} -{"key": "BAC009S0901W0464", "wav": "./aishell/wav/test/S0901/BAC009S0901W0464.wav", "txt": "清洁工被电梯咬断腿曾反映这样擦电源危险"} -{"key": "BAC009S0901W0465", "wav": "./aishell/wav/test/S0901/BAC009S0901W0465.wav", "txt": "制图黄欣晨报记者佟继萍王亦菲实习生张诗欢"} -{"key": "BAC009S0901W0466", "wav": "./aishell/wav/test/S0901/BAC009S0901W0466.wav", "txt": "网络时代信息的存在有了新方式"} -{"key": "BAC009S0901W0467", "wav": "./aishell/wav/test/S0901/BAC009S0901W0467.wav", "txt": "云盘就是一种直接把信息存在网络空间里的存储工具"} -{"key": "BAC009S0901W0468", "wav": "./aishell/wav/test/S0901/BAC009S0901W0468.wav", "txt": "和传统硬盘不同的是"} -{"key": "BAC009S0901W0469", "wav": "./aishell/wav/test/S0901/BAC009S0901W0469.wav", "txt": "用户不需要把它带在身上"} -{"key": "BAC009S0901W0470", "wav": "./aishell/wav/test/S0901/BAC009S0901W0470.wav", "txt": "只需要一个账户名和密码"} -{"key": "BAC009S0901W0471", "wav": "./aishell/wav/test/S0901/BAC009S0901W0471.wav", "txt": "就可以在网络环境下"} -{"key": "BAC009S0901W0472", "wav": "./aishell/wav/test/S0901/BAC009S0901W0472.wav", "txt": "上传读取和下载里面的信息"} -{"key": "BAC009S0901W0473", "wav": "./aishell/wav/test/S0901/BAC009S0901W0473.wav", "txt": "本来云盘的出现方便了人们的生活和工作"} -{"key": "BAC009S0901W0474", "wav": "./aishell/wav/test/S0901/BAC009S0901W0474.wav", "txt": "把云盘变成了一个淫秽色情信息的隐蔽聚散地"} -{"key": "BAC009S0901W0475", "wav": "./aishell/wav/test/S0901/BAC009S0901W0475.wav", "txt": "清风正在吹散互联网雾霾"} -{"key": "BAC009S0901W0476", "wav": "./aishell/wav/test/S0901/BAC009S0901W0476.wav", "txt": "四年前的一幕仍没从夏英俊的记忆中抹去"} -{"key": "BAC009S0901W0477", "wav": "./aishell/wav/test/S0901/BAC009S0901W0477.wav", "txt": "渐冻男孩驾驶电动轮椅上班医生曾诊断活不过十八岁"} -{"key": "BAC009S0901W0478", "wav": "./aishell/wav/test/S0901/BAC009S0901W0478.wav", "txt": "蔡兴桥在妈妈的帮助下靠墙练习站立"} -{"key": "BAC009S0901W0479", "wav": "./aishell/wav/test/S0901/BAC009S0901W0479.wav", "txt": "渔民南海捞出外国间谍潜航器搜集情报或已传回"} -{"key": "BAC009S0901W0480", "wav": "./aishell/wav/test/S0901/BAC009S0901W0480.wav", "txt": "在许多人眼里这都是小说和电影里才会出现的情节"} -{"key": "BAC009S0901W0481", "wav": "./aishell/wav/test/S0901/BAC009S0901W0481.wav", "txt": "可实际上维护国家边海防安全保护国家利益不受侵犯"} -{"key": "BAC009S0901W0482", "wav": "./aishell/wav/test/S0901/BAC009S0901W0482.wav", "txt": "这样的斗争和考验有时就发生在我们身边"} -{"key": "BAC009S0901W0483", "wav": "./aishell/wav/test/S0901/BAC009S0901W0483.wav", "txt": "南海的渔民在捕鱼的时候就曾捞出过一个奇怪的东西"} -{"key": "BAC009S0901W0484", "wav": "./aishell/wav/test/S0901/BAC009S0901W0484.wav", "txt": "由此引出一起重大安全案件"} -{"key": "BAC009S0901W0485", "wav": "./aishell/wav/test/S0901/BAC009S0901W0485.wav", "txt": "渔民在南海打捞起可疑电子装置确系无人潜航器"} -{"key": "BAC009S0901W0486", "wav": "./aishell/wav/test/S0901/BAC009S0901W0486.wav", "txt": "经国家安全部门会同有关技术权威部门鉴定"} -{"key": "BAC009S0901W0487", "wav": "./aishell/wav/test/S0901/BAC009S0901W0487.wav", "txt": "它既能搜集我国重要海域内各类环境数据"} -{"key": "BAC009S0901W0488", "wav": "./aishell/wav/test/S0901/BAC009S0901W0488.wav", "txt": "又能探测获取我海军舰队活动动向"} -{"key": "BAC009S0901W0489", "wav": "./aishell/wav/test/S0901/BAC009S0901W0489.wav", "txt": "实现近距离侦查和情报收集任务"} -{"key": "BAC009S0901W0490", "wav": "./aishell/wav/test/S0901/BAC009S0901W0490.wav", "txt": "渔民投诉遭离奇执法被派出所讨价还价式罚款"} -{"key": "BAC009S0901W0491", "wav": "./aishell/wav/test/S0901/BAC009S0901W0491.wav", "txt": "海南临高籍多位渔民向中新网记者反应称"} -{"key": "BAC009S0901W0492", "wav": "./aishell/wav/test/S0901/BAC009S0901W0492.wav", "txt": "二十二日在文昌市清澜港边防派出所执法检查时"} -{"key": "BAC009S0901W0493", "wav": "./aishell/wav/test/S0901/BAC009S0901W0493.wav", "txt": "渔民缴纳罚款后在摁手印时"} -{"key": "BAC009S0901W0494", "wav": "./aishell/wav/test/S0901/BAC009S0901W0494.wav", "txt": "被民警用针扎破手指"} -{"key": "BAC009S0901W0495", "wav": "./aishell/wav/test/S0901/BAC009S0901W0495.wav", "txt": "让他们很担心会不会相互传染疾病"} -{"key": "BAC009S0902W0121", "wav": "./aishell/wav/test/S0902/BAC009S0902W0121.wav", "txt": "所以审批加上放款的时间最快也要在七个半月左右"} -{"key": "BAC009S0902W0122", "wav": "./aishell/wav/test/S0902/BAC009S0902W0122.wav", "txt": "作为取之于民用之于民的住房公积金"} -{"key": "BAC009S0902W0123", "wav": "./aishell/wav/test/S0902/BAC009S0902W0123.wav", "txt": "缴存者还可以在租房装修离退休时提取"} -{"key": "BAC009S0902W0124", "wav": "./aishell/wav/test/S0902/BAC009S0902W0124.wav", "txt": "因此操作环节的快捷性与便捷性非常重要"} -{"key": "BAC009S0902W0125", "wav": "./aishell/wav/test/S0902/BAC009S0902W0125.wav", "txt": "后续要加大公积金贷款的便利性"} -{"key": "BAC009S0902W0126", "wav": "./aishell/wav/test/S0902/BAC009S0902W0126.wav", "txt": "鼓励购房者积极缴纳公积金"} -{"key": "BAC009S0902W0127", "wav": "./aishell/wav/test/S0902/BAC009S0902W0127.wav", "txt": "进而选择此类方式购房"} -{"key": "BAC009S0902W0128", "wav": "./aishell/wav/test/S0902/BAC009S0902W0128.wav", "txt": "另外要处理公积金异地使用的问题"} -{"key": "BAC009S0902W0129", "wav": "./aishell/wav/test/S0902/BAC009S0902W0129.wav", "txt": "这对于目前一线城市来说很紧要"} -{"key": "BAC009S0902W0130", "wav": "./aishell/wav/test/S0902/BAC009S0902W0130.wav", "txt": "很多人受限购政策的影响"} -{"key": "BAC009S0902W0131", "wav": "./aishell/wav/test/S0902/BAC009S0902W0131.wav", "txt": "难以在周边城市用公积金购房"} -{"key": "BAC009S0902W0132", "wav": "./aishell/wav/test/S0902/BAC009S0902W0132.wav", "txt": "导致公积金资源闲置的问题出现"} -{"key": "BAC009S0902W0133", "wav": "./aishell/wav/test/S0902/BAC009S0902W0133.wav", "txt": "美丽北京大型绿色公益品牌项目"} -{"key": "BAC009S0902W0134", "wav": "./aishell/wav/test/S0902/BAC009S0902W0134.wav", "txt": "随着广州住房公积金贷款政策的调整实施"} -{"key": "BAC009S0902W0135", "wav": "./aishell/wav/test/S0902/BAC009S0902W0135.wav", "txt": "政策内容主要涉及购房"} -{"key": "BAC009S0902W0136", "wav": "./aishell/wav/test/S0902/BAC009S0902W0136.wav", "txt": "随着广州住房公积金贷款政策的调整实施"} -{"key": "BAC009S0902W0137", "wav": "./aishell/wav/test/S0902/BAC009S0902W0137.wav", "txt": "公积金贷款最高额度亦不同程度上调"} -{"key": "BAC009S0902W0138", "wav": "./aishell/wav/test/S0902/BAC009S0902W0138.wav", "txt": "住房公积金贷款因其利率较低的优势"} -{"key": "BAC009S0902W0139", "wav": "./aishell/wav/test/S0902/BAC009S0902W0139.wav", "txt": "一直以来广受购房者青睐"} -{"key": "BAC009S0902W0140", "wav": "./aishell/wav/test/S0902/BAC009S0902W0140.wav", "txt": "本轮本轮住房公积金房贷政策调整"} -{"key": "BAC009S0902W0141", "wav": "./aishell/wav/test/S0902/BAC009S0902W0141.wav", "txt": "进一步加速了消费者的入市节奏"} -{"key": "BAC009S0902W0142", "wav": "./aishell/wav/test/S0902/BAC009S0902W0142.wav", "txt": "广州调整住房公积金个人住房贷款政策"} -{"key": "BAC009S0902W0143", "wav": "./aishell/wav/test/S0902/BAC009S0902W0143.wav", "txt": "同时对申请公积金贷款的缴纳时限调整为七个月"} -{"key": "BAC009S0902W0144", "wav": "./aishell/wav/test/S0902/BAC009S0902W0144.wav", "txt": "据广州日报昨天报道"} -{"key": "BAC009S0902W0145", "wav": "./aishell/wav/test/S0902/BAC009S0902W0145.wav", "txt": "公积金贷款首付比例降低的消息令购房者喜出望外"} -{"key": "BAC009S0902W0146", "wav": "./aishell/wav/test/S0902/BAC009S0902W0146.wav", "txt": "其中刚需要买入市积极性明显提高"} -{"key": "BAC009S0902W0147", "wav": "./aishell/wav/test/S0902/BAC009S0902W0147.wav", "txt": "据伟嘉安捷提供的数据显示"} -{"key": "BAC009S0902W0148", "wav": "./aishell/wav/test/S0902/BAC009S0902W0148.wav", "txt": "北京公积金贷款首付比例松绑一周后"} -{"key": "BAC009S0902W0149", "wav": "./aishell/wav/test/S0902/BAC009S0902W0149.wav", "txt": "公积金贷款及组合贷咨询量明显上涨"} -{"key": "BAC009S0902W0150", "wav": "./aishell/wav/test/S0902/BAC009S0902W0150.wav", "txt": "尤其组合贷的咨询量较上月月初一上涨百分之七左右"} -{"key": "BAC009S0902W0151", "wav": "./aishell/wav/test/S0902/BAC009S0902W0151.wav", "txt": "上海深圳等主要城市也在公积金新政推动下"} -{"key": "BAC009S0902W0152", "wav": "./aishell/wav/test/S0902/BAC009S0902W0152.wav", "txt": "呈现购房者积入市的行情"} -{"key": "BAC009S0902W0153", "wav": "./aishell/wav/test/S0902/BAC009S0902W0153.wav", "txt": "全国已有超百个城市发布了不同力度的公积金松绑政策"} -{"key": "BAC009S0902W0154", "wav": "./aishell/wav/test/S0902/BAC009S0902W0154.wav", "txt": "加之降息降准等政策组合拳"} -{"key": "BAC009S0902W0155", "wav": "./aishell/wav/test/S0902/BAC009S0902W0155.wav", "txt": "呈现出量价齐涨的局面"} -{"key": "BAC009S0902W0156", "wav": "./aishell/wav/test/S0902/BAC009S0902W0156.wav", "txt": "据中国指数研究院最新数据显示"} -{"key": "BAC009S0902W0157", "wav": "./aishell/wav/test/S0902/BAC009S0902W0157.wav", "txt": "深圳环比上上涨百分之七"} -{"key": "BAC009S0902W0158", "wav": "./aishell/wav/test/S0902/BAC009S0902W0158.wav", "txt": "涨幅据十大城市之首"} -{"key": "BAC009S0902W0159", "wav": "./aishell/wav/test/S0902/BAC009S0902W0159.wav", "txt": "五月份多地楼市的成交量明显上涨"} -{"key": "BAC009S0902W0160", "wav": "./aishell/wav/test/S0902/BAC009S0902W0160.wav", "txt": "是房地产当前发局格局下的一个必然"} -{"key": "BAC009S0902W0161", "wav": "./aishell/wav/test/S0902/BAC009S0902W0161.wav", "txt": "唯独这样才能盘活公积金资源"} -{"key": "BAC009S0902W0162", "wav": "./aishell/wav/test/S0902/BAC009S0902W0162.wav", "txt": "促使更多购房者积极入市"} -{"key": "BAC009S0902W0163", "wav": "./aishell/wav/test/S0902/BAC009S0902W0163.wav", "txt": "伴随着各地住房公积金新政的落地实施"} -{"key": "BAC009S0902W0164", "wav": "./aishell/wav/test/S0902/BAC009S0902W0164.wav", "txt": "楼市进展仍需进一步观望"} -{"key": "BAC009S0902W0165", "wav": "./aishell/wav/test/S0902/BAC009S0902W0165.wav", "txt": "购房者受惠于政策利好的同时"} -{"key": "BAC009S0902W0166", "wav": "./aishell/wav/test/S0902/BAC009S0902W0166.wav", "txt": "公积金在申请放贷流程上并未提速"} -{"key": "BAC009S0902W0167", "wav": "./aishell/wav/test/S0902/BAC009S0902W0167.wav", "txt": "相反相关环节上审批更加严格"} -{"key": "BAC009S0902W0168", "wav": "./aishell/wav/test/S0902/BAC009S0902W0168.wav", "txt": "从目前上海住房公积金的具体政策看"} -{"key": "BAC009S0902W0169", "wav": "./aishell/wav/test/S0902/BAC009S0902W0169.wav", "txt": "购房的扶持力度在加大"} -{"key": "BAC009S0902W0170", "wav": "./aishell/wav/test/S0902/BAC009S0902W0170.wav", "txt": "但主要还是体现在贷款成本的降低"} -{"key": "BAC009S0902W0171", "wav": "./aishell/wav/test/S0902/BAC009S0902W0171.wav", "txt": "而申请公积金贷款方面还是需要走严格的流程"} -{"key": "BAC009S0902W0172", "wav": "./aishell/wav/test/S0902/BAC009S0902W0172.wav", "txt": "公积金提取一直是目前试图突破的内容"} -{"key": "BAC009S0902W0173", "wav": "./aishell/wav/test/S0902/BAC009S0902W0173.wav", "txt": "但目前还未出现大面积提取行为"} -{"key": "BAC009S0902W0174", "wav": "./aishell/wav/test/S0902/BAC009S0902W0174.wav", "txt": "来自广州日报的报道称"} -{"key": "BAC009S0902W0175", "wav": "./aishell/wav/test/S0902/BAC009S0902W0175.wav", "txt": "从申请到最后的放款"} -{"key": "BAC009S0902W0176", "wav": "./aishell/wav/test/S0902/BAC009S0902W0176.wav", "txt": "部分客户甚至等两个多月"} -{"key": "BAC009S0902W0177", "wav": "./aishell/wav/test/S0902/BAC009S0902W0177.wav", "txt": "如果申请公积金贷款及公积金贷款与商业贷款的组合贷"} -{"key": "BAC009S0902W0178", "wav": "./aishell/wav/test/S0902/BAC009S0902W0178.wav", "txt": "伟嘉安捷对中新网房产频道表示"} -{"key": "BAC009S0902W0179", "wav": "./aishell/wav/test/S0902/BAC009S0902W0179.wav", "txt": "现在公积金贷款办理需要一个月左右的时间"} -{"key": "BAC009S0902W0180", "wav": "./aishell/wav/test/S0902/BAC009S0902W0180.wav", "txt": "而申请办理组合贷款的手续则更为复杂"} -{"key": "BAC009S0902W0181", "wav": "./aishell/wav/test/S0902/BAC009S0902W0181.wav", "txt": "所以审批加上放款的时间最快也要在五个半月左右"} -{"key": "BAC009S0902W0182", "wav": "./aishell/wav/test/S0902/BAC009S0902W0182.wav", "txt": "作为取之于民用之于民的住房公积金"} -{"key": "BAC009S0902W0183", "wav": "./aishell/wav/test/S0902/BAC009S0902W0183.wav", "txt": "缴存者还可以在租房装修离退休时提取"} -{"key": "BAC009S0902W0184", "wav": "./aishell/wav/test/S0902/BAC009S0902W0184.wav", "txt": "因此操作环节的快捷性与便捷性非常重要"} -{"key": "BAC009S0902W0185", "wav": "./aishell/wav/test/S0902/BAC009S0902W0185.wav", "txt": "后续要加大公积金贷款的便利性"} -{"key": "BAC009S0902W0186", "wav": "./aishell/wav/test/S0902/BAC009S0902W0186.wav", "txt": "鼓励购房者积极缴纳公积金"} -{"key": "BAC009S0902W0187", "wav": "./aishell/wav/test/S0902/BAC009S0902W0187.wav", "txt": "科技支撑能力显着增强"} -{"key": "BAC009S0902W0188", "wav": "./aishell/wav/test/S0902/BAC009S0902W0188.wav", "txt": "生产经营方式不断优化"} -{"key": "BAC009S0902W0189", "wav": "./aishell/wav/test/S0902/BAC009S0902W0189.wav", "txt": "农业产业体系更趋完善"} -{"key": "BAC009S0902W0190", "wav": "./aishell/wav/test/S0902/BAC009S0902W0190.wav", "txt": "土地产出率劳动生产率资源利用率显着提高"} -{"key": "BAC009S0902W0191", "wav": "./aishell/wav/test/S0902/BAC009S0902W0191.wav", "txt": "现代农业建设取得突破性进展"} -{"key": "BAC009S0902W0192", "wav": "./aishell/wav/test/S0902/BAC009S0902W0192.wav", "txt": "主要农产品优势区基本实行农业现代化"} -{"key": "BAC009S0902W0193", "wav": "./aishell/wav/test/S0902/BAC009S0902W0193.wav", "txt": "现代农业发展主要指标类别"} -{"key": "BAC009S0902W0194", "wav": "./aishell/wav/test/S0902/BAC009S0902W0194.wav", "txt": "粮食综合生产能力五亿吨"} -{"key": "BAC009S0902W0195", "wav": "./aishell/wav/test/S0902/BAC009S0902W0195.wav", "txt": "粮食播种面积五亿亩棉花总产量七万吨"} -{"key": "BAC009S0902W0196", "wav": "./aishell/wav/test/S0902/BAC009S0902W0196.wav", "txt": "油料总产量七万吨"} -{"key": "BAC009S0902W0197", "wav": "./aishell/wav/test/S0902/BAC009S0902W0197.wav", "txt": "肉类总产量五万吨"} -{"key": "BAC009S0902W0198", "wav": "./aishell/wav/test/S0902/BAC009S0902W0198.wav", "txt": "奶类总产量七万吨水产品总产量七万吨"} -{"key": "BAC009S0902W0199", "wav": "./aishell/wav/test/S0902/BAC009S0902W0199.wav", "txt": "农产品质量安全例行监测总体合格率百分之五十"} -{"key": "BAC009S0902W0200", "wav": "./aishell/wav/test/S0902/BAC009S0902W0200.wav", "txt": "畜牧业产值占农业总产值比重百分之"} -{"key": "BAC009S0902W0201", "wav": "./aishell/wav/test/S0902/BAC009S0902W0201.wav", "txt": "渔业产值占农业总产值比重百分之"} -{"key": "BAC009S0902W0202", "wav": "./aishell/wav/test/S0902/BAC009S0902W0202.wav", "txt": "农产品加工业产值与农业总产值"} -{"key": "BAC009S0902W0203", "wav": "./aishell/wav/test/S0902/BAC009S0902W0203.wav", "txt": "丰富和解调仲裁诉等维权内容和方式"} -{"key": "BAC009S0902W0204", "wav": "./aishell/wav/test/S0902/BAC009S0902W0204.wav", "txt": "新增农田有效灌溉面积万亩"} -{"key": "BAC009S0902W0205", "wav": "./aishell/wav/test/S0902/BAC009S0902W0205.wav", "txt": "耕种收综合机械化水平百分之五"} -{"key": "BAC009S0902W0206", "wav": "./aishell/wav/test/S0902/BAC009S0902W0206.wav", "txt": "丰富和解调解仲裁诉诉讼等"} -{"key": "BAC009S0902W0207", "wav": "./aishell/wav/test/S0902/BAC009S0902W0207.wav", "txt": "科技科技进步贡献率百分之七"} -{"key": "BAC009S0902W0208", "wav": "./aishell/wav/test/S0902/BAC009S0902W0208.wav", "txt": "农村实用人才总量万人"} -{"key": "BAC009S0902W0209", "wav": "./aishell/wav/test/S0902/BAC009S0902W0209.wav", "txt": "农业产业化组织带动农户数量亿户"} -{"key": "BAC009S0902W0210", "wav": "./aishell/wav/test/S0902/BAC009S0902W0210.wav", "txt": "团结就是力量"} -{"key": "BAC009S0902W0211", "wav": "./aishell/wav/test/S0902/BAC009S0902W0211.wav", "txt": "适宜农户沼气普及率百分之五"} -{"key": "BAC009S0902W0212", "wav": "./aishell/wav/test/S0902/BAC009S0902W0212.wav", "txt": "农作物秸秆综合利用率百分之五"} -{"key": "BAC009S0902W0213", "wav": "./aishell/wav/test/S0902/BAC009S0902W0213.wav", "txt": "薛之谦的歌儿很棒"} -{"key": "BAC009S0902W0214", "wav": "./aishell/wav/test/S0902/BAC009S0902W0214.wav", "txt": "农林牧渔业增长值年均增长率百分之五"} -{"key": "BAC009S0902W0215", "wav": "./aishell/wav/test/S0902/BAC009S0902W0215.wav", "txt": "增长速度按可比价格计算"} -{"key": "BAC009S0902W0216", "wav": "./aishell/wav/test/S0902/BAC009S0902W0216.wav", "txt": "从加快转变农业发展的方式关键环节入手"} -{"key": "BAC009S0902W0217", "wav": "./aishell/wav/test/S0902/BAC009S0902W0217.wav", "txt": "完善现代农业产业体系"} -{"key": "BAC009S0902W0218", "wav": "./aishell/wav/test/S0902/BAC009S0902W0218.wav", "txt": "稳定发展粮食和棉油糖生产"} -{"key": "BAC009S0902W0219", "wav": "./aishell/wav/test/S0902/BAC009S0902W0219.wav", "txt": "实施全国增长千亿斤粮食生产能力规划"} -{"key": "BAC009S0902W0220", "wav": "./aishell/wav/test/S0902/BAC009S0902W0220.wav", "txt": "积极推进南方稻区单改双"} -{"key": "BAC009S0902W0221", "wav": "./aishell/wav/test/S0902/BAC009S0902W0221.wav", "txt": "扩大东北优势区粳稻种植面积"} -{"key": "BAC009S0902W0222", "wav": "./aishell/wav/test/S0902/BAC009S0902W0222.wav", "txt": "稳步推进江淮等粳高稻生产适宜区糟改粳"} -{"key": "BAC009S0902W0223", "wav": "./aishell/wav/test/S0902/BAC009S0902W0223.wav", "txt": "稳定增加玉米播种面积"} -{"key": "BAC009S0902W0224", "wav": "./aishell/wav/test/S0902/BAC009S0902W0224.wav", "txt": "积极恢复和稳定大豆种植面积"} -{"key": "BAC009S0902W0225", "wav": "./aishell/wav/test/S0902/BAC009S0902W0225.wav", "txt": "积极开发和选育马铃薯优质专用高产品种"} -{"key": "BAC009S0902W0226", "wav": "./aishell/wav/test/S0902/BAC009S0902W0226.wav", "txt": "提高脱毒种薯供给能力"} -{"key": "BAC009S0902W0227", "wav": "./aishell/wav/test/S0902/BAC009S0902W0227.wav", "txt": "继续加强优质棉花生产基地建设"} -{"key": "BAC009S0902W0228", "wav": "./aishell/wav/test/S0902/BAC009S0902W0228.wav", "txt": "多油并举稳定食用植物油自给率"} -{"key": "BAC009S0902W0229", "wav": "./aishell/wav/test/S0902/BAC009S0902W0229.wav", "txt": "基本满足国内棉花消费需求"} -{"key": "BAC009S0902W0230", "wav": "./aishell/wav/test/S0902/BAC009S0902W0230.wav", "txt": "积极发展菜篮子产品生产"} -{"key": "BAC009S0902W0231", "wav": "./aishell/wav/test/S0902/BAC009S0902W0231.wav", "txt": "加强蔬菜水果肉蛋奶水产品等产品优势产区建设"} -{"key": "BAC009S0902W0232", "wav": "./aishell/wav/test/S0902/BAC009S0902W0232.wav", "txt": "扩大大中城市郊区菜篮子产品生产基地规模"} -{"key": "BAC009S0902W0233", "wav": "./aishell/wav/test/S0902/BAC009S0902W0233.wav", "txt": "推动苹果柑橘等优势园艺产品生产"} -{"key": "BAC009S0902W0234", "wav": "./aishell/wav/test/S0902/BAC009S0902W0234.wav", "txt": "稳定发展生猪和蛋禽"} -{"key": "BAC009S0902W0235", "wav": "./aishell/wav/test/S0902/BAC009S0902W0235.wav", "txt": "大力发展农产品加工和流通业"} -{"key": "BAC009S0902W0236", "wav": "./aishell/wav/test/S0902/BAC009S0902W0236.wav", "txt": "加强主要农产品优势产区加工基地建设"} -{"key": "BAC009S0902W0237", "wav": "./aishell/wav/test/S0902/BAC009S0902W0237.wav", "txt": "引导农产品加工业向种养业优势区域和城市郊区集中"} -{"key": "BAC009S0902W0238", "wav": "./aishell/wav/test/S0902/BAC009S0902W0238.wav", "txt": "启动实施农产品加工提升工程"} -{"key": "BAC009S0902W0239", "wav": "./aishell/wav/test/S0902/BAC009S0902W0239.wav", "txt": "提高生产流通组织化程度"} -{"key": "BAC009S0902W0240", "wav": "./aishell/wav/test/S0902/BAC009S0902W0240.wav", "txt": "培育一批产值过百亿元的大型加工和流通企业集团"} -{"key": "BAC009S0902W0241", "wav": "./aishell/wav/test/S0902/BAC009S0902W0241.wav", "txt": "强化流通基础设施建设和产销信息引导"} -{"key": "BAC009S0902W0242", "wav": "./aishell/wav/test/S0902/BAC009S0902W0242.wav", "txt": "升级改造农产品批发市场"} -{"key": "BAC009S0902W0243", "wav": "./aishell/wav/test/S0902/BAC009S0902W0243.wav", "txt": "支持优势产区现代化鲜活农产品批发市场建设"} -{"key": "BAC009S0902W0244", "wav": "./aishell/wav/test/S0902/BAC009S0902W0244.wav", "txt": "大力发展冷链体系和生鲜农产品配送"} -{"key": "BAC009S0902W0245", "wav": "./aishell/wav/test/S0902/BAC009S0902W0245.wav", "txt": "推进订单生产和农超对接"} -{"key": "BAC009S0902W0246", "wav": "./aishell/wav/test/S0902/BAC009S0902W0246.wav", "txt": "落实鲜活农产品运输绿化通道政策"} -{"key": "BAC009S0902W0247", "wav": "./aishell/wav/test/S0902/BAC009S0902W0247.wav", "txt": "降低农产品流通成本"} -{"key": "BAC009S0902W0248", "wav": "./aishell/wav/test/S0902/BAC009S0902W0248.wav", "txt": "规范和完善农产品期货市场"} -{"key": "BAC009S0902W0249", "wav": "./aishell/wav/test/S0902/BAC009S0902W0249.wav", "txt": "强化农业科技和人才支撑"} -{"key": "BAC009S0902W0250", "wav": "./aishell/wav/test/S0902/BAC009S0902W0250.wav", "txt": "增强农业科技自主创新能力"} -{"key": "BAC009S0902W0251", "wav": "./aishell/wav/test/S0902/BAC009S0902W0251.wav", "txt": "明确农业科技的公共性基础社会性地位"} -{"key": "BAC009S0902W0252", "wav": "./aishell/wav/test/S0902/BAC009S0902W0252.wav", "txt": "加强基础性前沿性公益性重大农业科学技术研究"} -{"key": "BAC009S0902W0253", "wav": "./aishell/wav/test/S0902/BAC009S0902W0253.wav", "txt": "比去年同期的六十二点三十亿美元大幅增长百分之十二"} -{"key": "BAC009S0902W0254", "wav": "./aishell/wav/test/S0902/BAC009S0902W0254.wav", "txt": "系涨幅最为明显的地区"} -{"key": "BAC009S0902W0255", "wav": "./aishell/wav/test/S0902/BAC009S0902W0255.wav", "txt": "占总营收的二十六点百分之六十点六十七"} -{"key": "BAC009S0902W0257", "wav": "./aishell/wav/test/S0902/BAC009S0902W0257.wav", "txt": "苹果的股价有一定的波动规律"} -{"key": "BAC009S0902W0258", "wav": "./aishell/wav/test/S0902/BAC009S0902W0258.wav", "txt": "即是在新品发布前的一个多季度的时间内"} -{"key": "BAC009S0902W0259", "wav": "./aishell/wav/test/S0902/BAC009S0902W0259.wav", "txt": "因为在新品发布之前"} -{"key": "BAC009S0902W0260", "wav": "./aishell/wav/test/S0902/BAC009S0902W0260.wav", "txt": "由于许多用户都持币待购"} -{"key": "BAC009S0902W0261", "wav": "./aishell/wav/test/S0902/BAC009S0902W0261.wav", "txt": "因此需求会暂时被抑制住"} -{"key": "BAC009S0902W0262", "wav": "./aishell/wav/test/S0902/BAC009S0902W0262.wav", "txt": "销量都会有一定的影响"} -{"key": "BAC009S0902W0263", "wav": "./aishell/wav/test/S0902/BAC009S0902W0263.wav", "txt": "孙永杰对二十一世纪报道记者表示"} -{"key": "BAC009S0902W0264", "wav": "./aishell/wav/test/S0902/BAC009S0902W0264.wav", "txt": "苹果的股价会随着销量相反"} -{"key": "BAC009S0902W0266", "wav": "./aishell/wav/test/S0902/BAC009S0902W0266.wav", "txt": "缺乏新的业务增长点"} -{"key": "BAC009S0902W0267", "wav": "./aishell/wav/test/S0902/BAC009S0902W0267.wav", "txt": "苹果在二零一四年营收为二百二十二亿美元"} -{"key": "BAC009S0902W0269", "wav": "./aishell/wav/test/S0902/BAC009S0902W0269.wav", "txt": "就手机领域的发展趋势"} -{"key": "BAC009S0902W0270", "wav": "./aishell/wav/test/S0902/BAC009S0902W0270.wav", "txt": "苹果高端市场已经确立了一个独一无二的地位"} -{"key": "BAC009S0902W0271", "wav": "./aishell/wav/test/S0902/BAC009S0902W0271.wav", "txt": "以前在高端智能手机市场领域"} -{"key": "BAC009S0902W0272", "wav": "./aishell/wav/test/S0902/BAC009S0902W0272.wav", "txt": "苹果有两个竞争对手"} -{"key": "BAC009S0902W0274", "wav": "./aishell/wav/test/S0902/BAC009S0902W0274.wav", "txt": "今年股价已经累计下跌了百分之六十而三星的情况也不佳"} -{"key": "BAC009S0902W0275", "wav": "./aishell/wav/test/S0902/BAC009S0902W0275.wav", "txt": "在三星第二季度财报中"} -{"key": "BAC009S0902W0277", "wav": "./aishell/wav/test/S0902/BAC009S0902W0277.wav", "txt": "降至二十六点零六万亿韩元"} -{"key": "BAC009S0902W0278", "wav": "./aishell/wav/test/S0902/BAC009S0902W0278.wav", "txt": "其中手机的销售额下降了七点百分之三"} -{"key": "BAC009S0902W0279", "wav": "./aishell/wav/test/S0902/BAC009S0902W0279.wav", "txt": "至二十五点五万亿韩元"} -{"key": "BAC009S0902W0280", "wav": "./aishell/wav/test/S0902/BAC009S0902W0280.wav", "txt": "在安卓手机的总体交付量中"} -{"key": "BAC009S0902W0281", "wav": "./aishell/wav/test/S0902/BAC009S0902W0281.wav", "txt": "价格高于六百美元的高端手机占比为百分之一"} -{"key": "BAC009S0902W0282", "wav": "./aishell/wav/test/S0902/BAC009S0902W0282.wav", "txt": "价格高于六百美元的高端手机"} -{"key": "BAC009S0902W0283", "wav": "./aishell/wav/test/S0902/BAC009S0902W0283.wav", "txt": "在安卓出货量中的占比减少到了百分之六"} -{"key": "BAC009S0902W0285", "wav": "./aishell/wav/test/S0902/BAC009S0902W0285.wav", "txt": "价格高于六百美元的占比从百分之七十增加到了百分之八十"} -{"key": "BAC009S0902W0287", "wav": "./aishell/wav/test/S0902/BAC009S0902W0287.wav", "txt": "苹果在高端市场击溃了对手"} -{"key": "BAC009S0902W0288", "wav": "./aishell/wav/test/S0902/BAC009S0902W0288.wav", "txt": "而这对于未来苹果保持高利润和利润率至关重要"} -{"key": "BAC009S0902W0289", "wav": "./aishell/wav/test/S0902/BAC009S0902W0289.wav", "txt": "这对苹果是一个利好"} -{"key": "BAC009S0902W0290", "wav": "./aishell/wav/test/S0902/BAC009S0902W0290.wav", "txt": "意味着只要用户要选择高端手机"} -{"key": "BAC009S0902W0291", "wav": "./aishell/wav/test/S0902/BAC009S0902W0291.wav", "txt": "在类似印度之类的新兴市场"} -{"key": "BAC009S0902W0292", "wav": "./aishell/wav/test/S0902/BAC009S0902W0292.wav", "txt": "因此从全球的角度来看"} -{"key": "BAC009S0902W0293", "wav": "./aishell/wav/test/S0902/BAC009S0902W0293.wav", "txt": "智能手机仍然有增长和爆发的空间"} -{"key": "BAC009S0902W0295", "wav": "./aishell/wav/test/S0902/BAC009S0902W0295.wav", "txt": "但是作为一个仍在成长没有其他对手的市场来讲"} -{"key": "BAC009S0902W0296", "wav": "./aishell/wav/test/S0902/BAC009S0902W0296.wav", "txt": "苹果已然可以单点突破"} -{"key": "BAC009S0902W0297", "wav": "./aishell/wav/test/S0902/BAC009S0902W0297.wav", "txt": "至于新的业务增长点"} -{"key": "BAC009S0902W0298", "wav": "./aishell/wav/test/S0902/BAC009S0902W0298.wav", "txt": "但是瑞士联合银行分析师估计"} -{"key": "BAC009S0902W0299", "wav": "./aishell/wav/test/S0902/BAC009S0902W0299.wav", "txt": "较最初的预期减少了一半左右"} -{"key": "BAC009S0902W0302", "wav": "./aishell/wav/test/S0902/BAC009S0902W0302.wav", "txt": "已经占有了全球智能手表市场的百分之五"} -{"key": "BAC009S0902W0303", "wav": "./aishell/wav/test/S0902/BAC009S0902W0303.wav", "txt": "云计算和大数据时代"} -{"key": "BAC009S0902W0305", "wav": "./aishell/wav/test/S0902/BAC009S0902W0305.wav", "txt": "紫光股份曾经出现一连波连续十六个一字涨停的狂飙行市"} -{"key": "BAC009S0902W0306", "wav": "./aishell/wav/test/S0902/BAC009S0902W0306.wav", "txt": "近日的走势也强于大盘"} -{"key": "BAC009S0902W0307", "wav": "./aishell/wav/test/S0902/BAC009S0902W0307.wav", "txt": "两个机构专用席位列于买一和卖二的位置"} -{"key": "BAC009S0902W0308", "wav": "./aishell/wav/test/S0902/BAC009S0902W0308.wav", "txt": "买卖前五名共计净出于该股六十二点九三万元"} -{"key": "BAC009S0902W0309", "wav": "./aishell/wav/test/S0902/BAC009S0902W0309.wav", "txt": "大盘股仍是毫无作为"} -{"key": "BAC009S0902W0310", "wav": "./aishell/wav/test/S0902/BAC009S0902W0310.wav", "txt": "题材股继续扮演黑马角色"} -{"key": "BAC009S0902W0311", "wav": "./aishell/wav/test/S0902/BAC009S0902W0311.wav", "txt": "紫光股份千九十三八在公布拓展云计算市场后"} -{"key": "BAC009S0902W0312", "wav": "./aishell/wav/test/S0902/BAC009S0902W0312.wav", "txt": "盘中有两千六百八十六万元资金净流入"} -{"key": "BAC009S0902W0313", "wav": "./aishell/wav/test/S0902/BAC009S0902W0313.wav", "txt": "给孩子买儿童电话手表有必要吗"} -{"key": "BAC009S0902W0314", "wav": "./aishell/wav/test/S0902/BAC009S0902W0314.wav", "txt": "消费者在听销售人员介绍小天才手表"} -{"key": "BAC009S0902W0315", "wav": "./aishell/wav/test/S0902/BAC009S0902W0315.wav", "txt": "消费者在听销售人员介绍小天才手表"} -{"key": "BAC009S0902W0316", "wav": "./aishell/wav/test/S0902/BAC009S0902W0316.wav", "txt": "消费者在听销售人员介绍小天才电话手表"} -{"key": "BAC009S0902W0317", "wav": "./aishell/wav/test/S0902/BAC009S0902W0317.wav", "txt": "很多家长都在给孩子购置各种学习用品"} -{"key": "BAC009S0902W0318", "wav": "./aishell/wav/test/S0902/BAC009S0902W0318.wav", "txt": "除了传统的书包文具以及辅导书外"} -{"key": "BAC009S0902W0319", "wav": "./aishell/wav/test/S0902/BAC009S0902W0319.wav", "txt": "这个儿童电话手表以其强大的定位通话微聊等功能"} -{"key": "BAC009S0902W0320", "wav": "./aishell/wav/test/S0902/BAC009S0902W0320.wav", "txt": "深受家长和儿童欢迎"} -{"key": "BAC009S0902W0321", "wav": "./aishell/wav/test/S0902/BAC009S0902W0321.wav", "txt": "很多孩子都以拥有一款电话手表为豪"} -{"key": "BAC009S0902W0322", "wav": "./aishell/wav/test/S0902/BAC009S0902W0322.wav", "txt": "而不少品牌的电话手表量销售量更是突破百万大关"} -{"key": "BAC009S0902W0323", "wav": "./aishell/wav/test/S0902/BAC009S0902W0323.wav", "txt": "电话手表对儿童健康安全是否有危险"} -{"key": "BAC009S0902W0324", "wav": "./aishell/wav/test/S0902/BAC009S0902W0324.wav", "txt": "老师是否允许孩子戴手表上学"} -{"key": "BAC009S0902W0325", "wav": "./aishell/wav/test/S0902/BAC009S0902W0325.wav", "txt": "电话手表应该如何选购"} -{"key": "BAC009S0902W0326", "wav": "./aishell/wav/test/S0902/BAC009S0902W0326.wav", "txt": "笔者进行了深度的了解"} -{"key": "BAC009S0902W0327", "wav": "./aishell/wav/test/S0902/BAC009S0902W0327.wav", "txt": "儿童电话手表到底有多火"} -{"key": "BAC009S0902W0328", "wav": "./aishell/wav/test/S0902/BAC009S0902W0328.wav", "txt": "年龄或大或小的孩子"} -{"key": "BAC009S0902W0330", "wav": "./aishell/wav/test/S0902/BAC009S0902W0330.wav", "txt": "都会目不转睛的盯着"} -{"key": "BAC009S0902W0331", "wav": "./aishell/wav/test/S0902/BAC009S0902W0331.wav", "txt": "或者跟着广告哼起歌曲来"} -{"key": "BAC009S0902W0332", "wav": "./aishell/wav/test/S0902/BAC009S0902W0332.wav", "txt": "随着产品快速进入家长和孩童的视野"} -{"key": "BAC009S0902W0333", "wav": "./aishell/wav/test/S0902/BAC009S0902W0333.wav", "txt": "每天的销量让你感受到儿童电话手表的火爆"} -{"key": "BAC009S0902W0334", "wav": "./aishell/wav/test/S0902/BAC009S0902W0334.wav", "txt": "对于如此火爆的市市场需求"} -{"key": "BAC009S0902W0335", "wav": "./aishell/wav/test/S0902/BAC009S0902W0335.wav", "txt": "来自广西的苏女士说家长对孩子安全的关心"} -{"key": "BAC009S0902W0336", "wav": "./aishell/wav/test/S0902/BAC009S0902W0336.wav", "txt": "是电话手表今年大受欢迎的主要原因"} -{"key": "BAC009S0902W0337", "wav": "./aishell/wav/test/S0902/BAC009S0902W0337.wav", "txt": "在电话手表出现之前"} -{"key": "BAC009S0902W0338", "wav": "./aishell/wav/test/S0902/BAC009S0902W0338.wav", "txt": "据悉他正在积极进修表演准备进入演艺圈"} -{"key": "BAC009S0902W0339", "wav": "./aishell/wav/test/S0902/BAC009S0902W0339.wav", "txt": "近日日本媒体曝出惊人消息"} -{"key": "BAC009S0902W0340", "wav": "./aishell/wav/test/S0902/BAC009S0902W0340.wav", "txt": "称高桥大辅可能在一段时间里出柜"} -{"key": "BAC009S0902W0341", "wav": "./aishell/wav/test/S0902/BAC009S0902W0341.wav", "txt": "公开自己的同性恋者身份"} -{"key": "BAC009S0902W0342", "wav": "./aishell/wav/test/S0902/BAC009S0902W0342.wav", "txt": "恐怕又要传来不少女粉丝心碎的声音了"} -{"key": "BAC009S0902W0343", "wav": "./aishell/wav/test/S0902/BAC009S0902W0343.wav", "txt": "高桥大辅堪称日本花样滑冰男单领域的领军人物"} -{"key": "BAC009S0902W0344", "wav": "./aishell/wav/test/S0902/BAC009S0902W0344.wav", "txt": "在他的职业生涯里曾在二零一零年拿到世锦赛金牌"} -{"key": "BAC009S0902W0345", "wav": "./aishell/wav/test/S0902/BAC009S0902W0345.wav", "txt": "温哥华冬奥会拿到铜牌"} -{"key": "BAC009S0902W0346", "wav": "./aishell/wav/test/S0902/BAC009S0902W0346.wav", "txt": "一二年总决赛拿到金牌"} -{"key": "BAC009S0902W0347", "wav": "./aishell/wav/test/S0902/BAC009S0902W0347.wav", "txt": "还曾经两次拿到了四大洲锦标赛的男单冠军"} -{"key": "BAC009S0902W0348", "wav": "./aishell/wav/test/S0902/BAC009S0902W0348.wav", "txt": "表示未来会进入演艺圈发展"} -{"key": "BAC009S0902W0349", "wav": "./aishell/wav/test/S0902/BAC009S0902W0349.wav", "txt": "颜值颇高的他今年四月远赴美国纽约"} -{"key": "BAC009S0902W0350", "wav": "./aishell/wav/test/S0902/BAC009S0902W0350.wav", "txt": "高桥大辅丝毫不加掩饰"} -{"key": "BAC009S0902W0351", "wav": "./aishell/wav/test/S0902/BAC009S0902W0351.wav", "txt": "他经常在社交网站公开美食等照片"} -{"key": "BAC009S0902W0352", "wav": "./aishell/wav/test/S0902/BAC009S0902W0352.wav", "txt": "看起来在美国过得很开心的样子"} -{"key": "BAC009S0902W0353", "wav": "./aishell/wav/test/S0902/BAC009S0902W0353.wav", "txt": "过去一直背负着日本花滑界的重压"} -{"key": "BAC009S0902W0354", "wav": "./aishell/wav/test/S0902/BAC009S0902W0354.wav", "txt": "终于得到了释放的样子"} -{"key": "BAC009S0902W0355", "wav": "./aishell/wav/test/S0902/BAC009S0902W0355.wav", "txt": "他每周二三天来学校"} -{"key": "BAC009S0902W0356", "wav": "./aishell/wav/test/S0902/BAC009S0902W0356.wav", "txt": "还有记者爆料说居住在纽约的日本人透露"} -{"key": "BAC009S0902W0357", "wav": "./aishell/wav/test/S0902/BAC009S0902W0357.wav", "txt": "高桥在当地过着奢华享乐的生活"} -{"key": "BAC009S0902W0358", "wav": "./aishell/wav/test/S0902/BAC009S0902W0358.wav", "txt": "如果真的想学习的话"} -{"key": "BAC009S0902W0359", "wav": "./aishell/wav/test/S0902/BAC009S0902W0359.wav", "txt": "就不会刻意选择位于纽约闹市区的这所大学"} -{"key": "BAC009S0902W0360", "wav": "./aishell/wav/test/S0902/BAC009S0902W0360.wav", "txt": "图片中他们一行人面对镜头尽显搞怪天赋"} -{"key": "BAC009S0902W0361", "wav": "./aishell/wav/test/S0902/BAC009S0902W0361.wav", "txt": "高桥大辅则是噘着嘴做出索吻的动作"} -{"key": "BAC009S0902W0362", "wav": "./aishell/wav/test/S0902/BAC009S0902W0362.wav", "txt": "外界认为这是一种另有深意的暗示"} -{"key": "BAC009S0902W0363", "wav": "./aishell/wav/test/S0902/BAC009S0902W0363.wav", "txt": "而对于他的好友小林尊"} -{"key": "BAC009S0902W0364", "wav": "./aishell/wav/test/S0902/BAC009S0902W0364.wav", "txt": "被认为日本体育界的相关人士称"} -{"key": "BAC009S0902W0365", "wav": "./aishell/wav/test/S0902/BAC009S0902W0365.wav", "txt": "但多年来关于他的形婚"} -{"key": "BAC009S0902W0366", "wav": "./aishell/wav/test/S0902/BAC009S0902W0366.wav", "txt": "实际上是同性恋者的传闻一直未停过"} -{"key": "BAC009S0902W0367", "wav": "./aishell/wav/test/S0902/BAC009S0902W0367.wav", "txt": "和澳洲鱼雷索普一样"} -{"key": "BAC009S0902W0368", "wav": "./aishell/wav/test/S0902/BAC009S0902W0368.wav", "txt": "高桥大辅因为其比赛风格的妖娆多变"} -{"key": "BAC009S0902W0369", "wav": "./aishell/wav/test/S0902/BAC009S0902W0369.wav", "txt": "多年来围绕其性取向的争论一直没有停息"} -{"key": "BAC009S0902W0370", "wav": "./aishell/wav/test/S0902/BAC009S0902W0370.wav", "txt": "退役前高桥大辅曾与花滑女神浅田真央传出恋情"} -{"key": "BAC009S0902W0371", "wav": "./aishell/wav/test/S0902/BAC009S0902W0371.wav", "txt": "身为上司而且已婚有儿女的桥本被指责涉嫌性侵"} -{"key": "BAC009S0902W0372", "wav": "./aishell/wav/test/S0902/BAC009S0902W0372.wav", "txt": "不过两位当事人双双否认性侵的说法"} -{"key": "BAC009S0902W0373", "wav": "./aishell/wav/test/S0902/BAC009S0902W0373.wav", "txt": "如今和小林尊出双入对"} -{"key": "BAC009S0902W0374", "wav": "./aishell/wav/test/S0902/BAC009S0902W0374.wav", "txt": "高调参加同性恋者的年度盛事"} -{"key": "BAC009S0902W0375", "wav": "./aishell/wav/test/S0902/BAC009S0902W0375.wav", "txt": "有可靠消息称高桥很可能在近期正式宣布出柜"} -{"key": "BAC009S0902W0376", "wav": "./aishell/wav/test/S0902/BAC009S0902W0376.wav", "txt": "此消息一出迅速引发外界强烈关注"} -{"key": "BAC009S0902W0377", "wav": "./aishell/wav/test/S0902/BAC009S0902W0377.wav", "txt": "日本网友也是众说纷纭一点儿也不吃惊"} -{"key": "BAC009S0902W0378", "wav": "./aishell/wav/test/S0902/BAC009S0902W0378.wav", "txt": "看他在冰场上搔首弄姿地表现"} -{"key": "BAC009S0902W0379", "wav": "./aishell/wav/test/S0902/BAC009S0902W0379.wav", "txt": "高桥大辅应该是他的新欢"} -{"key": "BAC009S0902W0380", "wav": "./aishell/wav/test/S0902/BAC009S0902W0380.wav", "txt": "难怪他能接受年过半百的桥本的索吻"} -{"key": "BAC009S0902W0381", "wav": "./aishell/wav/test/S0902/BAC009S0902W0381.wav", "txt": "许多为高桥痴迷多年的女粉丝肯定深受打击"} -{"key": "BAC009S0902W0382", "wav": "./aishell/wav/test/S0902/BAC009S0902W0382.wav", "txt": "作为日本的花滑王子"} -{"key": "BAC009S0902W0383", "wav": "./aishell/wav/test/S0902/BAC009S0902W0383.wav", "txt": "这么多年一直要压抑自己的性取向"} -{"key": "BAC009S0902W0384", "wav": "./aishell/wav/test/S0902/BAC009S0902W0384.wav", "txt": "挺不容易的支持他追属属于自己的真正幸福"} -{"key": "BAC009S0902W0385", "wav": "./aishell/wav/test/S0902/BAC009S0902W0385.wav", "txt": "据美联社十日报道"} -{"key": "BAC009S0902W0386", "wav": "./aishell/wav/test/S0902/BAC009S0902W0386.wav", "txt": "一些参赛选手赛后感到胃部不适"} -{"key": "BAC009S0902W0387", "wav": "./aishell/wav/test/S0902/BAC009S0902W0387.wav", "txt": "而队医怀疑这或许与比赛地水污染有关"} -{"key": "BAC009S0902W0388", "wav": "./aishell/wav/test/S0902/BAC009S0902W0388.wav", "txt": "美国队官员不排除他们的队员因食物或饮水而生病"} -{"key": "BAC009S0902W0389", "wav": "./aishell/wav/test/S0902/BAC009S0902W0389.wav", "txt": "近来有关里约水污染问题备受关注"} -{"key": "BAC009S0902W0390", "wav": "./aishell/wav/test/S0902/BAC009S0902W0390.wav", "txt": "美联社公布的一项独立水质检测显示"} -{"key": "BAC009S0902W0391", "wav": "./aishell/wav/test/S0902/BAC009S0902W0391.wav", "txt": "在奥运会赛艇和铁人三项公开水域等比赛地"} -{"key": "BAC009S0902W0392", "wav": "./aishell/wav/test/S0902/BAC009S0902W0392.wav", "txt": "也存在高危病毒危险"} -{"key": "BAC009S0902W0393", "wav": "./aishell/wav/test/S0902/BAC009S0902W0393.wav", "txt": "该湖区也将是明年奥运会赛艇比赛地"} -{"key": "BAC009S0902W0394", "wav": "./aishell/wav/test/S0902/BAC009S0902W0394.wav", "txt": "比污染严重的瓜内巴拉湾相比"} -{"key": "BAC009S0902W0395", "wav": "./aishell/wav/test/S0902/BAC009S0902W0395.wav", "txt": "赛艇比赛所在湖区的水污染问题近年来得到改善"} -{"key": "BAC009S0902W0396", "wav": "./aishell/wav/test/S0902/BAC009S0902W0396.wav", "txt": "但是上周公布的水质检测显示"} -{"key": "BAC009S0902W0397", "wav": "./aishell/wav/test/S0902/BAC009S0902W0397.wav", "txt": "湖区水污染仍旧十分严重"} -{"key": "BAC009S0902W0398", "wav": "./aishell/wav/test/S0902/BAC009S0902W0398.wav", "txt": "在本次赛艇测试赛期间"} -{"key": "BAC009S0902W0399", "wav": "./aishell/wav/test/S0902/BAC009S0902W0399.wav", "txt": "一些参赛选手也向新华社记者表示"} -{"key": "BAC009S0902W0400", "wav": "./aishell/wav/test/S0902/BAC009S0902W0400.wav", "txt": "比赛地的湖水比较浑浊"} -{"key": "BAC009S0902W0401", "wav": "./aishell/wav/test/S0902/BAC009S0902W0401.wav", "txt": "但还是担心水质问题"} -{"key": "BAC009S0902W0402", "wav": "./aishell/wav/test/S0902/BAC009S0902W0402.wav", "txt": "来自中国的赛艇选手崔帅豪说"} -{"key": "BAC009S0902W0403", "wav": "./aishell/wav/test/S0902/BAC009S0902W0403.wav", "txt": "比赛地水不是太干净"} -{"key": "BAC009S0902W0404", "wav": "./aishell/wav/test/S0902/BAC009S0902W0404.wav", "txt": "他自己还将出任影片的男主角"} -{"key": "BAC009S0902W0405", "wav": "./aishell/wav/test/S0902/BAC009S0902W0405.wav", "txt": "忙碌成本可想而知"} -{"key": "BAC009S0902W0406", "wav": "./aishell/wav/test/S0902/BAC009S0902W0406.wav", "txt": "外媒发布了更令人兴奋的消息"} -{"key": "BAC009S0902W0407", "wav": "./aishell/wav/test/S0902/BAC009S0902W0407.wav", "txt": "将在本届美国电影学会影展中进行秘密放映"} -{"key": "BAC009S0902W0408", "wav": "./aishell/wav/test/S0902/BAC009S0902W0408.wav", "txt": "对方是二十五岁的人妻名模泰舒培"} -{"key": "BAC009S0902W0409", "wav": "./aishell/wav/test/S0902/BAC009S0902W0409.wav", "txt": "搜狐娱乐讯七月十五日"} -{"key": "BAC009S0902W0410", "wav": "./aishell/wav/test/S0902/BAC009S0902W0410.wav", "txt": "陈冠希前女友嫩模黄榕在香港书展出席写真宣传活动"} -{"key": "BAC009S0902W0411", "wav": "./aishell/wav/test/S0902/BAC009S0902W0411.wav", "txt": "身穿白色抹胸的她大秀性感好身材"} -{"key": "BAC009S0902W0412", "wav": "./aishell/wav/test/S0902/BAC009S0902W0412.wav", "txt": "谈及前男友陈冠希近日被指外貌衰老了不少"} -{"key": "BAC009S0902W0413", "wav": "./aishell/wav/test/S0902/BAC009S0902W0413.wav", "txt": "黄榕坦言可能他做了太多运动"} -{"key": "BAC009S0902W0414", "wav": "./aishell/wav/test/S0902/BAC009S0902W0414.wav", "txt": "搜狐娱乐讯日前"} -{"key": "BAC009S0902W0415", "wav": "./aishell/wav/test/S0902/BAC009S0902W0415.wav", "txt": "众星云集上海出席某商家的开业活动"} -{"key": "BAC009S0902W0416", "wav": "./aishell/wav/test/S0902/BAC009S0902W0416.wav", "txt": "由潮男陈冠希打头阵"} -{"key": "BAC009S0902W0417", "wav": "./aishell/wav/test/S0902/BAC009S0902W0417.wav", "txt": "更云集了罗中旭前任"} -{"key": "BAC009S0902W0418", "wav": "./aishell/wav/test/S0902/BAC009S0902W0418.wav", "txt": "黄宗泽绯闻女友等女星"} -{"key": "BAC009S0902W0419", "wav": "./aishell/wav/test/S0902/BAC009S0902W0419.wav", "txt": "现场气氛火爆"} -{"key": "BAC009S0902W0420", "wav": "./aishell/wav/test/S0902/BAC009S0902W0420.wav", "txt": "粉丝们一度失控"} -{"key": "BAC009S0902W0421", "wav": "./aishell/wav/test/S0902/BAC009S0902W0421.wav", "txt": "陈冠希坦言认为陈奕迅是k歌之王"} -{"key": "BAC009S0902W0422", "wav": "./aishell/wav/test/S0902/BAC009S0902W0422.wav", "txt": "但由于风格不同"} -{"key": "BAC009S0902W0423", "wav": "./aishell/wav/test/S0902/BAC009S0902W0423.wav", "txt": "新专辑音乐方面还是坚持做自己"} -{"key": "BAC009S0902W0424", "wav": "./aishell/wav/test/S0902/BAC009S0902W0424.wav", "txt": "搜狐娱乐讯九月五日"} -{"key": "BAC009S0902W0425", "wav": "./aishell/wav/test/S0902/BAC009S0902W0425.wav", "txt": "一怒之下把大叔身份证扔在地上"} -{"key": "BAC009S0902W0426", "wav": "./aishell/wav/test/S0902/BAC009S0902W0426.wav", "txt": "二人发生姓肢体冲突"} -{"key": "BAC009S0902W0427", "wav": "./aishell/wav/test/S0902/BAC009S0902W0427.wav", "txt": "此视频曝光后"} -{"key": "BAC009S0902W0428", "wav": "./aishell/wav/test/S0902/BAC009S0902W0428.wav", "txt": "网友纷纷力挺陈冠希"} -{"key": "BAC009S0902W0429", "wav": "./aishell/wav/test/S0902/BAC009S0902W0429.wav", "txt": "温兆伦许飞欧弟等明星也通过微博表示支持力挺"} -{"key": "BAC009S0902W0430", "wav": "./aishell/wav/test/S0902/BAC009S0902W0430.wav", "txt": "搜狐娱乐讯北京时间六月二十四日消息"} -{"key": "BAC009S0902W0431", "wav": "./aishell/wav/test/S0902/BAC009S0902W0431.wav", "txt": "渔船凶案嫌疑借发动机声将同船同事依次杀害"} -{"key": "BAC009S0902W0432", "wav": "./aishell/wav/test/S0902/BAC009S0902W0432.wav", "txt": "渔船海上爆炸沉没四名渔民漂流三天获救"} -{"key": "BAC009S0902W0433", "wav": "./aishell/wav/test/S0902/BAC009S0902W0433.wav", "txt": "昨天上午七点五零分"} -{"key": "BAC009S0902W0434", "wav": "./aishell/wav/test/S0902/BAC009S0902W0434.wav", "txt": "目前正在根据海事部门的要求开往盐城大分港"} -{"key": "BAC009S0902W0435", "wav": "./aishell/wav/test/S0902/BAC009S0902W0435.wav", "txt": "准备将获救的四人送上岸边医院救治"} -{"key": "BAC009S0902W0436", "wav": "./aishell/wav/test/S0902/BAC009S0902W0436.wav", "txt": "渔船海上被撞翻仅一人逃生同伴求救却无能为力"} -{"key": "BAC009S0902W0437", "wav": "./aishell/wav/test/S0902/BAC009S0902W0437.wav", "txt": "出事的渔船被拖到韩榆石桥海边"} -{"key": "BAC009S0902W0438", "wav": "./aishell/wav/test/S0902/BAC009S0902W0438.wav", "txt": "渔船被其他船撞翻六人死海事部门悬赏五万寻肇事者"} -{"key": "BAC009S0902W0439", "wav": "./aishell/wav/test/S0902/BAC009S0902W0439.wav", "txt": "快报讯通讯员李欢乐记者王晓宇八月二六日"} -{"key": "BAC009S0902W0440", "wav": "./aishell/wav/test/S0902/BAC009S0902W0440.wav", "txt": "船上八名船员六人不幸遇难"} -{"key": "BAC009S0902W0441", "wav": "./aishell/wav/test/S0902/BAC009S0902W0441.wav", "txt": "只有一名船员得以逃生"} -{"key": "BAC009S0902W0442", "wav": "./aishell/wav/test/S0902/BAC009S0902W0442.wav", "txt": "渝武高速武胜段发生追尾事故已造成六死九伤"} -{"key": "BAC009S0902W0443", "wav": "./aishell/wav/test/S0902/BAC009S0902W0443.wav", "txt": "记者从广安消防部门获悉"} -{"key": "BAC009S0902W0444", "wav": "./aishell/wav/test/S0902/BAC009S0902W0444.wav", "txt": "大客车的车头和车身损毁严重"} -{"key": "BAC009S0902W0445", "wav": "./aishell/wav/test/S0902/BAC009S0902W0445.wav", "txt": "车辆载有数十名乘客"} -{"key": "BAC009S0902W0446", "wav": "./aishell/wav/test/S0902/BAC009S0902W0446.wav", "txt": "截至九点四零分消防人员撤离时"} -{"key": "BAC009S0902W0447", "wav": "./aishell/wav/test/S0902/BAC009S0902W0447.wav", "txt": "已造成六人死亡九人受伤"} -{"key": "BAC009S0902W0448", "wav": "./aishell/wav/test/S0902/BAC009S0902W0448.wav", "txt": "目前记者正赶往武胜县人民医院"} -{"key": "BAC009S0902W0449", "wav": "./aishell/wav/test/S0902/BAC009S0902W0449.wav", "txt": "渝蓉高速四川段计划明年通车被称最拖沓高速"} -{"key": "BAC009S0902W0450", "wav": "./aishell/wav/test/S0902/BAC009S0902W0450.wav", "txt": "渝蓉高速四川段因烂尾被称为最拖沓高速"} -{"key": "BAC009S0902W0451", "wav": "./aishell/wav/test/S0902/BAC009S0902W0451.wav", "txt": "渝蓉高速四川段资金断裂烂尾已修了六年"} -{"key": "BAC009S0902W0452", "wav": "./aishell/wav/test/S0902/BAC009S0902W0452.wav", "txt": "渣土车右转弯骑车男童被卷入车轮下不幸身亡"} -{"key": "BAC009S0902W0453", "wav": "./aishell/wav/test/S0902/BAC009S0902W0453.wav", "txt": "肇事车及损伤严重的自行车报料人供图"} -{"key": "BAC009S0902W0454", "wav": "./aishell/wav/test/S0902/BAC009S0902W0454.wav", "txt": "渣土车挂倒电动车致一死一伤肇事车主逃逸"} -{"key": "BAC009S0902W0455", "wav": "./aishell/wav/test/S0902/BAC009S0902W0455.wav", "txt": "蚌飞市发生一起惨剧"} -{"key": "BAC009S0902W0456", "wav": "./aishell/wav/test/S0902/BAC009S0902W0456.wav", "txt": "一对男女骑电动车在通过一个十字路口时"} -{"key": "BAC009S0902W0457", "wav": "./aishell/wav/test/S0902/BAC009S0902W0457.wav", "txt": "被同方向行驶的一辆渣土车挂倒"} -{"key": "BAC009S0902W0458", "wav": "./aishell/wav/test/S0902/BAC009S0902W0458.wav", "txt": "骑电动车男子当场死亡"} -{"key": "BAC009S0902W0459", "wav": "./aishell/wav/test/S0902/BAC009S0902W0459.wav", "txt": "但渣土车司机肇事后不仅没有下车救援"} -{"key": "BAC009S0902W0460", "wav": "./aishell/wav/test/S0902/BAC009S0902W0460.wav", "txt": "目前当地警方正在追查这名司机"} -{"key": "BAC009S0902W0461", "wav": "./aishell/wav/test/S0902/BAC009S0902W0461.wav", "txt": "渣土车撞进路边民房女子抱小孩幸运逃生"} -{"key": "BAC009S0902W0462", "wav": "./aishell/wav/test/S0902/BAC009S0902W0462.wav", "txt": "山水湾小区斜对面的一处工地旁"} -{"key": "BAC009S0902W0463", "wav": "./aishell/wav/test/S0902/BAC009S0902W0463.wav", "txt": "肇事的大卡车车头仍然卡在工房内"} -{"key": "BAC009S0902W0464", "wav": "./aishell/wav/test/S0902/BAC009S0902W0464.wav", "txt": "图记者陈斌潇湘晨报长沙讯一零月一一日下午"} -{"key": "BAC009S0902W0465", "wav": "./aishell/wav/test/S0902/BAC009S0902W0465.wav", "txt": "长沙县湘龙西路一处十字路口"} -{"key": "BAC009S0902W0466", "wav": "./aishell/wav/test/S0902/BAC009S0902W0466.wav", "txt": "一辆红色的卡车和一辆黄色的渣土车发生碰撞"} -{"key": "BAC009S0902W0467", "wav": "./aishell/wav/test/S0902/BAC009S0902W0467.wav", "txt": "黄色渣土车一头撞进了路边的工房"} -{"key": "BAC009S0902W0468", "wav": "./aishell/wav/test/S0902/BAC009S0902W0468.wav", "txt": "被撞废的奔驰昨日二二时左右"} -{"key": "BAC009S0902W0469", "wav": "./aishell/wav/test/S0902/BAC009S0902W0469.wav", "txt": "省城政务区习友路与怀宁路交叉口"} -{"key": "BAC009S0902W0470", "wav": "./aishell/wav/test/S0902/BAC009S0902W0470.wav", "txt": "一辆渣土车突然冲向逆向车道"} -{"key": "BAC009S0902W0471", "wav": "./aishell/wav/test/S0902/BAC009S0902W0471.wav", "txt": "连续撞了五辆小轿车最终才停了下来"} -{"key": "BAC009S0902W0472", "wav": "./aishell/wav/test/S0902/BAC009S0902W0472.wav", "txt": "其中一辆奔驰轿车被撞出近一零米远"} -{"key": "BAC009S0902W0473", "wav": "./aishell/wav/test/S0902/BAC009S0902W0473.wav", "txt": "渤海一渔船沉没船上一六人落水一二人失踪"} -{"key": "BAC009S0902W0474", "wav": "./aishell/wav/test/S0902/BAC009S0902W0474.wav", "txt": "唐山乐亭一船队在渤海与一山东渔船发生纠纷"} -{"key": "BAC009S0902W0475", "wav": "./aishell/wav/test/S0902/BAC009S0902W0475.wav", "txt": "导致唐山一渔船沉没"} -{"key": "BAC009S0902W0476", "wav": "./aishell/wav/test/S0902/BAC009S0902W0476.wav", "txt": "但因海上风大浪急影响救援"} -{"key": "BAC009S0902W0477", "wav": "./aishell/wav/test/S0902/BAC009S0902W0477.wav", "txt": "目前仍未发现失踪船员"} -{"key": "BAC009S0902W0478", "wav": "./aishell/wav/test/S0902/BAC009S0902W0478.wav", "txt": "渤海垃圾成堆变死海"} -{"key": "BAC009S0902W0479", "wav": "./aishell/wav/test/S0902/BAC009S0902W0479.wav", "txt": "与韩国西海相连的中国渤海湾由于垃圾堆积"} -{"key": "BAC009S0902W0480", "wav": "./aishell/wav/test/S0902/BAC009S0902W0480.wav", "txt": "有人忧虑渤海湾的污染会直接影响到韩国西部海域"} -{"key": "BAC009S0902W0481", "wav": "./aishell/wav/test/S0902/BAC009S0902W0481.wav", "txt": "渤海失事河北籍渔船已致四人遇难仍有八人失踪"} -{"key": "BAC009S0902W0482", "wav": "./aishell/wav/test/S0902/BAC009S0902W0482.wav", "txt": "又在船仓内发现四名船员遗体"} -{"key": "BAC009S0902W0483", "wav": "./aishell/wav/test/S0902/BAC009S0902W0483.wav", "txt": "目前仍有八名失踪人员下落不明"} -{"key": "BAC009S0902W0484", "wav": "./aishell/wav/test/S0902/BAC009S0902W0484.wav", "txt": "渤海湾溢油事故赔偿案宣判康菲公司被判赔一六八万"} -{"key": "BAC009S0902W0485", "wav": "./aishell/wav/test/S0902/BAC009S0902W0485.wav", "txt": "温岭倒塌厂房系违章建筑涉事负责人已被控制"} -{"key": "BAC009S0902W0486", "wav": "./aishell/wav/test/S0902/BAC009S0902W0486.wav", "txt": "据新华社电七月四日一六时许"} -{"key": "BAC009S0902W0487", "wav": "./aishell/wav/test/S0902/BAC009S0902W0487.wav", "txt": "浙江温岭市大溪镇发生鞋厂厂房倒塌事故"} -{"key": "BAC009S0902W0488", "wav": "./aishell/wav/test/S0902/BAC009S0902W0488.wav", "txt": "共造成一四人死亡三三人受伤"} -{"key": "BAC009S0902W0489", "wav": "./aishell/wav/test/S0902/BAC009S0902W0489.wav", "txt": "事故厂房系违章建筑"} -{"key": "BAC009S0902W0490", "wav": "./aishell/wav/test/S0902/BAC009S0902W0490.wav", "txt": "此前已被列入拆除范围"} -{"key": "BAC009S0902W0491", "wav": "./aishell/wav/test/S0902/BAC009S0902W0491.wav", "txt": "涉事两企业负责人均已被控制"} -{"key": "BAC009S0902W0492", "wav": "./aishell/wav/test/S0902/BAC009S0902W0492.wav", "txt": "温岭医院助理殴打女病人五年后提拔为副院长"} -{"key": "BAC009S0902W0493", "wav": "./aishell/wav/test/S0902/BAC009S0902W0493.wav", "txt": "法晚深度即时记者杜雯雯实习生张明明近日"} -{"key": "BAC009S0902W0494", "wav": "./aishell/wav/test/S0902/BAC009S0902W0494.wav", "txt": "此关于滕灵方此后晋升为副院长一事"} -{"key": "BAC009S0902W0495", "wav": "./aishell/wav/test/S0902/BAC009S0902W0495.wav", "txt": "该医院党委书记杨幼萍向晚报记者表示"} -{"key": "BAC009S0903W0121", "wav": "./aishell/wav/test/S0903/BAC009S0903W0121.wav", "txt": "进而选择此类方式购房"} -{"key": "BAC009S0903W0122", "wav": "./aishell/wav/test/S0903/BAC009S0903W0122.wav", "txt": "另外要处理公积金异地使用的问题"} -{"key": "BAC009S0903W0123", "wav": "./aishell/wav/test/S0903/BAC009S0903W0123.wav", "txt": "这对于目前一线城市来说很紧要"} -{"key": "BAC009S0903W0124", "wav": "./aishell/wav/test/S0903/BAC009S0903W0124.wav", "txt": "很多人受限购政策的影响"} -{"key": "BAC009S0903W0125", "wav": "./aishell/wav/test/S0903/BAC009S0903W0125.wav", "txt": "难以在周边城市用公积金购房"} -{"key": "BAC009S0903W0126", "wav": "./aishell/wav/test/S0903/BAC009S0903W0126.wav", "txt": "导致公积金资源闲置的问题出现"} -{"key": "BAC009S0903W0127", "wav": "./aishell/wav/test/S0903/BAC009S0903W0127.wav", "txt": "中新网房产频道"} -{"key": "BAC009S0903W0128", "wav": "./aishell/wav/test/S0903/BAC009S0903W0128.wav", "txt": "随着广州住房公积金贷款政策的调整实施"} -{"key": "BAC009S0903W0129", "wav": "./aishell/wav/test/S0903/BAC009S0903W0129.wav", "txt": "北上广深四个一线城市已经全部放开公积金房贷业"} -{"key": "BAC009S0903W0130", "wav": "./aishell/wav/test/S0903/BAC009S0903W0130.wav", "txt": "公积金新政加速楼市库存消化至搜狐财经"} -{"key": "BAC009S0903W0131", "wav": "./aishell/wav/test/S0903/BAC009S0903W0131.wav", "txt": "住建部等三部委联合发文"} -{"key": "BAC009S0903W0132", "wav": "./aishell/wav/test/S0903/BAC009S0903W0132.wav", "txt": "再次降低公积金贷款的门槛"} -{"key": "BAC009S0903W0133", "wav": "./aishell/wav/test/S0903/BAC009S0903W0133.wav", "txt": "还清首套房公积金贷款"} -{"key": "BAC009S0903W0134", "wav": "./aishell/wav/test/S0903/BAC009S0903W0134.wav", "txt": "再次申请公积金贷款购买第二套房的"} -{"key": "BAC009S0903W0135", "wav": "./aishell/wav/test/S0903/BAC009S0903W0135.wav", "txt": "该政策延续了去年新政以来"} -{"key": "BAC009S0903W0136", "wav": "./aishell/wav/test/S0903/BAC009S0903W0136.wav", "txt": "也延续了公积金担当扶持楼市主力军的政策选择"} -{"key": "BAC009S0903W0137", "wav": "./aishell/wav/test/S0903/BAC009S0903W0137.wav", "txt": "从去年三部委发文"} -{"key": "BAC009S0903W0138", "wav": "./aishell/wav/test/S0903/BAC009S0903W0138.wav", "txt": "公积金对楼市的扶持力度不断加大"} -{"key": "BAC009S0903W0139", "wav": "./aishell/wav/test/S0903/BAC009S0903W0139.wav", "txt": "相继有一百多个城市出台了公积金新政"} -{"key": "BAC009S0903W0140", "wav": "./aishell/wav/test/S0903/BAC009S0903W0140.wav", "txt": "公积金贷款利率也数次下调"} -{"key": "BAC009S0903W0141", "wav": "./aishell/wav/test/S0903/BAC009S0903W0141.wav", "txt": "二套还清十首套比例降至五成"} -{"key": "BAC009S0903W0142", "wav": "./aishell/wav/test/S0903/BAC009S0903W0142.wav", "txt": "与新政相比"} -{"key": "BAC009S0903W0143", "wav": "./aishell/wav/test/S0903/BAC009S0903W0143.wav", "txt": "目前公积金政策已经与去年等同了"} -{"key": "BAC009S0903W0144", "wav": "./aishell/wav/test/S0903/BAC009S0903W0144.wav", "txt": "此次公积金政策大力度调整"} -{"key": "BAC009S0903W0145", "wav": "./aishell/wav/test/S0903/BAC009S0903W0145.wav", "txt": "主要目的是通过激励改善型住房需求"} -{"key": "BAC009S0903W0146", "wav": "./aishell/wav/test/S0903/BAC009S0903W0146.wav", "txt": "实现三四线城市去库存"} -{"key": "BAC009S0903W0147", "wav": "./aishell/wav/test/S0903/BAC009S0903W0147.wav", "txt": "尽管全国商品房销售面积持续回升"} -{"key": "BAC009S0903W0148", "wav": "./aishell/wav/test/S0903/BAC009S0903W0148.wav", "txt": "但库存压力却难以缓减"} -{"key": "BAC009S0903W0149", "wav": "./aishell/wav/test/S0903/BAC009S0903W0149.wav", "txt": "全国商品房待售面积比七月末增加了五百万平方米"} -{"key": "BAC009S0903W0150", "wav": "./aishell/wav/test/S0903/BAC009S0903W0150.wav", "txt": "比去年底增加了七万平方米"} -{"key": "BAC009S0903W0151", "wav": "./aishell/wav/test/S0903/BAC009S0903W0151.wav", "txt": "库存逆势攀升的根本原因在于供求错配"} -{"key": "BAC009S0903W0152", "wav": "./aishell/wav/test/S0903/BAC009S0903W0152.wav", "txt": "推动全国成交面积止跌反弹"} -{"key": "BAC009S0903W0153", "wav": "./aishell/wav/test/S0903/BAC009S0903W0153.wav", "txt": "但供应和库存却主要集中在七个三四线城市"} -{"key": "BAC009S0903W0154", "wav": "./aishell/wav/test/S0903/BAC009S0903W0154.wav", "txt": "且待售库存单套面积较大"} -{"key": "BAC009S0903W0155", "wav": "./aishell/wav/test/S0903/BAC009S0903W0155.wav", "txt": "无论是降低二套房公积金首付比例"} -{"key": "BAC009S0903W0156", "wav": "./aishell/wav/test/S0903/BAC009S0903W0156.wav", "txt": "还是不再区分普通和非普通住房"} -{"key": "BAC009S0903W0157", "wav": "./aishell/wav/test/S0903/BAC009S0903W0157.wav", "txt": "都意在有针对性地加大三四线城市楼市库存消化力度"} -{"key": "BAC009S0903W0158", "wav": "./aishell/wav/test/S0903/BAC009S0903W0158.wav", "txt": "只有楼市库存真正消化了"} -{"key": "BAC009S0903W0159", "wav": "./aishell/wav/test/S0903/BAC009S0903W0159.wav", "txt": "才能提振开发商拿地和开工的积极性"} -{"key": "BAC009S0903W0160", "wav": "./aishell/wav/test/S0903/BAC009S0903W0160.wav", "txt": "在公积金利率已降至历史低位"} -{"key": "BAC009S0903W0161", "wav": "./aishell/wav/test/S0903/BAC009S0903W0161.wav", "txt": "站在金九银十即将来临的起点上"} -{"key": "BAC009S0903W0162", "wav": "./aishell/wav/test/S0903/BAC009S0903W0162.wav", "txt": "再次降低公积金贷款首付比例"} -{"key": "BAC009S0903W0163", "wav": "./aishell/wav/test/S0903/BAC009S0903W0163.wav", "txt": "目的也是为了夯实楼市回升的基础"} -{"key": "BAC009S0903W0164", "wav": "./aishell/wav/test/S0903/BAC009S0903W0164.wav", "txt": "尽管去年新政以来"} -{"key": "BAC009S0903W0165", "wav": "./aishell/wav/test/S0903/BAC009S0903W0165.wav", "txt": "楼市持续三个季度回升"} -{"key": "BAC009S0903W0166", "wav": "./aishell/wav/test/S0903/BAC009S0903W0166.wav", "txt": "回升势头有转弱的迹象"} -{"key": "BAC009S0903W0167", "wav": "./aishell/wav/test/S0903/BAC009S0903W0167.wav", "txt": "首先是重点城市楼市成交回落趋势明显"} -{"key": "BAC009S0903W0168", "wav": "./aishell/wav/test/S0903/BAC009S0903W0168.wav", "txt": "领头羊一线城市分别下降百分之一和百分之七"} -{"key": "BAC009S0903W0169", "wav": "./aishell/wav/test/S0903/BAC009S0903W0169.wav", "txt": "而重点城市的供应也在七月份下滑了百分之七"} -{"key": "BAC009S0903W0170", "wav": "./aishell/wav/test/S0903/BAC009S0903W0170.wav", "txt": "五月份更是增加了七百万平方米"} -{"key": "BAC009S0903W0171", "wav": "./aishell/wav/test/S0903/BAC009S0903W0171.wav", "txt": "银行房贷额度开始紧张"} -{"key": "BAC009S0903W0172", "wav": "./aishell/wav/test/S0903/BAC009S0903W0172.wav", "txt": "首套房贷利润优惠也开始减少"} -{"key": "BAC009S0903W0173", "wav": "./aishell/wav/test/S0903/BAC009S0903W0173.wav", "txt": "近期人民币贬值叠加资本外流预期"} -{"key": "BAC009S0903W0174", "wav": "./aishell/wav/test/S0903/BAC009S0903W0174.wav", "txt": "资金面紧张对楼市的影响开始显现"} -{"key": "BAC009S0903W0175", "wav": "./aishell/wav/test/S0903/BAC009S0903W0175.wav", "txt": "市场对金九银十的预期也开始谨慎起来"} -{"key": "BAC009S0903W0176", "wav": "./aishell/wav/test/S0903/BAC009S0903W0176.wav", "txt": "除了去库存和夯实楼市回升基础外"} -{"key": "BAC009S0903W0177", "wav": "./aishell/wav/test/S0903/BAC009S0903W0177.wav", "txt": "此次公积金政策调整"} -{"key": "BAC009S0903W0178", "wav": "./aishell/wav/test/S0903/BAC009S0903W0178.wav", "txt": "也在于全面落实分类调控因城施策"} -{"key": "BAC009S0903W0179", "wav": "./aishell/wav/test/S0903/BAC009S0903W0179.wav", "txt": "纠偏政策一刀切的负面影响"} -{"key": "BAC009S0903W0180", "wav": "./aishell/wav/test/S0903/BAC009S0903W0180.wav", "txt": "去年新政以来"} -{"key": "BAC009S0903W0181", "wav": "./aishell/wav/test/S0903/BAC009S0903W0181.wav", "txt": "松绑二套房贷认定标准降低二套房贷首付比例"} -{"key": "BAC009S0903W0182", "wav": "./aishell/wav/test/S0903/BAC009S0903W0182.wav", "txt": "以及营业税免征期"} -{"key": "BAC009S0903W0183", "wav": "./aishell/wav/test/S0903/BAC009S0903W0183.wav", "txt": "第一次在公积金上提出差别对待"} -{"key": "BAC009S0903W0184", "wav": "./aishell/wav/test/S0903/BAC009S0903W0184.wav", "txt": "包括上海广州厦门南京在内的重点城市"} -{"key": "BAC009S0903W0185", "wav": "./aishell/wav/test/S0903/BAC009S0903W0185.wav", "txt": "以及前几次公积金新政的实施"} -{"key": "BAC009S0903W0186", "wav": "./aishell/wav/test/S0903/BAC009S0903W0186.wav", "txt": "公积金可贷额度受到严重冲击"} -{"key": "BAC009S0903W0187", "wav": "./aishell/wav/test/S0903/BAC009S0903W0187.wav", "txt": "着力解决一批影响现代农业发展全局的重大科技问题"} -{"key": "BAC009S0903W0188", "wav": "./aishell/wav/test/S0903/BAC009S0903W0188.wav", "txt": "加快农业技术引进消化吸收再创新步伐"} -{"key": "BAC009S0903W0189", "wav": "./aishell/wav/test/S0903/BAC009S0903W0189.wav", "txt": "加强农业科技领域国际合作"} -{"key": "BAC009S0903W0190", "wav": "./aishell/wav/test/S0903/BAC009S0903W0190.wav", "txt": "调整优化农业科研布局"} -{"key": "BAC009S0903W0191", "wav": "./aishell/wav/test/S0903/BAC009S0903W0191.wav", "txt": "加强农业科研基地和重点实验室建设"} -{"key": "BAC009S0903W0192", "wav": "./aishell/wav/test/S0903/BAC009S0903W0192.wav", "txt": "完善农业科技创新体系和现代农业产业技术体系"} -{"key": "BAC009S0903W0193", "wav": "./aishell/wav/test/S0903/BAC009S0903W0193.wav", "txt": "启动实施农业科技创新能力建设工程"} -{"key": "BAC009S0903W0194", "wav": "./aishell/wav/test/S0903/BAC009S0903W0194.wav", "txt": "组建一批产业技术创新战略联盟和国家农业科技园区"} -{"key": "BAC009S0903W0195", "wav": "./aishell/wav/test/S0903/BAC009S0903W0195.wav", "txt": "完善农业科技评价机制"} -{"key": "BAC009S0903W0196", "wav": "./aishell/wav/test/S0903/BAC009S0903W0196.wav", "txt": "激发农业科技创新活力"} -{"key": "BAC009S0903W0197", "wav": "./aishell/wav/test/S0903/BAC009S0903W0197.wav", "txt": "大力发展现代农作物种业"} -{"key": "BAC009S0903W0198", "wav": "./aishell/wav/test/S0903/BAC009S0903W0198.wav", "txt": "实施好转基因生物新品种培育重大专项"} -{"key": "BAC009S0903W0199", "wav": "./aishell/wav/test/S0903/BAC009S0903W0199.wav", "txt": "加快发展生物育种战略性新兴产业"} -{"key": "BAC009S0903W0200", "wav": "./aishell/wav/test/S0903/BAC009S0903W0200.wav", "txt": "加快农业新品种新技术转化应用"} -{"key": "BAC009S0903W0201", "wav": "./aishell/wav/test/S0903/BAC009S0903W0201.wav", "txt": "加强小麦一喷三防喷施叶面肥"} -{"key": "BAC009S0903W0202", "wav": "./aishell/wav/test/S0903/BAC009S0903W0202.wav", "txt": "加快牲畜水产遗传改良进程"} -{"key": "BAC009S0903W0203", "wav": "./aishell/wav/test/S0903/BAC009S0903W0203.wav", "txt": "创新农业技术推广机制"} -{"key": "BAC009S0903W0204", "wav": "./aishell/wav/test/S0903/BAC009S0903W0204.wav", "txt": "大规模开展高产创建"} -{"key": "BAC009S0903W0205", "wav": "./aishell/wav/test/S0903/BAC009S0903W0205.wav", "txt": "在有条件地区实行整乡整县场推进"} -{"key": "BAC009S0903W0206", "wav": "./aishell/wav/test/S0903/BAC009S0903W0206.wav", "txt": "力争实现优势产区和主要品种全复盖"} -{"key": "BAC009S0903W0207", "wav": "./aishell/wav/test/S0903/BAC009S0903W0207.wav", "txt": "壮大农业农村人才队伍"} -{"key": "BAC009S0903W0208", "wav": "./aishell/wav/test/S0903/BAC009S0903W0208.wav", "txt": "以实施现代农业人才支撑计划为抓手"} -{"key": "BAC009S0903W0209", "wav": "./aishell/wav/test/S0903/BAC009S0903W0209.wav", "txt": "加大农村劳动力培训阳光工程实施力度"} -{"key": "BAC009S0903W0210", "wav": "./aishell/wav/test/S0903/BAC009S0903W0210.wav", "txt": "大力发展农业职业培养"} -{"key": "BAC009S0903W0211", "wav": "./aishell/wav/test/S0903/BAC009S0903W0211.wav", "txt": "加快技能型人才培养"} -{"key": "BAC009S0903W0212", "wav": "./aishell/wav/test/S0903/BAC009S0903W0212.wav", "txt": "支持高校毕业生和各类优秀人才投身现代农业建设"} -{"key": "BAC009S0903W0213", "wav": "./aishell/wav/test/S0903/BAC009S0903W0213.wav", "txt": "鼓励外出务工农农民带技术带资金回乡创业"} -{"key": "BAC009S0903W0214", "wav": "./aishell/wav/test/S0903/BAC009S0903W0214.wav", "txt": "改善农业基础设备和装备条件"} -{"key": "BAC009S0903W0215", "wav": "./aishell/wav/test/S0903/BAC009S0903W0215.wav", "txt": "大规模开展高标准农田建设"} -{"key": "BAC009S0903W0216", "wav": "./aishell/wav/test/S0903/BAC009S0903W0216.wav", "txt": "按照统筹规划分工协作集中投入连片推进的思想"} -{"key": "BAC009S0903W0217", "wav": "./aishell/wav/test/S0903/BAC009S0903W0217.wav", "txt": "大规模改造中低产田"} -{"key": "BAC009S0903W0218", "wav": "./aishell/wav/test/S0903/BAC009S0903W0218.wav", "txt": "建设旱涝保收高标准农田"} -{"key": "BAC009S0903W0219", "wav": "./aishell/wav/test/S0903/BAC009S0903W0219.wav", "txt": "加快大中型灌区排灌泵站配套改造"} -{"key": "BAC009S0903W0220", "wav": "./aishell/wav/test/S0903/BAC009S0903W0220.wav", "txt": "大力开展小型农田水利建设"} -{"key": "BAC009S0903W0221", "wav": "./aishell/wav/test/S0903/BAC009S0903W0221.wav", "txt": "增加农田有效灌溉面积"} -{"key": "BAC009S0903W0222", "wav": "./aishell/wav/test/S0903/BAC009S0903W0222.wav", "txt": "加强新增千亿斤粮食生产能力规划的田间工程建设"} -{"key": "BAC009S0903W0223", "wav": "./aishell/wav/test/S0903/BAC009S0903W0223.wav", "txt": "完善机耕道农田防护林等设施"} -{"key": "BAC009S0903W0224", "wav": "./aishell/wav/test/S0903/BAC009S0903W0224.wav", "txt": "推广土壤有机质提升测土配方施肥等培肥地力技术"} -{"key": "BAC009S0903W0225", "wav": "./aishell/wav/test/S0903/BAC009S0903W0225.wav", "txt": "完善高标准农田建后管护支持政策和制度"} -{"key": "BAC009S0903W0226", "wav": "./aishell/wav/test/S0903/BAC009S0903W0226.wav", "txt": "延长各类设施使用年限"} -{"key": "BAC009S0903W0227", "wav": "./aishell/wav/test/S0903/BAC009S0903W0227.wav", "txt": "确保农田综合生产能力长期持续稳定提升"} -{"key": "BAC009S0903W0228", "wav": "./aishell/wav/test/S0903/BAC009S0903W0228.wav", "txt": "改善养殖业生产条件"} -{"key": "BAC009S0903W0229", "wav": "./aishell/wav/test/S0903/BAC009S0903W0229.wav", "txt": "加快实施生禽良种工程"} -{"key": "BAC009S0903W0230", "wav": "./aishell/wav/test/S0903/BAC009S0903W0230.wav", "txt": "支持生禽规模化养殖场小区开展标准化改造和建设"} -{"key": "BAC009S0903W0231", "wav": "./aishell/wav/test/S0903/BAC009S0903W0231.wav", "txt": "加快草原围栏棚圈和牧区水利建设"} -{"key": "BAC009S0903W0232", "wav": "./aishell/wav/test/S0903/BAC009S0903W0232.wav", "txt": "配套发展节水高效灌溉词草基地"} -{"key": "BAC009S0903W0233", "wav": "./aishell/wav/test/S0903/BAC009S0903W0233.wav", "txt": "健全水产良良种体系"} -{"key": "BAC009S0903W0234", "wav": "./aishell/wav/test/S0903/BAC009S0903W0234.wav", "txt": "开展池塘标准化改造"} -{"key": "BAC009S0903W0235", "wav": "./aishell/wav/test/S0903/BAC009S0903W0235.wav", "txt": "建设水产健康养殖示范场"} -{"key": "BAC009S0903W0236", "wav": "./aishell/wav/test/S0903/BAC009S0903W0236.wav", "txt": "加强渔港和渔政执法能力建设"} -{"key": "BAC009S0903W0237", "wav": "./aishell/wav/test/S0903/BAC009S0903W0237.wav", "txt": "全面落实农机具购置补贴各项管理制度和规定"} -{"key": "BAC009S0903W0238", "wav": "./aishell/wav/test/S0903/BAC009S0903W0238.wav", "txt": "加快推进水稻栽插收获和玉米收获机械化"} -{"key": "BAC009S0903W0239", "wav": "./aishell/wav/test/S0903/BAC009S0903W0239.wav", "txt": "重点突破棉花油菜甘蔗收获机械化瓶颈"} -{"key": "BAC009S0903W0240", "wav": "./aishell/wav/test/S0903/BAC009S0903W0240.wav", "txt": "大力发展高效植保机器"} -{"key": "BAC009S0903W0241", "wav": "./aishell/wav/test/S0903/BAC009S0903W0241.wav", "txt": "积极推进养殖业园艺业农产品初加工机械化"} -{"key": "BAC009S0903W0242", "wav": "./aishell/wav/test/S0903/BAC009S0903W0242.wav", "txt": "加快实施保护性耕作工程"} -{"key": "BAC009S0903W0243", "wav": "./aishell/wav/test/S0903/BAC009S0903W0243.wav", "txt": "提高大型农机具和农药化肥农膜等农资生产水平"} -{"key": "BAC009S0903W0244", "wav": "./aishell/wav/test/S0903/BAC009S0903W0244.wav", "txt": "加强农业防灾减灾能力建设"} -{"key": "BAC009S0903W0245", "wav": "./aishell/wav/test/S0903/BAC009S0903W0245.wav", "txt": "提高防汛抗旱减灾能力"} -{"key": "BAC009S0903W0246", "wav": "./aishell/wav/test/S0903/BAC009S0903W0246.wav", "txt": "加强种子饲草料等急救灾物资储备调运条件建设"} -{"key": "BAC009S0903W0247", "wav": "./aishell/wav/test/S0903/BAC009S0903W0247.wav", "txt": "推广相应的生产技术和防灾减灾措施大力推进农业标准化"} -{"key": "BAC009S0903W0248", "wav": "./aishell/wav/test/S0903/BAC009S0903W0248.wav", "txt": "以农兽药残留标准为重点"} -{"key": "BAC009S0903W0249", "wav": "./aishell/wav/test/S0903/BAC009S0903W0249.wav", "txt": "加快健全农业标准体系"} -{"key": "BAC009S0903W0250", "wav": "./aishell/wav/test/S0903/BAC009S0903W0250.wav", "txt": "以园艺产品生产品水产品等为重点"} -{"key": "BAC009S0903W0251", "wav": "./aishell/wav/test/S0903/BAC009S0903W0251.wav", "txt": "推行统一的标准操作规程和技术规范"} -{"key": "BAC009S0903W0252", "wav": "./aishell/wav/test/S0903/BAC009S0903W0252.wav", "txt": "加强国家级农业标准化整建制推进示范县场建设"} -{"key": "BAC009S0903W0253", "wav": "./aishell/wav/test/S0903/BAC009S0903W0253.wav", "txt": "市场占有率为百分之五"} -{"key": "BAC009S0903W0254", "wav": "./aishell/wav/test/S0903/BAC009S0903W0254.wav", "txt": "二零一四年三星期累计销售超过一百二十万块智能手表"} -{"key": "BAC009S0903W0255", "wav": "./aishell/wav/test/S0903/BAC009S0903W0255.wav", "txt": "这个数据不及苹果的一个季度"} -{"key": "BAC009S0903W0256", "wav": "./aishell/wav/test/S0903/BAC009S0903W0256.wav", "txt": "因此不能表示苹果没有新的业务增长点"} -{"key": "BAC009S0903W0257", "wav": "./aishell/wav/test/S0903/BAC009S0903W0257.wav", "txt": "本报记者纪佳鹏北京报道北京时间八月十二日"} -{"key": "BAC009S0903W0258", "wav": "./aishell/wav/test/S0903/BAC009S0903W0258.wav", "txt": "作为科技股领头羊的苹果股价当天下挫百分之二"} -{"key": "BAC009S0903W0259", "wav": "./aishell/wav/test/S0903/BAC009S0903W0259.wav", "txt": "十二月二日路透社报道"} -{"key": "BAC009S0903W0260", "wav": "./aishell/wav/test/S0903/BAC009S0903W0260.wav", "txt": "苹果股票每分钟交易量已超过六百七十万股"} -{"key": "BAC009S0903W0261", "wav": "./aishell/wav/test/S0903/BAC009S0903W0261.wav", "txt": "这种巨大且异乎寻常的抛售量"} -{"key": "BAC009S0903W0262", "wav": "./aishell/wav/test/S0903/BAC009S0903W0262.wav", "txt": "瞬间将苹果估价拉低了至少百分之六"} -{"key": "BAC009S0903W0263", "wav": "./aishell/wav/test/S0903/BAC009S0903W0263.wav", "txt": "使其市值分秒间蒸发近四百亿美元"} -{"key": "BAC009S0903W0264", "wav": "./aishell/wav/test/S0903/BAC009S0903W0264.wav", "txt": "成为苹果近三个月以来股价下跌最严重的一次"} -{"key": "BAC009S0903W0265", "wav": "./aishell/wav/test/S0903/BAC009S0903W0265.wav", "txt": "苹果股价一度每分钟跌幅已破百分之三"} -{"key": "BAC009S0903W0266", "wav": "./aishell/wav/test/S0903/BAC009S0903W0266.wav", "txt": "每股报价报收于一百一十一点二七美元"} -{"key": "BAC009S0903W0267", "wav": "./aishell/wav/test/S0903/BAC009S0903W0267.wav", "txt": "报收于每股一百一十五点四五美元"} -{"key": "BAC009S0903W0268", "wav": "./aishell/wav/test/S0903/BAC009S0903W0268.wav", "txt": "对于造成此次异常闪崩的原因目前尚未公布"} -{"key": "BAC009S0903W0269", "wav": "./aishell/wav/test/S0903/BAC009S0903W0269.wav", "txt": "此举或与摩根士丹利下调苹果股票持股比例有关"} -{"key": "BAC009S0903W0270", "wav": "./aishell/wav/test/S0903/BAC009S0903W0270.wav", "txt": "同时将苹果持股比例由百分之四下调至百分之三"} -{"key": "BAC009S0903W0271", "wav": "./aishell/wav/test/S0903/BAC009S0903W0271.wav", "txt": "并建议客户减少对该股票在投资组合中的占比"} -{"key": "BAC009S0903W0272", "wav": "./aishell/wav/test/S0903/BAC009S0903W0272.wav", "txt": "高频交易也与此次闪崩事件逃脱不了干系"} -{"key": "BAC009S0903W0273", "wav": "./aishell/wav/test/S0903/BAC009S0903W0273.wav", "txt": "高频交易一直饱受诟病"} -{"key": "BAC009S0903W0274", "wav": "./aishell/wav/test/S0903/BAC009S0903W0274.wav", "txt": "美国股市九点五十起"} -{"key": "BAC009S0903W0275", "wav": "./aishell/wav/test/S0903/BAC009S0903W0275.wav", "txt": "超过三百馀种不同类别股票均出现不正常股价波动"} -{"key": "BAC009S0903W0276", "wav": "./aishell/wav/test/S0903/BAC009S0903W0276.wav", "txt": "当出现此类价格变化时"} -{"key": "BAC009S0903W0277", "wav": "./aishell/wav/test/S0903/BAC009S0903W0277.wav", "txt": "通常只是算法交易造成的影响"} -{"key": "BAC009S0903W0278", "wav": "./aishell/wav/test/S0903/BAC009S0903W0278.wav", "txt": "也就是所说的流动性蒸发事实上"} -{"key": "BAC009S0903W0279", "wav": "./aishell/wav/test/S0903/BAC009S0903W0279.wav", "txt": "流动性从未得到足够的重视"} -{"key": "BAC009S0903W0280", "wav": "./aishell/wav/test/S0903/BAC009S0903W0280.wav", "txt": "我们当下的股市在流动性方面也表现得支离破碎"} -{"key": "BAC009S0903W0281", "wav": "./aishell/wav/test/S0903/BAC009S0903W0281.wav", "txt": "苹果领头的股价闪崩原因可能比想象中的更为复杂"} -{"key": "BAC009S0903W0282", "wav": "./aishell/wav/test/S0903/BAC009S0903W0282.wav", "txt": "现在就下结论将原因推给高频交易"} -{"key": "BAC009S0903W0283", "wav": "./aishell/wav/test/S0903/BAC009S0903W0283.wav", "txt": "这种做法很容易误导客服"} -{"key": "BAC009S0903W0284", "wav": "./aishell/wav/test/S0903/BAC009S0903W0284.wav", "txt": "阿里巴巴当日股价下跌一点百分之四"} -{"key": "BAC009S0903W0285", "wav": "./aishell/wav/test/S0903/BAC009S0903W0285.wav", "txt": "谷歌十点五八分股价也出现一点百分之七十九的最大跌幅"} -{"key": "BAC009S0903W0286", "wav": "./aishell/wav/test/S0903/BAC009S0903W0286.wav", "txt": "苹果股价闪崩只是正常股票套利的表现"} -{"key": "BAC009S0903W0287", "wav": "./aishell/wav/test/S0903/BAC009S0903W0287.wav", "txt": "苹果股价相较十月份低点已经上涨约百分之二十五"} -{"key": "BAC009S0903W0288", "wav": "./aishell/wav/test/S0903/BAC009S0903W0288.wav", "txt": "纳斯达克在此期间只涨了百分之十"} -{"key": "BAC009S0903W0289", "wav": "./aishell/wav/test/S0903/BAC009S0903W0289.wav", "txt": "选择套现或也是情理之中"} -{"key": "BAC009S0903W0290", "wav": "./aishell/wav/test/S0903/BAC009S0903W0290.wav", "txt": "每股下滑三点八八美元报收于一点一十五点零五美元"} -{"key": "BAC009S0903W0292", "wav": "./aishell/wav/test/S0903/BAC009S0903W0292.wav", "txt": "苹果股票每分钟交易量已超过六十七万股"} -{"key": "BAC009S0903W0293", "wav": "./aishell/wav/test/S0903/BAC009S0903W0293.wav", "txt": "这不仅创下苹果公司自二零一四年"} -{"key": "BAC009S0903W0294", "wav": "./aishell/wav/test/S0903/BAC009S0903W0294.wav", "txt": "苹果背后那行字应该在每个中国人心里搜狐科技"} -{"key": "BAC009S0903W0295", "wav": "./aishell/wav/test/S0903/BAC009S0903W0295.wav", "txt": "翻译过来就是加利福尼亚苹果公司设计"} -{"key": "BAC009S0903W0296", "wav": "./aishell/wav/test/S0903/BAC009S0903W0296.wav", "txt": "按说这只是一个客观表述"} -{"key": "BAC009S0903W0297", "wav": "./aishell/wav/test/S0903/BAC009S0903W0297.wav", "txt": "对于谋求转型发展怀揣创新型国家梦想的中国来说"} -{"key": "BAC009S0903W0298", "wav": "./aishell/wav/test/S0903/BAC009S0903W0298.wav", "txt": "这行字值得我们深思"} -{"key": "BAC009S0903W0299", "wav": "./aishell/wav/test/S0903/BAC009S0903W0299.wav", "txt": "众多跨国品牌在中国都有生产基地"} -{"key": "BAC009S0903W0300", "wav": "./aishell/wav/test/S0903/BAC009S0903W0300.wav", "txt": "像苹果这样在产品上强调在本国设计的很少"} -{"key": "BAC009S0903W0301", "wav": "./aishell/wav/test/S0903/BAC009S0903W0301.wav", "txt": "这样的做法当然是企业行为"} -{"key": "BAC009S0903W0302", "wav": "./aishell/wav/test/S0903/BAC009S0903W0302.wav", "txt": "这行字对于我们来说"} -{"key": "BAC009S0903W0303", "wav": "./aishell/wav/test/S0903/BAC009S0903W0303.wav", "txt": "很多家长都考虑给孩子配置具有定位功能的智能手机"} -{"key": "BAC009S0903W0304", "wav": "./aishell/wav/test/S0903/BAC009S0903W0304.wav", "txt": "智能手机特定的上网和游戏功能"} -{"key": "BAC009S0903W0305", "wav": "./aishell/wav/test/S0903/BAC009S0903W0305.wav", "txt": "注定了它强烈的娱乐性"} -{"key": "BAC009S0903W0306", "wav": "./aishell/wav/test/S0903/BAC009S0903W0306.wav", "txt": "给孩子配置智能手机"} -{"key": "BAC009S0903W0307", "wav": "./aishell/wav/test/S0903/BAC009S0903W0307.wav", "txt": "担心会直接影响孩子正常的学习"} -{"key": "BAC009S0903W0308", "wav": "./aishell/wav/test/S0903/BAC009S0903W0308.wav", "txt": "儿童电话手表除了通话定位等功能外"} -{"key": "BAC009S0903W0309", "wav": "./aishell/wav/test/S0903/BAC009S0903W0309.wav", "txt": "还针对性的设置了上课禁用等功能"} -{"key": "BAC009S0903W0310", "wav": "./aishell/wav/test/S0903/BAC009S0903W0310.wav", "txt": "孩子带到学校既不会让孩子分心"} -{"key": "BAC009S0903W0311", "wav": "./aishell/wav/test/S0903/BAC009S0903W0311.wav", "txt": "又可以让家长了解孩子的位置"} -{"key": "BAC009S0903W0312", "wav": "./aishell/wav/test/S0903/BAC009S0903W0312.wav", "txt": "是很多父母迫切需要的"} -{"key": "BAC009S0903W0313", "wav": "./aishell/wav/test/S0903/BAC009S0903W0313.wav", "txt": "对于小天才电话手表上课禁用功能"} -{"key": "BAC009S0903W0314", "wav": "./aishell/wav/test/S0903/BAC009S0903W0314.wav", "txt": "相关人员表示为了方便和孩子保持联系"} -{"key": "BAC009S0903W0315", "wav": "./aishell/wav/test/S0903/BAC009S0903W0315.wav", "txt": "之前很多家长会给孩子买手机"} -{"key": "BAC009S0903W0316", "wav": "./aishell/wav/test/S0903/BAC009S0903W0316.wav", "txt": "影响学习虽说功能手机可以阻止孩子玩游戏"} -{"key": "BAC009S0903W0317", "wav": "./aishell/wav/test/S0903/BAC009S0903W0317.wav", "txt": "儿童电话手表正好解决了这两个问题"} -{"key": "BAC009S0903W0318", "wav": "./aishell/wav/test/S0903/BAC009S0903W0318.wav", "txt": "家长随时和孩子保持联系"} -{"key": "BAC009S0903W0319", "wav": "./aishell/wav/test/S0903/BAC009S0903W0319.wav", "txt": "我就给自己的孩子也买了一个呢"} -{"key": "BAC009S0903W0320", "wav": "./aishell/wav/test/S0903/BAC009S0903W0320.wav", "txt": "失孤等影片的上映"} -{"key": "BAC009S0903W0321", "wav": "./aishell/wav/test/S0903/BAC009S0903W0321.wav", "txt": "也将儿童人身安全的话题推向了妙论的风口浪尖"} -{"key": "BAC009S0903W0322", "wav": "./aishell/wav/test/S0903/BAC009S0903W0322.wav", "txt": "儿童电话手表的诞生"} -{"key": "BAC009S0903W0323", "wav": "./aishell/wav/test/S0903/BAC009S0903W0323.wav", "txt": "为孩子多了一份强有力的保障"} -{"key": "BAC009S0903W0324", "wav": "./aishell/wav/test/S0903/BAC009S0903W0324.wav", "txt": "电话手表正是瞄准了这一需求"} -{"key": "BAC009S0903W0325", "wav": "./aishell/wav/test/S0903/BAC009S0903W0325.wav", "txt": "加上随身携带的便捷性和流畅的操作体验"} -{"key": "BAC009S0903W0326", "wav": "./aishell/wav/test/S0903/BAC009S0903W0326.wav", "txt": "在手机平板电脑之外"} -{"key": "BAC009S0903W0327", "wav": "./aishell/wav/test/S0903/BAC009S0903W0327.wav", "txt": "开扩了一个新的市场"} -{"key": "BAC009S0903W0328", "wav": "./aishell/wav/test/S0903/BAC009S0903W0328.wav", "txt": "现在三百六十腾讯等大公司都涉足了这一领域"} -{"key": "BAC009S0903W0329", "wav": "./aishell/wav/test/S0903/BAC009S0903W0329.wav", "txt": "自今年六月电话手表行业兴起起来"} -{"key": "BAC009S0903W0330", "wav": "./aishell/wav/test/S0903/BAC009S0903W0330.wav", "txt": "整体行业出货量应该不断突破"} -{"key": "BAC009S0903W0331", "wav": "./aishell/wav/test/S0903/BAC009S0903W0331.wav", "txt": "并将成为新兴的销售热点"} -{"key": "BAC009S0903W0332", "wav": "./aishell/wav/test/S0903/BAC009S0903W0332.wav", "txt": "科技创新带动了电话手表行业"} -{"key": "BAC009S0903W0333", "wav": "./aishell/wav/test/S0903/BAC009S0903W0333.wav", "txt": "其实儿童电话手表的火"} -{"key": "BAC009S0903W0334", "wav": "./aishell/wav/test/S0903/BAC009S0903W0334.wav", "txt": "是火在行业的科技创新"} -{"key": "BAC009S0903W0335", "wav": "./aishell/wav/test/S0903/BAC009S0903W0335.wav", "txt": "随着国家在科技创新方面的投入和关注度的增加"} -{"key": "BAC009S0903W0336", "wav": "./aishell/wav/test/S0903/BAC009S0903W0336.wav", "txt": "新兴行业对于创新的热情也不断增加"} -{"key": "BAC009S0903W0337", "wav": "./aishell/wav/test/S0903/BAC009S0903W0337.wav", "txt": "我们小天才电话手表就是不断创新的成果"} -{"key": "BAC009S0903W0338", "wav": "./aishell/wav/test/S0903/BAC009S0903W0338.wav", "txt": "意大利选手弗菜戈也说我们在来里约之前"} -{"key": "BAC009S0903W0339", "wav": "./aishell/wav/test/S0903/BAC009S0903W0339.wav", "txt": "看到了有关这里水污染的报道"} -{"key": "BAC009S0903W0340", "wav": "./aishell/wav/test/S0903/BAC009S0903W0340.wav", "txt": "对这里的水质比较关心"} -{"key": "BAC009S0903W0341", "wav": "./aishell/wav/test/S0903/BAC009S0903W0341.wav", "txt": "这个湖虽然没有漂浮的垃圾"} -{"key": "BAC009S0903W0342", "wav": "./aishell/wav/test/S0903/BAC009S0903W0342.wav", "txt": "但湖水很脏也很浑浊"} -{"key": "BAC009S0903W0343", "wav": "./aishell/wav/test/S0903/BAC009S0903W0343.wav", "txt": "里约奥组委此前表示"} -{"key": "BAC009S0903W0344", "wav": "./aishell/wav/test/S0903/BAC009S0903W0344.wav", "txt": "运动员的健康是他们关注的头等大事"} -{"key": "BAC009S0903W0345", "wav": "./aishell/wav/test/S0903/BAC009S0903W0345.wav", "txt": "无论帆船赛艇还是公开水域"} -{"key": "BAC009S0903W0346", "wav": "./aishell/wav/test/S0903/BAC009S0903W0346.wav", "txt": "在奥运期间水质都可以保证运动员的健康"} -{"key": "BAC009S0903W0347", "wav": "./aishell/wav/test/S0903/BAC009S0903W0347.wav", "txt": "二零一五年九月十二日星期六十一点"} -{"key": "BAC009S0903W0348", "wav": "./aishell/wav/test/S0903/BAC009S0903W0348.wav", "txt": "开幕式举行了庄严的入场仪式"} -{"key": "BAC009S0903W0349", "wav": "./aishell/wav/test/S0903/BAC009S0903W0349.wav", "txt": "裁判员队伍和参赛代表队依次入场亮相"} -{"key": "BAC009S0903W0350", "wav": "./aishell/wav/test/S0903/BAC009S0903W0350.wav", "txt": "裁判员代表和运动员代表进行了宣誓"} -{"key": "BAC009S0903W0351", "wav": "./aishell/wav/test/S0903/BAC009S0903W0351.wav", "txt": "曾春蕾和刘晓彤向各参赛队赠送了签名排球"} -{"key": "BAC009S0903W0352", "wav": "./aishell/wav/test/S0903/BAC009S0903W0352.wav", "txt": "北京市体育局副局长孙学才宣布比赛开幕"} -{"key": "BAC009S0903W0353", "wav": "./aishell/wav/test/S0903/BAC009S0903W0353.wav", "txt": "响应北京市振兴三大球战略的号召"} -{"key": "BAC009S0903W0354", "wav": "./aishell/wav/test/S0903/BAC009S0903W0354.wav", "txt": "促进北京排球事业发展"} -{"key": "BAC009S0903W0355", "wav": "./aishell/wav/test/S0903/BAC009S0903W0355.wav", "txt": "丰富北京市业馀排球群体活动"} -{"key": "BAC009S0903W0356", "wav": "./aishell/wav/test/S0903/BAC009S0903W0356.wav", "txt": "激发广大群众对排球的热情"} -{"key": "BAC009S0903W0357", "wav": "./aishell/wav/test/S0903/BAC009S0903W0357.wav", "txt": "为将其打造成具有影响力的群众性品牌赛事"} -{"key": "BAC009S0903W0358", "wav": "./aishell/wav/test/S0903/BAC009S0903W0358.wav", "txt": "在社会主义核心价值观的指引下"} -{"key": "BAC009S0903W0359", "wav": "./aishell/wav/test/S0903/BAC009S0903W0359.wav", "txt": "突出弘扬北京排球文化"} -{"key": "BAC009S0903W0360", "wav": "./aishell/wav/test/S0903/BAC009S0903W0360.wav", "txt": "组委会在部门设置上调整了人员分工"} -{"key": "BAC009S0903W0361", "wav": "./aishell/wav/test/S0903/BAC009S0903W0361.wav", "txt": "组委会工作机构共分为四部一室"} -{"key": "BAC009S0903W0362", "wav": "./aishell/wav/test/S0903/BAC009S0903W0362.wav", "txt": "并且全部采用有经验的工作人员参与竞赛组织工作"} -{"key": "BAC009S0903W0363", "wav": "./aishell/wav/test/S0903/BAC009S0903W0363.wav", "txt": "在制定竞赛规程方面严格遵循规范化专业化原则"} -{"key": "BAC009S0903W0364", "wav": "./aishell/wav/test/S0903/BAC009S0903W0364.wav", "txt": "不仅能够将业馀排球与职业排球严格地区分开"} -{"key": "BAC009S0903W0365", "wav": "./aishell/wav/test/S0903/BAC009S0903W0365.wav", "txt": "而且满足了绝大多数业馀排球爱好者的参赛需求"} -{"key": "BAC009S0903W0366", "wav": "./aishell/wav/test/S0903/BAC009S0903W0366.wav", "txt": "充分做到公平公正公开"} -{"key": "BAC009S0903W0367", "wav": "./aishell/wav/test/S0903/BAC009S0903W0367.wav", "txt": "其中国际级裁判员两名"} -{"key": "BAC009S0903W0368", "wav": "./aishell/wav/test/S0903/BAC009S0903W0368.wav", "txt": "结合业馀排球特点做出细微调整制定而成"} -{"key": "BAC009S0903W0369", "wav": "./aishell/wav/test/S0903/BAC009S0903W0369.wav", "txt": "营造出良好的比赛氛围"} -{"key": "BAC009S0903W0370", "wav": "./aishell/wav/test/S0903/BAC009S0903W0370.wav", "txt": "悬挂于场馆醒目位置"} -{"key": "BAC009S0903W0371", "wav": "./aishell/wav/test/S0903/BAC009S0903W0371.wav", "txt": "增强参赛者的荣誉感与积极性的同时"} -{"key": "BAC009S0903W0372", "wav": "./aishell/wav/test/S0903/BAC009S0903W0372.wav", "txt": "进一步提升了比赛品质"} -{"key": "BAC009S0903W0373", "wav": "./aishell/wav/test/S0903/BAC009S0903W0373.wav", "txt": "要将北京市业馀排球联赛打造成群众性品牌赛事"} -{"key": "BAC009S0903W0374", "wav": "./aishell/wav/test/S0903/BAC009S0903W0374.wav", "txt": "离不开广大媒体的支持"} -{"key": "BAC009S0903W0375", "wav": "./aishell/wav/test/S0903/BAC009S0903W0375.wav", "txt": "组委会特意举办隆重的开幕式"} -{"key": "BAC009S0903W0376", "wav": "./aishell/wav/test/S0903/BAC009S0903W0376.wav", "txt": "并邀请京城排球界全部媒体参加报道"} -{"key": "BAC009S0903W0377", "wav": "./aishell/wav/test/S0903/BAC009S0903W0377.wav", "txt": "并制作了精美的秩序册发给媒体及参赛队"} -{"key": "BAC009S0903W0379", "wav": "./aishell/wav/test/S0903/BAC009S0903W0379.wav", "txt": "以大球套小球为设计理念"} -{"key": "BAC009S0903W0380", "wav": "./aishell/wav/test/S0903/BAC009S0903W0380.wav", "txt": "为振兴三大球贡献自己的一份力量"} -{"key": "BAC009S0903W0381", "wav": "./aishell/wav/test/S0903/BAC009S0903W0381.wav", "txt": "他们的造型由排球的五个经典动作组成"} -{"key": "BAC009S0903W0382", "wav": "./aishell/wav/test/S0903/BAC009S0903W0382.wav", "txt": "分别是发接传垫扣"} -{"key": "BAC009S0903W0383", "wav": "./aishell/wav/test/S0903/BAC009S0903W0383.wav", "txt": "颜色则是由代表着运动精神的奥运五环色组成"} -{"key": "BAC009S0903W0384", "wav": "./aishell/wav/test/S0903/BAC009S0903W0384.wav", "txt": "来自全国各地的业馀排球爱好者纷纷前来踊跃报名"} -{"key": "BAC009S0903W0385", "wav": "./aishell/wav/test/S0903/BAC009S0903W0385.wav", "txt": "半个月的报名期限未到"} -{"key": "BAC009S0903W0386", "wav": "./aishell/wav/test/S0903/BAC009S0903W0386.wav", "txt": "二四个参赛名额就已经全部报满"} -{"key": "BAC009S0903W0387", "wav": "./aishell/wav/test/S0903/BAC009S0903W0387.wav", "txt": "共有三百二十三名业馀排球爱好者报名参加比赛"} -{"key": "BAC009S0903W0388", "wav": "./aishell/wav/test/S0903/BAC009S0903W0388.wav", "txt": "其中年龄最小的年仅十四岁"} -{"key": "BAC009S0903W0389", "wav": "./aishell/wav/test/S0903/BAC009S0903W0389.wav", "txt": "最大的已经年过半百"} -{"key": "BAC009S0903W0390", "wav": "./aishell/wav/test/S0903/BAC009S0903W0390.wav", "txt": "另外还有两名来自加拿大和美国的外籍华侨"} -{"key": "BAC009S0903W0391", "wav": "./aishell/wav/test/S0903/BAC009S0903W0391.wav", "txt": "由此可见北京市业馀排球联赛的影响力与号召力"} -{"key": "BAC009S0903W0392", "wav": "./aishell/wav/test/S0903/BAC009S0903W0392.wav", "txt": "在参赛的二四支队伍中"} -{"key": "BAC009S0903W0393", "wav": "./aishell/wav/test/S0903/BAC009S0903W0393.wav", "txt": "有一些临时组建的球队"} -{"key": "BAC009S0903W0394", "wav": "./aishell/wav/test/S0903/BAC009S0903W0394.wav", "txt": "但大部分都是常年活跃在业馀排球圈里成熟队球"} -{"key": "BAC009S0903W0395", "wav": "./aishell/wav/test/S0903/BAC009S0903W0395.wav", "txt": "而且多次参加过业馀排球比赛"} -{"key": "BAC009S0903W0396", "wav": "./aishell/wav/test/S0903/BAC009S0903W0396.wav", "txt": "相信有这些高水平业馀排球队的参与"} -{"key": "BAC009S0903W0397", "wav": "./aishell/wav/test/S0903/BAC009S0903W0397.wav", "txt": "这一届北京市业馀排球联赛一定会精彩纷呈"} -{"key": "BAC009S0903W0398", "wav": "./aishell/wav/test/S0903/BAC009S0903W0398.wav", "txt": "为期五天的比赛全部结束后"} -{"key": "BAC009S0903W0399", "wav": "./aishell/wav/test/S0903/BAC009S0903W0399.wav", "txt": "将举行隆重的颁奖仪式"} -{"key": "BAC009S0903W0400", "wav": "./aishell/wav/test/S0903/BAC009S0903W0400.wav", "txt": "从四分之一决赛开始每场比赛评选出一名优秀运动员"} -{"key": "BAC009S0903W0401", "wav": "./aishell/wav/test/S0903/BAC009S0903W0401.wav", "txt": "为参加联赛的吸引力"} -{"key": "BAC009S0903W0402", "wav": "./aishell/wav/test/S0903/BAC009S0903W0402.wav", "txt": "提升参赛队的积极性"} -{"key": "BAC009S0903W0403", "wav": "./aishell/wav/test/S0903/BAC009S0903W0403.wav", "txt": "组委会提高了前三名的含金量"} -{"key": "BAC009S0903W0404", "wav": "./aishell/wav/test/S0903/BAC009S0903W0404.wav", "txt": "这也是该片首次亮相大荧幕"} -{"key": "BAC009S0903W0405", "wav": "./aishell/wav/test/S0903/BAC009S0903W0405.wav", "txt": "影片的正式公映要到圣诞节当天"} -{"key": "BAC009S0903W0406", "wav": "./aishell/wav/test/S0903/BAC009S0903W0406.wav", "txt": "但本月评论界就可以知道该片的真实成色"} -{"key": "BAC009S0903W0407", "wav": "./aishell/wav/test/S0903/BAC009S0903W0407.wav", "txt": "曾在二零一零年获得空前成功"} -{"key": "BAC009S0903W0408", "wav": "./aishell/wav/test/S0903/BAC009S0903W0408.wav", "txt": "据香港媒体报道"} -{"key": "BAC009S0903W0409", "wav": "./aishell/wav/test/S0903/BAC009S0903W0409.wav", "txt": "因参演剧集殭而与陈嘉宝及赖慰玲成为好姐妹"} -{"key": "BAC009S0903W0410", "wav": "./aishell/wav/test/S0903/BAC009S0903W0410.wav", "txt": "众人一起为寿寿星女庆生"} -{"key": "BAC009S0903W0411", "wav": "./aishell/wav/test/S0903/BAC009S0903W0411.wav", "txt": "陈嘉宝昨天六月二十三日将大合照上传个人主页"} -{"key": "BAC009S0903W0412", "wav": "./aishell/wav/test/S0903/BAC009S0903W0412.wav", "txt": "除了看见陈嘉宝及赖慰玲外"} -{"key": "BAC009S0903W0413", "wav": "./aishell/wav/test/S0903/BAC009S0903W0413.wav", "txt": "亮点正是与陈凯琳互相了解中郑嘉颖也有出席"} -{"key": "BAC009S0903W0414", "wav": "./aishell/wav/test/S0903/BAC009S0903W0414.wav", "txt": "并做陈凯琳背后的男人"} -{"key": "BAC009S0903W0415", "wav": "./aishell/wav/test/S0903/BAC009S0903W0415.wav", "txt": "网友纷纷将焦点转移到这对情侣身上"} -{"key": "BAC009S0903W0416", "wav": "./aishell/wav/test/S0903/BAC009S0903W0416.wav", "txt": "中新网七月二十八日电据香港明报消息"} -{"key": "BAC009S0903W0417", "wav": "./aishell/wav/test/S0903/BAC009S0903W0417.wav", "txt": "陈凯琳田心妮等出席新剧开机机仪式"} -{"key": "BAC009S0903W0418", "wav": "./aishell/wav/test/S0903/BAC009S0903W0418.wav", "txt": "谈及此前她曾到横店探班郑嘉颖"} -{"key": "BAC009S0903W0419", "wav": "./aishell/wav/test/S0903/BAC009S0903W0419.wav", "txt": "因为新剧的厂景和外景推迟了"} -{"key": "BAC009S0903W0420", "wav": "./aishell/wav/test/S0903/BAC009S0903W0420.wav", "txt": "才有时间去探班"} -{"key": "BAC009S0903W0421", "wav": "./aishell/wav/test/S0903/BAC009S0903W0421.wav", "txt": "在当地逗留了三四天"} -{"key": "BAC009S0903W0422", "wav": "./aishell/wav/test/S0903/BAC009S0903W0422.wav", "txt": "自己也有带剧本去看"} -{"key": "BAC009S0903W0423", "wav": "./aishell/wav/test/S0903/BAC009S0903W0423.wav", "txt": "搜狐娱乐讯北京时间十月二十六日消息"} -{"key": "BAC009S0903W0424", "wav": "./aishell/wav/test/S0903/BAC009S0903W0424.wav", "txt": "据香港媒体报导"} -{"key": "BAC009S0903W0425", "wav": "./aishell/wav/test/S0903/BAC009S0903W0425.wav", "txt": "昨晚张保仔播映大结局故演员齐集饭局以及庆祝"} -{"key": "BAC009S0903W0426", "wav": "./aishell/wav/test/S0903/BAC009S0903W0426.wav", "txt": "陈展鹏风骚到场"} -{"key": "BAC009S0903W0427", "wav": "./aishell/wav/test/S0903/BAC009S0903W0427.wav", "txt": "他要赶进厂开工"} -{"key": "BAC009S0903W0428", "wav": "./aishell/wav/test/S0903/BAC009S0903W0428.wav", "txt": "因此开香槟后要先离场"} -{"key": "BAC009S0903W0429", "wav": "./aishell/wav/test/S0903/BAC009S0903W0429.wav", "txt": "一直传他跟洪永城不和"} -{"key": "BAC009S0903W0430", "wav": "./aishell/wav/test/S0903/BAC009S0903W0430.wav", "txt": "两人在台下分枱坐欠交流"} -{"key": "BAC009S0903W0431", "wav": "./aishell/wav/test/S0903/BAC009S0903W0431.wav", "txt": "公安局的决定书说不对他做出行政处罚"} -{"key": "BAC009S0903W0432", "wav": "./aishell/wav/test/S0903/BAC009S0903W0432.wav", "txt": "我们才按照正常程序给他转为副院长一职的"} -{"key": "BAC009S0903W0433", "wav": "./aishell/wav/test/S0903/BAC009S0903W0433.wav", "txt": "温岭鞋厂倒塌事故已一四人遇难鞋厂老板被控制"} -{"key": "BAC009S0903W0434", "wav": "./aishell/wav/test/S0903/BAC009S0903W0434.wav", "txt": "新京报快讯记者杨锋七月四日下午四时零八分"} -{"key": "BAC009S0903W0435", "wav": "./aishell/wav/test/S0903/BAC009S0903W0435.wav", "txt": "浙江台州温岭市一一零指挥中心接警称"} -{"key": "BAC009S0903W0436", "wav": "./aishell/wav/test/S0903/BAC009S0903W0436.wav", "txt": "新京报记者从温岭市政府新闻办获悉"} -{"key": "BAC009S0903W0437", "wav": "./aishell/wav/test/S0903/BAC009S0903W0437.wav", "txt": "早前通报的五名失联人员已全部找到"} -{"key": "BAC009S0903W0438", "wav": "./aishell/wav/test/S0903/BAC009S0903W0438.wav", "txt": "死亡人数上升至一四人"} -{"key": "BAC009S0903W0439", "wav": "./aishell/wav/test/S0903/BAC009S0903W0439.wav", "txt": "涉事企业老板已被警方控制"} -{"key": "BAC009S0903W0440", "wav": "./aishell/wav/test/S0903/BAC009S0903W0440.wav", "txt": "温州二零位面包师制出二五米蛋糕或申报吉尼斯纪录"} -{"key": "BAC009S0903W0441", "wav": "./aishell/wav/test/S0903/BAC009S0903W0441.wav", "txt": "前往温州龙湾万达广场游玩的市民"} -{"key": "BAC009S0903W0442", "wav": "./aishell/wav/test/S0903/BAC009S0903W0442.wav", "txt": "无不发出这样的惊叹"} -{"key": "BAC009S0903W0443", "wav": "./aishell/wav/test/S0903/BAC009S0903W0443.wav", "txt": "一糕点店派出二零位面包师傅"} -{"key": "BAC009S0903W0444", "wav": "./aishell/wav/test/S0903/BAC009S0903W0444.wav", "txt": "耗时一四个小时打造的二米五超长蛋糕"} -{"key": "BAC009S0903W0445", "wav": "./aishell/wav/test/S0903/BAC009S0903W0445.wav", "txt": "吸引众多市民驻足观看"} -{"key": "BAC009S0903W0446", "wav": "./aishell/wav/test/S0903/BAC009S0903W0446.wav", "txt": "温州二名已婚男为争美女驾豪车互撞四个回合"} -{"key": "BAC009S0903W0447", "wav": "./aishell/wav/test/S0903/BAC009S0903W0447.wav", "txt": "车子被撞得破烂不堪七月四日凌晨"} -{"key": "BAC009S0903W0448", "wav": "./aishell/wav/test/S0903/BAC009S0903W0448.wav", "txt": "宝马奔驰连续四次相撞"} -{"key": "BAC009S0903W0449", "wav": "./aishell/wav/test/S0903/BAC009S0903W0449.wav", "txt": "两车驾驶员一度下车大打出手"} -{"key": "BAC009S0903W0450", "wav": "./aishell/wav/test/S0903/BAC009S0903W0450.wav", "txt": "起因是为了一名年轻的刘姓美女"} -{"key": "BAC009S0903W0451", "wav": "./aishell/wav/test/S0903/BAC009S0903W0451.wav", "txt": "经保险公司初步估算"} -{"key": "BAC009S0903W0452", "wav": "./aishell/wav/test/S0903/BAC009S0903W0452.wav", "txt": "两车损失高达三四十万元"} -{"key": "BAC009S0903W0453", "wav": "./aishell/wav/test/S0903/BAC009S0903W0453.wav", "txt": "温州七人涉嫌百倍抬杠非法经营期货三二亿被批货"} -{"key": "BAC009S0903W0454", "wav": "./aishell/wav/test/S0903/BAC009S0903W0454.wav", "txt": "浙江温州一公司安装虚拟交易系统"} -{"key": "BAC009S0903W0455", "wav": "./aishell/wav/test/S0903/BAC009S0903W0455.wav", "txt": "以一一零倍的杠杆吸引社会公众投资"} -{"key": "BAC009S0903W0456", "wav": "./aishell/wav/test/S0903/BAC009S0903W0456.wav", "txt": "非法经营期货金额共计人民币三二亿元"} -{"key": "BAC009S0903W0457", "wav": "./aishell/wav/test/S0903/BAC009S0903W0457.wav", "txt": "七名犯罪嫌疑人因涉嫌非法经营罪被批准逮捕"} -{"key": "BAC009S0903W0458", "wav": "./aishell/wav/test/S0903/BAC009S0903W0458.wav", "txt": "温州城管掌掴女清洁工已被停职检查"} -{"key": "BAC009S0903W0459", "wav": "./aishell/wav/test/S0903/BAC009S0903W0459.wav", "txt": "温州天价馒头续店方称顾客要狭索赔三条中华烟"} -{"key": "BAC009S0903W0460", "wav": "./aishell/wav/test/S0903/BAC009S0903W0460.wav", "txt": "荞麦窝窝头一零月二零日"} -{"key": "BAC009S0903W0461", "wav": "./aishell/wav/test/S0903/BAC009S0903W0461.wav", "txt": "网络上一张永嘉桥头国际饭店的结帐单十分引人注目"} -{"key": "BAC009S0903W0462", "wav": "./aishell/wav/test/S0903/BAC009S0903W0462.wav", "txt": "菜单显示该饭店的荞麦窝窝头卖三八元一个"} -{"key": "BAC009S0903W0463", "wav": "./aishell/wav/test/S0903/BAC009S0903W0463.wav", "txt": "三零馀位食客吃了四五个窝窝头"} -{"key": "BAC009S0903W0464", "wav": "./aishell/wav/test/S0903/BAC009S0903W0464.wav", "txt": "发现事情并没有这么简单"} -{"key": "BAC009S0903W0465", "wav": "./aishell/wav/test/S0903/BAC009S0903W0465.wav", "txt": "温州火锅先生后续涉案者父亲写公开道歉信"} -{"key": "BAC009S0903W0466", "wav": "./aishell/wav/test/S0903/BAC009S0903W0466.wav", "txt": "温州网八月二十七日讯记者项锐见习记者黄梦思"} -{"key": "BAC009S0903W0467", "wav": "./aishell/wav/test/S0903/BAC009S0903W0467.wav", "txt": "温州一七月大女童接种疫苗抽搐省疾控专家调查"} -{"key": "BAC009S0903W0468", "wav": "./aishell/wav/test/S0903/BAC009S0903W0468.wav", "txt": "金报讯记者蓝莹九月一一日上午"} -{"key": "BAC009S0903W0469", "wav": "./aishell/wav/test/S0903/BAC009S0903W0469.wav", "txt": "随即被送到儿童医院进行救治"} -{"key": "BAC009S0903W0470", "wav": "./aishell/wav/test/S0903/BAC009S0903W0470.wav", "txt": "经过连续三天的抢救"} -{"key": "BAC009S0903W0471", "wav": "./aishell/wav/test/S0903/BAC009S0903W0471.wav", "txt": "孩子仍处于昏迷阶段"} -{"key": "BAC009S0903W0472", "wav": "./aishell/wav/test/S0903/BAC009S0903W0472.wav", "txt": "省市区三三级疾控部门专家已介入调查"} -{"key": "BAC009S0903W0473", "wav": "./aishell/wav/test/S0903/BAC009S0903W0473.wav", "txt": "温州一中学门口氢气罐爆炸卖气球摊贩不治身亡"} -{"key": "BAC009S0903W0474", "wav": "./aishell/wav/test/S0903/BAC009S0903W0474.wav", "txt": "温州一住持被免政协委员遭准儿媳举报娶妻开路虎"} -{"key": "BAC009S0903W0475", "wav": "./aishell/wav/test/S0903/BAC009S0903W0475.wav", "txt": "关于中国嵩山少林寺方丈齐永信的举报风波尚未停歇"} -{"key": "BAC009S0903W0476", "wav": "./aishell/wav/test/S0903/BAC009S0903W0476.wav", "txt": "因准儿媳的举报跌下神坛"} -{"key": "BAC009S0903W0477", "wav": "./aishell/wav/test/S0903/BAC009S0903W0477.wav", "txt": "位于温州苍南龙港镇水门村的一个仓库发生火灾"} -{"key": "BAC009S0903W0478", "wav": "./aishell/wav/test/S0903/BAC009S0903W0478.wav", "txt": "记者从消防部门处了解到"} -{"key": "BAC009S0903W0479", "wav": "./aishell/wav/test/S0903/BAC009S0903W0479.wav", "txt": "这里存放着乙酯和工业酒精等化工品"} -{"key": "BAC009S0903W0480", "wav": "./aishell/wav/test/S0903/BAC009S0903W0480.wav", "txt": "温州一夫妻非法集资五亿丈夫将赃款送给情妇洗钱"} -{"key": "BAC009S0903W0481", "wav": "./aishell/wav/test/S0903/BAC009S0903W0481.wav", "txt": "温州一女士洗澡被已婚男多次偷窥二年后才报警"} -{"key": "BAC009S0903W0482", "wav": "./aishell/wav/test/S0903/BAC009S0903W0482.wav", "txt": "温州网讯洗澡被偷窥却一忍再忍"} -{"key": "BAC009S0903W0483", "wav": "./aishell/wav/test/S0903/BAC009S0903W0483.wav", "txt": "但愿望总是照不进现实"} -{"key": "BAC009S0903W0484", "wav": "./aishell/wav/test/S0903/BAC009S0903W0484.wav", "txt": "称一名男子时常偷窥自己洗澡"} -{"key": "BAC009S0903W0485", "wav": "./aishell/wav/test/S0903/BAC009S0903W0485.wav", "txt": "且时间已长达两年多了"} -{"key": "BAC009S0903W0486", "wav": "./aishell/wav/test/S0903/BAC009S0903W0486.wav", "txt": "温州一家六口吃毒蘑菇身亡女婴拒吃面躲过死神"} -{"key": "BAC009S0903W0487", "wav": "./aishell/wav/test/S0903/BAC009S0903W0487.wav", "txt": "温州一家六口误食毒菌五人已死亡"} -{"key": "BAC009S0903W0488", "wav": "./aishell/wav/test/S0903/BAC009S0903W0488.wav", "txt": "温州永嘉县桥下镇吴山村的潘老伯一家六口"} -{"key": "BAC009S0903W0489", "wav": "./aishell/wav/test/S0903/BAC009S0903W0489.wav", "txt": "半个月前因误食有毒野生菌中毒"} -{"key": "BAC009S0903W0490", "wav": "./aishell/wav/test/S0903/BAC009S0903W0490.wav", "txt": "至七月一六日中午已有五人去世"} -{"key": "BAC009S0903W0491", "wav": "./aishell/wav/test/S0903/BAC009S0903W0491.wav", "txt": "潘老伯二六岁的外孙还在医院重症监护室治疗"} -{"key": "BAC009S0903W0492", "wav": "./aishell/wav/test/S0903/BAC009S0903W0492.wav", "txt": "仍处于深度昏迷状态"} -{"key": "BAC009S0903W0493", "wav": "./aishell/wav/test/S0903/BAC009S0903W0493.wav", "txt": "温州一村主任台风夜转移群众其妻子遇难"} -{"key": "BAC009S0903W0494", "wav": "./aishell/wav/test/S0903/BAC009S0903W0494.wav", "txt": "石柱村泥石流塌方现场"} -{"key": "BAC009S0903W0495", "wav": "./aishell/wav/test/S0903/BAC009S0903W0495.wav", "txt": "温州一男子在医院放置假炸弹被判处有期徒刑一年"} -{"key": "BAC009S0904W0121", "wav": "./aishell/wav/test/S0904/BAC009S0904W0121.wav", "txt": "为了解决额度荒的问题"} -{"key": "BAC009S0904W0122", "wav": "./aishell/wav/test/S0904/BAC009S0904W0122.wav", "txt": "近期广州和南京已经开始启动公转商贷款贴息模式"} -{"key": "BAC009S0904W0123", "wav": "./aishell/wav/test/S0904/BAC009S0904W0123.wav", "txt": "即由商业银行向市民发放执行公积金利率的贷款"} -{"key": "BAC009S0904W0124", "wav": "./aishell/wav/test/S0904/BAC009S0904W0124.wav", "txt": "公积金贷款与商业贷款之间的利息差额"} -{"key": "BAC009S0904W0125", "wav": "./aishell/wav/test/S0904/BAC009S0904W0125.wav", "txt": "由公积金中心向银行支付"} -{"key": "BAC009S0904W0126", "wav": "./aishell/wav/test/S0904/BAC009S0904W0126.wav", "txt": "重点城市公积金贷额款度也有限制"} -{"key": "BAC009S0904W0127", "wav": "./aishell/wav/test/S0904/BAC009S0904W0127.wav", "txt": "比如北京和上海家庭最高贷款额度均为一百万元"} -{"key": "BAC009S0904W0128", "wav": "./aishell/wav/test/S0904/BAC009S0904W0128.wav", "txt": "广州和深圳分别为五万元和七万元"} -{"key": "BAC009S0904W0129", "wav": "./aishell/wav/test/S0904/BAC009S0904W0129.wav", "txt": "在近期房价持续回升的背景下"} -{"key": "BAC009S0904W0130", "wav": "./aishell/wav/test/S0904/BAC009S0904W0130.wav", "txt": "多数二线城市和三四线城市"} -{"key": "BAC009S0904W0131", "wav": "./aishell/wav/test/S0904/BAC009S0904W0131.wav", "txt": "公积金贷款上限能够复盖单套房总价"} -{"key": "BAC009S0904W0132", "wav": "./aishell/wav/test/S0904/BAC009S0904W0132.wav", "txt": "这些城市公积金贷款买房的比例也比较高"} -{"key": "BAC009S0904W0133", "wav": "./aishell/wav/test/S0904/BAC009S0904W0133.wav", "txt": "此次政策调整也有较好的针对性"} -{"key": "BAC009S0904W0134", "wav": "./aishell/wav/test/S0904/BAC009S0904W0134.wav", "txt": "包括此次三部委发布公积金新政"} -{"key": "BAC009S0904W0135", "wav": "./aishell/wav/test/S0904/BAC009S0904W0135.wav", "txt": "再结合近期限外政策全面松绑"} -{"key": "BAC009S0904W0136", "wav": "./aishell/wav/test/S0904/BAC009S0904W0136.wav", "txt": "具有实时性合理性和较强的针对性"} -{"key": "BAC009S0904W0137", "wav": "./aishell/wav/test/S0904/BAC009S0904W0137.wav", "txt": "有助于发挥内需在稳增长中的积极作用"} -{"key": "BAC009S0904W0138", "wav": "./aishell/wav/test/S0904/BAC009S0904W0138.wav", "txt": "住建部等三部委联合发文"} -{"key": "BAC009S0904W0139", "wav": "./aishell/wav/test/S0904/BAC009S0904W0139.wav", "txt": "再次降低公积金贷款的门槛"} -{"key": "BAC009S0904W0140", "wav": "./aishell/wav/test/S0904/BAC009S0904W0140.wav", "txt": "还清首套房公积金贷款"} -{"key": "BAC009S0904W0141", "wav": "./aishell/wav/test/S0904/BAC009S0904W0141.wav", "txt": "在公积金贷款额度上调后一个月内"} -{"key": "BAC009S0904W0142", "wav": "./aishell/wav/test/S0904/BAC009S0904W0142.wav", "txt": "北京公积金贷款成交量上涨百分之五"} -{"key": "BAC009S0904W0143", "wav": "./aishell/wav/test/S0904/BAC009S0904W0143.wav", "txt": "中小户型住房去化速度明显加快"} -{"key": "BAC009S0904W0144", "wav": "./aishell/wav/test/S0904/BAC009S0904W0144.wav", "txt": "北京公积金贷款额度提高"} -{"key": "BAC009S0904W0145", "wav": "./aishell/wav/test/S0904/BAC009S0904W0145.wav", "txt": "虽有助于使刚需购房者长期受益"} -{"key": "BAC009S0904W0146", "wav": "./aishell/wav/test/S0904/BAC009S0904W0146.wav", "txt": "但仍存在七万最高贷款额申请难等落地问题"} -{"key": "BAC009S0904W0147", "wav": "./aishell/wav/test/S0904/BAC009S0904W0147.wav", "txt": "公积金政策放宽对楼市成交的短期刺激作用将难以持续"} -{"key": "BAC009S0904W0148", "wav": "./aishell/wav/test/S0904/BAC009S0904W0148.wav", "txt": "各地公积金政策步入频繁调整期"} -{"key": "BAC009S0904W0149", "wav": "./aishell/wav/test/S0904/BAC009S0904W0149.wav", "txt": "一向严格收紧购房政策的北京也加入此阵营"} -{"key": "BAC009S0904W0150", "wav": "./aishell/wav/test/S0904/BAC009S0904W0150.wav", "txt": "据中新网房产频道梳理"} -{"key": "BAC009S0904W0151", "wav": "./aishell/wav/test/S0904/BAC009S0904W0151.wav", "txt": "北京针对公积金的调整次数便达到五次"} -{"key": "BAC009S0904W0152", "wav": "./aishell/wav/test/S0904/BAC009S0904W0152.wav", "txt": "美丽北京大型绿色公益品牌项目"} -{"key": "BAC009S0904W0153", "wav": "./aishell/wav/test/S0904/BAC009S0904W0153.wav", "txt": "调整公积金年度缴存上下限和缴存比例"} -{"key": "BAC009S0904W0154", "wav": "./aishell/wav/test/S0904/BAC009S0904W0154.wav", "txt": "放宽公积金贷款二套房认定标准"} -{"key": "BAC009S0904W0155", "wav": "./aishell/wav/test/S0904/BAC009S0904W0155.wav", "txt": "将公积金贷款利率下调"} -{"key": "BAC009S0904W0156", "wav": "./aishell/wav/test/S0904/BAC009S0904W0156.wav", "txt": "公积金政策的调整从未这么频繁"} -{"key": "BAC009S0904W0157", "wav": "./aishell/wav/test/S0904/BAC009S0904W0157.wav", "txt": "从一系列公积金政策看来"} -{"key": "BAC009S0904W0158", "wav": "./aishell/wav/test/S0904/BAC009S0904W0158.wav", "txt": "扶持刚需客群已经成为北京房地产调控的主要方向"} -{"key": "BAC009S0904W0159", "wav": "./aishell/wav/test/S0904/BAC009S0904W0159.wav", "txt": "未来政策层面将继续保持宽松"} -{"key": "BAC009S0904W0160", "wav": "./aishell/wav/test/S0904/BAC009S0904W0160.wav", "txt": "在上海易居房地产研究院研究员严跃进看来"} -{"key": "BAC009S0904W0161", "wav": "./aishell/wav/test/S0904/BAC009S0904W0161.wav", "txt": "这一系列公积金政策的放宽"} -{"key": "BAC009S0904W0162", "wav": "./aishell/wav/test/S0904/BAC009S0904W0162.wav", "txt": "与目前房地产救市的市场导向相吻合"} -{"key": "BAC009S0904W0163", "wav": "./aishell/wav/test/S0904/BAC009S0904W0163.wav", "txt": "盘活各地公积金资源"} -{"key": "BAC009S0904W0164", "wav": "./aishell/wav/test/S0904/BAC009S0904W0164.wav", "txt": "年初选择使用公积金贷款的购房者占比环比增多"} -{"key": "BAC009S0904W0165", "wav": "./aishell/wav/test/S0904/BAC009S0904W0165.wav", "txt": "伟嘉安捷提供数据显示"} -{"key": "BAC009S0904W0166", "wav": "./aishell/wav/test/S0904/BAC009S0904W0166.wav", "txt": "七月北京公积金贷款成交量提升了百分之五"} -{"key": "BAC009S0904W0167", "wav": "./aishell/wav/test/S0904/BAC009S0904W0167.wav", "txt": "贷款需求将在下个月继续释放"} -{"key": "BAC009S0904W0168", "wav": "./aishell/wav/test/S0904/BAC009S0904W0168.wav", "txt": "北京七月楼市的成交情况"} -{"key": "BAC009S0904W0169", "wav": "./aishell/wav/test/S0904/BAC009S0904W0169.wav", "txt": "也佐证了公积金政策放宽刺激作用的显现"} -{"key": "BAC009S0904W0170", "wav": "./aishell/wav/test/S0904/BAC009S0904W0170.wav", "txt": "在总成交中占比环比增加五个百分点"} -{"key": "BAC009S0904W0171", "wav": "./aishell/wav/test/S0904/BAC009S0904W0171.wav", "txt": "且低于七十平米的小户型住房成交明显上升"} -{"key": "BAC009S0904W0172", "wav": "./aishell/wav/test/S0904/BAC009S0904W0172.wav", "txt": "北京调整首套房公积金贷款最高额度"} -{"key": "BAC009S0904W0173", "wav": "./aishell/wav/test/S0904/BAC009S0904W0173.wav", "txt": "伟业我爱我家集团副总裁胡景晖分析"} -{"key": "BAC009S0904W0174", "wav": "./aishell/wav/test/S0904/BAC009S0904W0174.wav", "txt": "刚需人群观望心理正逐步消散"} -{"key": "BAC009S0904W0175", "wav": "./aishell/wav/test/S0904/BAC009S0904W0175.wav", "txt": "开始加速进入新房市场"} -{"key": "BAC009S0904W0176", "wav": "./aishell/wav/test/S0904/BAC009S0904W0176.wav", "txt": "公积金政策的放宽对二手房市场也产生了影响"} -{"key": "BAC009S0904W0177", "wav": "./aishell/wav/test/S0904/BAC009S0904W0177.wav", "txt": "据伟业我爱我家市场研究院测算"} -{"key": "BAC009S0904W0178", "wav": "./aishell/wav/test/S0904/BAC009S0904W0178.wav", "txt": "在过去六个月中处于高点"} -{"key": "BAC009S0904W0179", "wav": "./aishell/wav/test/S0904/BAC009S0904W0179.wav", "txt": "虽然刚需购房者入市积极性有所提升"} -{"key": "BAC009S0904W0180", "wav": "./aishell/wav/test/S0904/BAC009S0904W0180.wav", "txt": "但不少业内人士认为"} -{"key": "BAC009S0904W0181", "wav": "./aishell/wav/test/S0904/BAC009S0904W0181.wav", "txt": "这一刺激作用并不会长时间延续"} -{"key": "BAC009S0904W0182", "wav": "./aishell/wav/test/S0904/BAC009S0904W0182.wav", "txt": "上调公积金贷款额度对市场的刺激是短期的"} -{"key": "BAC009S0904W0183", "wav": "./aishell/wav/test/S0904/BAC009S0904W0183.wav", "txt": "公积金短期拉动的购房需求有限"} -{"key": "BAC009S0904W0184", "wav": "./aishell/wav/test/S0904/BAC009S0904W0184.wav", "txt": "更多是原本计划购房的客群享受到了政策利好"} -{"key": "BAC009S0904W0185", "wav": "./aishell/wav/test/S0904/BAC009S0904W0185.wav", "txt": "原本短期内不考虑购房的客群"} -{"key": "BAC009S0904W0186", "wav": "./aishell/wav/test/S0904/BAC009S0904W0186.wav", "txt": "在这一政策出台后匆忙购房"} -{"key": "BAC009S0904W0187", "wav": "./aishell/wav/test/S0904/BAC009S0904W0187.wav", "txt": "加强农产品质量安全监管"} -{"key": "BAC009S0904W0188", "wav": "./aishell/wav/test/S0904/BAC009S0904W0188.wav", "txt": "建立协调配合检打联动联防联控应急处置机制"} -{"key": "BAC009S0904W0189", "wav": "./aishell/wav/test/S0904/BAC009S0904W0189.wav", "txt": "实行农产品产地安全分级管理"} -{"key": "BAC009S0904W0190", "wav": "./aishell/wav/test/S0904/BAC009S0904W0190.wav", "txt": "推动农产品生产加工和流通企业建立诚信制度"} -{"key": "BAC009S0904W0191", "wav": "./aishell/wav/test/S0904/BAC009S0904W0191.wav", "txt": "提高农业产业化和规模化经营水平"} -{"key": "BAC009S0904W0192", "wav": "./aishell/wav/test/S0904/BAC009S0904W0192.wav", "txt": "推进农业产业化经营跨越式发展"} -{"key": "BAC009S0904W0193", "wav": "./aishell/wav/test/S0904/BAC009S0904W0193.wav", "txt": "制定扶持农业产业化龙头企业发展的综合性政策"} -{"key": "BAC009S0904W0194", "wav": "./aishell/wav/test/S0904/BAC009S0904W0194.wav", "txt": "启动实施农业产业化经营跨越发展行动"} -{"key": "BAC009S0904W0195", "wav": "./aishell/wav/test/S0904/BAC009S0904W0195.wav", "txt": "按照扶优扶大扶强的原则"} -{"key": "BAC009S0904W0196", "wav": "./aishell/wav/test/S0904/BAC009S0904W0196.wav", "txt": "依托农产品加工物流等各类农业园区"} -{"key": "BAC009S0904W0197", "wav": "./aishell/wav/test/S0904/BAC009S0904W0197.wav", "txt": "选建一批农业产业化示范基地"} -{"key": "BAC009S0904W0198", "wav": "./aishell/wav/test/S0904/BAC009S0904W0198.wav", "txt": "推进龙头企业集群发展"} -{"key": "BAC009S0904W0199", "wav": "./aishell/wav/test/S0904/BAC009S0904W0199.wav", "txt": "引导龙头企业采取兼并重组参股收购等方式"} -{"key": "BAC009S0904W0200", "wav": "./aishell/wav/test/S0904/BAC009S0904W0200.wav", "txt": "支持龙头企业跨区域经营"} -{"key": "BAC009S0904W0201", "wav": "./aishell/wav/test/S0904/BAC009S0904W0201.wav", "txt": "提升产品研发精深加工技术水平和装备能力"} -{"key": "BAC009S0904W0202", "wav": "./aishell/wav/test/S0904/BAC009S0904W0202.wav", "txt": "鼓励龙头企业采取参股合作等方式"} -{"key": "BAC009S0904W0203", "wav": "./aishell/wav/test/S0904/BAC009S0904W0203.wav", "txt": "与农户建立紧密型利益联联结关系"} -{"key": "BAC009S0904W0204", "wav": "./aishell/wav/test/S0904/BAC009S0904W0204.wav", "txt": "强化农民专业合作社组织带动能力"} -{"key": "BAC009S0904W0205", "wav": "./aishell/wav/test/S0904/BAC009S0904W0205.wav", "txt": "广泛开展示范社建设行动"} -{"key": "BAC009S0904W0206", "wav": "./aishell/wav/test/S0904/BAC009S0904W0206.wav", "txt": "加大合作社经营管理人员培训培养力度"} -{"key": "BAC009S0904W0207", "wav": "./aishell/wav/test/S0904/BAC009S0904W0207.wav", "txt": "加强合作社辅导员队伍建设"} -{"key": "BAC009S0904W0208", "wav": "./aishell/wav/test/S0904/BAC009S0904W0208.wav", "txt": "支持农民专业合作社参加农产品展示展销活动"} -{"key": "BAC009S0904W0209", "wav": "./aishell/wav/test/S0904/BAC009S0904W0209.wav", "txt": "建立稳定的产销关系"} -{"key": "BAC009S0904W0210", "wav": "./aishell/wav/test/S0904/BAC009S0904W0210.wav", "txt": "鼓励农民专业合作社开展信用合作"} -{"key": "BAC009S0904W0211", "wav": "./aishell/wav/test/S0904/BAC009S0904W0211.wav", "txt": "在自愿基础上组建联合社"} -{"key": "BAC009S0904W0212", "wav": "./aishell/wav/test/S0904/BAC009S0904W0212.wav", "txt": "提高生产经营和市场开拓能力"} -{"key": "BAC009S0904W0213", "wav": "./aishell/wav/test/S0904/BAC009S0904W0213.wav", "txt": "扶持合作社建设农产品仓储冷藏初加工等设施"} -{"key": "BAC009S0904W0214", "wav": "./aishell/wav/test/S0904/BAC009S0904W0214.wav", "txt": "发展多种形式的适度规模经营"} -{"key": "BAC009S0904W0215", "wav": "./aishell/wav/test/S0904/BAC009S0904W0215.wav", "txt": "在依法自愿有偿和加强服务基础上"} -{"key": "BAC009S0904W0216", "wav": "./aishell/wav/test/S0904/BAC009S0904W0216.wav", "txt": "完善土地承包经营权流转市场"} -{"key": "BAC009S0904W0217", "wav": "./aishell/wav/test/S0904/BAC009S0904W0217.wav", "txt": "发展多种形式的规模化专业化生产经营"} -{"key": "BAC009S0904W0218", "wav": "./aishell/wav/test/S0904/BAC009S0904W0218.wav", "txt": "引导土地承包经营权向生产和经营能手集中"} -{"key": "BAC009S0904W0219", "wav": "./aishell/wav/test/S0904/BAC009S0904W0219.wav", "txt": "大力培育和发展种养大户家庭农牧场"} -{"key": "BAC009S0904W0220", "wav": "./aishell/wav/test/S0904/BAC009S0904W0220.wav", "txt": "实施一村一品强村富民工程"} -{"key": "BAC009S0904W0221", "wav": "./aishell/wav/test/S0904/BAC009S0904W0221.wav", "txt": "大力发展农业社会化服务"} -{"key": "BAC009S0904W0222", "wav": "./aishell/wav/test/S0904/BAC009S0904W0222.wav", "txt": "增强农业公益性服务能力"} -{"key": "BAC009S0904W0223", "wav": "./aishell/wav/test/S0904/BAC009S0904W0223.wav", "txt": "加快基层农技推广体系改革和建施"} -{"key": "BAC009S0904W0224", "wav": "./aishell/wav/test/S0904/BAC009S0904W0224.wav", "txt": "健全公益性农业技术推广服务体系"} -{"key": "BAC009S0904W0225", "wav": "./aishell/wav/test/S0904/BAC009S0904W0225.wav", "txt": "加强农业有害生物监测预警和防控能力建设"} -{"key": "BAC009S0904W0226", "wav": "./aishell/wav/test/S0904/BAC009S0904W0226.wav", "txt": "加强农业资源和生态环境保护"} -{"key": "BAC009S0904W0227", "wav": "./aishell/wav/test/S0904/BAC009S0904W0227.wav", "txt": "继续实行最严格的耕地保护制度"} -{"key": "BAC009S0904W0228", "wav": "./aishell/wav/test/S0904/BAC009S0904W0228.wav", "txt": "确保耕地保有量保持在十亿亩"} -{"key": "BAC009S0904W0229", "wav": "./aishell/wav/test/S0904/BAC009S0904W0229.wav", "txt": "基本农田不低于十亿亩"} -{"key": "BAC009S0904W0230", "wav": "./aishell/wav/test/S0904/BAC009S0904W0230.wav", "txt": "科学保护和合理利用水资源"} -{"key": "BAC009S0904W0231", "wav": "./aishell/wav/test/S0904/BAC009S0904W0231.wav", "txt": "大力发展节水增效农业"} -{"key": "BAC009S0904W0232", "wav": "./aishell/wav/test/S0904/BAC009S0904W0232.wav", "txt": "继续建设国家级旱作农业示范区"} -{"key": "BAC009S0904W0233", "wav": "./aishell/wav/test/S0904/BAC009S0904W0233.wav", "txt": "坚持基本草原保护制度"} -{"key": "BAC009S0904W0234", "wav": "./aishell/wav/test/S0904/BAC009S0904W0234.wav", "txt": "推行禁牧休牧和划区轮牧"} -{"key": "BAC009S0904W0235", "wav": "./aishell/wav/test/S0904/BAC009S0904W0235.wav", "txt": "实施草原保护重大工程"} -{"key": "BAC009S0904W0236", "wav": "./aishell/wav/test/S0904/BAC009S0904W0236.wav", "txt": "加大水生生物资源养护力度"} -{"key": "BAC009S0904W0237", "wav": "./aishell/wav/test/S0904/BAC009S0904W0237.wav", "txt": "强化水生生态修复和建设"} -{"key": "BAC009S0904W0238", "wav": "./aishell/wav/test/S0904/BAC009S0904W0238.wav", "txt": "加强畜禽遗传资源和农业野生植物资源保护"} -{"key": "BAC009S0904W0239", "wav": "./aishell/wav/test/S0904/BAC009S0904W0239.wav", "txt": "加强农业生态环境治理"} -{"key": "BAC009S0904W0240", "wav": "./aishell/wav/test/S0904/BAC009S0904W0240.wav", "txt": "鼓励使用生物农药高效低毒低残留农药和有机肥料"} -{"key": "BAC009S0904W0241", "wav": "./aishell/wav/test/S0904/BAC009S0904W0241.wav", "txt": "回收再利用农膜和农药包装物"} -{"key": "BAC009S0904W0242", "wav": "./aishell/wav/test/S0904/BAC009S0904W0242.wav", "txt": "加快规模养殖场粪污处理利用"} -{"key": "BAC009S0904W0243", "wav": "./aishell/wav/test/S0904/BAC009S0904W0243.wav", "txt": "治理和控制农业面源污染"} -{"key": "BAC009S0904W0244", "wav": "./aishell/wav/test/S0904/BAC009S0904W0244.wav", "txt": "培育门类丰富层次齐用的综合利用产业"} -{"key": "BAC009S0904W0245", "wav": "./aishell/wav/test/S0904/BAC009S0904W0245.wav", "txt": "建立秸秆禁烧和综合利用的长效机制"} -{"key": "BAC009S0904W0246", "wav": "./aishell/wav/test/S0904/BAC009S0904W0246.wav", "txt": "继续实施农村沼气工程"} -{"key": "BAC009S0904W0247", "wav": "./aishell/wav/test/S0904/BAC009S0904W0247.wav", "txt": "大力推进农村清洁工程建设"} -{"key": "BAC009S0904W0248", "wav": "./aishell/wav/test/S0904/BAC009S0904W0248.wav", "txt": "清洁水源田园和家园"} -{"key": "BAC009S0904W0249", "wav": "./aishell/wav/test/S0904/BAC009S0904W0249.wav", "txt": "大力推进农业节能减排"} -{"key": "BAC009S0904W0250", "wav": "./aishell/wav/test/S0904/BAC009S0904W0250.wav", "txt": "树立绿色低碳发展理念"} -{"key": "BAC009S0904W0251", "wav": "./aishell/wav/test/S0904/BAC009S0904W0251.wav", "txt": "积极发展资源节约型和环境友好型农业"} -{"key": "BAC009S0904W0252", "wav": "./aishell/wav/test/S0904/BAC009S0904W0252.wav", "txt": "淘汰报废高耗能老旧农业机械"} -{"key": "BAC009S0904W0253", "wav": "./aishell/wav/test/S0904/BAC009S0904W0253.wav", "txt": "应该也是个提醒中国再也不能仅仅满足于组装了"} -{"key": "BAC009S0904W0254", "wav": "./aishell/wav/test/S0904/BAC009S0904W0254.wav", "txt": "我们在科技创新方面的进步非常显着"} -{"key": "BAC009S0904W0255", "wav": "./aishell/wav/test/S0904/BAC009S0904W0255.wav", "txt": "一项项领先世界的科技成果"} -{"key": "BAC009S0904W0256", "wav": "./aishell/wav/test/S0904/BAC009S0904W0256.wav", "txt": "不断刷新中国创造的精度高度深度"} -{"key": "BAC009S0904W0257", "wav": "./aishell/wav/test/S0904/BAC009S0904W0257.wav", "txt": "成为一个个响亮的中国品牌"} -{"key": "BAC009S0904W0258", "wav": "./aishell/wav/test/S0904/BAC009S0904W0258.wav", "txt": "我们的自主创新能力还不够强"} -{"key": "BAC009S0904W0259", "wav": "./aishell/wav/test/S0904/BAC009S0904W0259.wav", "txt": "与世界先进水平相比还有明显差距"} -{"key": "BAC009S0904W0260", "wav": "./aishell/wav/test/S0904/BAC009S0904W0260.wav", "txt": "特别是企业自主创新方面"} -{"key": "BAC009S0904W0261", "wav": "./aishell/wav/test/S0904/BAC009S0904W0261.wav", "txt": "具有重大影响的科技产品还不是很多"} -{"key": "BAC009S0904W0262", "wav": "./aishell/wav/test/S0904/BAC009S0904W0262.wav", "txt": "与世界第二经济大国的地位还不相称"} -{"key": "BAC009S0904W0263", "wav": "./aishell/wav/test/S0904/BAC009S0904W0263.wav", "txt": "希望中国品牌在国际市场的知名度和影响力越来越大"} -{"key": "BAC009S0904W0264", "wav": "./aishell/wav/test/S0904/BAC009S0904W0264.wav", "txt": "中国人从来不缺乏创新创造的基因"} -{"key": "BAC009S0904W0265", "wav": "./aishell/wav/test/S0904/BAC009S0904W0265.wav", "txt": "创新是中华民族最鲜明的禀赋"} -{"key": "BAC009S0904W0266", "wav": "./aishell/wav/test/S0904/BAC009S0904W0266.wav", "txt": "我们完全有理由树立创新自信"} -{"key": "BAC009S0904W0267", "wav": "./aishell/wav/test/S0904/BAC009S0904W0267.wav", "txt": "上一次工业革命我们落在了西方发达国家后面很远"} -{"key": "BAC009S0904W0268", "wav": "./aishell/wav/test/S0904/BAC009S0904W0268.wav", "txt": "面对以网络和数字技术为标志的信息技术发展"} -{"key": "BAC009S0904W0269", "wav": "./aishell/wav/test/S0904/BAC009S0904W0269.wav", "txt": "我们迎来了赶超发达国家的难得机遇"} -{"key": "BAC009S0904W0270", "wav": "./aishell/wav/test/S0904/BAC009S0904W0270.wav", "txt": "我国拥有近一四亿人口"} -{"key": "BAC009S0904W0271", "wav": "./aishell/wav/test/S0904/BAC009S0904W0271.wav", "txt": "手机网民近五点六亿"} -{"key": "BAC009S0904W0272", "wav": "./aishell/wav/test/S0904/BAC009S0904W0272.wav", "txt": "这样的规模没有任何一个国家可以比拟"} -{"key": "BAC009S0904W0273", "wav": "./aishell/wav/test/S0904/BAC009S0904W0273.wav", "txt": "他们的消费需求是拉动创新创业的巨大牵引力"} -{"key": "BAC009S0904W0274", "wav": "./aishell/wav/test/S0904/BAC009S0904W0274.wav", "txt": "规模超大的人才群体更是创新创造无与伦比的重要资源"} -{"key": "BAC009S0904W0275", "wav": "./aishell/wav/test/S0904/BAC009S0904W0275.wav", "txt": "我国经济发展进入新常态"} -{"key": "BAC009S0904W0276", "wav": "./aishell/wav/test/S0904/BAC009S0904W0276.wav", "txt": "双目标不仅包括保持中高速增长"} -{"key": "BAC009S0904W0277", "wav": "./aishell/wav/test/S0904/BAC009S0904W0277.wav", "txt": "还包括迈向中高端水平"} -{"key": "BAC009S0904W0278", "wav": "./aishell/wav/test/S0904/BAC009S0904W0278.wav", "txt": "我国的经济处在爬坡过坎的重要关口"} -{"key": "BAC009S0904W0279", "wav": "./aishell/wav/test/S0904/BAC009S0904W0279.wav", "txt": "我们也许不用像以前那样为了追求某个数字赶紧赶慢了"} -{"key": "BAC009S0904W0280", "wav": "./aishell/wav/test/S0904/BAC009S0904W0280.wav", "txt": "但松一口气的想法是没有出路的"} -{"key": "BAC009S0904W0281", "wav": "./aishell/wav/test/S0904/BAC009S0904W0281.wav", "txt": "恰恰更需要我们有所作为"} -{"key": "BAC009S0904W0282", "wav": "./aishell/wav/test/S0904/BAC009S0904W0282.wav", "txt": "就是在创新驱动上下功夫"} -{"key": "BAC009S0904W0283", "wav": "./aishell/wav/test/S0904/BAC009S0904W0283.wav", "txt": "在转型发展上下功夫"} -{"key": "BAC009S0904W0284", "wav": "./aishell/wav/test/S0904/BAC009S0904W0284.wav", "txt": "不断提高技术创新对经济发展的贡献率"} -{"key": "BAC009S0904W0285", "wav": "./aishell/wav/test/S0904/BAC009S0904W0285.wav", "txt": "如果说过去的这些年"} -{"key": "BAC009S0904W0286", "wav": "./aishell/wav/test/S0904/BAC009S0904W0286.wav", "txt": "我们成为世界工厂是不可逾越的发展阶段"} -{"key": "BAC009S0904W0287", "wav": "./aishell/wav/test/S0904/BAC009S0904W0287.wav", "txt": "那么未来的五年十年二十年"} -{"key": "BAC009S0904W0288", "wav": "./aishell/wav/test/S0904/BAC009S0904W0288.wav", "txt": "我们肯定不能再沾沾自喜于世界工厂"} -{"key": "BAC009S0904W0289", "wav": "./aishell/wav/test/S0904/BAC009S0904W0289.wav", "txt": "也不能一直被贴上中国组装的标签"} -{"key": "BAC009S0904W0290", "wav": "./aishell/wav/test/S0904/BAC009S0904W0290.wav", "txt": "长期处在产业链的末端"} -{"key": "BAC009S0904W0291", "wav": "./aishell/wav/test/S0904/BAC009S0904W0291.wav", "txt": "期待着越来越多中国设计的产品不断涌现并享誉国际"} -{"key": "BAC009S0904W0292", "wav": "./aishell/wav/test/S0904/BAC009S0904W0292.wav", "txt": "未必印在每个产品上"} -{"key": "BAC009S0904W0293", "wav": "./aishell/wav/test/S0904/BAC009S0904W0293.wav", "txt": "但应刻在每个中国企业家甚至每个中国人心里"} -{"key": "BAC009S0904W0295", "wav": "./aishell/wav/test/S0904/BAC009S0904W0295.wav", "txt": "中国经营网注有国外媒体报道称"} -{"key": "BAC009S0904W0296", "wav": "./aishell/wav/test/S0904/BAC009S0904W0296.wav", "txt": "苹果市场价值达到七千亿美元刚刚过去几个月"} -{"key": "BAC009S0904W0297", "wav": "./aishell/wav/test/S0904/BAC009S0904W0297.wav", "txt": "已经有股票经纪公司预测"} -{"key": "BAC009S0904W0298", "wav": "./aishell/wav/test/S0904/BAC009S0904W0298.wav", "txt": "那么苹果能突破一万亿大关吗"} -{"key": "BAC009S0904W0299", "wav": "./aishell/wav/test/S0904/BAC009S0904W0299.wav", "txt": "苹果公司上次发布全新产品是在五年以前"} -{"key": "BAC009S0904W0301", "wav": "./aishell/wav/test/S0904/BAC009S0904W0301.wav", "txt": "苹果的目标股价也开始相应地上涨"} -{"key": "BAC009S0904W0302", "wav": "./aishell/wav/test/S0904/BAC009S0904W0302.wav", "txt": "苹果的市值可能将突破万亿美元"} -{"key": "BAC009S0904W0303", "wav": "./aishell/wav/test/S0904/BAC009S0904W0303.wav", "txt": "现在市面上的电话手表功能最主要有两个通话和定位"} -{"key": "BAC009S0904W0304", "wav": "./aishell/wav/test/S0904/BAC009S0904W0304.wav", "txt": "儿童电话手表还推出了其他更多人性化的创新功能"} -{"key": "BAC009S0904W0305", "wav": "./aishell/wav/test/S0904/BAC009S0904W0305.wav", "txt": "对手表的大力普及也起到了至关重要的作用"} -{"key": "BAC009S0904W0306", "wav": "./aishell/wav/test/S0904/BAC009S0904W0306.wav", "txt": "以小天才电话手表为例"} -{"key": "BAC009S0904W0307", "wav": "./aishell/wav/test/S0904/BAC009S0904W0307.wav", "txt": "除了能和手机一样接打电话"} -{"key": "BAC009S0904W0308", "wav": "./aishell/wav/test/S0904/BAC009S0904W0308.wav", "txt": "做到全方位亲子沟通"} -{"key": "BAC009S0904W0310", "wav": "./aishell/wav/test/S0904/BAC009S0904W0310.wav", "txt": "击掌成为加好友等功能也一应俱全"} -{"key": "BAC009S0904W0311", "wav": "./aishell/wav/test/S0904/BAC009S0904W0311.wav", "txt": "电话手表就相当于一部简化的智能手机"} -{"key": "BAC009S0904W0312", "wav": "./aishell/wav/test/S0904/BAC009S0904W0312.wav", "txt": "主要在于将通信和定位的模块大大缩小到方寸之间"} -{"key": "BAC009S0904W0313", "wav": "./aishell/wav/test/S0904/BAC009S0904W0313.wav", "txt": "置入只有手机几分之一大小的手表表盘"} -{"key": "BAC009S0904W0314", "wav": "./aishell/wav/test/S0904/BAC009S0904W0314.wav", "txt": "还要保证与手机一样的通话质量呢"} -{"key": "BAC009S0904W0315", "wav": "./aishell/wav/test/S0904/BAC009S0904W0315.wav", "txt": "这是摆在行业面前最大的技术难题"} -{"key": "BAC009S0904W0316", "wav": "./aishell/wav/test/S0904/BAC009S0904W0316.wav", "txt": "小天才产品负责人表示"} -{"key": "BAC009S0904W0317", "wav": "./aishell/wav/test/S0904/BAC009S0904W0317.wav", "txt": "以小天才电话手表为例"} -{"key": "BAC009S0904W0318", "wav": "./aishell/wav/test/S0904/BAC009S0904W0318.wav", "txt": "公司超百位研发人员历经半年多时间"} -{"key": "BAC009S0904W0319", "wav": "./aishell/wav/test/S0904/BAC009S0904W0319.wav", "txt": "投入巨资研究经费攻关"} -{"key": "BAC009S0904W0320", "wav": "./aishell/wav/test/S0904/BAC009S0904W0320.wav", "txt": "最后找到芬兰的高级技术团队"} -{"key": "BAC009S0904W0321", "wav": "./aishell/wav/test/S0904/BAC009S0904W0321.wav", "txt": "才解决电话手表的内线内置问题"} -{"key": "BAC009S0904W0322", "wav": "./aishell/wav/test/S0904/BAC009S0904W0322.wav", "txt": "对于这种突破性的天线内置方案"} -{"key": "BAC009S0904W0323", "wav": "./aishell/wav/test/S0904/BAC009S0904W0323.wav", "txt": "我们进行了极为严谨的测试"} -{"key": "BAC009S0904W0324", "wav": "./aishell/wav/test/S0904/BAC009S0904W0324.wav", "txt": "确保信号与手机相当才真正投放市场"} -{"key": "BAC009S0904W0325", "wav": "./aishell/wav/test/S0904/BAC009S0904W0325.wav", "txt": "对于创新成果的实证和检验"} -{"key": "BAC009S0904W0326", "wav": "./aishell/wav/test/S0904/BAC009S0904W0326.wav", "txt": "电话手表对儿童安全吗"} -{"key": "BAC009S0904W0327", "wav": "./aishell/wav/test/S0904/BAC009S0904W0327.wav", "txt": "儿童电话手表的辐射对儿童的健康安全是否存在隐患呢"} -{"key": "BAC009S0904W0328", "wav": "./aishell/wav/test/S0904/BAC009S0904W0328.wav", "txt": "这种说法到底有无科学依据呢"} -{"key": "BAC009S0904W0329", "wav": "./aishell/wav/test/S0904/BAC009S0904W0329.wav", "txt": "关于手机等产品的辐射问题"} -{"key": "BAC009S0904W0330", "wav": "./aishell/wav/test/S0904/BAC009S0904W0330.wav", "txt": "任何家用电器只要通电就会产生电磁辐射"} -{"key": "BAC009S0904W0331", "wav": "./aishell/wav/test/S0904/BAC009S0904W0331.wav", "txt": "大到空调电视机电脑微波炉加湿器"} -{"key": "BAC009S0904W0332", "wav": "./aishell/wav/test/S0904/BAC009S0904W0332.wav", "txt": "小到吹风机充电器甚至接线板都会产生电磁辐射"} -{"key": "BAC009S0904W0333", "wav": "./aishell/wav/test/S0904/BAC009S0904W0333.wav", "txt": "虽然电磁辐射无处不在"} -{"key": "BAC009S0904W0334", "wav": "./aishell/wav/test/S0904/BAC009S0904W0334.wav", "txt": "并非所有的电磁辐射都会对人体产生危害"} -{"key": "BAC009S0904W0335", "wav": "./aishell/wav/test/S0904/BAC009S0904W0335.wav", "txt": "中国电力科学研究院高级工程师邬雄表示"} -{"key": "BAC009S0904W0336", "wav": "./aishell/wav/test/S0904/BAC009S0904W0336.wav", "txt": "比如阳光也是一种电磁辐射"} -{"key": "BAC009S0904W0337", "wav": "./aishell/wav/test/S0904/BAC009S0904W0337.wav", "txt": "根据国际非电离辐射防护委员会制定的标准"} -{"key": "BAC009S0904W0338", "wav": "./aishell/wav/test/S0904/BAC009S0904W0338.wav", "txt": "北京市业馀排球联赛未来每年都将举办一届"} -{"key": "BAC009S0904W0339", "wav": "./aishell/wav/test/S0904/BAC009S0904W0339.wav", "txt": "并且会逐渐扩大比赛规模"} -{"key": "BAC009S0904W0340", "wav": "./aishell/wav/test/S0904/BAC009S0904W0340.wav", "txt": "筹备时间和比赛周期都将延长"} -{"key": "BAC009S0904W0341", "wav": "./aishell/wav/test/S0904/BAC009S0904W0341.wav", "txt": "参赛队伍数量也会有所提升"} -{"key": "BAC009S0904W0342", "wav": "./aishell/wav/test/S0904/BAC009S0904W0342.wav", "txt": "明年北京市业馀排球联赛将在中国排球协会备案"} -{"key": "BAC009S0904W0343", "wav": "./aishell/wav/test/S0904/BAC009S0904W0343.wav", "txt": "北京市排球协会与天津排协已经初步达成合作意向"} -{"key": "BAC009S0904W0344", "wav": "./aishell/wav/test/S0904/BAC009S0904W0344.wav", "txt": "今后北京与天津两地可能会联合办赛"} -{"key": "BAC009S0904W0345", "wav": "./aishell/wav/test/S0904/BAC009S0904W0345.wav", "txt": "通过冠军赛季后赛垫场赛等形式"} -{"key": "BAC009S0904W0346", "wav": "./aishell/wav/test/S0904/BAC009S0904W0346.wav", "txt": "通过未来几年的发展"} -{"key": "BAC009S0904W0347", "wav": "./aishell/wav/test/S0904/BAC009S0904W0347.wav", "txt": "影响力强的全国性比赛"} -{"key": "BAC009S0904W0348", "wav": "./aishell/wav/test/S0904/BAC009S0904W0348.wav", "txt": "高清图女排凯旋郎平受热捧"} -{"key": "BAC009S0904W0349", "wav": "./aishell/wav/test/S0904/BAC009S0904W0349.wav", "txt": "时隔一二年重夺世界杯冠军的中国女排"} -{"key": "BAC009S0904W0350", "wav": "./aishell/wav/test/S0904/BAC009S0904W0350.wav", "txt": "新队长曾春蕾揭秘了角色转变前后的幕后故事"} -{"key": "BAC009S0904W0351", "wav": "./aishell/wav/test/S0904/BAC009S0904W0351.wav", "txt": "并且介绍自己是如何通过实战调整状态而渐入佳境的"} -{"key": "BAC009S0904W0352", "wav": "./aishell/wav/test/S0904/BAC009S0904W0352.wav", "txt": "后两轮死磕俄罗斯和日本更是有红了眼的感觉"} -{"key": "BAC009S0904W0353", "wav": "./aishell/wav/test/S0904/BAC009S0904W0353.wav", "txt": "回忆起当时临危受命接班队长一职的情况"} -{"key": "BAC009S0904W0354", "wav": "./aishell/wav/test/S0904/BAC009S0904W0354.wav", "txt": "曾春蕾介绍是在中国女排出发的前一天"} -{"key": "BAC009S0904W0355", "wav": "./aishell/wav/test/S0904/BAC009S0904W0355.wav", "txt": "主教练郎平训练结束后通知她的"} -{"key": "BAC009S0904W0356", "wav": "./aishell/wav/test/S0904/BAC009S0904W0356.wav", "txt": "当时确实没有什么心理准备"} -{"key": "BAC009S0904W0357", "wav": "./aishell/wav/test/S0904/BAC009S0904W0357.wav", "txt": "虽然知道惠若琪的心脏不太好"} -{"key": "BAC009S0904W0358", "wav": "./aishell/wav/test/S0904/BAC009S0904W0358.wav", "txt": "但是也不好过问太多"} -{"key": "BAC009S0904W0359", "wav": "./aishell/wav/test/S0904/BAC009S0904W0359.wav", "txt": "结果等到的消息是她不能去世界杯"} -{"key": "BAC009S0904W0360", "wav": "./aishell/wav/test/S0904/BAC009S0904W0360.wav", "txt": "其实在二零一四年女排大奖赛的总决赛"} -{"key": "BAC009S0904W0361", "wav": "./aishell/wav/test/S0904/BAC009S0904W0361.wav", "txt": "曾春蕾就曾经临时客串过队长职务"} -{"key": "BAC009S0904W0362", "wav": "./aishell/wav/test/S0904/BAC009S0904W0362.wav", "txt": "不过和这次在世界杯当队长相比压力明显不同"} -{"key": "BAC009S0904W0363", "wav": "./aishell/wav/test/S0904/BAC009S0904W0363.wav", "txt": "这位北京姑娘直言在三大赛当队长的感觉很特殊"} -{"key": "BAC009S0904W0364", "wav": "./aishell/wav/test/S0904/BAC009S0904W0364.wav", "txt": "是心智上的一个考验"} -{"key": "BAC009S0904W0365", "wav": "./aishell/wav/test/S0904/BAC009S0904W0365.wav", "txt": "刚开始无谓的心理压力很大"} -{"key": "BAC009S0904W0366", "wav": "./aishell/wav/test/S0904/BAC009S0904W0366.wav", "txt": "甚至在头一场的比赛还影响到自己的技术发挥"} -{"key": "BAC009S0904W0367", "wav": "./aishell/wav/test/S0904/BAC009S0904W0367.wav", "txt": "好在队友们相互弥补得非常出色"} -{"key": "BAC009S0904W0368", "wav": "./aishell/wav/test/S0904/BAC009S0904W0368.wav", "txt": "曾春蕾通过自我调节而让竞技状态渐入佳境"} -{"key": "BAC009S0904W0369", "wav": "./aishell/wav/test/S0904/BAC009S0904W0369.wav", "txt": "在保障好技术稳定发挥的同时"} -{"key": "BAC009S0904W0370", "wav": "./aishell/wav/test/S0904/BAC009S0904W0370.wav", "txt": "还能够在情绪上带动队友"} -{"key": "BAC009S0904W0371", "wav": "./aishell/wav/test/S0904/BAC009S0904W0371.wav", "txt": "谈及当队长的责任感"} -{"key": "BAC009S0904W0372", "wav": "./aishell/wav/test/S0904/BAC009S0904W0372.wav", "txt": "曾春蕾认为中国女排的困难体现在伤病多"} -{"key": "BAC009S0904W0373", "wav": "./aishell/wav/test/S0904/BAC009S0904W0373.wav", "txt": "需要不停地告诫自己要淡定下来"} -{"key": "BAC009S0904W0374", "wav": "./aishell/wav/test/S0904/BAC009S0904W0374.wav", "txt": "毕竟她本人是经历过伦敦奥运会的"} -{"key": "BAC009S0904W0375", "wav": "./aishell/wav/test/S0904/BAC009S0904W0375.wav", "txt": "当队长的一举一动都会带来情绪上影响"} -{"key": "BAC009S0904W0376", "wav": "./aishell/wav/test/S0904/BAC009S0904W0376.wav", "txt": "因此一个眼神一个动作"} -{"key": "BAC009S0904W0377", "wav": "./aishell/wav/test/S0904/BAC009S0904W0377.wav", "txt": "都要给队友们传递乐观和放松的讯号"} -{"key": "BAC009S0904W0378", "wav": "./aishell/wav/test/S0904/BAC009S0904W0378.wav", "txt": "曾春蕾一记五米线的调整攻打得非常漂亮"} -{"key": "BAC009S0904W0379", "wav": "./aishell/wav/test/S0904/BAC009S0904W0379.wav", "txt": "评价自己发挥的最好一场其实就是本场比赛"} -{"key": "BAC009S0904W0380", "wav": "./aishell/wav/test/S0904/BAC009S0904W0380.wav", "txt": "因为和高手过招有种红了眼的感觉"} -{"key": "BAC009S0904W0381", "wav": "./aishell/wav/test/S0904/BAC009S0904W0381.wav", "txt": "个别球更是像释放怒火一般"} -{"key": "BAC009S0904W0382", "wav": "./aishell/wav/test/S0904/BAC009S0904W0382.wav", "txt": "桎梏挣脱开了就敢于发挥"} -{"key": "BAC009S0904W0383", "wav": "./aishell/wav/test/S0904/BAC009S0904W0383.wav", "txt": "由于中国女排的前期准备特别充分"} -{"key": "BAC009S0904W0384", "wav": "./aishell/wav/test/S0904/BAC009S0904W0384.wav", "txt": "这在曾春蕾看来打俄罗斯很有底"} -{"key": "BAC009S0904W0385", "wav": "./aishell/wav/test/S0904/BAC009S0904W0385.wav", "txt": "发挥也很从容和淡定"} -{"key": "BAC009S0904W0386", "wav": "./aishell/wav/test/S0904/BAC009S0904W0386.wav", "txt": "曾春蕾坦言打关键分的状态很忘我"} -{"key": "BAC009S0904W0387", "wav": "./aishell/wav/test/S0904/BAC009S0904W0387.wav", "txt": "打日本从来都不需要动员"} -{"key": "BAC009S0904W0388", "wav": "./aishell/wav/test/S0904/BAC009S0904W0388.wav", "txt": "队友彼此之间需要相互鼓励"} -{"key": "BAC009S0904W0389", "wav": "./aishell/wav/test/S0904/BAC009S0904W0389.wav", "txt": "但更多的是落实在技术环节的细腻方面"} -{"key": "BAC009S0904W0390", "wav": "./aishell/wav/test/S0904/BAC009S0904W0390.wav", "txt": "因为想要捧起来冠军奖杯的欲望太强烈"} -{"key": "BAC009S0904W0391", "wav": "./aishell/wav/test/S0904/BAC009S0904W0391.wav", "txt": "直通里约奥运会的目标也近在咫尺"} -{"key": "BAC009S0904W0392", "wav": "./aishell/wav/test/S0904/BAC009S0904W0392.wav", "txt": "身为大队员就会去提醒大家"} -{"key": "BAC009S0904W0393", "wav": "./aishell/wav/test/S0904/BAC009S0904W0393.wav", "txt": "将去年输球的原因作为教训反思"} -{"key": "BAC009S0904W0394", "wav": "./aishell/wav/test/S0904/BAC009S0904W0394.wav", "txt": "对垒日本女排有这样一个小细节"} -{"key": "BAC009S0904W0395", "wav": "./aishell/wav/test/S0904/BAC009S0904W0395.wav", "txt": "曾春蕾在刘晓彤一传失误后直接说我来"} -{"key": "BAC009S0904W0396", "wav": "./aishell/wav/test/S0904/BAC009S0904W0396.wav", "txt": "表明队长角色转换完成得还不错"} -{"key": "BAC009S0904W0397", "wav": "./aishell/wav/test/S0904/BAC009S0904W0397.wav", "txt": "自言就应该去承担更多的任务"} -{"key": "BAC009S0904W0398", "wav": "./aishell/wav/test/S0904/BAC009S0904W0398.wav", "txt": "曾春蕾保持着清醒的头脑"} -{"key": "BAC009S0904W0399", "wav": "./aishell/wav/test/S0904/BAC009S0904W0399.wav", "txt": "深知世界杯夺冠是对过去努力的肯定"} -{"key": "BAC009S0904W0400", "wav": "./aishell/wav/test/S0904/BAC009S0904W0400.wav", "txt": "但更多的是看到了自己的不足"} -{"key": "BAC009S0904W0401", "wav": "./aishell/wav/test/S0904/BAC009S0904W0401.wav", "txt": "也知道了未来需要努力的方向"} -{"key": "BAC009S0904W0402", "wav": "./aishell/wav/test/S0904/BAC009S0904W0402.wav", "txt": "视频中国三比一大胜俄罗斯独占女排世界杯榜首"} -{"key": "BAC009S0904W0403", "wav": "./aishell/wav/test/S0904/BAC009S0904W0403.wav", "txt": "日本二零一五女排世界杯单循环赛战至第十轮"} -{"key": "BAC009S0904W0404", "wav": "./aishell/wav/test/S0904/BAC009S0904W0404.wav", "txt": "不仅在全球收回十亿美元票房"} -{"key": "BAC009S0904W0405", "wav": "./aishell/wav/test/S0904/BAC009S0904W0405.wav", "txt": "且获得奥斯卡最佳动画片大奖"} -{"key": "BAC009S0904W0406", "wav": "./aishell/wav/test/S0904/BAC009S0904W0406.wav", "txt": "皮克斯终于有了拍摄玩具总动员四的计划"} -{"key": "BAC009S0904W0407", "wav": "./aishell/wav/test/S0904/BAC009S0904W0407.wav", "txt": "这部正在酝酿中的续集敲定了导演"} -{"key": "BAC009S0904W0408", "wav": "./aishell/wav/test/S0904/BAC009S0904W0408.wav", "txt": "但上台祝酒时都会交足戏"} -{"key": "BAC009S0904W0409", "wav": "./aishell/wav/test/S0904/BAC009S0904W0409.wav", "txt": "洪永城还主动跟陈展鹏碰杯"} -{"key": "BAC009S0904W0411", "wav": "./aishell/wav/test/S0904/BAC009S0904W0411.wav", "txt": "她自言最近在拍戏"} -{"key": "BAC009S0904W0412", "wav": "./aishell/wav/test/S0904/BAC009S0904W0412.wav", "txt": "戏中的角色常常以性感打扮示人"} -{"key": "BAC009S0904W0413", "wav": "./aishell/wav/test/S0904/BAC009S0904W0413.wav", "txt": "所以自己也很喜欢性感打扮"} -{"key": "BAC009S0904W0414", "wav": "./aishell/wav/test/S0904/BAC009S0904W0414.wav", "txt": "问及男友郑嘉颖会不会介意这么性感"} -{"key": "BAC009S0904W0416", "wav": "./aishell/wav/test/S0904/BAC009S0904W0416.wav", "txt": "这个程度是美的"} -{"key": "BAC009S0904W0417", "wav": "./aishell/wav/test/S0904/BAC009S0904W0417.wav", "txt": "他应该也是喜欢"} -{"key": "BAC009S0904W0418", "wav": "./aishell/wav/test/S0904/BAC009S0904W0418.wav", "txt": "问及最近是否有跟男友见面"} -{"key": "BAC009S0904W0420", "wav": "./aishell/wav/test/S0904/BAC009S0904W0420.wav", "txt": "自己也有一段时间没有跟他见面了"} -{"key": "BAC009S0904W0421", "wav": "./aishell/wav/test/S0904/BAC009S0904W0421.wav", "txt": "两人都是依赖电话沟通"} -{"key": "BAC009S0904W0422", "wav": "./aishell/wav/test/S0904/BAC009S0904W0422.wav", "txt": "自己也很期待九月中旬和男友见面"} -{"key": "BAC009S0904W0423", "wav": "./aishell/wav/test/S0904/BAC009S0904W0423.wav", "txt": "并大呼我自己也非常期待他回来"} -{"key": "BAC009S0904W0424", "wav": "./aishell/wav/test/S0904/BAC009S0904W0424.wav", "txt": "因为很久了很想念他"} -{"key": "BAC009S0904W0425", "wav": "./aishell/wav/test/S0904/BAC009S0904W0425.wav", "txt": "问及见面后两人怎样庆祝"} -{"key": "BAC009S0904W0426", "wav": "./aishell/wav/test/S0904/BAC009S0904W0426.wav", "txt": "她表示应该是吃吃饭看电影之类的"} -{"key": "BAC009S0904W0428", "wav": "./aishell/wav/test/S0904/BAC009S0904W0428.wav", "txt": "是否会请教男友拍戏上的问题"} -{"key": "BAC009S0904W0429", "wav": "./aishell/wav/test/S0904/BAC009S0904W0429.wav", "txt": "她透露有些不懂的会问男友郑嘉颖"} -{"key": "BAC009S0904W0430", "wav": "./aishell/wav/test/S0904/BAC009S0904W0430.wav", "txt": "对方给了她很大的帮助"} -{"key": "BAC009S0904W0431", "wav": "./aishell/wav/test/S0904/BAC009S0904W0431.wav", "txt": "图自网络温州网讯有网友爆料"} -{"key": "BAC009S0904W0432", "wav": "./aishell/wav/test/S0904/BAC009S0904W0432.wav", "txt": "温州瑞安一驾考考生在科目三考试中突然晕了过去"} -{"key": "BAC009S0904W0433", "wav": "./aishell/wav/test/S0904/BAC009S0904W0433.wav", "txt": "送到医院时已没有呼吸"} -{"key": "BAC009S0904W0434", "wav": "./aishell/wav/test/S0904/BAC009S0904W0434.wav", "txt": "现场图温都讯今天下午四时许"} -{"key": "BAC009S0904W0435", "wav": "./aishell/wav/test/S0904/BAC009S0904W0435.wav", "txt": "看来温州市区电梯也该大整修了"} -{"key": "BAC009S0904W0436", "wav": "./aishell/wav/test/S0904/BAC009S0904W0436.wav", "txt": "温州一网友造谣苏迪罗登陆期间水库崩塌被拘"} -{"key": "BAC009S0904W0437", "wav": "./aishell/wav/test/S0904/BAC009S0904W0437.wav", "txt": "澎湃新闻八月一零日从浙江温州平阳警方获悉"} -{"key": "BAC009S0904W0438", "wav": "./aishell/wav/test/S0904/BAC009S0904W0438.wav", "txt": "因在台风苏迪罗登陆期间在网络散布水库崩塌谣言"} -{"key": "BAC009S0904W0439", "wav": "./aishell/wav/test/S0904/BAC009S0904W0439.wav", "txt": "温州一路虎店隐瞒新车维修史被判赔三一四万"} -{"key": "BAC009S0904W0440", "wav": "./aishell/wav/test/S0904/BAC009S0904W0440.wav", "txt": "温州新力虎汽车销售公司展示台"} -{"key": "BAC009S0904W0441", "wav": "./aishell/wav/test/S0904/BAC009S0904W0441.wav", "txt": "温州一酒店窝头三八元一个三盘消费一七一零元"} -{"key": "BAC009S0904W0442", "wav": "./aishell/wav/test/S0904/BAC009S0904W0442.wav", "txt": "温州一闲置地块填满垃圾臭味浓烈附近居民不敢开窗"} -{"key": "BAC009S0904W0443", "wav": "./aishell/wav/test/S0904/BAC009S0904W0443.wav", "txt": "小区外的空置地上填满垃圾近日"} -{"key": "BAC009S0904W0444", "wav": "./aishell/wav/test/S0904/BAC009S0904W0444.wav", "txt": "与小区只有一河之隔的东边"} -{"key": "BAC009S0904W0445", "wav": "./aishell/wav/test/S0904/BAC009S0904W0445.wav", "txt": "因惠民路南段从去年开通后"} -{"key": "BAC009S0904W0446", "wav": "./aishell/wav/test/S0904/BAC009S0904W0446.wav", "txt": "对一块闲置地监管没有跟上"} -{"key": "BAC009S0904W0447", "wav": "./aishell/wav/test/S0904/BAC009S0904W0447.wav", "txt": "近一年来每天晚上有垃圾倒在这块闲置地上"} -{"key": "BAC009S0904W0448", "wav": "./aishell/wav/test/S0904/BAC009S0904W0448.wav", "txt": "垃圾刺鼻的臭味害得住户们连窗户都不敢打开"} -{"key": "BAC009S0904W0449", "wav": "./aishell/wav/test/S0904/BAC009S0904W0449.wav", "txt": "此前温州政协委员连续两届提出要求整治垃圾污染问题"} -{"key": "BAC009S0904W0450", "wav": "./aishell/wav/test/S0904/BAC009S0904W0450.wav", "txt": "日前本报记者前往实地调查了解"} -{"key": "BAC009S0904W0451", "wav": "./aishell/wav/test/S0904/BAC009S0904W0451.wav", "txt": "温州三学生为庆生爬上浙江第一高楼玩自拍"} -{"key": "BAC009S0904W0452", "wav": "./aishell/wav/test/S0904/BAC009S0904W0452.wav", "txt": "再上到楼顶一座高约四零米的铁塔上"} -{"key": "BAC009S0904W0453", "wav": "./aishell/wav/test/S0904/BAC009S0904W0453.wav", "txt": "并在铁塔上借助自拍杆合影"} -{"key": "BAC009S0904W0454", "wav": "./aishell/wav/test/S0904/BAC009S0904W0454.wav", "txt": "一则长达一分五七秒的视频在网络上热传"} -{"key": "BAC009S0904W0455", "wav": "./aishell/wav/test/S0904/BAC009S0904W0455.wav", "txt": "有网友称之为青春任性"} -{"key": "BAC009S0904W0456", "wav": "./aishell/wav/test/S0904/BAC009S0904W0456.wav", "txt": "温州三家熟食店摊主被捕为求卖相好添加日落黄"} -{"key": "BAC009S0904W0457", "wav": "./aishell/wav/test/S0904/BAC009S0904W0457.wav", "txt": "本报讯记者范跃红通讯员瓯文为了卖相好"} -{"key": "BAC009S0904W0458", "wav": "./aishell/wav/test/S0904/BAC009S0904W0458.wav", "txt": "温州两女孩溺水救护车因车多路堵错过救援时间"} -{"key": "BAC009S0904W0459", "wav": "./aishell/wav/test/S0904/BAC009S0904W0459.wav", "txt": "温州两男子为争女人驾奔驰宝马街头四次对撞"} -{"key": "BAC009S0904W0460", "wav": "./aishell/wav/test/S0904/BAC009S0904W0460.wav", "txt": "瑞安市商业街和联中路交叉口"} -{"key": "BAC009S0904W0461", "wav": "./aishell/wav/test/S0904/BAC009S0904W0461.wav", "txt": "一辆宝马和一辆奔驰四次相撞"} -{"key": "BAC009S0904W0462", "wav": "./aishell/wav/test/S0904/BAC009S0904W0462.wav", "txt": "两车损失高达数十万元"} -{"key": "BAC009S0904W0463", "wav": "./aishell/wav/test/S0904/BAC009S0904W0463.wav", "txt": "温州企业家卖房建养老院捐给当地却被闲置三年"} -{"key": "BAC009S0904W0464", "wav": "./aishell/wav/test/S0904/BAC009S0904W0464.wav", "txt": "浙江温州乐清七五岁的企业家虞一杰退休之后"} -{"key": "BAC009S0904W0465", "wav": "./aishell/wav/test/S0904/BAC009S0904W0465.wav", "txt": "拿出了自己全部的积蓄"} -{"key": "BAC009S0904W0466", "wav": "./aishell/wav/test/S0904/BAC009S0904W0466.wav", "txt": "还卖了自己在杭州和乐清的房子"} -{"key": "BAC009S0904W0467", "wav": "./aishell/wav/test/S0904/BAC009S0904W0467.wav", "txt": "但是养老院建成至今已经有三年了"} -{"key": "BAC009S0904W0468", "wav": "./aishell/wav/test/S0904/BAC009S0904W0468.wav", "txt": "那原因到底在哪里呢"} -{"key": "BAC009S0904W0469", "wav": "./aishell/wav/test/S0904/BAC009S0904W0469.wav", "txt": "温州体育局官员逼女教练陪酒当地纪委介入调查"} -{"key": "BAC009S0904W0470", "wav": "./aishell/wav/test/S0904/BAC009S0904W0470.wav", "txt": "以给编制五险一金等为借口"} -{"key": "BAC009S0904W0471", "wav": "./aishell/wav/test/S0904/BAC009S0904W0471.wav", "txt": "诱逼女教练陪他喝酒吃饭唱歌"} -{"key": "BAC009S0904W0472", "wav": "./aishell/wav/test/S0904/BAC009S0904W0472.wav", "txt": "并贴出多张聊天记录截图"} -{"key": "BAC009S0904W0473", "wav": "./aishell/wav/test/S0904/BAC009S0904W0473.wav", "txt": "立即引起众多网友关注"} -{"key": "BAC009S0904W0474", "wav": "./aishell/wav/test/S0904/BAC009S0904W0474.wav", "txt": "温州六旬老人辗转各地看公厕一二年还债七六万元"} -{"key": "BAC009S0904W0475", "wav": "./aishell/wav/test/S0904/BAC009S0904W0475.wav", "txt": "温州网讯我不想死后给后人说闲话"} -{"key": "BAC009S0904W0476", "wav": "./aishell/wav/test/S0904/BAC009S0904W0476.wav", "txt": "省吃俭用也要把该还的钱尽力还掉"} -{"key": "BAC009S0904W0477", "wav": "./aishell/wav/test/S0904/BAC009S0904W0477.wav", "txt": "让借给我钱的好心人"} -{"key": "BAC009S0904W0478", "wav": "./aishell/wav/test/S0904/BAC009S0904W0478.wav", "txt": "这是富林愚老人发自内心的一句话"} -{"key": "BAC009S0904W0479", "wav": "./aishell/wav/test/S0904/BAC009S0904W0479.wav", "txt": "温州农贸市场现注胶虾业内人称为增加重量"} -{"key": "BAC009S0904W0480", "wav": "./aishell/wav/test/S0904/BAC009S0904W0480.wav", "txt": "虾里有明显的胶状物质图片来源网友微信日前"} -{"key": "BAC009S0904W0481", "wav": "./aishell/wav/test/S0904/BAC009S0904W0481.wav", "txt": "回家后发现大虾体内竟然被注射了不明胶状物"} -{"key": "BAC009S0904W0482", "wav": "./aishell/wav/test/S0904/BAC009S0904W0482.wav", "txt": "瑞安市市场监管局玉海所介入调查"} -{"key": "BAC009S0904W0483", "wav": "./aishell/wav/test/S0904/BAC009S0904W0483.wav", "txt": "当事水产摊贩已退还郑女士一零零元购虾款"} -{"key": "BAC009S0904W0484", "wav": "./aishell/wav/test/S0904/BAC009S0904W0484.wav", "txt": "温州化工仓库起火殃及附近河流大量死鱼漂河面"} -{"key": "BAC009S0904W0485", "wav": "./aishell/wav/test/S0904/BAC009S0904W0485.wav", "txt": "图为几天前村民拍到的河面"} -{"key": "BAC009S0904W0486", "wav": "./aishell/wav/test/S0904/BAC009S0904W0486.wav", "txt": "温州医生夫妇贩婴被批捕女儿欲捐款替父赎罪"} -{"key": "BAC009S0904W0487", "wav": "./aishell/wav/test/S0904/BAC009S0904W0487.wav", "txt": "温州医生涉贩卖儿童谎称活婴是死婴骗父母放弃"} -{"key": "BAC009S0904W0488", "wav": "./aishell/wav/test/S0904/BAC009S0904W0488.wav", "txt": "参考消息网九月二五日报道新报称"} -{"key": "BAC009S0904W0489", "wav": "./aishell/wav/test/S0904/BAC009S0904W0489.wav", "txt": "继陕西富平妇产科医生张淑侠因贩卖婴儿被判刑之后"} -{"key": "BAC009S0904W0490", "wav": "./aishell/wav/test/S0904/BAC009S0904W0490.wav", "txt": "中国再现医生涉嫌拐卖婴儿的案例"} -{"key": "BAC009S0904W0491", "wav": "./aishell/wav/test/S0904/BAC009S0904W0491.wav", "txt": "一对来自浙江温州的医生夫妇涉案被捕"} -{"key": "BAC009S0904W0492", "wav": "./aishell/wav/test/S0904/BAC009S0904W0492.wav", "txt": "温州瑞安市发生一起违停女司机故意伤害交通协警案件"} -{"key": "BAC009S0904W0493", "wav": "./aishell/wav/test/S0904/BAC009S0904W0493.wav", "txt": "温州城管协管员掌掴女清洁工被停职"} -{"key": "BAC009S0904W0494", "wav": "./aishell/wav/test/S0904/BAC009S0904W0494.wav", "txt": "该段视频时长仅有六秒"} -{"key": "BAC009S0904W0495", "wav": "./aishell/wav/test/S0904/BAC009S0904W0495.wav", "txt": "一名路人疾步上前将男子拦开"} -{"key": "BAC009S0905W0121", "wav": "./aishell/wav/test/S0905/BAC009S0905W0121.wav", "txt": "公积金贷款额度的提高"} -{"key": "BAC009S0905W0122", "wav": "./aishell/wav/test/S0905/BAC009S0905W0122.wav", "txt": "确实降低了刚需人群购房成本"} -{"key": "BAC009S0905W0123", "wav": "./aishell/wav/test/S0905/BAC009S0905W0123.wav", "txt": "对房地产市场的利好影响将是长期的"} -{"key": "BAC009S0905W0124", "wav": "./aishell/wav/test/S0905/BAC009S0905W0124.wav", "txt": "而这一落地难题也会影响其对刚需市场的支持效力"} -{"key": "BAC009S0905W0125", "wav": "./aishell/wav/test/S0905/BAC009S0905W0125.wav", "txt": "这一公积金新政实际上仍然存在很多门槛"} -{"key": "BAC009S0905W0126", "wav": "./aishell/wav/test/S0905/BAC009S0905W0126.wav", "txt": "在住房公积金贷款的申请过程中"} -{"key": "BAC009S0905W0127", "wav": "./aishell/wav/test/S0905/BAC009S0905W0127.wav", "txt": "有些要与房企具体项目挂钩"} -{"key": "BAC009S0905W0128", "wav": "./aishell/wav/test/S0905/BAC009S0905W0128.wav", "txt": "在公积金贷款额度上调后一个月内"} -{"key": "BAC009S0905W0129", "wav": "./aishell/wav/test/S0905/BAC009S0905W0129.wav", "txt": "北京公积金贷款成交量上涨百分之五"} -{"key": "BAC009S0905W0130", "wav": "./aishell/wav/test/S0905/BAC009S0905W0130.wav", "txt": "金融市场总体平稳鲁指冲高回落"} -{"key": "BAC009S0905W0131", "wav": "./aishell/wav/test/S0905/BAC009S0905W0131.wav", "txt": "但专家预计短期央行仍可能会积极维稳"} -{"key": "BAC009S0905W0132", "wav": "./aishell/wav/test/S0905/BAC009S0905W0132.wav", "txt": "汇率较大概率维持双向"} -{"key": "BAC009S0905W0133", "wav": "./aishell/wav/test/S0905/BAC009S0905W0133.wav", "txt": "相关公司股票走势招商银行"} -{"key": "BAC009S0905W0134", "wav": "./aishell/wav/test/S0905/BAC009S0905W0134.wav", "txt": "降准降息或再掀收益率的下降潮"} -{"key": "BAC009S0905W0135", "wav": "./aishell/wav/test/S0905/BAC009S0905W0135.wav", "txt": "双降加上广州公积金贷款新政落地"} -{"key": "BAC009S0905W0136", "wav": "./aishell/wav/test/S0905/BAC009S0905W0136.wav", "txt": "上周末成为潜在买家们争相咨询看楼的时机"} -{"key": "BAC009S0905W0137", "wav": "./aishell/wav/test/S0905/BAC009S0905W0137.wav", "txt": "期待岁末能有更多利好出现"} -{"key": "BAC009S0905W0138", "wav": "./aishell/wav/test/S0905/BAC009S0905W0138.wav", "txt": "第四季度二手住宅成交量将环比增幅在百分之七以内"} -{"key": "BAC009S0905W0139", "wav": "./aishell/wav/test/S0905/BAC009S0905W0139.wav", "txt": "价格要到明年初才出现上涨"} -{"key": "BAC009S0905W0140", "wav": "./aishell/wav/test/S0905/BAC009S0905W0140.wav", "txt": "广州日报讯记者林琳上周五"} -{"key": "BAC009S0905W0141", "wav": "./aishell/wav/test/S0905/BAC009S0905W0141.wav", "txt": "再加上广州公积金贷款新政终于落地"} -{"key": "BAC009S0905W0142", "wav": "./aishell/wav/test/S0905/BAC009S0905W0142.wav", "txt": "一系列利好消息影响下的首个周末"} -{"key": "BAC009S0905W0143", "wav": "./aishell/wav/test/S0905/BAC009S0905W0143.wav", "txt": "买家积极咨询看楼"} -{"key": "BAC009S0905W0144", "wav": "./aishell/wav/test/S0905/BAC009S0905W0144.wav", "txt": "降息消息传出后首日"} -{"key": "BAC009S0905W0145", "wav": "./aishell/wav/test/S0905/BAC009S0905W0145.wav", "txt": "地铺门店咨询量与七月同期相比约有百分之七左右的增幅"} -{"key": "BAC009S0905W0146", "wav": "./aishell/wav/test/S0905/BAC009S0905W0146.wav", "txt": "满堂红链家市场研究部高级经理周峰透露"} -{"key": "BAC009S0905W0147", "wav": "./aishell/wav/test/S0905/BAC009S0905W0147.wav", "txt": "店均电话咨询量比上一个周末增加十一百分之左右"} -{"key": "BAC009S0905W0148", "wav": "./aishell/wav/test/S0905/BAC009S0905W0148.wav", "txt": "看楼量对比上一周末大概增加百分之七左右"} -{"key": "BAC009S0905W0149", "wav": "./aishell/wav/test/S0905/BAC009S0905W0149.wav", "txt": "不过他认为这种增幅并不算太明显"} -{"key": "BAC009S0905W0150", "wav": "./aishell/wav/test/S0905/BAC009S0905W0150.wav", "txt": "搜房网广州二手房电商集团市场部总监罗来平发现"} -{"key": "BAC009S0905W0151", "wav": "./aishell/wav/test/S0905/BAC009S0905W0151.wav", "txt": "市场上约有两成业主反价"} -{"key": "BAC009S0905W0152", "wav": "./aishell/wav/test/S0905/BAC009S0905W0152.wav", "txt": "一个天河区的中介人士告诉记者"} -{"key": "BAC009S0905W0153", "wav": "./aishell/wav/test/S0905/BAC009S0905W0153.wav", "txt": "市民对连续多次降息已经麻木了"} -{"key": "BAC009S0905W0154", "wav": "./aishell/wav/test/S0905/BAC009S0905W0154.wav", "txt": "公积金贷款新政出台"} -{"key": "BAC009S0905W0155", "wav": "./aishell/wav/test/S0905/BAC009S0905W0155.wav", "txt": "市场不可能那么快有反应"} -{"key": "BAC009S0905W0156", "wav": "./aishell/wav/test/S0905/BAC009S0905W0156.wav", "txt": "七月广州二手楼市交投升温的态势已相当明确"} -{"key": "BAC009S0905W0157", "wav": "./aishell/wav/test/S0905/BAC009S0905W0157.wav", "txt": "按照这一趋势发展下去"} -{"key": "BAC009S0905W0158", "wav": "./aishell/wav/test/S0905/BAC009S0905W0158.wav", "txt": "再加上央行降息以及公积金新政等利好的叠加效应"} -{"key": "BAC009S0905W0159", "wav": "./aishell/wav/test/S0905/BAC009S0905W0159.wav", "txt": "有望进一步激活买家在接近年底这段时间的入市积极性"} -{"key": "BAC009S0905W0160", "wav": "./aishell/wav/test/S0905/BAC009S0905W0160.wav", "txt": "据阳光家缘网站公布数据统计"} -{"key": "BAC009S0905W0161", "wav": "./aishell/wav/test/S0905/BAC009S0905W0161.wav", "txt": "广州二手住宅市场七月的网签量已达一千套"} -{"key": "BAC009S0905W0162", "wav": "./aishell/wav/test/S0905/BAC009S0905W0162.wav", "txt": "广州二手住宅市场网签量达一千套"} -{"key": "BAC009S0905W0163", "wav": "./aishell/wav/test/S0905/BAC009S0905W0163.wav", "txt": "超过五月七千套的水平"} -{"key": "BAC009S0905W0164", "wav": "./aishell/wav/test/S0905/BAC009S0905W0164.wav", "txt": "目前市场上的低价房源已基本消耗完毕"} -{"key": "BAC009S0905W0165", "wav": "./aishell/wav/test/S0905/BAC009S0905W0165.wav", "txt": "广州二手房迎来新一轮涨价潮"} -{"key": "BAC009S0905W0166", "wav": "./aishell/wav/test/S0905/BAC009S0905W0166.wav", "txt": "搜房网广州二手房统计中心数据显示"} -{"key": "BAC009S0905W0167", "wav": "./aishell/wav/test/S0905/BAC009S0905W0167.wav", "txt": "广州五月二手房均价为一千元每平方米"} -{"key": "BAC009S0905W0168", "wav": "./aishell/wav/test/S0905/BAC009S0905W0168.wav", "txt": "比月初增长了一百元每平方米"} -{"key": "BAC009S0905W0169", "wav": "./aishell/wav/test/S0905/BAC009S0905W0169.wav", "txt": "因此判断随着利好政策的实施和成交量的增加"} -{"key": "BAC009S0905W0170", "wav": "./aishell/wav/test/S0905/BAC009S0905W0170.wav", "txt": "今年的房价还会有上升空间"} -{"key": "BAC009S0905W0171", "wav": "./aishell/wav/test/S0905/BAC009S0905W0171.wav", "txt": "广州还是在执行严厉的限购政策"} -{"key": "BAC009S0905W0172", "wav": "./aishell/wav/test/S0905/BAC009S0905W0172.wav", "txt": "我预计市场成交量会有所增加"} -{"key": "BAC009S0905W0173", "wav": "./aishell/wav/test/S0905/BAC009S0905W0173.wav", "txt": "但增加的幅度不会太大"} -{"key": "BAC009S0905W0174", "wav": "./aishell/wav/test/S0905/BAC009S0905W0174.wav", "txt": "他预测今年剩馀的两个月中"} -{"key": "BAC009S0905W0175", "wav": "./aishell/wav/test/S0905/BAC009S0905W0175.wav", "txt": "昨日人民币汇率小幅走弱"} -{"key": "BAC009S0905W0176", "wav": "./aishell/wav/test/S0905/BAC009S0905W0176.wav", "txt": "人民币中间价"} -{"key": "BAC009S0905W0177", "wav": "./aishell/wav/test/S0905/BAC009S0905W0177.wav", "txt": "美丽北京大型绿色公益品牌项目"} -{"key": "BAC009S0905W0178", "wav": "./aishell/wav/test/S0905/BAC009S0905W0178.wav", "txt": "在岸人民币兑美元收盘下跌百分之一"} -{"key": "BAC009S0905W0179", "wav": "./aishell/wav/test/S0905/BAC009S0905W0179.wav", "txt": "双降后首日在岸人民币由弱转强"} -{"key": "BAC009S0905W0180", "wav": "./aishell/wav/test/S0905/BAC009S0905W0180.wav", "txt": "人民币成交额减少百分之一"} -{"key": "BAC009S0905W0181", "wav": "./aishell/wav/test/S0905/BAC009S0905W0181.wav", "txt": "报七千亿美元"} -{"key": "BAC009S0905W0182", "wav": "./aishell/wav/test/S0905/BAC009S0905W0182.wav", "txt": "上周五的双降政策让市场担忧"} -{"key": "BAC009S0905W0183", "wav": "./aishell/wav/test/S0905/BAC009S0905W0183.wav", "txt": "投金或在经济增长速度放缓形势下加速外流"} -{"key": "BAC009S0905W0184", "wav": "./aishell/wav/test/S0905/BAC009S0905W0184.wav", "txt": "投资者担心这将加重人民币所面临的压力"} -{"key": "BAC009S0905W0185", "wav": "./aishell/wav/test/S0905/BAC009S0905W0185.wav", "txt": "就在上周五双降公布之后"} -{"key": "BAC009S0905W0186", "wav": "./aishell/wav/test/S0905/BAC009S0905W0186.wav", "txt": "招商银行同业金融部高级分析师刘东亮指出"} -{"key": "BAC009S0905W0187", "wav": "./aishell/wav/test/S0905/BAC009S0905W0187.wav", "txt": "加快老旧渔船更新改造"} -{"key": "BAC009S0905W0188", "wav": "./aishell/wav/test/S0905/BAC009S0905W0188.wav", "txt": "不断增强农业可持续发展能力"} -{"key": "BAC009S0905W0189", "wav": "./aishell/wav/test/S0905/BAC009S0905W0189.wav", "txt": "创建国家现代农业示范区"} -{"key": "BAC009S0905W0190", "wav": "./aishell/wav/test/S0905/BAC009S0905W0190.wav", "txt": "加大示范区建设力度"} -{"key": "BAC009S0905W0191", "wav": "./aishell/wav/test/S0905/BAC009S0905W0191.wav", "txt": "加大示范目建设投入力度"} -{"key": "BAC009S0905W0192", "wav": "./aishell/wav/test/S0905/BAC009S0905W0192.wav", "txt": "努力打造现代农业发展的典型和样板"} -{"key": "BAC009S0905W0193", "wav": "./aishell/wav/test/S0905/BAC009S0905W0193.wav", "txt": "发挥示范区引领作用"} -{"key": "BAC009S0905W0194", "wav": "./aishell/wav/test/S0905/BAC009S0905W0194.wav", "txt": "通过产业拉动技术辐射和人员培训等"} -{"key": "BAC009S0905W0195", "wav": "./aishell/wav/test/S0905/BAC009S0905W0195.wav", "txt": "带动周边地区现代农业加快发展"} -{"key": "BAC009S0905W0196", "wav": "./aishell/wav/test/S0905/BAC009S0905W0196.wav", "txt": "引导各地鉴借示范区发展现代农业的好做法和好经验"} -{"key": "BAC009S0905W0197", "wav": "./aishell/wav/test/S0905/BAC009S0905W0197.wav", "txt": "推动创建不同层次特色鲜明的现代农业示范区"} -{"key": "BAC009S0905W0198", "wav": "./aishell/wav/test/S0905/BAC009S0905W0198.wav", "txt": "按照分类指导突出重点梯次推进的思路"} -{"key": "BAC009S0905W0199", "wav": "./aishell/wav/test/S0905/BAC009S0905W0199.wav", "txt": "以七区二十三带农业战略格局为核心"} -{"key": "BAC009S0905W0200", "wav": "./aishell/wav/test/S0905/BAC009S0905W0200.wav", "txt": "着力建设重点推进率先实现和稳步发展三类区域"} -{"key": "BAC009S0905W0201", "wav": "./aishell/wav/test/S0905/BAC009S0905W0201.wav", "txt": "引领全国现代农业加快发展"} -{"key": "BAC009S0905W0202", "wav": "./aishell/wav/test/S0905/BAC009S0905W0202.wav", "txt": "重点推进区域"} -{"key": "BAC009S0905W0203", "wav": "./aishell/wav/test/S0905/BAC009S0905W0203.wav", "txt": "农业生产技术较为成熟"} -{"key": "BAC009S0905W0204", "wav": "./aishell/wav/test/S0905/BAC009S0905W0204.wav", "txt": "农业生产条件具有良好基础"} -{"key": "BAC009S0905W0205", "wav": "./aishell/wav/test/S0905/BAC009S0905W0205.wav", "txt": "承担着主要农产品供给保证的主体功能"} -{"key": "BAC009S0905W0206", "wav": "./aishell/wav/test/S0905/BAC009S0905W0206.wav", "txt": "加快推进该区域现代农业建设"} -{"key": "BAC009S0905W0207", "wav": "./aishell/wav/test/S0905/BAC009S0905W0207.wav", "txt": "事关全国农业现代化进程和国家粮食安全大局"} -{"key": "BAC009S0905W0208", "wav": "./aishell/wav/test/S0905/BAC009S0905W0208.wav", "txt": "继续发挥该区域粮食安全基础保障作用"} -{"key": "BAC009S0905W0209", "wav": "./aishell/wav/test/S0905/BAC009S0905W0209.wav", "txt": "调动各方发展粮食生产积极性"} -{"key": "BAC009S0905W0210", "wav": "./aishell/wav/test/S0905/BAC009S0905W0210.wav", "txt": "以建设小麦玉米水稻大豆优势产业带为重点"} -{"key": "BAC009S0905W0211", "wav": "./aishell/wav/test/S0905/BAC009S0905W0211.wav", "txt": "深入开展粮食稳定增产行动"} -{"key": "BAC009S0905W0212", "wav": "./aishell/wav/test/S0905/BAC009S0905W0212.wav", "txt": "加强农田水利和高标准农田建设"} -{"key": "BAC009S0905W0213", "wav": "./aishell/wav/test/S0905/BAC009S0905W0213.wav", "txt": "提高农机装备和作业水平"} -{"key": "BAC009S0905W0214", "wav": "./aishell/wav/test/S0905/BAC009S0905W0214.wav", "txt": "大力开展高产创建和科技指导服务"} -{"key": "BAC009S0905W0215", "wav": "./aishell/wav/test/S0905/BAC009S0905W0215.wav", "txt": "推广防灾减灾增产关键技术"} -{"key": "BAC009S0905W0216", "wav": "./aishell/wav/test/S0905/BAC009S0905W0216.wav", "txt": "加快选育应用优良品种"} -{"key": "BAC009S0905W0217", "wav": "./aishell/wav/test/S0905/BAC009S0905W0217.wav", "txt": "大幅度提升粮食综合生产能力和现代化生产水平"} -{"key": "BAC009S0905W0218", "wav": "./aishell/wav/test/S0905/BAC009S0905W0218.wav", "txt": "大力发展粮食精深加工及仓储物流业"} -{"key": "BAC009S0905W0219", "wav": "./aishell/wav/test/S0905/BAC009S0905W0219.wav", "txt": "完善粮食仓储运输设备"} -{"key": "BAC009S0905W0220", "wav": "./aishell/wav/test/S0905/BAC009S0905W0220.wav", "txt": "引导龙头企业向优势产区集聚"} -{"key": "BAC009S0905W0221", "wav": "./aishell/wav/test/S0905/BAC009S0905W0221.wav", "txt": "提高粮食生产综合效益"} -{"key": "BAC009S0905W0222", "wav": "./aishell/wav/test/S0905/BAC009S0905W0222.wav", "txt": "其他主要农产品优势区"} -{"key": "BAC009S0905W0223", "wav": "./aishell/wav/test/S0905/BAC009S0905W0223.wav", "txt": "以及蔬菜蚕卓等农产品生产的主体区域"} -{"key": "BAC009S0905W0224", "wav": "./aishell/wav/test/S0905/BAC009S0905W0224.wav", "txt": "以建设区域内各类农产品优势产业带为重点"} -{"key": "BAC009S0905W0225", "wav": "./aishell/wav/test/S0905/BAC009S0905W0225.wav", "txt": "提高资源利用率和加工转化率"} -{"key": "BAC009S0905W0226", "wav": "./aishell/wav/test/S0905/BAC009S0905W0226.wav", "txt": "继续巩固棉油糖水果和蔬菜等产品供给保证地位"} -{"key": "BAC009S0905W0227", "wav": "./aishell/wav/test/S0905/BAC009S0905W0227.wav", "txt": "着力强化技术装备支撑"} -{"key": "BAC009S0905W0228", "wav": "./aishell/wav/test/S0905/BAC009S0905W0228.wav", "txt": "提高现代化生产水平"} -{"key": "BAC009S0905W0229", "wav": "./aishell/wav/test/S0905/BAC009S0905W0229.wav", "txt": "强化出口水产品生产基地功能"} -{"key": "BAC009S0905W0230", "wav": "./aishell/wav/test/S0905/BAC009S0905W0230.wav", "txt": "加快现代养殖业发展"} -{"key": "BAC009S0905W0231", "wav": "./aishell/wav/test/S0905/BAC009S0905W0231.wav", "txt": "率先实现区域"} -{"key": "BAC009S0905W0232", "wav": "./aishell/wav/test/S0905/BAC009S0905W0232.wav", "txt": "该区域交通区位市场和人力资源优势明显"} -{"key": "BAC009S0905W0233", "wav": "./aishell/wav/test/S0905/BAC009S0905W0233.wav", "txt": "资本技术等现代化生产要素集约化程度高"} -{"key": "BAC009S0905W0234", "wav": "./aishell/wav/test/S0905/BAC009S0905W0234.wav", "txt": "加快该区域现代农业建设"} -{"key": "BAC009S0905W0235", "wav": "./aishell/wav/test/S0905/BAC009S0905W0235.wav", "txt": "对于引领全国现代农业加快发展具有重要意义"} -{"key": "BAC009S0905W0236", "wav": "./aishell/wav/test/S0905/BAC009S0905W0236.wav", "txt": "东部沿海先导农业区"} -{"key": "BAC009S0905W0237", "wav": "./aishell/wav/test/S0905/BAC009S0905W0237.wav", "txt": "大力发展资本技术密集型农业"} -{"key": "BAC009S0905W0238", "wav": "./aishell/wav/test/S0905/BAC009S0905W0238.wav", "txt": "保持耕地面积不减少"} -{"key": "BAC009S0905W0239", "wav": "./aishell/wav/test/S0905/BAC009S0905W0239.wav", "txt": "探索企业化集团化发展模式"} -{"key": "BAC009S0905W0240", "wav": "./aishell/wav/test/S0905/BAC009S0905W0240.wav", "txt": "大力推进标准化生产和集约化经营"} -{"key": "BAC009S0905W0241", "wav": "./aishell/wav/test/S0905/BAC009S0905W0241.wav", "txt": "提高信息化优质化和品牌化水平"} -{"key": "BAC009S0905W0242", "wav": "./aishell/wav/test/S0905/BAC009S0905W0242.wav", "txt": "提升产品的科技含量和附加值"} -{"key": "BAC009S0905W0243", "wav": "./aishell/wav/test/S0905/BAC009S0905W0243.wav", "txt": "大城市郊区多功能农业区"} -{"key": "BAC009S0905W0244", "wav": "./aishell/wav/test/S0905/BAC009S0905W0244.wav", "txt": "主要指沿海地区以外的直辖市省会城市等大城市郊区"} -{"key": "BAC009S0905W0245", "wav": "./aishell/wav/test/S0905/BAC009S0905W0245.wav", "txt": "统筹推进新一轮菜篮子工程建设"} -{"key": "BAC009S0905W0246", "wav": "./aishell/wav/test/S0905/BAC009S0905W0246.wav", "txt": "合理确定大城市郊区菜篮子产品生产用地保有数量"} -{"key": "BAC009S0905W0247", "wav": "./aishell/wav/test/S0905/BAC009S0905W0247.wav", "txt": "提高大城市菜篮子产品的自给率"} -{"key": "BAC009S0905W0248", "wav": "./aishell/wav/test/S0905/BAC009S0905W0248.wav", "txt": "在稳定城市副食品供应保证能力的基础上"} -{"key": "BAC009S0905W0249", "wav": "./aishell/wav/test/S0905/BAC009S0905W0249.wav", "txt": "全面推进机械化标准化品牌化产业化发展"} -{"key": "BAC009S0905W0250", "wav": "./aishell/wav/test/S0905/BAC009S0905W0250.wav", "txt": "加快农田基础设备和现代农业装备建设"} -{"key": "BAC009S0905W0251", "wav": "./aishell/wav/test/S0905/BAC009S0905W0251.wav", "txt": "着力建设国家商品粮供给重点保证区"} -{"key": "BAC009S0905W0252", "wav": "./aishell/wav/test/S0905/BAC009S0905W0252.wav", "txt": "提升垦区现代农业发展水平"} -{"key": "BAC009S0905W0253", "wav": "./aishell/wav/test/S0905/BAC009S0905W0253.wav", "txt": "业界首次开始认真讨论苹果市值晋升万亿大关的潜力"} -{"key": "BAC009S0905W0254", "wav": "./aishell/wav/test/S0905/BAC009S0905W0254.wav", "txt": "苹果股票价格创下历史新高"} -{"key": "BAC009S0905W0255", "wav": "./aishell/wav/test/S0905/BAC009S0905W0255.wav", "txt": "苹果市值超过七千亿美元"} -{"key": "BAC009S0905W0256", "wav": "./aishell/wav/test/S0905/BAC009S0905W0256.wav", "txt": "如果按照每股一二七美元的股价来算"} -{"key": "BAC009S0905W0257", "wav": "./aishell/wav/test/S0905/BAC009S0905W0257.wav", "txt": "那么苹果市价约为七四四十亿美元"} -{"key": "BAC009S0905W0258", "wav": "./aishell/wav/test/S0905/BAC009S0905W0258.wav", "txt": "这一价格也是目前华尔街给出的最高估值"} -{"key": "BAC009S0905W0261", "wav": "./aishell/wav/test/S0905/BAC009S0905W0261.wav", "txt": "随着四克网络的在中国的展开"} -{"key": "BAC009S0905W0262", "wav": "./aishell/wav/test/S0905/BAC009S0905W0262.wav", "txt": "苹果对电动汽车表现出的浓厚兴趣"} -{"key": "BAC009S0905W0263", "wav": "./aishell/wav/test/S0905/BAC009S0905W0263.wav", "txt": "也能够给股票市场来带更多兴奋"} -{"key": "BAC009S0905W0264", "wav": "./aishell/wav/test/S0905/BAC009S0905W0264.wav", "txt": "苹果将继续向股东返还现金"} -{"key": "BAC009S0905W0265", "wav": "./aishell/wav/test/S0905/BAC009S0905W0265.wav", "txt": "四月份或将采取更多的举动"} -{"key": "BAC009S0905W0266", "wav": "./aishell/wav/test/S0905/BAC009S0905W0266.wav", "txt": "这些力量的结合将会推动苹果的市盈率大幅上正"} -{"key": "BAC009S0905W0267", "wav": "./aishell/wav/test/S0905/BAC009S0905W0267.wav", "txt": "苹果公司的市价将突破一万亿美金大关"} -{"key": "BAC009S0905W0268", "wav": "./aishell/wav/test/S0905/BAC009S0905W0268.wav", "txt": "这只是最乐观的估计"} -{"key": "BAC009S0905W0269", "wav": "./aishell/wav/test/S0905/BAC009S0905W0269.wav", "txt": "苹果在成长为万亿美元市场的巨无霸之前"} -{"key": "BAC009S0905W0270", "wav": "./aishell/wav/test/S0905/BAC009S0905W0270.wav", "txt": "还有很多阻碍要解决"} -{"key": "BAC009S0905W0271", "wav": "./aishell/wav/test/S0905/BAC009S0905W0271.wav", "txt": "先是价格昂贵功能鸡肋的特点遭到一众业内人士吐槽"} -{"key": "BAC009S0905W0273", "wav": "./aishell/wav/test/S0905/BAC009S0905W0273.wav", "txt": "屏幕良品率仅在百分之三十至百分之四十之间"} -{"key": "BAC009S0905W0274", "wav": "./aishell/wav/test/S0905/BAC009S0905W0274.wav", "txt": "苹果公司现在已将约三百万的原始订单削减了一半"} -{"key": "BAC009S0905W0275", "wav": "./aishell/wav/test/S0905/BAC009S0905W0275.wav", "txt": "准备和特斯拉一较高下"} -{"key": "BAC009S0905W0276", "wav": "./aishell/wav/test/S0905/BAC009S0905W0276.wav", "txt": "但相对于传统的汽车制造工业"} -{"key": "BAC009S0905W0277", "wav": "./aishell/wav/test/S0905/BAC009S0905W0277.wav", "txt": "苹果作为消费数码产品的公司是否具备造车能力"} -{"key": "BAC009S0905W0279", "wav": "./aishell/wav/test/S0905/BAC009S0905W0279.wav", "txt": "目前大部分华尔街分析师们都对苹果的未来保持乐观"} -{"key": "BAC009S0905W0280", "wav": "./aishell/wav/test/S0905/BAC009S0905W0280.wav", "txt": "仅有三点百分之四的分析师建议卖出"} -{"key": "BAC009S0905W0281", "wav": "./aishell/wav/test/S0905/BAC009S0905W0281.wav", "txt": "中国经营网注有国外媒体报道称"} -{"key": "BAC009S0905W0282", "wav": "./aishell/wav/test/S0905/BAC009S0905W0282.wav", "txt": "苹果市场价值达到七千亿美元刚刚过去几个月"} -{"key": "BAC009S0905W0283", "wav": "./aishell/wav/test/S0905/BAC009S0905W0283.wav", "txt": "已经有股票经纪公司预测"} -{"key": "BAC009S0905W0284", "wav": "./aishell/wav/test/S0905/BAC009S0905W0284.wav", "txt": "苹果能否摆脱王者魔咒"} -{"key": "BAC009S0905W0285", "wav": "./aishell/wav/test/S0905/BAC009S0905W0285.wav", "txt": "苹果晋身道指固属众望所归"} -{"key": "BAC009S0905W0287", "wav": "./aishell/wav/test/S0905/BAC009S0905W0287.wav", "txt": "而苹果得以顺利跻身道指"} -{"key": "BAC009S0905W0288", "wav": "./aishell/wav/test/S0905/BAC009S0905W0288.wav", "txt": "亦拜股份去年六月一拆七所赐"} -{"key": "BAC009S0905W0289", "wav": "./aishell/wav/test/S0905/BAC009S0905W0289.wav", "txt": "却完全不足以彰显编制机构与时并进"} -{"key": "BAC009S0905W0290", "wav": "./aishell/wav/test/S0905/BAC009S0905W0290.wav", "txt": "苹果固然不会因此而升格"} -{"key": "BAC009S0905W0292", "wav": "./aishell/wav/test/S0905/BAC009S0905W0292.wav", "txt": "毕竟还有许多人的心愿"} -{"key": "BAC009S0905W0293", "wav": "./aishell/wav/test/S0905/BAC009S0905W0293.wav", "txt": "老毕于跟苹果押注太阳能一文问过大家"} -{"key": "BAC009S0905W0294", "wav": "./aishell/wav/test/S0905/BAC009S0905W0294.wav", "txt": "苹果股价在说不准的时间内有望上升三成"} -{"key": "BAC009S0905W0295", "wav": "./aishell/wav/test/S0905/BAC009S0905W0295.wav", "txt": "是否能令捧场客心满意足"} -{"key": "BAC009S0905W0296", "wav": "./aishell/wav/test/S0905/BAC009S0905W0296.wav", "txt": "问题焦点若是太阳能"} -{"key": "BAC009S0905W0297", "wav": "./aishell/wav/test/S0905/BAC009S0905W0297.wav", "txt": "诸位自然不会满足于前面提及的潜在回报"} -{"key": "BAC009S0905W0298", "wav": "./aishell/wav/test/S0905/BAC009S0905W0298.wav", "txt": "这家市值离万亿美元不远的股王"} -{"key": "BAC009S0905W0299", "wav": "./aishell/wav/test/S0905/BAC009S0905W0299.wav", "txt": "难不成真能第三期发育"} -{"key": "BAC009S0905W0300", "wav": "./aishell/wav/test/S0905/BAC009S0905W0300.wav", "txt": "读者若信经济学人"} -{"key": "BAC009S0905W0302", "wav": "./aishell/wav/test/S0905/BAC009S0905W0302.wav", "txt": "若定苹果第三期发育的立场已呼之欲出"} -{"key": "BAC009S0905W0303", "wav": "./aishell/wav/test/S0905/BAC009S0905W0303.wav", "txt": "手机辐射的比吸收率最高限值为二瓦特每千克"} -{"key": "BAC009S0905W0304", "wav": "./aishell/wav/test/S0905/BAC009S0905W0304.wav", "txt": "我国的标准和国际差不多"} -{"key": "BAC009S0905W0306", "wav": "./aishell/wav/test/S0905/BAC009S0905W0306.wav", "txt": "对生活中的电磁辐射进行了全面健康风险评估"} -{"key": "BAC009S0905W0307", "wav": "./aishell/wav/test/S0905/BAC009S0905W0307.wav", "txt": "不存在实际健康问题"} -{"key": "BAC009S0905W0308", "wav": "./aishell/wav/test/S0905/BAC009S0905W0308.wav", "txt": "辐射吸收率在国家的安全标准范围之内"} -{"key": "BAC009S0905W0309", "wav": "./aishell/wav/test/S0905/BAC009S0905W0309.wav", "txt": "电话手表的辐射主要来自天线"} -{"key": "BAC009S0905W0310", "wav": "./aishell/wav/test/S0905/BAC009S0905W0310.wav", "txt": "包括外置天线和内置天线"} -{"key": "BAC009S0905W0311", "wav": "./aishell/wav/test/S0905/BAC009S0905W0311.wav", "txt": "正规厂家生产的电话手表辐射一般符合国家标准"} -{"key": "BAC009S0905W0312", "wav": "./aishell/wav/test/S0905/BAC009S0905W0312.wav", "txt": "以小天才电话手表为例"} -{"key": "BAC009S0905W0313", "wav": "./aishell/wav/test/S0905/BAC009S0905W0313.wav", "txt": "根据权威机构检测报告显示"} -{"key": "BAC009S0905W0314", "wav": "./aishell/wav/test/S0905/BAC009S0905W0314.wav", "txt": "小天才电话手表辐射远小于国家标准二瓦特每千克"} -{"key": "BAC009S0905W0315", "wav": "./aishell/wav/test/S0905/BAC009S0905W0315.wav", "txt": "只要辐射值小于或等于国家标准值"} -{"key": "BAC009S0905W0316", "wav": "./aishell/wav/test/S0905/BAC009S0905W0316.wav", "txt": "就是符合国家标准的"} -{"key": "BAC009S0905W0317", "wav": "./aishell/wav/test/S0905/BAC009S0905W0317.wav", "txt": "小天才负责人介绍说"} -{"key": "BAC009S0905W0318", "wav": "./aishell/wav/test/S0905/BAC009S0905W0318.wav", "txt": "手机是直接贴着耳朵使用"} -{"key": "BAC009S0905W0319", "wav": "./aishell/wav/test/S0905/BAC009S0905W0319.wav", "txt": "而电话手表通话时离头部还有一百零一百一十五厘米的距离"} -{"key": "BAC009S0905W0320", "wav": "./aishell/wav/test/S0905/BAC009S0905W0320.wav", "txt": "可见电话手表的辐射比手机还小"} -{"key": "BAC009S0905W0321", "wav": "./aishell/wav/test/S0905/BAC009S0905W0321.wav", "txt": "不排除有一些杂牌的电话手表辐射会超标"} -{"key": "BAC009S0905W0322", "wav": "./aishell/wav/test/S0905/BAC009S0905W0322.wav", "txt": "建议家长通过正规渠道购买正规厂家生产的产品"} -{"key": "BAC009S0905W0323", "wav": "./aishell/wav/test/S0905/BAC009S0905W0323.wav", "txt": "电话手表应如何选购"} -{"key": "BAC009S0905W0324", "wav": "./aishell/wav/test/S0905/BAC009S0905W0324.wav", "txt": "关于儿童电话手表应该如何选购"} -{"key": "BAC009S0905W0325", "wav": "./aishell/wav/test/S0905/BAC009S0905W0325.wav", "txt": "也是众多家长特别想了解的"} -{"key": "BAC009S0905W0326", "wav": "./aishell/wav/test/S0905/BAC009S0905W0326.wav", "txt": "除了之前提到的关于辐射的测试报告外"} -{"key": "BAC009S0905W0327", "wav": "./aishell/wav/test/S0905/BAC009S0905W0327.wav", "txt": "专家提醒相关的产品认证也是消费者必须要关注的"} -{"key": "BAC009S0905W0328", "wav": "./aishell/wav/test/S0905/BAC009S0905W0328.wav", "txt": "所有在中国境内销售及使用的无线电组件产品"} -{"key": "BAC009S0905W0329", "wav": "./aishell/wav/test/S0905/BAC009S0905W0329.wav", "txt": "必须取得无线电型号的核准认证"} -{"key": "BAC009S0905W0330", "wav": "./aishell/wav/test/S0905/BAC009S0905W0330.wav", "txt": "没有该认证的产品属于违法产品"} -{"key": "BAC009S0905W0331", "wav": "./aishell/wav/test/S0905/BAC009S0905W0331.wav", "txt": "未获得进网许可证的"} -{"key": "BAC009S0905W0332", "wav": "./aishell/wav/test/S0905/BAC009S0905W0332.wav", "txt": "不得接入公用电信网使用和在国内销售"} -{"key": "BAC009S0905W0333", "wav": "./aishell/wav/test/S0905/BAC009S0905W0333.wav", "txt": "小天才电话手表等国内几个大品牌都有"} -{"key": "BAC009S0905W0334", "wav": "./aishell/wav/test/S0905/BAC009S0905W0334.wav", "txt": "这也是选购电话手表要注意关注的"} -{"key": "BAC009S0905W0335", "wav": "./aishell/wav/test/S0905/BAC009S0905W0335.wav", "txt": "很多家长都在给孩子购置各种学习用"} -{"key": "BAC009S0905W0336", "wav": "./aishell/wav/test/S0905/BAC009S0905W0336.wav", "txt": "网络安全漏洞挡道车联网阴霾笼罩搜狐科技"} -{"key": "BAC009S0905W0337", "wav": "./aishell/wav/test/S0905/BAC009S0905W0337.wav", "txt": "对频频的骚扰电话显得无可奈何"} -{"key": "BAC009S0905W0338", "wav": "./aishell/wav/test/S0905/BAC009S0905W0338.wav", "txt": "由郎平挂帅的中国女排在名古屋赛区"} -{"key": "BAC009S0905W0339", "wav": "./aishell/wav/test/S0905/BAC009S0905W0339.wav", "txt": "提升战绩为九胜一负反超至榜首位置"} -{"key": "BAC009S0905W0340", "wav": "./aishell/wav/test/S0905/BAC009S0905W0340.wav", "txt": "只要在明天的最后一战中赢下东道主日本"} -{"key": "BAC009S0905W0341", "wav": "./aishell/wav/test/S0905/BAC009S0905W0341.wav", "txt": "高清女排力擒俄罗斯夺冠占主动众将喜极而泣"} -{"key": "BAC009S0905W0342", "wav": "./aishell/wav/test/S0905/BAC009S0905W0342.wav", "txt": "能够赢得比赛真的很开心"} -{"key": "BAC009S0905W0343", "wav": "./aishell/wav/test/S0905/BAC009S0905W0343.wav", "txt": "对手给我们制造了非常多的困难"} -{"key": "BAC009S0905W0344", "wav": "./aishell/wav/test/S0905/BAC009S0905W0344.wav", "txt": "我和队友们一起团结努力克服了这些困难"} -{"key": "BAC009S0905W0345", "wav": "./aishell/wav/test/S0905/BAC009S0905W0345.wav", "txt": "在今天的比赛中曾春蕾首发出场"} -{"key": "BAC009S0905W0346", "wav": "./aishell/wav/test/S0905/BAC009S0905W0346.wav", "txt": "凭借十三分位列本队和扣球榜第二位"} -{"key": "BAC009S0905W0347", "wav": "./aishell/wav/test/S0905/BAC009S0905W0347.wav", "txt": "而主教练郎平则在全面性方面对大家做了更多要求"} -{"key": "BAC009S0905W0348", "wav": "./aishell/wav/test/S0905/BAC009S0905W0348.wav", "txt": "说到今天获胜的原因"} -{"key": "BAC009S0905W0349", "wav": "./aishell/wav/test/S0905/BAC009S0905W0349.wav", "txt": "作为队长出席新闻发布会的曾春蕾提到了凝聚力三个字"} -{"key": "BAC009S0905W0350", "wav": "./aishell/wav/test/S0905/BAC009S0905W0350.wav", "txt": "凝聚力一直都是中国女排的传统"} -{"key": "BAC009S0905W0351", "wav": "./aishell/wav/test/S0905/BAC009S0905W0351.wav", "txt": "它都是女排精神的一部分"} -{"key": "BAC009S0905W0352", "wav": "./aishell/wav/test/S0905/BAC009S0905W0352.wav", "txt": "当队伍遇到一些困难的时候"} -{"key": "BAC009S0905W0353", "wav": "./aishell/wav/test/S0905/BAC009S0905W0353.wav", "txt": "我们不需要教练要求就会团结在一起"} -{"key": "BAC009S0905W0354", "wav": "./aishell/wav/test/S0905/BAC009S0905W0354.wav", "txt": "像这种无形的向心力是在队伍中一直存在的"} -{"key": "BAC009S0905W0355", "wav": "./aishell/wav/test/S0905/BAC009S0905W0355.wav", "txt": "在今天的比赛中中国女排始终相互鼓励相互扶持"} -{"key": "BAC009S0905W0356", "wav": "./aishell/wav/test/S0905/BAC009S0905W0356.wav", "txt": "在几度遇险的情况下顽强咬住"} -{"key": "BAC009S0905W0357", "wav": "./aishell/wav/test/S0905/BAC009S0905W0357.wav", "txt": "无论年轻队员还是老队员都可能在比赛中出现起伏"} -{"key": "BAC009S0905W0358", "wav": "./aishell/wav/test/S0905/BAC009S0905W0358.wav", "txt": "我们要做的就是相互弥补"} -{"key": "BAC009S0905W0359", "wav": "./aishell/wav/test/S0905/BAC009S0905W0359.wav", "txt": "今天作为队长我更多是在精神层面上提醒大家"} -{"key": "BAC009S0905W0360", "wav": "./aishell/wav/test/S0905/BAC009S0905W0360.wav", "txt": "而在技术上年轻队员也弥补了我的不足"} -{"key": "BAC009S0905W0361", "wav": "./aishell/wav/test/S0905/BAC009S0905W0361.wav", "txt": "这是我们每个人都应该做的"} -{"key": "BAC009S0905W0362", "wav": "./aishell/wav/test/S0905/BAC009S0905W0362.wav", "txt": "如果能够战而胜之的话"} -{"key": "BAC009S0905W0363", "wav": "./aishell/wav/test/S0905/BAC009S0905W0363.wav", "txt": "明天还剩最后一场比赛"} -{"key": "BAC009S0905W0364", "wav": "./aishell/wav/test/S0905/BAC009S0905W0364.wav", "txt": "对我们来讲最重要的就是兢兢业业"} -{"key": "BAC009S0905W0365", "wav": "./aishell/wav/test/S0905/BAC009S0905W0365.wav", "txt": "大家回去之后将马上投入到对日本的准备中"} -{"key": "BAC009S0905W0366", "wav": "./aishell/wav/test/S0905/BAC009S0905W0366.wav", "txt": "明天比赛里我们会冷静下来落实到细节"} -{"key": "BAC009S0905W0367", "wav": "./aishell/wav/test/S0905/BAC009S0905W0367.wav", "txt": "一分分和对手拼到最后"} -{"key": "BAC009S0905W0368", "wav": "./aishell/wav/test/S0905/BAC009S0905W0368.wav", "txt": "北京时间明天晚间十八点"} -{"key": "BAC009S0905W0369", "wav": "./aishell/wav/test/S0905/BAC009S0905W0369.wav", "txt": "中国女排将应战日本队"} -{"key": "BAC009S0905W0370", "wav": "./aishell/wav/test/S0905/BAC009S0905W0370.wav", "txt": "搜狐体育郭健文"} -{"key": "BAC009S0905W0371", "wav": "./aishell/wav/test/S0905/BAC009S0905W0371.wav", "txt": "女排三零阿根廷朱婷复出扣杀状态神勇"} -{"key": "BAC009S0905W0372", "wav": "./aishell/wav/test/S0905/BAC009S0905W0372.wav", "txt": "搜狐体育郭健九月一日发自日本冈山今天下午"} -{"key": "BAC009S0905W0373", "wav": "./aishell/wav/test/S0905/BAC009S0905W0373.wav", "txt": "二零一五年第十二届女排世界杯单循环赛战至第八轮"} -{"key": "BAC009S0905W0374", "wav": "./aishell/wav/test/S0905/BAC009S0905W0374.wav", "txt": "从而将战绩提升为七胜一负积二十一分"} -{"key": "BAC009S0905W0375", "wav": "./aishell/wav/test/S0905/BAC009S0905W0375.wav", "txt": "本场比赛朱婷复出担任首发主攻并当选为当场最佳"} -{"key": "BAC009S0905W0376", "wav": "./aishell/wav/test/S0905/BAC009S0905W0376.wav", "txt": "虽然在比赛中没有得到出场机会"} -{"key": "BAC009S0905W0377", "wav": "./aishell/wav/test/S0905/BAC009S0905W0377.wav", "txt": "但曾春蕾赛后还是以队长身份出席了新闻发布会"} -{"key": "BAC009S0905W0378", "wav": "./aishell/wav/test/S0905/BAC009S0905W0378.wav", "txt": "很开心赢得今天的比赛"} -{"key": "BAC009S0905W0379", "wav": "./aishell/wav/test/S0905/BAC009S0905W0379.wav", "txt": "队伍凭借稳定的整体发挥获得了三零的胜利"} -{"key": "BAC009S0905W0380", "wav": "./aishell/wav/test/S0905/BAC009S0905W0380.wav", "txt": "曾春蕾表示阿根廷是一支拥有良好防守能力的球队"} -{"key": "BAC009S0905W0381", "wav": "./aishell/wav/test/S0905/BAC009S0905W0381.wav", "txt": "这一点也值得中国女排学习"} -{"key": "BAC009S0905W0382", "wav": "./aishell/wav/test/S0905/BAC009S0905W0382.wav", "txt": "中国女排队长坦言不仅是后面的几场比赛"} -{"key": "BAC009S0905W0383", "wav": "./aishell/wav/test/S0905/BAC009S0905W0383.wav", "txt": "每场较量对球队都很关键"} -{"key": "BAC009S0905W0384", "wav": "./aishell/wav/test/S0905/BAC009S0905W0384.wav", "txt": "我们球员要做的就是立足于自己"} -{"key": "BAC009S0905W0385", "wav": "./aishell/wav/test/S0905/BAC009S0905W0385.wav", "txt": "争取把自身水平发挥出来"} -{"key": "BAC009S0905W0386", "wav": "./aishell/wav/test/S0905/BAC009S0905W0386.wav", "txt": "至于其他球队的比赛结果"} -{"key": "BAC009S0905W0387", "wav": "./aishell/wav/test/S0905/BAC009S0905W0387.wav", "txt": "阿根廷队队长索萨认为"} -{"key": "BAC009S0905W0388", "wav": "./aishell/wav/test/S0905/BAC009S0905W0388.wav", "txt": "中国队的快速打法给自己的球队制造了很大的麻烦"} -{"key": "BAC009S0905W0389", "wav": "./aishell/wav/test/S0905/BAC009S0905W0389.wav", "txt": "像她们这样的亚洲对手速度很快"} -{"key": "BAC009S0905W0390", "wav": "./aishell/wav/test/S0905/BAC009S0905W0390.wav", "txt": "对我们来说比赛很困难"} -{"key": "BAC009S0905W0391", "wav": "./aishell/wav/test/S0905/BAC009S0905W0391.wav", "txt": "还有三场非常重要的比赛"} -{"key": "BAC009S0905W0392", "wav": "./aishell/wav/test/S0905/BAC009S0905W0392.wav", "txt": "希望得到想要的结果"} -{"key": "BAC009S0905W0393", "wav": "./aishell/wav/test/S0905/BAC009S0905W0393.wav", "txt": "对阵中国这样的球队是非常困难的"} -{"key": "BAC009S0905W0394", "wav": "./aishell/wav/test/S0905/BAC009S0905W0394.wav", "txt": "令我满意的是球队能够以一个积极的态度进行比赛"} -{"key": "BAC009S0905W0395", "wav": "./aishell/wav/test/S0905/BAC009S0905W0395.wav", "txt": "以前接触比较多的巴西队速度也很快"} -{"key": "BAC009S0905W0396", "wav": "./aishell/wav/test/S0905/BAC009S0905W0396.wav", "txt": "我们应该多和亚洲球队比赛来适应这样的打法"} -{"key": "BAC009S0905W0397", "wav": "./aishell/wav/test/S0905/BAC009S0905W0397.wav", "txt": "接下来中国女排将转战名古屋"} -{"key": "BAC009S0905W0398", "wav": "./aishell/wav/test/S0905/BAC009S0905W0398.wav", "txt": "从九月四日起迎接多米尼加俄罗斯和日本的挑战"} -{"key": "BAC009S0905W0399", "wav": "./aishell/wav/test/S0905/BAC009S0905W0399.wav", "txt": "搜狐体育郭健文"} -{"key": "BAC009S0905W0400", "wav": "./aishell/wav/test/S0905/BAC009S0905W0400.wav", "txt": "广州日报社记者许胚日前"} -{"key": "BAC009S0905W0401", "wav": "./aishell/wav/test/S0905/BAC009S0905W0401.wav", "txt": "英国人保拉拉德克利夫公开了自己的血液检测结果"} -{"key": "BAC009S0905W0402", "wav": "./aishell/wav/test/S0905/BAC009S0905W0402.wav", "txt": "以此证明自己并没有使用过违禁药物"} -{"key": "BAC009S0905W0403", "wav": "./aishell/wav/test/S0905/BAC009S0905W0403.wav", "txt": "在英国议会关于血液兴奋剂的听证会中"} -{"key": "BAC009S0905W0404", "wav": "./aishell/wav/test/S0905/BAC009S0905W0404.wav", "txt": "将出任玩具总动员四的导演"} -{"key": "BAC009S0905W0405", "wav": "./aishell/wav/test/S0905/BAC009S0905W0405.wav", "txt": "影片将在二零一七年登陆全国"} -{"key": "BAC009S0905W0406", "wav": "./aishell/wav/test/S0905/BAC009S0905W0406.wav", "txt": "来源时光网昨日"} -{"key": "BAC009S0905W0407", "wav": "./aishell/wav/test/S0905/BAC009S0905W0407.wav", "txt": "在英格兰多塞特群的波维顿坦克博物馆"} -{"key": "BAC009S0905W0408", "wav": "./aishell/wav/test/S0905/BAC009S0905W0408.wav", "txt": "至于有传拍台庆剧很容易获奖"} -{"key": "BAC009S0905W0410", "wav": "./aishell/wav/test/S0905/BAC009S0905W0410.wav", "txt": "她笑称我不想说我没有信心"} -{"key": "BAC009S0905W0411", "wav": "./aishell/wav/test/S0905/BAC009S0905W0411.wav", "txt": "很多演员都非常棒"} -{"key": "BAC009S0905W0412", "wav": "./aishell/wav/test/S0905/BAC009S0905W0412.wav", "txt": "搜狐娱乐讯北京时间七月二十日消息"} -{"key": "BAC009S0905W0413", "wav": "./aishell/wav/test/S0905/BAC009S0905W0413.wav", "txt": "据香港媒体报导"} -{"key": "BAC009S0905W0417", "wav": "./aishell/wav/test/S0905/BAC009S0905W0417.wav", "txt": "不到几个月的时间已爱得如此火热了"} -{"key": "BAC009S0905W0418", "wav": "./aishell/wav/test/S0905/BAC009S0905W0418.wav", "txt": "两人不想恋情变得高调"} -{"key": "BAC009S0905W0419", "wav": "./aishell/wav/test/S0905/BAC009S0905W0419.wav", "txt": "却多次被身边的人将他们的行踪暴露出来"} -{"key": "BAC009S0905W0420", "wav": "./aishell/wav/test/S0905/BAC009S0905W0420.wav", "txt": "两人被传媒追问恋情时都要求给予空间"} -{"key": "BAC009S0905W0421", "wav": "./aishell/wav/test/S0905/BAC009S0905W0421.wav", "txt": "看来他们需要身边的朋友保密他们的行踪"} -{"key": "BAC009S0905W0422", "wav": "./aishell/wav/test/S0905/BAC009S0905W0422.wav", "txt": "这样做反而更实际"} -{"key": "BAC009S0905W0423", "wav": "./aishell/wav/test/S0905/BAC009S0905W0423.wav", "txt": "搜狐娱乐讯北京时间六月三十日消息"} -{"key": "BAC009S0905W0424", "wav": "./aishell/wav/test/S0905/BAC009S0905W0424.wav", "txt": "据香港媒体报道"} -{"key": "BAC009S0905W0425", "wav": "./aishell/wav/test/S0905/BAC009S0905W0425.wav", "txt": "陈凯琳的心被郑嘉颖成功俘虏"} -{"key": "BAC009S0905W0426", "wav": "./aishell/wav/test/S0905/BAC009S0905W0426.wav", "txt": "更是郑嘉颖愿意公开承认的女友"} -{"key": "BAC009S0905W0427", "wav": "./aishell/wav/test/S0905/BAC009S0905W0427.wav", "txt": "不过二人因给陈嘉宝把生日合照在网上公开才泄露恋情"} -{"key": "BAC009S0905W0428", "wav": "./aishell/wav/test/S0905/BAC009S0905W0428.wav", "txt": "对此陈凯琳没有怪责陈嘉宝"} -{"key": "BAC009S0905W0429", "wav": "./aishell/wav/test/S0905/BAC009S0905W0429.wav", "txt": "觉得对方只是分享生日上的喜悦"} -{"key": "BAC009S0905W0430", "wav": "./aishell/wav/test/S0905/BAC009S0905W0430.wav", "txt": "陈凯琳之前说没交过男友"} -{"key": "BAC009S0905W0431", "wav": "./aishell/wav/test/S0905/BAC009S0905W0431.wav", "txt": "温州鹿城区宣传部官微做出回应"} -{"key": "BAC009S0905W0432", "wav": "./aishell/wav/test/S0905/BAC009S0905W0432.wav", "txt": "称涉事男子为某街道协管员"} -{"key": "BAC009S0905W0433", "wav": "./aishell/wav/test/S0905/BAC009S0905W0433.wav", "txt": "其发现清洁工保洁不到位"} -{"key": "BAC009S0905W0434", "wav": "./aishell/wav/test/S0905/BAC009S0905W0434.wav", "txt": "因此与清洁工引发争执"} -{"key": "BAC009S0905W0435", "wav": "./aishell/wav/test/S0905/BAC009S0905W0435.wav", "txt": "进一步导致肢体冲突"} -{"key": "BAC009S0905W0436", "wav": "./aishell/wav/test/S0905/BAC009S0905W0436.wav", "txt": "目前该协管已经停职"} -{"key": "BAC009S0905W0437", "wav": "./aishell/wav/test/S0905/BAC009S0905W0437.wav", "txt": "温州多地商户拉横幅求降租导购不少店亏本经营"} -{"key": "BAC009S0905W0438", "wav": "./aishell/wav/test/S0905/BAC009S0905W0438.wav", "txt": "东越花苑不少商铺都关门转租记者谢国林摄"} -{"key": "BAC009S0905W0439", "wav": "./aishell/wav/test/S0905/BAC009S0905W0439.wav", "txt": "温州大妈年逾半百冒充女儿成功骗婚多名小鲜肉"} -{"key": "BAC009S0905W0440", "wav": "./aishell/wav/test/S0905/BAC009S0905W0440.wav", "txt": "该女子已经行骗多地"} -{"key": "BAC009S0905W0441", "wav": "./aishell/wav/test/S0905/BAC009S0905W0441.wav", "txt": "她一直假冒的林某竟是她的女儿"} -{"key": "BAC009S0905W0442", "wav": "./aishell/wav/test/S0905/BAC009S0905W0442.wav", "txt": "而且她还是已婚身份"} -{"key": "BAC009S0905W0443", "wav": "./aishell/wav/test/S0905/BAC009S0905W0443.wav", "txt": "凭着远比真实年龄看起来要年经许多的容貌"} -{"key": "BAC009S0905W0444", "wav": "./aishell/wav/test/S0905/BAC009S0905W0444.wav", "txt": "雷某一直在河北邢台衡水等地干着游走骗婚的勾当"} -{"key": "BAC009S0905W0445", "wav": "./aishell/wav/test/S0905/BAC009S0905W0445.wav", "txt": "温州天价窝头事件背后顾客要持持赔三条中华"} -{"key": "BAC009S0905W0446", "wav": "./aishell/wav/test/S0905/BAC009S0905W0446.wav", "txt": "网络上一张永嘉桥头国际饭店的结帐单十分引人注目"} -{"key": "BAC009S0905W0447", "wav": "./aishell/wav/test/S0905/BAC009S0905W0447.wav", "txt": "菜单显示该饭店的荞麦窝窝头卖三八元一个"} -{"key": "BAC009S0905W0448", "wav": "./aishell/wav/test/S0905/BAC009S0905W0448.wav", "txt": "三零馀位食客吃了四五个窝窝头"} -{"key": "BAC009S0905W0449", "wav": "./aishell/wav/test/S0905/BAC009S0905W0449.wav", "txt": "发现事情并没有这么简单"} -{"key": "BAC009S0905W0450", "wav": "./aishell/wav/test/S0905/BAC009S0905W0450.wav", "txt": "温州女协管员侮辱环卫工行尸走肉已辞职"} -{"key": "BAC009S0905W0451", "wav": "./aishell/wav/test/S0905/BAC009S0905W0451.wav", "txt": "温州女协管员发伪辱性文字环卫节一群行尸走肉"} -{"key": "BAC009S0905W0452", "wav": "./aishell/wav/test/S0905/BAC009S0905W0452.wav", "txt": "温州女婴打疫苗后口吐白沫抽搐昏迷"} -{"key": "BAC009S0905W0453", "wav": "./aishell/wav/test/S0905/BAC009S0905W0453.wav", "txt": "温州网讯在温医大附属育英儿童医院的重监护室里"} -{"key": "BAC009S0905W0454", "wav": "./aishell/wav/test/S0905/BAC009S0905W0454.wav", "txt": "才七个月大的女童腾腾化名已昏迷了两天时间"} -{"key": "BAC009S0905W0455", "wav": "./aishell/wav/test/S0905/BAC009S0905W0455.wav", "txt": "随即被送到儿童医院进行抢救"} -{"key": "BAC009S0905W0456", "wav": "./aishell/wav/test/S0905/BAC009S0905W0456.wav", "txt": "区市省三级疾控部门专家已介入调查"} -{"key": "BAC009S0905W0457", "wav": "./aishell/wav/test/S0905/BAC009S0905W0457.wav", "txt": "温州家庭误食毒蘑菇后续小女儿已确诊脑死亡"} -{"key": "BAC009S0905W0458", "wav": "./aishell/wav/test/S0905/BAC009S0905W0458.wav", "txt": "温州少年峡谷失踪续二零万馀元赔偿款执行到位"} -{"key": "BAC009S0905W0459", "wav": "./aishell/wav/test/S0905/BAC009S0905W0459.wav", "txt": "金报讯记者蓝莹还记得小温吗"} -{"key": "BAC009S0905W0460", "wav": "./aishell/wav/test/S0905/BAC009S0905W0460.wav", "txt": "二零一三六二三"} -{"key": "BAC009S0905W0461", "wav": "./aishell/wav/test/S0905/BAC009S0905W0461.wav", "txt": "温州一四岁少年小温迷失莒溪大峡谷"} -{"key": "BAC009S0905W0462", "wav": "./aishell/wav/test/S0905/BAC009S0905W0462.wav", "txt": "浙江省史上规模最大的户外救援行动开始了"} -{"key": "BAC009S0905W0463", "wav": "./aishell/wav/test/S0905/BAC009S0905W0463.wav", "txt": "经过长达四个月的搜救"} -{"key": "BAC009S0905W0464", "wav": "./aishell/wav/test/S0905/BAC009S0905W0464.wav", "txt": "最终在峡谷上游的石头夹缝下"} -{"key": "BAC009S0905W0465", "wav": "./aishell/wav/test/S0905/BAC009S0905W0465.wav", "txt": "发现小温残缺的遗骸"} -{"key": "BAC009S0905W0466", "wav": "./aishell/wav/test/S0905/BAC009S0905W0466.wav", "txt": "温州市场现胶注虾业内不仅增重卖相更好"} -{"key": "BAC009S0905W0467", "wav": "./aishell/wav/test/S0905/BAC009S0905W0467.wav", "txt": "温州市民郑女士在农贸市场购买了三只大虾"} -{"key": "BAC009S0905W0468", "wav": "./aishell/wav/test/S0905/BAC009S0905W0468.wav", "txt": "回家后发现大虾体内居然被注射了不明胶状物"} -{"key": "BAC009S0905W0469", "wav": "./aishell/wav/test/S0905/BAC009S0905W0469.wav", "txt": "生活经验让郑女士起了疑心"} -{"key": "BAC009S0905W0470", "wav": "./aishell/wav/test/S0905/BAC009S0905W0470.wav", "txt": "她将几只虾的图片通过微博发布"} -{"key": "BAC009S0905W0471", "wav": "./aishell/wav/test/S0905/BAC009S0905W0471.wav", "txt": "迅速引起了网友以及当地监管部门的关注"} -{"key": "BAC009S0905W0472", "wav": "./aishell/wav/test/S0905/BAC009S0905W0472.wav", "txt": "温州市域铁路将成为全国第一条城市交通铁路"} -{"key": "BAC009S0905W0475", "wav": "./aishell/wav/test/S0905/BAC009S0905W0475.wav", "txt": "温州市治堵办的负责人表示"} -{"key": "BAC009S0905W0477", "wav": "./aishell/wav/test/S0905/BAC009S0905W0477.wav", "txt": "温州开水浇头服务员被批捕涉嫌故意伤害罪"} -{"key": "BAC009S0905W0478", "wav": "./aishell/wav/test/S0905/BAC009S0905W0478.wav", "txt": "京华时报讯昨天下午"} -{"key": "BAC009S0905W0479", "wav": "./aishell/wav/test/S0905/BAC009S0905W0479.wav", "txt": "浙江温州鹿城区检察院通报九月六日"} -{"key": "BAC009S0905W0480", "wav": "./aishell/wav/test/S0905/BAC009S0905W0480.wav", "txt": "开水淋顾客的火锅店服务员朱某被依法批准逮捕"} -{"key": "BAC009S0905W0481", "wav": "./aishell/wav/test/S0905/BAC009S0905W0481.wav", "txt": "温州惊现注胶虾续苍南再查六公斤注胶大虾"} -{"key": "BAC009S0905W0482", "wav": "./aishell/wav/test/S0905/BAC009S0905W0482.wav", "txt": "温州一菜场惊现注胶虾追踪"} -{"key": "BAC009S0905W0483", "wav": "./aishell/wav/test/S0905/BAC009S0905W0483.wav", "txt": "温州昆明出现注胶虾产地均指向广东湛江"} -{"key": "BAC009S0905W0484", "wav": "./aishell/wav/test/S0905/BAC009S0905W0484.wav", "txt": "浙江温州市一位市民一零零元买回三只斑节虾"} -{"key": "BAC009S0905W0485", "wav": "./aishell/wav/test/S0905/BAC009S0905W0485.wav", "txt": "在虾体内发现疑似胶状物质七月二十一日"} -{"key": "BAC009S0905W0486", "wav": "./aishell/wav/test/S0905/BAC009S0905W0486.wav", "txt": "云南昆明市同样发现类似注胶虾"} -{"key": "BAC009S0905W0487", "wav": "./aishell/wav/test/S0905/BAC009S0905W0487.wav", "txt": "国内两地出现注胶虾踪迹"} -{"key": "BAC009S0905W0488", "wav": "./aishell/wav/test/S0905/BAC009S0905W0488.wav", "txt": "且产地均指向广东省湛江市"} -{"key": "BAC009S0905W0489", "wav": "./aishell/wav/test/S0905/BAC009S0905W0489.wav", "txt": "温州景山花木市场发生大火火势已得到基本控制"} -{"key": "BAC009S0905W0490", "wav": "./aishell/wav/test/S0905/BAC009S0905W0490.wav", "txt": "温州服务员向顾客头上泼开水继而已被批捕"} -{"key": "BAC009S0905W0491", "wav": "./aishell/wav/test/S0905/BAC009S0905W0491.wav", "txt": "今天九月八日下午"} -{"key": "BAC009S0905W0492", "wav": "./aishell/wav/test/S0905/BAC009S0905W0492.wav", "txt": "因火锅加水问题与顾客发生争执"} -{"key": "BAC009S0905W0493", "wav": "./aishell/wav/test/S0905/BAC009S0905W0493.wav", "txt": "为泄愤将开水淋到顾客头上"} -{"key": "BAC009S0905W0494", "wav": "./aishell/wav/test/S0905/BAC009S0905W0494.wav", "txt": "并将其摁倒在地殴打"} -{"key": "BAC009S0905W0495", "wav": "./aishell/wav/test/S0905/BAC009S0905W0495.wav", "txt": "火锅店服务员朱某被温州市鹿城区检察院依法批准逮捕"} -{"key": "BAC009S0906W0121", "wav": "./aishell/wav/test/S0906/BAC009S0906W0121.wav", "txt": "双降会令市场看贬人民币的情绪持续"} -{"key": "BAC009S0906W0122", "wav": "./aishell/wav/test/S0906/BAC009S0906W0122.wav", "txt": "人民币未来贬值压力依然较大"} -{"key": "BAC009S0906W0123", "wav": "./aishell/wav/test/S0906/BAC009S0906W0123.wav", "txt": "预计短期央行仍可能会积极维稳"} -{"key": "BAC009S0906W0124", "wav": "./aishell/wav/test/S0906/BAC009S0906W0124.wav", "txt": "汇率较大概率维持双向波动"} -{"key": "BAC009S0906W0125", "wav": "./aishell/wav/test/S0906/BAC009S0906W0125.wav", "txt": "公积金松绑接棒释压房价下跌动力趋缓至搜狐财经"} -{"key": "BAC009S0906W0126", "wav": "./aishell/wav/test/S0906/BAC009S0906W0126.wav", "txt": "上海南昌等城市近期继续松绑了公积金贷款政策"} -{"key": "BAC009S0906W0127", "wav": "./aishell/wav/test/S0906/BAC009S0906W0127.wav", "txt": "而南昌除了放松首套房界定标准"} -{"key": "BAC009S0906W0128", "wav": "./aishell/wav/test/S0906/BAC009S0906W0128.wav", "txt": "还降低了首套房公积金首付比例"} -{"key": "BAC009S0906W0129", "wav": "./aishell/wav/test/S0906/BAC009S0906W0129.wav", "txt": "公积金贷款首付款比例不低于百分之七"} -{"key": "BAC009S0906W0130", "wav": "./aishell/wav/test/S0906/BAC009S0906W0130.wav", "txt": "上海易居研究院研究院严跃进认为"} -{"key": "BAC009S0906W0131", "wav": "./aishell/wav/test/S0906/BAC009S0906W0131.wav", "txt": "存销比已经见顶回落"} -{"key": "BAC009S0906W0132", "wav": "./aishell/wav/test/S0906/BAC009S0906W0132.wav", "txt": "房价下跌压力将趋于缓解"} -{"key": "BAC009S0906W0133", "wav": "./aishell/wav/test/S0906/BAC009S0906W0133.wav", "txt": "公积金大力度松绑相关商业银行信贷政策"} -{"key": "BAC009S0906W0134", "wav": "./aishell/wav/test/S0906/BAC009S0906W0134.wav", "txt": "各地对公积金贷款的松绑力度更大"} -{"key": "BAC009S0906W0135", "wav": "./aishell/wav/test/S0906/BAC009S0906W0135.wav", "txt": "江苏省对省级机关住房公积金政策做出了调整"} -{"key": "BAC009S0906W0136", "wav": "./aishell/wav/test/S0906/BAC009S0906W0136.wav", "txt": "昆明市住房公积金管理中心出台三项公积金新政"} -{"key": "BAC009S0906W0137", "wav": "./aishell/wav/test/S0906/BAC009S0906W0137.wav", "txt": "上海市公积金管理中心公布公积金新政"} -{"key": "BAC009S0906W0138", "wav": "./aishell/wav/test/S0906/BAC009S0906W0138.wav", "txt": "有一套住房并已结清公积金贷款"} -{"key": "BAC009S0906W0139", "wav": "./aishell/wav/test/S0906/BAC009S0906W0139.wav", "txt": "再次申请公积金贷款购房的"} -{"key": "BAC009S0906W0140", "wav": "./aishell/wav/test/S0906/BAC009S0906W0140.wav", "txt": "参照首套房贷款政策"} -{"key": "BAC009S0906W0141", "wav": "./aishell/wav/test/S0906/BAC009S0906W0141.wav", "txt": "中原地产市场研究部统计数据显示截至目前"} -{"key": "BAC009S0906W0142", "wav": "./aishell/wav/test/S0906/BAC009S0906W0142.wav", "txt": "二套执行认贷不认房"} -{"key": "BAC009S0906W0143", "wav": "./aishell/wav/test/S0906/BAC009S0906W0143.wav", "txt": "二套首付降比百分之七"} -{"key": "BAC009S0906W0144", "wav": "./aishell/wav/test/S0906/BAC009S0906W0144.wav", "txt": "南京武汉市放宽第二套房公积金贷款门槛"} -{"key": "BAC009S0906W0145", "wav": "./aishell/wav/test/S0906/BAC009S0906W0145.wav", "txt": "扬州杭州成都无锡等地"} -{"key": "BAC009S0906W0146", "wav": "./aishell/wav/test/S0906/BAC009S0906W0146.wav", "txt": "已有一套住房并结清贷款馀额的家庭"} -{"key": "BAC009S0906W0147", "wav": "./aishell/wav/test/S0906/BAC009S0906W0147.wav", "txt": "再购房执行首套房贷款政策"} -{"key": "BAC009S0906W0148", "wav": "./aishell/wav/test/S0906/BAC009S0906W0148.wav", "txt": "中原地产分析师张大伟认为"} -{"key": "BAC009S0906W0149", "wav": "./aishell/wav/test/S0906/BAC009S0906W0149.wav", "txt": "公积金是地方政府可以直接通过政策调整动用的资金"} -{"key": "BAC009S0906W0150", "wav": "./aishell/wav/test/S0906/BAC009S0906W0150.wav", "txt": "用公积金政策刺激市场是地方政府最习惯的举措"} -{"key": "BAC009S0906W0151", "wav": "./aishell/wav/test/S0906/BAC009S0906W0151.wav", "txt": "对购房者心理影响也非常大"} -{"key": "BAC009S0906W0152", "wav": "./aishell/wav/test/S0906/BAC009S0906W0152.wav", "txt": "由于公积金贷款利率相当于市场贷款利率的七折"} -{"key": "BAC009S0906W0153", "wav": "./aishell/wav/test/S0906/BAC009S0906W0153.wav", "txt": "对需求拉动作用比较大"} -{"key": "BAC009S0906W0154", "wav": "./aishell/wav/test/S0906/BAC009S0906W0154.wav", "txt": "上海作为一线城市代表"} -{"key": "BAC009S0906W0155", "wav": "./aishell/wav/test/S0906/BAC009S0906W0155.wav", "txt": "对房地产市场的心理影响比较大"} -{"key": "BAC009S0906W0156", "wav": "./aishell/wav/test/S0906/BAC009S0906W0156.wav", "txt": "预计还有其他城市将发布同类型松绑政策"} -{"key": "BAC009S0906W0157", "wav": "./aishell/wav/test/S0906/BAC009S0906W0157.wav", "txt": "房价下跌压力缓解各地救市政策不断"} -{"key": "BAC009S0906W0158", "wav": "./aishell/wav/test/S0906/BAC009S0906W0158.wav", "txt": "房企促销力度也在加大"} -{"key": "BAC009S0906W0159", "wav": "./aishell/wav/test/S0906/BAC009S0906W0159.wav", "txt": "各城市库存压力正在减小"} -{"key": "BAC009S0906W0160", "wav": "./aishell/wav/test/S0906/BAC009S0906W0160.wav", "txt": "房价下跌压力趋于缓解"} -{"key": "BAC009S0906W0161", "wav": "./aishell/wav/test/S0906/BAC009S0906W0161.wav", "txt": "上海易居房地产研究院数据显示"} -{"key": "BAC009S0906W0162", "wav": "./aishell/wav/test/S0906/BAC009S0906W0162.wav", "txt": "同比增长百分之七"} -{"key": "BAC009S0906W0163", "wav": "./aishell/wav/test/S0906/BAC009S0906W0163.wav", "txt": "这是今年五月份以来库存环比增幅最小的一次"} -{"key": "BAC009S0906W0164", "wav": "./aishell/wav/test/S0906/BAC009S0906W0164.wav", "txt": "环比增长百分之七"} -{"key": "BAC009S0906W0165", "wav": "./aishell/wav/test/S0906/BAC009S0906W0165.wav", "txt": "同比减小百分之七"} -{"key": "BAC009S0906W0166", "wav": "./aishell/wav/test/S0906/BAC009S0906W0166.wav", "txt": "五月份的供求关系是今年前五个月最均衡的一次"} -{"key": "BAC009S0906W0167", "wav": "./aishell/wav/test/S0906/BAC009S0906W0167.wav", "txt": "存销比见顶的态势基本确立"} -{"key": "BAC009S0906W0168", "wav": "./aishell/wav/test/S0906/BAC009S0906W0168.wav", "txt": "五个城市新建商品住宅存销比为七个月"} -{"key": "BAC009S0906W0169", "wav": "./aishell/wav/test/S0906/BAC009S0906W0169.wav", "txt": "该存销比数值为七个月"} -{"key": "BAC009S0906W0170", "wav": "./aishell/wav/test/S0906/BAC009S0906W0170.wav", "txt": "这直接利好去库存目标的实现"} -{"key": "BAC009S0906W0171", "wav": "./aishell/wav/test/S0906/BAC009S0906W0171.wav", "txt": "二到五个城市的总体水平看"} -{"key": "BAC009S0906W0172", "wav": "./aishell/wav/test/S0906/BAC009S0906W0172.wav", "txt": "库存去化周期依然偏大"} -{"key": "BAC009S0906W0173", "wav": "./aishell/wav/test/S0906/BAC009S0906W0173.wav", "txt": "说明各城市涨价的时机还不成熟"} -{"key": "BAC009S0906W0174", "wav": "./aishell/wav/test/S0906/BAC009S0906W0174.wav", "txt": "一至七月份大多数城市还是会采取积极降价的策略"} -{"key": "BAC009S0906W0175", "wav": "./aishell/wav/test/S0906/BAC009S0906W0175.wav", "txt": "房价未来可能会略微有下跌"} -{"key": "BAC009S0906W0176", "wav": "./aishell/wav/test/S0906/BAC009S0906W0176.wav", "txt": "一线城市由于需求面大"} -{"key": "BAC009S0906W0177", "wav": "./aishell/wav/test/S0906/BAC009S0906W0177.wav", "txt": "未来住宅价格会企稳回升"} -{"key": "BAC009S0906W0178", "wav": "./aishell/wav/test/S0906/BAC009S0906W0178.wav", "txt": "一些库存量较大的三四线城市"} -{"key": "BAC009S0906W0179", "wav": "./aishell/wav/test/S0906/BAC009S0906W0179.wav", "txt": "房价继续下行的可能性仍然比较大"} -{"key": "BAC009S0906W0180", "wav": "./aishell/wav/test/S0906/BAC009S0906W0180.wav", "txt": "同策咨询研究部总监张宏伟认为"} -{"key": "BAC009S0906W0181", "wav": "./aishell/wav/test/S0906/BAC009S0906W0181.wav", "txt": "月度市场成交量开始出现环比回升"} -{"key": "BAC009S0906W0182", "wav": "./aishell/wav/test/S0906/BAC009S0906W0182.wav", "txt": "市场去库存的速度在适度提高"} -{"key": "BAC009S0906W0183", "wav": "./aishell/wav/test/S0906/BAC009S0906W0183.wav", "txt": "从一线城市及存销比在七个月以下的城市来看"} -{"key": "BAC009S0906W0184", "wav": "./aishell/wav/test/S0906/BAC009S0906W0184.wav", "txt": "市场基本面有可能会率先好转"} -{"key": "BAC009S0906W0185", "wav": "./aishell/wav/test/S0906/BAC009S0906W0185.wav", "txt": "年底将出现翘尾行情"} -{"key": "BAC009S0906W0186", "wav": "./aishell/wav/test/S0906/BAC009S0906W0186.wav", "txt": "但年底出现翘尾行情并不代表楼市已经回暖"} -{"key": "BAC009S0906W0187", "wav": "./aishell/wav/test/S0906/BAC009S0906W0187.wav", "txt": "示范带动周边地区发展"} -{"key": "BAC009S0906W0188", "wav": "./aishell/wav/test/S0906/BAC009S0906W0188.wav", "txt": "并在农业走出去方面发挥重要作用"} -{"key": "BAC009S0906W0189", "wav": "./aishell/wav/test/S0906/BAC009S0906W0189.wav", "txt": "稳步发展区域"} -{"key": "BAC009S0906W0190", "wav": "./aishell/wav/test/S0906/BAC009S0906W0190.wav", "txt": "主要指草原生态经济区"} -{"key": "BAC009S0906W0191", "wav": "./aishell/wav/test/S0906/BAC009S0906W0191.wav", "txt": "包括北方干旱半干旱草原地区和青藏高原草原地区"} -{"key": "BAC009S0906W0192", "wav": "./aishell/wav/test/S0906/BAC009S0906W0192.wav", "txt": "加快该地区域现代农业建设"} -{"key": "BAC009S0906W0193", "wav": "./aishell/wav/test/S0906/BAC009S0906W0193.wav", "txt": "对于保障全国生态安全具有不可代替的战略作用"} -{"key": "BAC009S0906W0194", "wav": "./aishell/wav/test/S0906/BAC009S0906W0194.wav", "txt": "牢固树立生产生态有机结合生态优先的基本方针"} -{"key": "BAC009S0906W0195", "wav": "./aishell/wav/test/S0906/BAC009S0906W0195.wav", "txt": "加强草原生态环境保护和建设"} -{"key": "BAC009S0906W0196", "wav": "./aishell/wav/test/S0906/BAC009S0906W0196.wav", "txt": "稳步推进退牧还草和游牧民定居工程"} -{"key": "BAC009S0906W0197", "wav": "./aishell/wav/test/S0906/BAC009S0906W0197.wav", "txt": "加强以节水灌溉饲草地为重点的牧区水利建设"} -{"key": "BAC009S0906W0198", "wav": "./aishell/wav/test/S0906/BAC009S0906W0198.wav", "txt": "建立草原增加碳汇和生态补偿机制"} -{"key": "BAC009S0906W0199", "wav": "./aishell/wav/test/S0906/BAC009S0906W0199.wav", "txt": "转变畜牧业发展方式"} -{"key": "BAC009S0906W0200", "wav": "./aishell/wav/test/S0906/BAC009S0906W0200.wav", "txt": "优化生产布局和畜群结构"} -{"key": "BAC009S0906W0201", "wav": "./aishell/wav/test/S0906/BAC009S0906W0201.wav", "txt": "提高科学饲养和经营水平"} -{"key": "BAC009S0906W0202", "wav": "./aishell/wav/test/S0906/BAC009S0906W0202.wav", "txt": "加强农牧互补牧养结合"} -{"key": "BAC009S0906W0203", "wav": "./aishell/wav/test/S0906/BAC009S0906W0203.wav", "txt": "以最急需最关键最薄弱的环节和领域为重点"} -{"key": "BAC009S0906W0204", "wav": "./aishell/wav/test/S0906/BAC009S0906W0204.wav", "txt": "组织实施一批重大工程"} -{"key": "BAC009S0906W0205", "wav": "./aishell/wav/test/S0906/BAC009S0906W0205.wav", "txt": "全面分实现代农业发展的物质基础"} -{"key": "BAC009S0906W0206", "wav": "./aishell/wav/test/S0906/BAC009S0906W0206.wav", "txt": "一旱涝保收高标准农田建设工程"} -{"key": "BAC009S0906W0207", "wav": "./aishell/wav/test/S0906/BAC009S0906W0207.wav", "txt": "落实土壤改良地力培肥等措施"} -{"key": "BAC009S0906W0208", "wav": "./aishell/wav/test/S0906/BAC009S0906W0208.wav", "txt": "加快先进适用耕作技术推广应用"} -{"key": "BAC009S0906W0209", "wav": "./aishell/wav/test/S0906/BAC009S0906W0209.wav", "txt": "新建旱涝保收高标准农田四亿亩"} -{"key": "BAC009S0906W0210", "wav": "./aishell/wav/test/S0906/BAC009S0906W0210.wav", "txt": "新增千亿斤粮食生产能力建设工程"} -{"key": "BAC009S0906W0211", "wav": "./aishell/wav/test/S0906/BAC009S0906W0211.wav", "txt": "棉油糖生产基地建设工程"} -{"key": "BAC009S0906W0212", "wav": "./aishell/wav/test/S0906/BAC009S0906W0212.wav", "txt": "加强新疆黄淮海地区长江流域棉花生产基地建设"} -{"key": "BAC009S0906W0213", "wav": "./aishell/wav/test/S0906/BAC009S0906W0213.wav", "txt": "支持南方甘蔗和北方甜菜生产基地建设"} -{"key": "BAC009S0906W0214", "wav": "./aishell/wav/test/S0906/BAC009S0906W0214.wav", "txt": "着力改善田间基础设施良种科研繁育设施等生产条件"} -{"key": "BAC009S0906W0215", "wav": "./aishell/wav/test/S0906/BAC009S0906W0215.wav", "txt": "新一轮菜篮子建设工程"} -{"key": "BAC009S0906W0216", "wav": "./aishell/wav/test/S0906/BAC009S0906W0216.wav", "txt": "加强园艺作物标准园建设"} -{"key": "BAC009S0906W0217", "wav": "./aishell/wav/test/S0906/BAC009S0906W0217.wav", "txt": "引导建设优质农产品物流配送中心"} -{"key": "BAC009S0906W0218", "wav": "./aishell/wav/test/S0906/BAC009S0906W0218.wav", "txt": "发展农产品电子商务"} -{"key": "BAC009S0906W0219", "wav": "./aishell/wav/test/S0906/BAC009S0906W0219.wav", "txt": "健全农作物种质资源和畜禽遗传资源保存体系"} -{"key": "BAC009S0906W0220", "wav": "./aishell/wav/test/S0906/BAC009S0906W0220.wav", "txt": "建设动植物基因信息库"} -{"key": "BAC009S0906W0221", "wav": "./aishell/wav/test/S0906/BAC009S0906W0221.wav", "txt": "建立转基因生物安全保障体系"} -{"key": "BAC009S0906W0222", "wav": "./aishell/wav/test/S0906/BAC009S0906W0222.wav", "txt": "建设国家级农作物育制种基地"} -{"key": "BAC009S0906W0223", "wav": "./aishell/wav/test/S0906/BAC009S0906W0223.wav", "txt": "完善农作物品种试验和种子检测设施条件"} -{"key": "BAC009S0906W0224", "wav": "./aishell/wav/test/S0906/BAC009S0906W0224.wav", "txt": "建设水产遗传育种中心和原良种场"} -{"key": "BAC009S0906W0225", "wav": "./aishell/wav/test/S0906/BAC009S0906W0225.wav", "txt": "渔政渔港建设工程"} -{"key": "BAC009S0906W0226", "wav": "./aishell/wav/test/S0906/BAC009S0906W0226.wav", "txt": "建设一批大型渔政船"} -{"key": "BAC009S0906W0227", "wav": "./aishell/wav/test/S0906/BAC009S0906W0227.wav", "txt": "加强渔政基地和管理信息系统建设"} -{"key": "BAC009S0906W0228", "wav": "./aishell/wav/test/S0906/BAC009S0906W0228.wav", "txt": "动植物保护工程"} -{"key": "BAC009S0906W0229", "wav": "./aishell/wav/test/S0906/BAC009S0906W0229.wav", "txt": "健全六级动物疫病防控体系"} -{"key": "BAC009S0906W0230", "wav": "./aishell/wav/test/S0906/BAC009S0906W0230.wav", "txt": "健全兽药质量安全监管和动物防疫技术支撑体系"} -{"key": "BAC009S0906W0231", "wav": "./aishell/wav/test/S0906/BAC009S0906W0231.wav", "txt": "建设四级农作物病虫疫情监测防控体系"} -{"key": "BAC009S0906W0232", "wav": "./aishell/wav/test/S0906/BAC009S0906W0232.wav", "txt": "完善监测防控监管等设施设备"} -{"key": "BAC009S0906W0233", "wav": "./aishell/wav/test/S0906/BAC009S0906W0233.wav", "txt": "农产品质量安全检验检测能力建设工程"} -{"key": "BAC009S0906W0234", "wav": "./aishell/wav/test/S0906/BAC009S0906W0234.wav", "txt": "改扩建检验检测实验室"} -{"key": "BAC009S0906W0235", "wav": "./aishell/wav/test/S0906/BAC009S0906W0235.wav", "txt": "建设部级水产品质量安全研究中心"} -{"key": "BAC009S0906W0236", "wav": "./aishell/wav/test/S0906/BAC009S0906W0236.wav", "txt": "补充建设一批部级专业质检中心"} -{"key": "BAC009S0906W0237", "wav": "./aishell/wav/test/S0906/BAC009S0906W0237.wav", "txt": "构建全国农产品质量安全监测信息预警平台"} -{"key": "BAC009S0906W0238", "wav": "./aishell/wav/test/S0906/BAC009S0906W0238.wav", "txt": "乡镇农业公共服务能力建设工程"} -{"key": "BAC009S0906W0239", "wav": "./aishell/wav/test/S0906/BAC009S0906W0239.wav", "txt": "农业机械化推进工程"} -{"key": "BAC009S0906W0240", "wav": "./aishell/wav/test/S0906/BAC009S0906W0240.wav", "txt": "加大对秸秆机械化还田和收集打捆机具配套的支持力度"} -{"key": "BAC009S0906W0241", "wav": "./aishell/wav/test/S0906/BAC009S0906W0241.wav", "txt": "完善农业气象等方面的航空站和作业起降点基础设施"} -{"key": "BAC009S0906W0242", "wav": "./aishell/wav/test/S0906/BAC009S0906W0242.wav", "txt": "扶持农机服务组织发展"} -{"key": "BAC009S0906W0243", "wav": "./aishell/wav/test/S0906/BAC009S0906W0243.wav", "txt": "农业信息化建设工程"} -{"key": "BAC009S0906W0244", "wav": "./aishell/wav/test/S0906/BAC009S0906W0244.wav", "txt": "开展农业物物联网应用示范"} -{"key": "BAC009S0906W0245", "wav": "./aishell/wav/test/S0906/BAC009S0906W0245.wav", "txt": "加大天然草原退牧还草工程实施力度"} -{"key": "BAC009S0906W0246", "wav": "./aishell/wav/test/S0906/BAC009S0906W0246.wav", "txt": "加强京津风沙源区草地治理"} -{"key": "BAC009S0906W0247", "wav": "./aishell/wav/test/S0906/BAC009S0906W0247.wav", "txt": "继续加强三江源等地区草原生态建设"} -{"key": "BAC009S0906W0248", "wav": "./aishell/wav/test/S0906/BAC009S0906W0248.wav", "txt": "开展草原自然保护区建设和南方草地综合治理"} -{"key": "BAC009S0906W0249", "wav": "./aishell/wav/test/S0906/BAC009S0906W0249.wav", "txt": "加快实施游牧民定居工程"} -{"key": "BAC009S0906W0250", "wav": "./aishell/wav/test/S0906/BAC009S0906W0250.wav", "txt": "人工种草五亿亩"} -{"key": "BAC009S0906W0251", "wav": "./aishell/wav/test/S0906/BAC009S0906W0251.wav", "txt": "新型农村人才培养工程"} -{"key": "BAC009S0906W0252", "wav": "./aishell/wav/test/S0906/BAC009S0906W0252.wav", "txt": "必须从我国国情和农业发展实际出发"} -{"key": "BAC009S0906W0253", "wav": "./aishell/wav/test/S0906/BAC009S0906W0253.wav", "txt": "亦不可能跟自然规律抗衡"} -{"key": "BAC009S0906W0254", "wav": "./aishell/wav/test/S0906/BAC009S0906W0254.wav", "txt": "无止境地重复过去十年的惊人增长"} -{"key": "BAC009S0906W0256", "wav": "./aishell/wav/test/S0906/BAC009S0906W0256.wav", "txt": "企业于某个领域称王称霸的一刻"} -{"key": "BAC009S0906W0257", "wav": "./aishell/wav/test/S0906/BAC009S0906W0257.wav", "txt": "往往就是公司陷入灾难的开始"} -{"key": "BAC009S0906W0258", "wav": "./aishell/wav/test/S0906/BAC009S0906W0258.wav", "txt": "市场给予它的估值却异常克制"} -{"key": "BAC009S0906W0259", "wav": "./aishell/wav/test/S0906/BAC009S0906W0259.wav", "txt": "以二零一五年度每股八点五美元的盈利预测为准"} -{"key": "BAC009S0906W0260", "wav": "./aishell/wav/test/S0906/BAC009S0906W0260.wav", "txt": "苹果市盈率仅一五倍"} -{"key": "BAC009S0906W0261", "wav": "./aishell/wav/test/S0906/BAC009S0906W0261.wav", "txt": "莫说跟其他创意十足的科技股相提并论"} -{"key": "BAC009S0906W0262", "wav": "./aishell/wav/test/S0906/BAC009S0906W0262.wav", "txt": "比之大市亦有所不如"} -{"key": "BAC009S0906W0263", "wav": "./aishell/wav/test/S0906/BAC009S0906W0263.wav", "txt": "苹果早晚将步之前过气股王的后尘"} -{"key": "BAC009S0906W0265", "wav": "./aishell/wav/test/S0906/BAC009S0906W0265.wav", "txt": "不同意的地方多于同意"} -{"key": "BAC009S0906W0266", "wav": "./aishell/wav/test/S0906/BAC009S0906W0266.wav", "txt": "从随身听到智能电话"} -{"key": "BAC009S0906W0267", "wav": "./aishell/wav/test/S0906/BAC009S0906W0267.wav", "txt": "苹果的拿手好戏是把市场上原霸主拉下马"} -{"key": "BAC009S0906W0268", "wav": "./aishell/wav/test/S0906/BAC009S0906W0268.wav", "txt": "确认消费者喜新厌旧后"} -{"key": "BAC009S0906W0269", "wav": "./aishell/wav/test/S0906/BAC009S0906W0269.wav", "txt": "快速建立以苹果产品服务为核心的生态系统"} -{"key": "BAC009S0906W0270", "wav": "./aishell/wav/test/S0906/BAC009S0906W0270.wav", "txt": "透过不断的更新换代"} -{"key": "BAC009S0906W0271", "wav": "./aishell/wav/test/S0906/BAC009S0906W0271.wav", "txt": "索尼黑莓以至诺基亚"} -{"key": "BAC009S0906W0272", "wav": "./aishell/wav/test/S0906/BAC009S0906W0272.wav", "txt": "在最风光的时候看不见来自颠复者的威胁"} -{"key": "BAC009S0906W0273", "wav": "./aishell/wav/test/S0906/BAC009S0906W0273.wav", "txt": "从不可一世到遭对手边缘化"} -{"key": "BAC009S0906W0274", "wav": "./aishell/wav/test/S0906/BAC009S0906W0274.wav", "txt": "消费者贪新忘旧虽亦可能适用于苹果"} -{"key": "BAC009S0906W0277", "wav": "./aishell/wav/test/S0906/BAC009S0906W0277.wav", "txt": "对投资者大有参考价值"} -{"key": "BAC009S0906W0278", "wav": "./aishell/wav/test/S0906/BAC009S0906W0278.wav", "txt": "一九八三至二零零五年"} -{"key": "BAC009S0906W0279", "wav": "./aishell/wav/test/S0906/BAC009S0906W0279.wav", "txt": "标普五百指数市值冠军宝座"} -{"key": "BAC009S0906W0281", "wav": "./aishell/wav/test/S0906/BAC009S0906W0281.wav", "txt": "四大天王平均累计回报高达一千二百分之八十二"} -{"key": "BAC009S0906W0282", "wav": "./aishell/wav/test/S0906/BAC009S0906W0282.wav", "txt": "四倍于标指同期的三十百分之二"} -{"key": "BAC009S0906W0283", "wav": "./aishell/wav/test/S0906/BAC009S0906W0283.wav", "txt": "四大天王平均回报仅一百分之二十五"} -{"key": "BAC009S0906W0284", "wav": "./aishell/wav/test/S0906/BAC009S0906W0284.wav", "txt": "明显跑输标普五百指数的一百分之九十九"} -{"key": "BAC009S0906W0286", "wav": "./aishell/wav/test/S0906/BAC009S0906W0286.wav", "txt": "销售也总有饱和的一天"} -{"key": "BAC009S0906W0287", "wav": "./aishell/wav/test/S0906/BAC009S0906W0287.wav", "txt": "苹果能否第三期发育"} -{"key": "BAC009S0906W0290", "wav": "./aishell/wav/test/S0906/BAC009S0906W0290.wav", "txt": "从市场始终不愿给予苹果较高估值可见"} -{"key": "BAC009S0906W0291", "wav": "./aishell/wav/test/S0906/BAC009S0906W0291.wav", "txt": "管理层眼光得再高一点"} -{"key": "BAC009S0906W0292", "wav": "./aishell/wav/test/S0906/BAC009S0906W0292.wav", "txt": "苹果有意进军汽车产业"} -{"key": "BAC009S0906W0294", "wav": "./aishell/wav/test/S0906/BAC009S0906W0294.wav", "txt": "老毕对此说甚有保留"} -{"key": "BAC009S0906W0295", "wav": "./aishell/wav/test/S0906/BAC009S0906W0295.wav", "txt": "而库克若真有此意"} -{"key": "BAC009S0906W0297", "wav": "./aishell/wav/test/S0906/BAC009S0906W0297.wav", "txt": "汽车是苹果下一个颠复目标"} -{"key": "BAC009S0906W0298", "wav": "./aishell/wav/test/S0906/BAC009S0906W0298.wav", "txt": "马斯克乃商界新一代万人迷"} -{"key": "BAC009S0906W0299", "wav": "./aishell/wav/test/S0906/BAC009S0906W0299.wav", "txt": "人气不逊乔布斯在世之时"} -{"key": "BAC009S0906W0300", "wav": "./aishell/wav/test/S0906/BAC009S0906W0300.wav", "txt": "三藩市纪事报指此君曾与库克碰头"} -{"key": "BAC009S0906W0302", "wav": "./aishell/wav/test/S0906/BAC009S0906W0302.wav", "txt": "越多人讲往往越难成事"} -{"key": "BAC009S0906W0303", "wav": "./aishell/wav/test/S0906/BAC009S0906W0303.wav", "txt": "有黑客在网络上兜售车主信息"} -{"key": "BAC009S0906W0304", "wav": "./aishell/wav/test/S0906/BAC009S0906W0304.wav", "txt": "雪铁龙车主信息泄露规模或超十万条"} -{"key": "BAC009S0906W0305", "wav": "./aishell/wav/test/S0906/BAC009S0906W0305.wav", "txt": "该平台上显示的漏洞状态是"} -{"key": "BAC009S0906W0306", "wav": "./aishell/wav/test/S0906/BAC009S0906W0306.wav", "txt": "漏洞已通知厂商但厂商忽略该漏洞"} -{"key": "BAC009S0906W0307", "wav": "./aishell/wav/test/S0906/BAC009S0906W0307.wav", "txt": "该公司内部相关人士回应称"} -{"key": "BAC009S0906W0308", "wav": "./aishell/wav/test/S0906/BAC009S0906W0308.wav", "txt": "东风雪铁龙的客户数据存放在专业数据库中"} -{"key": "BAC009S0906W0309", "wav": "./aishell/wav/test/S0906/BAC009S0906W0309.wav", "txt": "对数据库设有监控及记录"} -{"key": "BAC009S0906W0310", "wav": "./aishell/wav/test/S0906/BAC009S0906W0310.wav", "txt": "对用户信息做足了保密工作"} -{"key": "BAC009S0906W0311", "wav": "./aishell/wav/test/S0906/BAC009S0906W0311.wav", "txt": "有业内人士分析指出"} -{"key": "BAC009S0906W0312", "wav": "./aishell/wav/test/S0906/BAC009S0906W0312.wav", "txt": "车企在信息安全方面的投入不足已经越来越成为其软肋"} -{"key": "BAC009S0906W0313", "wav": "./aishell/wav/test/S0906/BAC009S0906W0313.wav", "txt": "其中近一半的漏洞都可能造成网站用户的信息泄露"} -{"key": "BAC009S0906W0314", "wav": "./aishell/wav/test/S0906/BAC009S0906W0314.wav", "txt": "背后涉及到百万车主的信息安全"} -{"key": "BAC009S0906W0315", "wav": "./aishell/wav/test/S0906/BAC009S0906W0315.wav", "txt": "而绝大多数漏洞状态都是未联系到厂商或厂商忽略"} -{"key": "BAC009S0906W0316", "wav": "./aishell/wav/test/S0906/BAC009S0906W0316.wav", "txt": "汽车这个行业缺乏成熟的网络安全管理体系"} -{"key": "BAC009S0906W0317", "wav": "./aishell/wav/test/S0906/BAC009S0906W0317.wav", "txt": "网络运营人员的安全素质有待提高"} -{"key": "BAC009S0906W0318", "wav": "./aishell/wav/test/S0906/BAC009S0906W0318.wav", "txt": "很多车企网站是外包给第三方公司开发的"} -{"key": "BAC009S0906W0319", "wav": "./aishell/wav/test/S0906/BAC009S0906W0319.wav", "txt": "没有交付信息安全公司进行评估"} -{"key": "BAC009S0906W0320", "wav": "./aishell/wav/test/S0906/BAC009S0906W0320.wav", "txt": "因此更有可能留下信息安全风险"} -{"key": "BAC009S0906W0321", "wav": "./aishell/wav/test/S0906/BAC009S0906W0321.wav", "txt": "用户隐私遭泄露的问题日益突出"} -{"key": "BAC009S0906W0322", "wav": "./aishell/wav/test/S0906/BAC009S0906W0322.wav", "txt": "如果许多传统制造行业中的企业一样"} -{"key": "BAC009S0906W0323", "wav": "./aishell/wav/test/S0906/BAC009S0906W0323.wav", "txt": "车企诚待转化互联网思维以及加强互联网安全管控等"} -{"key": "BAC009S0906W0324", "wav": "./aishell/wav/test/S0906/BAC009S0906W0324.wav", "txt": "要跟上互联网发展的步伐不太容易"} -{"key": "BAC009S0906W0325", "wav": "./aishell/wav/test/S0906/BAC009S0906W0325.wav", "txt": "随着互联网快速发展"} -{"key": "BAC009S0906W0326", "wav": "./aishell/wav/test/S0906/BAC009S0906W0326.wav", "txt": "这类专业人才往往集中在互联网企业"} -{"key": "BAC009S0906W0327", "wav": "./aishell/wav/test/S0906/BAC009S0906W0327.wav", "txt": "而车企相对缺乏这类人才"} -{"key": "BAC009S0906W0328", "wav": "./aishell/wav/test/S0906/BAC009S0906W0328.wav", "txt": "网络安全管理体系方面投资非常大"} -{"key": "BAC009S0906W0329", "wav": "./aishell/wav/test/S0906/BAC009S0906W0329.wav", "txt": "涉及人才软件硬件服务以及管理等方面"} -{"key": "BAC009S0906W0330", "wav": "./aishell/wav/test/S0906/BAC009S0906W0330.wav", "txt": "互联网企业也是一步步投入不断完善"} -{"key": "BAC009S0906W0331", "wav": "./aishell/wav/test/S0906/BAC009S0906W0331.wav", "txt": "不同行业在网络安全方面投入比例不一"} -{"key": "BAC009S0906W0332", "wav": "./aishell/wav/test/S0906/BAC009S0906W0332.wav", "txt": "预计汽车行业在网络安全方面投入往往较少"} -{"key": "BAC009S0906W0333", "wav": "./aishell/wav/test/S0906/BAC009S0906W0333.wav", "txt": "一些车企为了节约成本"} -{"key": "BAC009S0906W0334", "wav": "./aishell/wav/test/S0906/BAC009S0906W0334.wav", "txt": "往往将数据库服务器都放在公网上"} -{"key": "BAC009S0906W0335", "wav": "./aishell/wav/test/S0906/BAC009S0906W0335.wav", "txt": "这样很容易被黑客攻破"} -{"key": "BAC009S0906W0336", "wav": "./aishell/wav/test/S0906/BAC009S0906W0336.wav", "txt": "一旦发现系统有漏洞"} -{"key": "BAC009S0906W0337", "wav": "./aishell/wav/test/S0906/BAC009S0906W0337.wav", "txt": "将及时采取主动或被动措施"} -{"key": "BAC009S0906W0338", "wav": "./aishell/wav/test/S0906/BAC009S0906W0338.wav", "txt": "拉德克利夫认为自己被暗指有问题"} -{"key": "BAC009S0906W0339", "wav": "./aishell/wav/test/S0906/BAC009S0906W0339.wav", "txt": "但检测结果的异常并不能就证实运动员使用违禁药物"} -{"key": "BAC009S0906W0340", "wav": "./aishell/wav/test/S0906/BAC009S0906W0340.wav", "txt": "因为导致这项数值波动的原因有很多"} -{"key": "BAC009S0906W0341", "wav": "./aishell/wav/test/S0906/BAC009S0906W0341.wav", "txt": "包括高原训练或身体过度消耗后立刻接受检测"} -{"key": "BAC009S0906W0342", "wav": "./aishell/wav/test/S0906/BAC009S0906W0342.wav", "txt": "因此我请求世界反兴奋剂机构回顾前后所有的数据"} -{"key": "BAC009S0906W0343", "wav": "./aishell/wav/test/S0906/BAC009S0906W0343.wav", "txt": "盘点昆仑决二零一五五大飙血之战搜狐体育"} -{"key": "BAC009S0906W0344", "wav": "./aishell/wav/test/S0906/BAC009S0906W0344.wav", "txt": "无疑是擂台上最能引爆肾上腺素的震撼一幕"} -{"key": "BAC009S0906W0345", "wav": "./aishell/wav/test/S0906/BAC009S0906W0345.wav", "txt": "令拳迷记忆犹新的飙血之战不计其数"} -{"key": "BAC009S0906W0346", "wav": "./aishell/wav/test/S0906/BAC009S0906W0346.wav", "txt": "而这些战斗也成为了圈内久聊不厌的经典谈资"} -{"key": "BAC009S0906W0347", "wav": "./aishell/wav/test/S0906/BAC009S0906W0347.wav", "txt": "在数百场真枪实弹的巅峰对决中"} -{"key": "BAC009S0906W0348", "wav": "./aishell/wav/test/S0906/BAC009S0906W0348.wav", "txt": "不乏诸多脍炙人口的飙血之战"} -{"key": "BAC009S0906W0349", "wav": "./aishell/wav/test/S0906/BAC009S0906W0349.wav", "txt": "十月二十八日与三十一日"} -{"key": "BAC009S0906W0350", "wav": "./aishell/wav/test/S0906/BAC009S0906W0350.wav", "txt": "下面小编将盘点本年度迄今为止昆仑决五大惨烈血战"} -{"key": "BAC009S0906W0353", "wav": "./aishell/wav/test/S0906/BAC009S0906W0353.wav", "txt": "北京时间七月二十八日晚"} -{"key": "BAC009S0906W0354", "wav": "./aishell/wav/test/S0906/BAC009S0906W0354.wav", "txt": "这场对决的惨烈程度超出了所有人的想象"} -{"key": "BAC009S0906W0355", "wav": "./aishell/wav/test/S0906/BAC009S0906W0355.wav", "txt": "比赛开始后仅仅十馀秒"} -{"key": "BAC009S0906W0356", "wav": "./aishell/wav/test/S0906/BAC009S0906W0356.wav", "txt": "播求的头部便被对方的肘击割破"} -{"key": "BAC009S0906W0357", "wav": "./aishell/wav/test/S0906/BAC009S0906W0357.wav", "txt": "打出一道深深的血口"} -{"key": "BAC009S0906W0358", "wav": "./aishell/wav/test/S0906/BAC009S0906W0358.wav", "txt": "伴随着双方激战的火爆升级"} -{"key": "BAC009S0906W0359", "wav": "./aishell/wav/test/S0906/BAC009S0906W0359.wav", "txt": "播求头部的伤口进一步扩大"} -{"key": "BAC009S0906W0360", "wav": "./aishell/wav/test/S0906/BAC009S0906W0360.wav", "txt": "几乎全部被鲜血复盖的半边身体令人触目惊心"} -{"key": "BAC009S0906W0361", "wav": "./aishell/wav/test/S0906/BAC009S0906W0361.wav", "txt": "双方的肘击对轰场面接连上演"} -{"key": "BAC009S0906W0362", "wav": "./aishell/wav/test/S0906/BAC009S0906W0362.wav", "txt": "哈亚的肘击刁钻狠辣"} -{"key": "BAC009S0906W0363", "wav": "./aishell/wav/test/S0906/BAC009S0906W0363.wav", "txt": "直肘反肘交替使用"} -{"key": "BAC009S0906W0364", "wav": "./aishell/wav/test/S0906/BAC009S0906W0364.wav", "txt": "令人防不胜防播求的肘击则更具王者霸气"} -{"key": "BAC009S0906W0365", "wav": "./aishell/wav/test/S0906/BAC009S0906W0365.wav", "txt": "以大刀阔斧的摆肘砸肘为主"} -{"key": "BAC009S0906W0366", "wav": "./aishell/wav/test/S0906/BAC009S0906W0366.wav", "txt": "凶悍直接大开大合"} -{"key": "BAC009S0906W0367", "wav": "./aishell/wav/test/S0906/BAC009S0906W0367.wav", "txt": "加之其半身浴血的黝黑健美体魄"} -{"key": "BAC009S0906W0368", "wav": "./aishell/wav/test/S0906/BAC009S0906W0368.wav", "txt": "颇似从地狱中走出的修罗帝王"} -{"key": "BAC009S0906W0369", "wav": "./aishell/wav/test/S0906/BAC009S0906W0369.wav", "txt": "播求久负盛名的扫腿与冲膝技术开始发威"} -{"key": "BAC009S0906W0370", "wav": "./aishell/wav/test/S0906/BAC009S0906W0370.wav", "txt": "令对手不再敢贸然近身"} -{"key": "BAC009S0906W0371", "wav": "./aishell/wav/test/S0906/BAC009S0906W0371.wav", "txt": "不得不暂停比赛进行处理"} -{"key": "BAC009S0906W0372", "wav": "./aishell/wav/test/S0906/BAC009S0906W0372.wav", "txt": "双方均向对手发起了不遗馀力的猛攻"} -{"key": "BAC009S0906W0373", "wav": "./aishell/wav/test/S0906/BAC009S0906W0373.wav", "txt": "这场史诗级的双王血战在两大强者最后的对决中"} -{"key": "BAC009S0906W0374", "wav": "./aishell/wav/test/S0906/BAC009S0906W0374.wav", "txt": "迎来了结束铃声的敲响"} -{"key": "BAC009S0906W0375", "wav": "./aishell/wav/test/S0906/BAC009S0906W0375.wav", "txt": "哈立以争议性的点数优势宣告获胜"} -{"key": "BAC009S0906W0376", "wav": "./aishell/wav/test/S0906/BAC009S0906W0376.wav", "txt": "浑身是血的播求由于头部三处动脉破裂失血过多"} -{"key": "BAC009S0906W0377", "wav": "./aishell/wav/test/S0906/BAC009S0906W0377.wav", "txt": "被立刻送往医院接受紧急输血治疗"} -{"key": "BAC009S0906W0378", "wav": "./aishell/wav/test/S0906/BAC009S0906W0378.wav", "txt": "也被送往医院进行抢救"} -{"key": "BAC009S0906W0379", "wav": "./aishell/wav/test/S0906/BAC009S0906W0379.wav", "txt": "对于任何一个目睹了整场比赛过程的人来讲"} -{"key": "BAC009S0906W0380", "wav": "./aishell/wav/test/S0906/BAC009S0906W0380.wav", "txt": "这场史诗级惊天血战中没有失败者"} -{"key": "BAC009S0906W0383", "wav": "./aishell/wav/test/S0906/BAC009S0906W0383.wav", "txt": "二零一五年六月七日"} -{"key": "BAC009S0906W0384", "wav": "./aishell/wav/test/S0906/BAC009S0906W0384.wav", "txt": "昆仑决雄霸山城在重庆江南体育馆重装上阵"} -{"key": "BAC009S0906W0385", "wav": "./aishell/wav/test/S0906/BAC009S0906W0385.wav", "txt": "面对身高臂展明显占优的对手"} -{"key": "BAC009S0906W0386", "wav": "./aishell/wav/test/S0906/BAC009S0906W0386.wav", "txt": "雅桑克莱并没有采取矮个子拳手惯用的闪击式打法"} -{"key": "BAC009S0906W0387", "wav": "./aishell/wav/test/S0906/BAC009S0906W0387.wav", "txt": "而是王气十足地向对手进行正面逼近"} -{"key": "BAC009S0906W0388", "wav": "./aishell/wav/test/S0906/BAC009S0906W0388.wav", "txt": "雅桑克莱的优势继续在扩大"} -{"key": "BAC009S0906W0389", "wav": "./aishell/wav/test/S0906/BAC009S0906W0389.wav", "txt": "标志性的扫腿重击力道沉猛的后手重拳纷纷呼啸而出"} -{"key": "BAC009S0906W0390", "wav": "./aishell/wav/test/S0906/BAC009S0906W0390.wav", "txt": "在其左扫腿无情踢击之下"} -{"key": "BAC009S0906W0391", "wav": "./aishell/wav/test/S0906/BAC009S0906W0391.wav", "txt": "祖耶夫的右肋很快便被踢出大片鲜红的淤血斑痕"} -{"key": "BAC009S0906W0392", "wav": "./aishell/wav/test/S0906/BAC009S0906W0392.wav", "txt": "经验丰富的雅桑克莱开始刻意放缓节奏"} -{"key": "BAC009S0906W0393", "wav": "./aishell/wav/test/S0906/BAC009S0906W0393.wav", "txt": "对已是强弩之末的对手进行消耗"} -{"key": "BAC009S0906W0394", "wav": "./aishell/wav/test/S0906/BAC009S0906W0394.wav", "txt": "此时的祖耶夫右眼已经肿胀得完全封闭"} -{"key": "BAC009S0906W0395", "wav": "./aishell/wav/test/S0906/BAC009S0906W0395.wav", "txt": "只能依靠顽强的意志进行支撑"} -{"key": "BAC009S0906W0396", "wav": "./aishell/wav/test/S0906/BAC009S0906W0396.wav", "txt": "雅桑克莱的组合拳将祖耶夫重重击倒然而"} -{"key": "BAC009S0906W0397", "wav": "./aishell/wav/test/S0906/BAC009S0906W0397.wav", "txt": "意志力惊人的白俄罗斯特种兵被没有就此放弃"} -{"key": "BAC009S0906W0398", "wav": "./aishell/wav/test/S0906/BAC009S0906W0398.wav", "txt": "顽强的意志力博得了对手以及全场观众致意"} -{"key": "BAC009S0906W0399", "wav": "./aishell/wav/test/S0906/BAC009S0906W0399.wav", "txt": "比赛在两名王者最后的对决中"} -{"key": "BAC009S0906W0400", "wav": "./aishell/wav/test/S0906/BAC009S0906W0400.wav", "txt": "比赛结果已经无需裁判的裁定"} -{"key": "BAC009S0906W0401", "wav": "./aishell/wav/test/S0906/BAC009S0906W0401.wav", "txt": "但看两人比赛后的面部状况"} -{"key": "BAC009S0906W0402", "wav": "./aishell/wav/test/S0906/BAC009S0906W0402.wav", "txt": "夺得了自己在昆仑拳坛上的第二场重要胜利"} -{"key": "BAC009S0906W0404", "wav": "./aishell/wav/test/S0906/BAC009S0906W0404.wav", "txt": "布拉德皮特新片狂怒接受了宣传媒体拍照"} -{"key": "BAC009S0906W0405", "wav": "./aishell/wav/test/S0906/BAC009S0906W0405.wav", "txt": "我们可以清晰看到皮特的结婚戒指"} -{"key": "BAC009S0906W0406", "wav": "./aishell/wav/test/S0906/BAC009S0906W0406.wav", "txt": "今天确定了上映日期二零一七年四月十七日"} -{"key": "BAC009S0906W0407", "wav": "./aishell/wav/test/S0906/BAC009S0906W0407.wav", "txt": "这是后年春季档的一个黄金上映期"} -{"key": "BAC009S0906W0408", "wav": "./aishell/wav/test/S0906/BAC009S0906W0408.wav", "txt": "看来郑嘉颖是她的初恋"} -{"key": "BAC009S0906W0409", "wav": "./aishell/wav/test/S0906/BAC009S0906W0409.wav", "txt": "问到他们在法国拍戏定情的细节"} -{"key": "BAC009S0906W0410", "wav": "./aishell/wav/test/S0906/BAC009S0906W0410.wav", "txt": "陈凯琳也拒绝回答"} -{"key": "BAC009S0906W0411", "wav": "./aishell/wav/test/S0906/BAC009S0906W0411.wav", "txt": "但就希望外界多给予他们发展空间"} -{"key": "BAC009S0906W0412", "wav": "./aishell/wav/test/S0906/BAC009S0906W0412.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0906W0413", "wav": "./aishell/wav/test/S0906/BAC009S0906W0413.wav", "txt": "早前有传媒更拍到陈凯琳直上嘉颖住所短聚"} -{"key": "BAC009S0906W0414", "wav": "./aishell/wav/test/S0906/BAC009S0906W0414.wav", "txt": "父女恋纸包不住火"} -{"key": "BAC009S0906W0415", "wav": "./aishell/wav/test/S0906/BAC009S0906W0415.wav", "txt": "两人于异国拍外景晨夕相对"} -{"key": "BAC009S0906W0416", "wav": "./aishell/wav/test/S0906/BAC009S0906W0416.wav", "txt": "感情一日千里"} -{"key": "BAC009S0906W0418", "wav": "./aishell/wav/test/S0906/BAC009S0906W0418.wav", "txt": "陈势安两天一夜没洗澡刷牙"} -{"key": "BAC009S0906W0419", "wav": "./aishell/wav/test/S0906/BAC009S0906W0419.wav", "txt": "猛嗑薄荷喉糖"} -{"key": "BAC009S0906W0420", "wav": "./aishell/wav/test/S0906/BAC009S0906W0420.wav", "txt": "搜狐娱乐讯据台湾媒体报道"} -{"key": "BAC009S0906W0421", "wav": "./aishell/wav/test/S0906/BAC009S0906W0421.wav", "txt": "香港女星吴君如与导演陈可辛爱情长跑十八年"} -{"key": "BAC009S0906W0422", "wav": "./aishell/wav/test/S0906/BAC009S0906W0422.wav", "txt": "虽然没有注册结婚"} -{"key": "BAC009S0906W0423", "wav": "./aishell/wav/test/S0906/BAC009S0906W0423.wav", "txt": "但两人关系比一般夫妻更加紧密"} -{"key": "BAC009S0906W0424", "wav": "./aishell/wav/test/S0906/BAC009S0906W0424.wav", "txt": "她日前被媒体目击与陈可辛在大街上逛街血拼"} -{"key": "BAC009S0906W0425", "wav": "./aishell/wav/test/S0906/BAC009S0906W0425.wav", "txt": "且沿途有说有笑"} -{"key": "BAC009S0906W0426", "wav": "./aishell/wav/test/S0906/BAC009S0906W0426.wav", "txt": "一路上都十指紧扣"} -{"key": "BAC009S0906W0427", "wav": "./aishell/wav/test/S0906/BAC009S0906W0427.wav", "txt": "甜蜜恩爱的模样彷彿热恋中的情侣"} -{"key": "BAC009S0906W0428", "wav": "./aishell/wav/test/S0906/BAC009S0906W0428.wav", "txt": "搜狐娱乐讯据香港媒体报导"} -{"key": "BAC009S0906W0429", "wav": "./aishell/wav/test/S0906/BAC009S0906W0429.wav", "txt": "一直邀请陈善之担任经理人"} -{"key": "BAC009S0906W0430", "wav": "./aishell/wav/test/S0906/BAC009S0906W0430.wav", "txt": "并兼任李嘉欣经理人及处理旗下其他艺人的合约事宜"} -{"key": "BAC009S0906W0431", "wav": "./aishell/wav/test/S0906/BAC009S0906W0431.wav", "txt": "执法人员将王靖苏押解回温州"} -{"key": "BAC009S0906W0432", "wav": "./aishell/wav/test/S0906/BAC009S0906W0432.wav", "txt": "温州水库沉车案现男女腐尸女方事发前行为古怪"} -{"key": "BAC009S0906W0433", "wav": "./aishell/wav/test/S0906/BAC009S0906W0433.wav", "txt": "温州沙城街道一民房发生火灾已造成四人死亡"} -{"key": "BAC009S0906W0434", "wav": "./aishell/wav/test/S0906/BAC009S0906W0434.wav", "txt": "温州沙城街道一民房今晨发生火灾已造成四人死亡"} -{"key": "BAC009S0906W0436", "wav": "./aishell/wav/test/S0906/BAC009S0906W0436.wav", "txt": "沙城街道七五村永安路一二五号一民房发生火灾"} -{"key": "BAC009S0906W0437", "wav": "./aishell/wav/test/S0906/BAC009S0906W0437.wav", "txt": "一时一零分火势完全扑灭"} -{"key": "BAC009S0906W0438", "wav": "./aishell/wav/test/S0906/BAC009S0906W0438.wav", "txt": "火灾造成四人死亡一人受伤"} -{"key": "BAC009S0906W0439", "wav": "./aishell/wav/test/S0906/BAC009S0906W0439.wav", "txt": "伤者目前在解放军第一一八医院进行治疗"} -{"key": "BAC009S0906W0441", "wav": "./aishell/wav/test/S0906/BAC009S0906W0441.wav", "txt": "温州惊现最牛菜场温州的状元农贸市场"} -{"key": "BAC009S0906W0443", "wav": "./aishell/wav/test/S0906/BAC009S0906W0443.wav", "txt": "买菜用支付宝扫码付钱"} -{"key": "BAC009S0906W0444", "wav": "./aishell/wav/test/S0906/BAC009S0906W0444.wav", "txt": "听说过段时间还要上场智能秤"} -{"key": "BAC009S0906W0445", "wav": "./aishell/wav/test/S0906/BAC009S0906W0445.wav", "txt": "用智能秤称重将自动生成二维码"} -{"key": "BAC009S0906W0446", "wav": "./aishell/wav/test/S0906/BAC009S0906W0446.wav", "txt": "用支付宝扫一下就能付款"} -{"key": "BAC009S0906W0447", "wav": "./aishell/wav/test/S0906/BAC009S0906W0447.wav", "txt": "温州美女学霸将赴非洲支教教当地小学生汉语"} -{"key": "BAC009S0906W0448", "wav": "./aishell/wav/test/S0906/BAC009S0906W0448.wav", "txt": "麻丽贤等一七位志愿者将远赴非洲支教"} -{"key": "BAC009S0906W0449", "wav": "./aishell/wav/test/S0906/BAC009S0906W0449.wav", "txt": "温州老人卖房筹四八零零万建养老院赠政府遭闲置"} -{"key": "BAC009S0906W0450", "wav": "./aishell/wav/test/S0906/BAC009S0906W0450.wav", "txt": "为了实现退休后能建一座养老机构"} -{"key": "BAC009S0906W0451", "wav": "./aishell/wav/test/S0906/BAC009S0906W0451.wav", "txt": "为更多的老人安度往年的心愿"} -{"key": "BAC009S0906W0452", "wav": "./aishell/wav/test/S0906/BAC009S0906W0452.wav", "txt": "浙江温州一老人拿出全部积蓄并卖掉两套房子"} -{"key": "BAC009S0906W0453", "wav": "./aishell/wav/test/S0906/BAC009S0906W0453.wav", "txt": "筹款四八零零万经六年建成养老院"} -{"key": "BAC009S0906W0454", "wav": "./aishell/wav/test/S0906/BAC009S0906W0454.wav", "txt": "捐给当地慈善部门后却遭闲置三年"} -{"key": "BAC009S0906W0455", "wav": "./aishell/wav/test/S0906/BAC009S0906W0455.wav", "txt": "温州苍南县看守所民警宿舍楼起火无人员伤亡"} -{"key": "BAC009S0906W0456", "wav": "./aishell/wav/test/S0906/BAC009S0906W0456.wav", "txt": "八月三日上午一一时左右"} -{"key": "BAC009S0906W0457", "wav": "./aishell/wav/test/S0906/BAC009S0906W0457.wav", "txt": "温州苍南县看守所一宿舍起火"} -{"key": "BAC009S0906W0458", "wav": "./aishell/wav/test/S0906/BAC009S0906W0458.wav", "txt": "该市苍南县公安局直属县看守所突发火情"} -{"key": "BAC009S0906W0459", "wav": "./aishell/wav/test/S0906/BAC009S0906W0459.wav", "txt": "所内民警宿舍楼突发大火"} -{"key": "BAC009S0906W0460", "wav": "./aishell/wav/test/S0906/BAC009S0906W0460.wav", "txt": "在看守所干警及消防人员的扑救下火势很快被扑灭"} -{"key": "BAC009S0906W0461", "wav": "./aishell/wav/test/S0906/BAC009S0906W0461.wav", "txt": "温州话到底有多难懂"} -{"key": "BAC009S0906W0463", "wav": "./aishell/wav/test/S0906/BAC009S0906W0463.wav", "txt": "大家对温州话难懂这事儿略有耳闻"} -{"key": "BAC009S0906W0464", "wav": "./aishell/wav/test/S0906/BAC009S0906W0464.wav", "txt": "一直被认为是全中国最难学习的方言之一"} -{"key": "BAC009S0906W0465", "wav": "./aishell/wav/test/S0906/BAC009S0906W0465.wav", "txt": "温州贩卖婴儿大案女医生假称婴儿已死然后卖掉"} -{"key": "BAC009S0906W0466", "wav": "./aishell/wav/test/S0906/BAC009S0906W0466.wav", "txt": "警方先后解救了一六名婴儿"} -{"key": "BAC009S0906W0467", "wav": "./aishell/wav/test/S0906/BAC009S0906W0467.wav", "txt": "有六个被送往苍南福利院"} -{"key": "BAC009S0906W0468", "wav": "./aishell/wav/test/S0906/BAC009S0906W0468.wav", "txt": "图为其中一名被解救的孩子"} -{"key": "BAC009S0906W0469", "wav": "./aishell/wav/test/S0906/BAC009S0906W0469.wav", "txt": "温州集资诈骗案犯汇给情人四千万小三被诉"} -{"key": "BAC009S0906W0470", "wav": "./aishell/wav/test/S0906/BAC009S0906W0470.wav", "txt": "二九岁的章某被控洗钱一二二万元"} -{"key": "BAC009S0906W0471", "wav": "./aishell/wav/test/S0906/BAC009S0906W0471.wav", "txt": "温州鞋业总经理遭追杀凶手行凶过程中被打死"} -{"key": "BAC009S0906W0472", "wav": "./aishell/wav/test/S0906/BAC009S0906W0472.wav", "txt": "陆续有人从乐清赶往平阳法院"} -{"key": "BAC009S0906W0473", "wav": "./aishell/wav/test/S0906/BAC009S0906W0473.wav", "txt": "平阳法院内外已聚集了三零零多人等待开庭"} -{"key": "BAC009S0906W0474", "wav": "./aishell/wav/test/S0906/BAC009S0906W0474.wav", "txt": "温州首家支付宝菜市场启动一周很多摊主不会用"} -{"key": "BAC009S0906W0475", "wav": "./aishell/wav/test/S0906/BAC009S0906W0475.wav", "txt": "状元农贸市场内挂着支付宝的宣传牌"} -{"key": "BAC009S0906W0476", "wav": "./aishell/wav/test/S0906/BAC009S0906W0476.wav", "txt": "温州高三男生坠楼身亡事发前无异常刚从家返校"} -{"key": "BAC009S0906W0477", "wav": "./aishell/wav/test/S0906/BAC009S0906W0477.wav", "txt": "龙湾永强中学一名高三男生从宿舍楼五楼楼顶坠楼身亡"} -{"key": "BAC009S0906W0478", "wav": "./aishell/wav/test/S0906/BAC009S0906W0478.wav", "txt": "永强中学校长也是坠楼学生的语文老师"} -{"key": "BAC009S0906W0479", "wav": "./aishell/wav/test/S0906/BAC009S0906W0479.wav", "txt": "印象里他性格是比较开朗的"} -{"key": "BAC009S0906W0480", "wav": "./aishell/wav/test/S0906/BAC009S0906W0480.wav", "txt": "没有发现近期有异常变化目前"} -{"key": "BAC009S0906W0481", "wav": "./aishell/wav/test/S0906/BAC009S0906W0481.wav", "txt": "龙湾警方已对此事展开调查"} -{"key": "BAC009S0906W0482", "wav": "./aishell/wav/test/S0906/BAC009S0906W0482.wav", "txt": "温州高速公路大米遭抢续五名涉案人员已落网"} -{"key": "BAC009S0906W0483", "wav": "./aishell/wav/test/S0906/BAC009S0906W0483.wav", "txt": "白花花的大米洒了一地"} -{"key": "BAC009S0906W0484", "wav": "./aishell/wav/test/S0906/BAC009S0906W0484.wav", "txt": "引来周边大批村民哄抢"} -{"key": "BAC009S0906W0485", "wav": "./aishell/wav/test/S0906/BAC009S0906W0485.wav", "txt": "一场考验道德与良知的大米保卫战悄然打响"} -{"key": "BAC009S0906W0486", "wav": "./aishell/wav/test/S0906/BAC009S0906W0486.wav", "txt": "温州鹿城警方发布通报称"} -{"key": "BAC009S0906W0487", "wav": "./aishell/wav/test/S0906/BAC009S0906W0487.wav", "txt": "五名涉嫌参与抢米的犯罪嫌疑人先后被抓获并拘留"} -{"key": "BAC009S0906W0488", "wav": "./aishell/wav/test/S0906/BAC009S0906W0488.wav", "txt": "民警仍在对其馀涉事人员进行调查"} -{"key": "BAC009S0906W0489", "wav": "./aishell/wav/test/S0906/BAC009S0906W0489.wav", "txt": "温州高速车祸九二包大米遭哄抢续带头者被拘"} -{"key": "BAC009S0906W0490", "wav": "./aishell/wav/test/S0906/BAC009S0906W0490.wav", "txt": "一辆货车在金丽温高速温州段发生事故"} -{"key": "BAC009S0906W0491", "wav": "./aishell/wav/test/S0906/BAC009S0906W0491.wav", "txt": "涉案的其中两名嫌疑人陈某女"} -{"key": "BAC009S0906W0492", "wav": "./aishell/wav/test/S0906/BAC009S0906W0492.wav", "txt": "永嘉县人谢某女"} -{"key": "BAC009S0906W0493", "wav": "./aishell/wav/test/S0906/BAC009S0906W0493.wav", "txt": "永嘉县人已被鹿城警方依法行政拘留"} -{"key": "BAC009S0906W0494", "wav": "./aishell/wav/test/S0906/BAC009S0906W0494.wav", "txt": "港京航班六名乘客推撞地勤四人被判九至一一天监禁"} -{"key": "BAC009S0907W0121", "wav": "./aishell/wav/test/S0907/BAC009S0907W0121.wav", "txt": "也不代表开发商资金面已经不再紧张"} -{"key": "BAC009S0907W0122", "wav": "./aishell/wav/test/S0907/BAC009S0907W0122.wav", "txt": "背后可能蕴含着开发商更多的窘境"} -{"key": "BAC009S0907W0123", "wav": "./aishell/wav/test/S0907/BAC009S0907W0123.wav", "txt": "本世纪网至本世纪经济报道"} -{"key": "BAC009S0907W0124", "wav": "./aishell/wav/test/S0907/BAC009S0907W0124.wav", "txt": "上海南昌等城市近期继续松绑了公积金贷款政策"} -{"key": "BAC009S0907W0125", "wav": "./aishell/wav/test/S0907/BAC009S0907W0125.wav", "txt": "而南昌除了放松首套房界定标准"} -{"key": "BAC009S0907W0126", "wav": "./aishell/wav/test/S0907/BAC009S0907W0126.wav", "txt": "还降低了首套房公积金首付"} -{"key": "BAC009S0907W0127", "wav": "./aishell/wav/test/S0907/BAC009S0907W0127.wav", "txt": "国家住房银行箭在弦上"} -{"key": "BAC009S0907W0128", "wav": "./aishell/wav/test/S0907/BAC009S0907W0128.wav", "txt": "住建部官员发表文章指出"} -{"key": "BAC009S0907W0129", "wav": "./aishell/wav/test/S0907/BAC009S0907W0129.wav", "txt": "以住房公积金制度为基础"} -{"key": "BAC009S0907W0130", "wav": "./aishell/wav/test/S0907/BAC009S0907W0130.wav", "txt": "设立国家住房银行条件已经基本成熟"} -{"key": "BAC009S0907W0131", "wav": "./aishell/wav/test/S0907/BAC009S0907W0131.wav", "txt": "国家住房银行是否箭在弦上"} -{"key": "BAC009S0907W0132", "wav": "./aishell/wav/test/S0907/BAC009S0907W0132.wav", "txt": "其成立需具备哪些条件"} -{"key": "BAC009S0907W0133", "wav": "./aishell/wav/test/S0907/BAC009S0907W0133.wav", "txt": "以住房公积金制度为基础"} -{"key": "BAC009S0907W0134", "wav": "./aishell/wav/test/S0907/BAC009S0907W0134.wav", "txt": "设立政策性住宅金融机构"} -{"key": "BAC009S0907W0135", "wav": "./aishell/wav/test/S0907/BAC009S0907W0135.wav", "txt": "此机构即是住房银行"} -{"key": "BAC009S0907W0136", "wav": "./aishell/wav/test/S0907/BAC009S0907W0136.wav", "txt": "设立住房银行的条件已基本成熟"} -{"key": "BAC009S0907W0137", "wav": "./aishell/wav/test/S0907/BAC009S0907W0137.wav", "txt": "改进住房公积金提取使用监管机制"} -{"key": "BAC009S0907W0138", "wav": "./aishell/wav/test/S0907/BAC009S0907W0138.wav", "txt": "全国住房公积金七万亿元"} -{"key": "BAC009S0907W0139", "wav": "./aishell/wav/test/S0907/BAC009S0907W0139.wav", "txt": "住房维修资金约七亿元"} -{"key": "BAC009S0907W0140", "wav": "./aishell/wav/test/S0907/BAC009S0907W0140.wav", "txt": "如允许每年发行专项金融债券七万亿元"} -{"key": "BAC009S0907W0141", "wav": "./aishell/wav/test/S0907/BAC009S0907W0141.wav", "txt": "今年资金规模接近七万亿元"} -{"key": "BAC009S0907W0142", "wav": "./aishell/wav/test/S0907/BAC009S0907W0142.wav", "txt": "明年预计达到七万亿元"} -{"key": "BAC009S0907W0143", "wav": "./aishell/wav/test/S0907/BAC009S0907W0143.wav", "txt": "可基本满足首套和改善性自住住房的低息贷款需求"} -{"key": "BAC009S0907W0144", "wav": "./aishell/wav/test/S0907/BAC009S0907W0144.wav", "txt": "三是已有人员和机构"} -{"key": "BAC009S0907W0145", "wav": "./aishell/wav/test/S0907/BAC009S0907W0145.wav", "txt": "全国共有管理中心一百个"} -{"key": "BAC009S0907W0146", "wav": "./aishell/wav/test/S0907/BAC009S0907W0146.wav", "txt": "业务网点一千个"} -{"key": "BAC009S0907W0147", "wav": "./aishell/wav/test/S0907/BAC009S0907W0147.wav", "txt": "从业人员五万人"} -{"key": "BAC009S0907W0148", "wav": "./aishell/wav/test/S0907/BAC009S0907W0148.wav", "txt": "可充分利用这些机构网点和人员"} -{"key": "BAC009S0907W0149", "wav": "./aishell/wav/test/S0907/BAC009S0907W0149.wav", "txt": "组建国家住房银行分行和支行"} -{"key": "BAC009S0907W0150", "wav": "./aishell/wav/test/S0907/BAC009S0907W0150.wav", "txt": "对各地分支机构实行垂直管理"} -{"key": "BAC009S0907W0151", "wav": "./aishell/wav/test/S0907/BAC009S0907W0151.wav", "txt": "全国住房公积金贷款风险准备金已接近一百亿元"} -{"key": "BAC009S0907W0152", "wav": "./aishell/wav/test/S0907/BAC009S0907W0152.wav", "txt": "其中五亿元为超额拨备"} -{"key": "BAC009S0907W0153", "wav": "./aishell/wav/test/S0907/BAC009S0907W0153.wav", "txt": "可转化为住房银行资本金"} -{"key": "BAC009S0907W0154", "wav": "./aishell/wav/test/S0907/BAC009S0907W0154.wav", "txt": "设立住房银行好处多多"} -{"key": "BAC009S0907W0155", "wav": "./aishell/wav/test/S0907/BAC009S0907W0155.wav", "txt": "提高家庭购房能力"} -{"key": "BAC009S0907W0156", "wav": "./aishell/wav/test/S0907/BAC009S0907W0156.wav", "txt": "通过国家住房银行提供低息贷款"} -{"key": "BAC009S0907W0157", "wav": "./aishell/wav/test/S0907/BAC009S0907W0157.wav", "txt": "可以解决贷款难和贷款贵问题"} -{"key": "BAC009S0907W0158", "wav": "./aishell/wav/test/S0907/BAC009S0907W0158.wav", "txt": "有效提高家庭购房能力"} -{"key": "BAC009S0907W0159", "wav": "./aishell/wav/test/S0907/BAC009S0907W0159.wav", "txt": "完善宏观调控机制"} -{"key": "BAC009S0907W0160", "wav": "./aishell/wav/test/S0907/BAC009S0907W0160.wav", "txt": "可以有效解决商业银行顺周期操作问题"} -{"key": "BAC009S0907W0161", "wav": "./aishell/wav/test/S0907/BAC009S0907W0161.wav", "txt": "避免房地产市场大起大落"} -{"key": "BAC009S0907W0162", "wav": "./aishell/wav/test/S0907/BAC009S0907W0162.wav", "txt": "拓展货币政策操作空间"} -{"key": "BAC009S0907W0163", "wav": "./aishell/wav/test/S0907/BAC009S0907W0163.wav", "txt": "为利率市场化改革创造条件"} -{"key": "BAC009S0907W0164", "wav": "./aishell/wav/test/S0907/BAC009S0907W0164.wav", "txt": "促进新型城镇化发展"} -{"key": "BAC009S0907W0165", "wav": "./aishell/wav/test/S0907/BAC009S0907W0165.wav", "txt": "将农民工纳入住房公积金制度"} -{"key": "BAC009S0907W0166", "wav": "./aishell/wav/test/S0907/BAC009S0907W0166.wav", "txt": "积累在城镇购房首期付款"} -{"key": "BAC009S0907W0167", "wav": "./aishell/wav/test/S0907/BAC009S0907W0167.wav", "txt": "再由国家住房银行提供低息贷款"} -{"key": "BAC009S0907W0168", "wav": "./aishell/wav/test/S0907/BAC009S0907W0168.wav", "txt": "后续还款用住房公积金支付"} -{"key": "BAC009S0907W0169", "wav": "./aishell/wav/test/S0907/BAC009S0907W0169.wav", "txt": "将有效缓解购房能力不足矛盾"} -{"key": "BAC009S0907W0170", "wav": "./aishell/wav/test/S0907/BAC009S0907W0170.wav", "txt": "提升新型城镇化质量和效益"} -{"key": "BAC009S0907W0171", "wav": "./aishell/wav/test/S0907/BAC009S0907W0171.wav", "txt": "改进住房公积金管理"} -{"key": "BAC009S0907W0172", "wav": "./aishell/wav/test/S0907/BAC009S0907W0172.wav", "txt": "根源是体制机制存在弊端"} -{"key": "BAC009S0907W0173", "wav": "./aishell/wav/test/S0907/BAC009S0907W0173.wav", "txt": "通过设立国家住房银行"} -{"key": "BAC009S0907W0174", "wav": "./aishell/wav/test/S0907/BAC009S0907W0174.wav", "txt": "可以有效提高资金管理集约化专业化和精细化水平"} -{"key": "BAC009S0907W0175", "wav": "./aishell/wav/test/S0907/BAC009S0907W0175.wav", "txt": "充分发挥住房公积金作用"} -{"key": "BAC009S0907W0176", "wav": "./aishell/wav/test/S0907/BAC009S0907W0176.wav", "txt": "住建部官员发表文章指出"} -{"key": "BAC009S0907W0177", "wav": "./aishell/wav/test/S0907/BAC009S0907W0177.wav", "txt": "以住房公积金制度为基础"} -{"key": "BAC009S0907W0178", "wav": "./aishell/wav/test/S0907/BAC009S0907W0178.wav", "txt": "设立国家住房银行条件已经基本成熟"} -{"key": "BAC009S0907W0179", "wav": "./aishell/wav/test/S0907/BAC009S0907W0179.wav", "txt": "国家住房银行是否箭在弦上"} -{"key": "BAC009S0907W0180", "wav": "./aishell/wav/test/S0907/BAC009S0907W0180.wav", "txt": "其成立需具备哪些条件"} -{"key": "BAC009S0907W0181", "wav": "./aishell/wav/test/S0907/BAC009S0907W0181.wav", "txt": "备受刚需购房者关注的公积金政策也频繁迎来调整"} -{"key": "BAC009S0907W0182", "wav": "./aishell/wav/test/S0907/BAC009S0907W0182.wav", "txt": "北京市管国管住房公积金中心先后发布通知"} -{"key": "BAC009S0907W0183", "wav": "./aishell/wav/test/S0907/BAC009S0907W0183.wav", "txt": "贷款最高额度由五万元升至七万元"} -{"key": "BAC009S0907W0184", "wav": "./aishell/wav/test/S0907/BAC009S0907W0184.wav", "txt": "公积金贷款总共可少缴利息三十馀万"} -{"key": "BAC009S0907W0185", "wav": "./aishell/wav/test/S0907/BAC009S0907W0185.wav", "txt": "是对过去住房公积金制度不作为方式的纠正"} -{"key": "BAC009S0907W0186", "wav": "./aishell/wav/test/S0907/BAC009S0907W0186.wav", "txt": "而随着各地公积金政策的调整"} -{"key": "BAC009S0907W0187", "wav": "./aishell/wav/test/S0907/BAC009S0907W0187.wav", "txt": "建立健全以工促农以城带乡的长效机制"} -{"key": "BAC009S0907W0188", "wav": "./aishell/wav/test/S0907/BAC009S0907W0188.wav", "txt": "为现代农业建设取得明显进展提供有力保障"} -{"key": "BAC009S0907W0189", "wav": "./aishell/wav/test/S0907/BAC009S0907W0189.wav", "txt": "建立农业投入稳定增长机制"} -{"key": "BAC009S0907W0190", "wav": "./aishell/wav/test/S0907/BAC009S0907W0190.wav", "txt": "按照总量持续增长比例稳步提高的要求"} -{"key": "BAC009S0907W0191", "wav": "./aishell/wav/test/S0907/BAC009S0907W0191.wav", "txt": "预算内固定资产投资要向重大农业农村建设项目倾斜"} -{"key": "BAC009S0907W0192", "wav": "./aishell/wav/test/S0907/BAC009S0907W0192.wav", "txt": "耕地占用税税率提高后"} -{"key": "BAC009S0907W0193", "wav": "./aishell/wav/test/S0907/BAC009S0907W0193.wav", "txt": "新增收入全部用于农业"} -{"key": "BAC009S0907W0194", "wav": "./aishell/wav/test/S0907/BAC009S0907W0194.wav", "txt": "积极推动土地出让收益用于高标准农田建设"} -{"key": "BAC009S0907W0195", "wav": "./aishell/wav/test/S0907/BAC009S0907W0195.wav", "txt": "充分发挥中国农业产业发展基金的引导作用"} -{"key": "BAC009S0907W0196", "wav": "./aishell/wav/test/S0907/BAC009S0907W0196.wav", "txt": "加快农村金融组织产品和服务创新"} -{"key": "BAC009S0907W0197", "wav": "./aishell/wav/test/S0907/BAC009S0907W0197.wav", "txt": "推动发展村镇银行等农村中小金融机构"} -{"key": "BAC009S0907W0198", "wav": "./aishell/wav/test/S0907/BAC009S0907W0198.wav", "txt": "引导金融机构发放农业中长期贷款"} -{"key": "BAC009S0907W0199", "wav": "./aishell/wav/test/S0907/BAC009S0907W0199.wav", "txt": "完善农民专业合作社管理方法"} -{"key": "BAC009S0907W0200", "wav": "./aishell/wav/test/S0907/BAC009S0907W0200.wav", "txt": "支持其开展信用合作"} -{"key": "BAC009S0907W0201", "wav": "./aishell/wav/test/S0907/BAC009S0907W0201.wav", "txt": "落实农民专业合作社和农村金融有关税收优惠政策"} -{"key": "BAC009S0907W0202", "wav": "./aishell/wav/test/S0907/BAC009S0907W0202.wav", "txt": "扶持农业信贷担保组织发展"} -{"key": "BAC009S0907W0203", "wav": "./aishell/wav/test/S0907/BAC009S0907W0203.wav", "txt": "扩大农村担保品范围"} -{"key": "BAC009S0907W0204", "wav": "./aishell/wav/test/S0907/BAC009S0907W0204.wav", "txt": "完善农业保险保费补贴政策"} -{"key": "BAC009S0907W0205", "wav": "./aishell/wav/test/S0907/BAC009S0907W0205.wav", "txt": "健全农业再保险体系"} -{"key": "BAC009S0907W0206", "wav": "./aishell/wav/test/S0907/BAC009S0907W0206.wav", "txt": "探索完善财政支持下的农业大灾风险分散机制"} -{"key": "BAC009S0907W0207", "wav": "./aishell/wav/test/S0907/BAC009S0907W0207.wav", "txt": "引导社会资本投入农业"} -{"key": "BAC009S0907W0208", "wav": "./aishell/wav/test/S0907/BAC009S0907W0208.wav", "txt": "各部门要主动服务三农"} -{"key": "BAC009S0907W0209", "wav": "./aishell/wav/test/S0907/BAC009S0907W0209.wav", "txt": "积极推动建立城乡要素平等交换关系"} -{"key": "BAC009S0907W0210", "wav": "./aishell/wav/test/S0907/BAC009S0907W0210.wav", "txt": "鼓励和促进工业与城市资源要素向农业农村配置"} -{"key": "BAC009S0907W0211", "wav": "./aishell/wav/test/S0907/BAC009S0907W0211.wav", "txt": "调动农民参与农业农村基础设施建设的积极性"} -{"key": "BAC009S0907W0212", "wav": "./aishell/wav/test/S0907/BAC009S0907W0212.wav", "txt": "通过组织动员和政策引导等多种途径"} -{"key": "BAC009S0907W0213", "wav": "./aishell/wav/test/S0907/BAC009S0907W0213.wav", "txt": "鼓励各种社会力量与乡村结对帮扶"} -{"key": "BAC009S0907W0214", "wav": "./aishell/wav/test/S0907/BAC009S0907W0214.wav", "txt": "参与农村产业发展和公共设施建设"} -{"key": "BAC009S0907W0215", "wav": "./aishell/wav/test/S0907/BAC009S0907W0215.wav", "txt": "努力形成多元化投入新格局"} -{"key": "BAC009S0907W0216", "wav": "./aishell/wav/test/S0907/BAC009S0907W0216.wav", "txt": "加大农业支持保护力度"} -{"key": "BAC009S0907W0217", "wav": "./aishell/wav/test/S0907/BAC009S0907W0217.wav", "txt": "坚持和完善农业补贴政策"} -{"key": "BAC009S0907W0218", "wav": "./aishell/wav/test/S0907/BAC009S0907W0218.wav", "txt": "建立农业补贴政策后评估机制"} -{"key": "BAC009S0907W0219", "wav": "./aishell/wav/test/S0907/BAC009S0907W0219.wav", "txt": "落实农资综合补贴动态调整机制"} -{"key": "BAC009S0907W0220", "wav": "./aishell/wav/test/S0907/BAC009S0907W0220.wav", "txt": "研究逐步扩大良种补贴品种和范围"} -{"key": "BAC009S0907W0221", "wav": "./aishell/wav/test/S0907/BAC009S0907W0221.wav", "txt": "扩大农机具购置补贴规模"} -{"key": "BAC009S0907W0222", "wav": "./aishell/wav/test/S0907/BAC009S0907W0222.wav", "txt": "加大农机化薄弱环节生产机械补贴力度"} -{"key": "BAC009S0907W0223", "wav": "./aishell/wav/test/S0907/BAC009S0907W0223.wav", "txt": "加大动物强制免疫补贴力度"} -{"key": "BAC009S0907W0224", "wav": "./aishell/wav/test/S0907/BAC009S0907W0224.wav", "txt": "逐步完善农业生产关键技术应用与服务支持政策"} -{"key": "BAC009S0907W0225", "wav": "./aishell/wav/test/S0907/BAC009S0907W0225.wav", "txt": "大幅度增加农业防灾减灾稳产增产关键技术良法补助"} -{"key": "BAC009S0907W0226", "wav": "./aishell/wav/test/S0907/BAC009S0907W0226.wav", "txt": "坚持和完善渔用柴油补贴政策"} -{"key": "BAC009S0907W0227", "wav": "./aishell/wav/test/S0907/BAC009S0907W0227.wav", "txt": "继续实施农业种子种苗种畜种禽免税进口优惠政策"} -{"key": "BAC009S0907W0228", "wav": "./aishell/wav/test/S0907/BAC009S0907W0228.wav", "txt": "建立完善农业生产奖补制度"} -{"key": "BAC009S0907W0229", "wav": "./aishell/wav/test/S0907/BAC009S0907W0229.wav", "txt": "完善主产区利益补偿机制"} -{"key": "BAC009S0907W0230", "wav": "./aishell/wav/test/S0907/BAC009S0907W0230.wav", "txt": "提高中央财政对粮食油料生产大县转移支付水平"} -{"key": "BAC009S0907W0231", "wav": "./aishell/wav/test/S0907/BAC009S0907W0231.wav", "txt": "继续加大对产粮大县生猪调出大县的奖励力度"} -{"key": "BAC009S0907W0232", "wav": "./aishell/wav/test/S0907/BAC009S0907W0232.wav", "txt": "规范粮食主产县涉农投资项目地方资金配套"} -{"key": "BAC009S0907W0233", "wav": "./aishell/wav/test/S0907/BAC009S0907W0233.wav", "txt": "全面取消主产区粮食风险基金地方资金配套"} -{"key": "BAC009S0907W0234", "wav": "./aishell/wav/test/S0907/BAC009S0907W0234.wav", "txt": "稳步提高粮食主产区县级人均财力水平"} -{"key": "BAC009S0907W0235", "wav": "./aishell/wav/test/S0907/BAC009S0907W0235.wav", "txt": "全面实施和完善草原生态保护补助奖励政策"} -{"key": "BAC009S0907W0236", "wav": "./aishell/wav/test/S0907/BAC009S0907W0236.wav", "txt": "扩大草原生态保护面源污染防控生态奖补范围和规模"} -{"key": "BAC009S0907W0237", "wav": "./aishell/wav/test/S0907/BAC009S0907W0237.wav", "txt": "探索实施生物农药低毒农药使用补助政策"} -{"key": "BAC009S0907W0238", "wav": "./aishell/wav/test/S0907/BAC009S0907W0238.wav", "txt": "研究建立高耗能老旧农业机械报废回收制度"} -{"key": "BAC009S0907W0239", "wav": "./aishell/wav/test/S0907/BAC009S0907W0239.wav", "txt": "探索实施报废更新补助"} -{"key": "BAC009S0907W0240", "wav": "./aishell/wav/test/S0907/BAC009S0907W0240.wav", "txt": "加大对农业科研和技术推广的支持力度"} -{"key": "BAC009S0907W0241", "wav": "./aishell/wav/test/S0907/BAC009S0907W0241.wav", "txt": "完善现代农业产业技术体系"} -{"key": "BAC009S0907W0242", "wav": "./aishell/wav/test/S0907/BAC009S0907W0242.wav", "txt": "选择部分农业科研院所予以稳定支持"} -{"key": "BAC009S0907W0243", "wav": "./aishell/wav/test/S0907/BAC009S0907W0243.wav", "txt": "按照种养规模和服务绩效安排工作经费"} -{"key": "BAC009S0907W0244", "wav": "./aishell/wav/test/S0907/BAC009S0907W0244.wav", "txt": "加大动物疫病防控经费投入"} -{"key": "BAC009S0907W0245", "wav": "./aishell/wav/test/S0907/BAC009S0907W0245.wav", "txt": "完善病死动物无害化处理补贴制度"} -{"key": "BAC009S0907W0246", "wav": "./aishell/wav/test/S0907/BAC009S0907W0246.wav", "txt": "建立和完善农作物病虫害专业化统防统治补助政策"} -{"key": "BAC009S0907W0247", "wav": "./aishell/wav/test/S0907/BAC009S0907W0247.wav", "txt": "继续向农民免费提供测土配方施肥服务"} -{"key": "BAC009S0907W0248", "wav": "./aishell/wav/test/S0907/BAC009S0907W0248.wav", "txt": "扩大土壤有机质提升项目实施范围和规模"} -{"key": "BAC009S0907W0249", "wav": "./aishell/wav/test/S0907/BAC009S0907W0249.wav", "txt": "继续加大农业农村人才培养力度"} -{"key": "BAC009S0907W0250", "wav": "./aishell/wav/test/S0907/BAC009S0907W0250.wav", "txt": "对大学生涉农创业按规定给予相关政策扶持"} -{"key": "BAC009S0907W0251", "wav": "./aishell/wav/test/S0907/BAC009S0907W0251.wav", "txt": "完善农产品市场调控机制"} -{"key": "BAC009S0907W0252", "wav": "./aishell/wav/test/S0907/BAC009S0907W0252.wav", "txt": "稳步提高稻谷小麦最低收购价"} -{"key": "BAC009S0907W0253", "wav": "./aishell/wav/test/S0907/BAC009S0907W0253.wav", "txt": "没有人提的往往才是真命天子"} -{"key": "BAC009S0907W0254", "wav": "./aishell/wav/test/S0907/BAC009S0907W0254.wav", "txt": "谁是苹果进军汽车市场的合作伙伴收购对象"} -{"key": "BAC009S0907W0258", "wav": "./aishell/wav/test/S0907/BAC009S0907W0258.wav", "txt": "这个问题存在于软件捆绑方式"} -{"key": "BAC009S0907W0259", "wav": "./aishell/wav/test/S0907/BAC009S0907W0259.wav", "txt": "它是软件集成的一种方式"} -{"key": "BAC009S0907W0261", "wav": "./aishell/wav/test/S0907/BAC009S0907W0261.wav", "txt": "他们很快提供了修复软件"} -{"key": "BAC009S0907W0262", "wav": "./aishell/wav/test/S0907/BAC009S0907W0262.wav", "txt": "不管是什么时候推出软件和开发一些超前的东西"} -{"key": "BAC009S0907W0263", "wav": "./aishell/wav/test/S0907/BAC009S0907W0263.wav", "txt": "避免不了出现一些漏洞"} -{"key": "BAC009S0907W0264", "wav": "./aishell/wav/test/S0907/BAC009S0907W0264.wav", "txt": "我们所做的就是发现漏洞后立即修复"} -{"key": "BAC009S0907W0265", "wav": "./aishell/wav/test/S0907/BAC009S0907W0265.wav", "txt": "在苹果发布靓丽的第四财季业绩报告后"} -{"key": "BAC009S0907W0266", "wav": "./aishell/wav/test/S0907/BAC009S0907W0266.wav", "txt": "乔斯维亚克就很少在公众场合露面"} -{"key": "BAC009S0907W0268", "wav": "./aishell/wav/test/S0907/BAC009S0907W0268.wav", "txt": "促使这家公司获得了创记录的第四财季盈利"} -{"key": "BAC009S0907W0269", "wav": "./aishell/wav/test/S0907/BAC009S0907W0269.wav", "txt": "苹果正在全力以赴出售尽可能多的智能手机"} -{"key": "BAC009S0907W0270", "wav": "./aishell/wav/test/S0907/BAC009S0907W0270.wav", "txt": "你必须保证自己了解稳态市场"} -{"key": "BAC009S0907W0271", "wav": "./aishell/wav/test/S0907/BAC009S0907W0271.wav", "txt": "而不仅仅是早期市场"} -{"key": "BAC009S0907W0272", "wav": "./aishell/wav/test/S0907/BAC009S0907W0272.wav", "txt": "大尺寸屏幕设备在亚洲很流行"} -{"key": "BAC009S0907W0273", "wav": "./aishell/wav/test/S0907/BAC009S0907W0273.wav", "txt": "但是在欧洲受欢迎度较低"} -{"key": "BAC009S0907W0274", "wav": "./aishell/wav/test/S0907/BAC009S0907W0274.wav", "txt": "美国市场刚好介于两者之间"} -{"key": "BAC009S0907W0275", "wav": "./aishell/wav/test/S0907/BAC009S0907W0275.wav", "txt": "目前这项服务已经达到了一个里程碑"} -{"key": "BAC009S0907W0277", "wav": "./aishell/wav/test/S0907/BAC009S0907W0277.wav", "txt": "有一百万张信用卡已被激活"} -{"key": "BAC009S0907W0278", "wav": "./aishell/wav/test/S0907/BAC009S0907W0278.wav", "txt": "其中就包括沃尔玛和百思买"} -{"key": "BAC009S0907W0279", "wav": "./aishell/wav/test/S0907/BAC009S0907W0279.wav", "txt": "这两家公司目前正在开发自己的移动支付系统"} -{"key": "BAC009S0907W0280", "wav": "./aishell/wav/test/S0907/BAC009S0907W0280.wav", "txt": "零售商最终都会向消费者妥协"} -{"key": "BAC009S0907W0281", "wav": "./aishell/wav/test/S0907/BAC009S0907W0281.wav", "txt": "想要成功的零售商将考虑消费者的利益"} -{"key": "BAC009S0907W0282", "wav": "./aishell/wav/test/S0907/BAC009S0907W0282.wav", "txt": "并接受消费者想要使用的支付方式"} -{"key": "BAC009S0907W0283", "wav": "./aishell/wav/test/S0907/BAC009S0907W0283.wav", "txt": "乔斯维亚克还谈及了苹果涉足可穿戴设备市场的问题"} -{"key": "BAC009S0907W0286", "wav": "./aishell/wav/test/S0907/BAC009S0907W0286.wav", "txt": "乔斯维亚克还为苹果平板电脑业务做了辩护"} -{"key": "BAC009S0907W0287", "wav": "./aishell/wav/test/S0907/BAC009S0907W0287.wav", "txt": "他拿出了数据作为证据截止目前"} -{"key": "BAC009S0907W0290", "wav": "./aishell/wav/test/S0907/BAC009S0907W0290.wav", "txt": "我们一直都在打造最好的产品"} -{"key": "BAC009S0907W0291", "wav": "./aishell/wav/test/S0907/BAC009S0907W0291.wav", "txt": "这次我们同样做到了"} -{"key": "BAC009S0907W0294", "wav": "./aishell/wav/test/S0907/BAC009S0907W0294.wav", "txt": "用户发现系统更新之后"} -{"key": "BAC009S0907W0295", "wav": "./aishell/wav/test/S0907/BAC009S0907W0295.wav", "txt": "心率测量记录没有之前那么频繁了"} -{"key": "BAC009S0907W0297", "wav": "./aishell/wav/test/S0907/BAC009S0907W0297.wav", "txt": "不过苹果官方很快澄清了这个事情"} -{"key": "BAC009S0907W0298", "wav": "./aishell/wav/test/S0907/BAC009S0907W0298.wav", "txt": "根据苹果官方的支持页面显示"} -{"key": "BAC009S0907W0300", "wav": "./aishell/wav/test/S0907/BAC009S0907W0300.wav", "txt": "不过更新后锻炼和运动手臂的时候不会记录心率"} -{"key": "BAC009S0907W0301", "wav": "./aishell/wav/test/S0907/BAC009S0907W0301.wav", "txt": "因此用户看到测量记录的记录要比之前少一些"} -{"key": "BAC009S0907W0302", "wav": "./aishell/wav/test/S0907/BAC009S0907W0302.wav", "txt": "不过这导致了很多新问题"} -{"key": "BAC009S0907W0303", "wav": "./aishell/wav/test/S0907/BAC009S0907W0303.wav", "txt": "在认证授权系统中对服务器设置权限管理"} -{"key": "BAC009S0907W0304", "wav": "./aishell/wav/test/S0907/BAC009S0907W0304.wav", "txt": "以及与经销商汽车垂直网站等签署保密协议等"} -{"key": "BAC009S0907W0305", "wav": "./aishell/wav/test/S0907/BAC009S0907W0305.wav", "txt": "这些措施在一定程度上将可防止用户数据泄露"} -{"key": "BAC009S0907W0306", "wav": "./aishell/wav/test/S0907/BAC009S0907W0306.wav", "txt": "除了投入大这一因素之外"} -{"key": "BAC009S0907W0307", "wav": "./aishell/wav/test/S0907/BAC009S0907W0307.wav", "txt": "往往对网络安全意识也不强"} -{"key": "BAC009S0907W0308", "wav": "./aishell/wav/test/S0907/BAC009S0907W0308.wav", "txt": "毕竟与互联网融合时间不长"} -{"key": "BAC009S0907W0309", "wav": "./aishell/wav/test/S0907/BAC009S0907W0309.wav", "txt": "上述网络安全人士称"} -{"key": "BAC009S0907W0310", "wav": "./aishell/wav/test/S0907/BAC009S0907W0310.wav", "txt": "乌云网合伙人邬迪接受第一财经日报记者采访时称"} -{"key": "BAC009S0907W0311", "wav": "./aishell/wav/test/S0907/BAC009S0907W0311.wav", "txt": "尽管网络安全目前投入成本大"} -{"key": "BAC009S0907W0312", "wav": "./aishell/wav/test/S0907/BAC009S0907W0312.wav", "txt": "又未直接产生经济效益"} -{"key": "BAC009S0907W0313", "wav": "./aishell/wav/test/S0907/BAC009S0907W0313.wav", "txt": "但到将来互联网时代"} -{"key": "BAC009S0907W0314", "wav": "./aishell/wav/test/S0907/BAC009S0907W0314.wav", "txt": "部分传统的车企或许还没有注意到这点"} -{"key": "BAC009S0907W0315", "wav": "./aishell/wav/test/S0907/BAC009S0907W0315.wav", "txt": "乌云上有不少因联网漏洞可导致车辆被控制"} -{"key": "BAC009S0907W0316", "wav": "./aishell/wav/test/S0907/BAC009S0907W0316.wav", "txt": "这将会导致行车安全问题"} -{"key": "BAC009S0907W0317", "wav": "./aishell/wav/test/S0907/BAC009S0907W0317.wav", "txt": "令车企烦恼的不仅是车主信息被泄露这一困扰"} -{"key": "BAC009S0907W0318", "wav": "./aishell/wav/test/S0907/BAC009S0907W0318.wav", "txt": "随着越来越多车企踊跃加入车联网浪潮中"} -{"key": "BAC009S0907W0319", "wav": "./aishell/wav/test/S0907/BAC009S0907W0319.wav", "txt": "信息安全隐患也随之而来"} -{"key": "BAC009S0907W0320", "wav": "./aishell/wav/test/S0907/BAC009S0907W0320.wav", "txt": "负责车辆网络安全问题"} -{"key": "BAC009S0907W0321", "wav": "./aishell/wav/test/S0907/BAC009S0907W0321.wav", "txt": "现在汽车与网络的联系越来越紧密"} -{"key": "BAC009S0907W0322", "wav": "./aishell/wav/test/S0907/BAC009S0907W0322.wav", "txt": "以后将能够与周围环境交流"} -{"key": "BAC009S0907W0323", "wav": "./aishell/wav/test/S0907/BAC009S0907W0323.wav", "txt": "如果车辆被黑客软件侵袭"} -{"key": "BAC009S0907W0324", "wav": "./aishell/wav/test/S0907/BAC009S0907W0324.wav", "txt": "车辆可能会发生严重的交通事故"} -{"key": "BAC009S0907W0325", "wav": "./aishell/wav/test/S0907/BAC009S0907W0325.wav", "txt": "比如现在的汽车一般采用了哪些新技术"} -{"key": "BAC009S0907W0326", "wav": "./aishell/wav/test/S0907/BAC009S0907W0326.wav", "txt": "其中十六家回复发函"} -{"key": "BAC009S0907W0327", "wav": "./aishell/wav/test/S0907/BAC009S0907W0327.wav", "txt": "在接受调查的这些公司中"} -{"key": "BAC009S0907W0328", "wav": "./aishell/wav/test/S0907/BAC009S0907W0328.wav", "txt": "有两家表示能够诊断或者反馈黑客入侵后的情况"} -{"key": "BAC009S0907W0329", "wav": "./aishell/wav/test/S0907/BAC009S0907W0329.wav", "txt": "有一家公司表示能够及时检测黑客入侵"} -{"key": "BAC009S0907W0330", "wav": "./aishell/wav/test/S0907/BAC009S0907W0330.wav", "txt": "像车上的信息娱乐系统和导航系统"} -{"key": "BAC009S0907W0331", "wav": "./aishell/wav/test/S0907/BAC009S0907W0331.wav", "txt": "很可能通过联网技术"} -{"key": "BAC009S0907W0332", "wav": "./aishell/wav/test/S0907/BAC009S0907W0332.wav", "txt": "被恶意软件或者黑客攻击"} -{"key": "BAC009S0907W0333", "wav": "./aishell/wav/test/S0907/BAC009S0907W0333.wav", "txt": "二十二二零一五"} -{"key": "BAC009S0907W0335", "wav": "./aishell/wav/test/S0907/BAC009S0907W0335.wav", "txt": "黑客可利用这些漏洞远程打开车门"} -{"key": "BAC009S0907W0336", "wav": "./aishell/wav/test/S0907/BAC009S0907W0336.wav", "txt": "宝马方面表示已经升级该数字系统"} -{"key": "BAC009S0907W0337", "wav": "./aishell/wav/test/S0907/BAC009S0907W0337.wav", "txt": "解决信息安全的问题"} -{"key": "BAC009S0907W0339", "wav": "./aishell/wav/test/S0907/BAC009S0907W0339.wav", "txt": "作为唯一能够入选五大飙血之战的女子比赛"} -{"key": "BAC009S0907W0340", "wav": "./aishell/wav/test/S0907/BAC009S0907W0340.wav", "txt": "正是得益于我国女子散打名将鄂美蝶的惊艳一击"} -{"key": "BAC009S0907W0341", "wav": "./aishell/wav/test/S0907/BAC009S0907W0341.wav", "txt": "在当天女子五二千克级自由搏击超级战中"} -{"key": "BAC009S0907W0342", "wav": "./aishell/wav/test/S0907/BAC009S0907W0342.wav", "txt": "鄂美蝶便毫无保留地将炮火轰向对手"} -{"key": "BAC009S0907W0343", "wav": "./aishell/wav/test/S0907/BAC009S0907W0343.wav", "txt": "三十三岁的大滨芳美在面对强大的火力下"} -{"key": "BAC009S0907W0344", "wav": "./aishell/wav/test/S0907/BAC009S0907W0344.wav", "txt": "比赛很快便呈向一边倒的局面第二回合"} -{"key": "BAC009S0907W0345", "wav": "./aishell/wav/test/S0907/BAC009S0907W0345.wav", "txt": "鄂美蝶继续将自己所学到的新搏击技能尽情展现"} -{"key": "BAC009S0907W0346", "wav": "./aishell/wav/test/S0907/BAC009S0907W0346.wav", "txt": "在一连串的拳腿风暴过后"} -{"key": "BAC009S0907W0347", "wav": "./aishell/wav/test/S0907/BAC009S0907W0347.wav", "txt": "终止时间定格在二分二十一秒"} -{"key": "BAC009S0907W0350", "wav": "./aishell/wav/test/S0907/BAC009S0907W0350.wav", "txt": "二零一五年四月十二"} -{"key": "BAC009S0907W0351", "wav": "./aishell/wav/test/S0907/BAC009S0907W0351.wav", "txt": "一场众星闪耀的群龙赛事震撼打响"} -{"key": "BAC009S0907W0352", "wav": "./aishell/wav/test/S0907/BAC009S0907W0352.wav", "txt": "作为此次赛事上唯一一场纯泰式规则的超级战"} -{"key": "BAC009S0907W0353", "wav": "./aishell/wav/test/S0907/BAC009S0907W0353.wav", "txt": "两位气质迥异的选手展示出了全然不同的擂台风格"} -{"key": "BAC009S0907W0354", "wav": "./aishell/wav/test/S0907/BAC009S0907W0354.wav", "txt": "在前两局僵持不下的情况下"} -{"key": "BAC009S0907W0355", "wav": "./aishell/wav/test/S0907/BAC009S0907W0355.wav", "txt": "面对兵行诡道的波斯弯刀"} -{"key": "BAC009S0907W0356", "wav": "./aishell/wav/test/S0907/BAC009S0907W0356.wav", "txt": "张春雨选择了加强压迫对手的力度"} -{"key": "BAC009S0907W0357", "wav": "./aishell/wav/test/S0907/BAC009S0907W0357.wav", "txt": "此举却导致了一次擂台意外的发生"} -{"key": "BAC009S0907W0358", "wav": "./aishell/wav/test/S0907/BAC009S0907W0358.wav", "txt": "张春雨被对手的一记肘击打破了右侧眉弓"} -{"key": "BAC009S0907W0359", "wav": "./aishell/wav/test/S0907/BAC009S0907W0359.wav", "txt": "经过场上护理人员的医治后"} -{"key": "BAC009S0907W0360", "wav": "./aishell/wav/test/S0907/BAC009S0907W0360.wav", "txt": "张春雨在全场观众的喝彩声中再次投入比赛"} -{"key": "BAC009S0907W0361", "wav": "./aishell/wav/test/S0907/BAC009S0907W0361.wav", "txt": "并向对手发起了凶猛的反扑"} -{"key": "BAC009S0907W0362", "wav": "./aishell/wav/test/S0907/BAC009S0907W0362.wav", "txt": "双方刺刀见红式的对攻中"} -{"key": "BAC009S0907W0363", "wav": "./aishell/wav/test/S0907/BAC009S0907W0363.wav", "txt": "伊萨的眉弓同样被张春雨以牙还牙的肘法击破"} -{"key": "BAC009S0907W0364", "wav": "./aishell/wav/test/S0907/BAC009S0907W0364.wav", "txt": "全面引爆现场观众的激情"} -{"key": "BAC009S0907W0365", "wav": "./aishell/wav/test/S0907/BAC009S0907W0365.wav", "txt": "比赛在双方互不相让的对攻中落下了帷幕"} -{"key": "BAC009S0907W0366", "wav": "./aishell/wav/test/S0907/BAC009S0907W0366.wav", "txt": "但对于每一位观赛者来讲"} -{"key": "BAC009S0907W0367", "wav": "./aishell/wav/test/S0907/BAC009S0907W0367.wav", "txt": "能够亲眼见证这场火爆刺激的的精彩大战"} -{"key": "BAC009S0907W0368", "wav": "./aishell/wav/test/S0907/BAC009S0907W0368.wav", "txt": "远比单纯的胜负有意义得多"} -{"key": "BAC009S0907W0369", "wav": "./aishell/wav/test/S0907/BAC009S0907W0369.wav", "txt": "这是一场没有输家的经典比赛"} -{"key": "BAC009S0907W0372", "wav": "./aishell/wav/test/S0907/BAC009S0907W0372.wav", "txt": "二零一五年二月一日"} -{"key": "BAC009S0907W0373", "wav": "./aishell/wav/test/S0907/BAC009S0907W0373.wav", "txt": "昆仑决广州站在广州天河体育中心成功打响"} -{"key": "BAC009S0907W0374", "wav": "./aishell/wav/test/S0907/BAC009S0907W0374.wav", "txt": "多国大神级搏击王者论剑昆仑武道之巅"} -{"key": "BAC009S0907W0375", "wav": "./aishell/wav/test/S0907/BAC009S0907W0375.wav", "txt": "决赛一如期待般精彩绝伦"} -{"key": "BAC009S0907W0376", "wav": "./aishell/wav/test/S0907/BAC009S0907W0376.wav", "txt": "马刀抡击式的中距离组合拳法配合高位膝技"} -{"key": "BAC009S0907W0377", "wav": "./aishell/wav/test/S0907/BAC009S0907W0377.wav", "txt": "打得对手只有招架之功"} -{"key": "BAC009S0907W0378", "wav": "./aishell/wav/test/S0907/BAC009S0907W0378.wav", "txt": "便将对手的眼部击伤"} -{"key": "BAC009S0907W0379", "wav": "./aishell/wav/test/S0907/BAC009S0907W0379.wav", "txt": "严重影响卡尔泽塔的实现"} -{"key": "BAC009S0907W0380", "wav": "./aishell/wav/test/S0907/BAC009S0907W0380.wav", "txt": "令对手无奈放弃比赛"} -{"key": "BAC009S0907W0381", "wav": "./aishell/wav/test/S0907/BAC009S0907W0381.wav", "txt": "他在二零一五年昆仑决诸神之战决赛圈的表现"} -{"key": "BAC009S0907W0382", "wav": "./aishell/wav/test/S0907/BAC009S0907W0382.wav", "txt": "将成为无数武迷接下来最大的期待之一"} -{"key": "BAC009S0907W0383", "wav": "./aishell/wav/test/S0907/BAC009S0907W0383.wav", "txt": "谁也不知道会发生什么"} -{"key": "BAC009S0907W0384", "wav": "./aishell/wav/test/S0907/BAC009S0907W0384.wav", "txt": "这就是竞技体育的魅力"} -{"key": "BAC009S0907W0385", "wav": "./aishell/wav/test/S0907/BAC009S0907W0385.wav", "txt": "在昨晚的女子标枪决赛中"} -{"key": "BAC009S0907W0386", "wav": "./aishell/wav/test/S0907/BAC009S0907W0386.wav", "txt": "然而就是这最后一掷"} -{"key": "BAC009S0907W0387", "wav": "./aishell/wav/test/S0907/BAC009S0907W0387.wav", "txt": "在昨天比赛的第五投"} -{"key": "BAC009S0907W0388", "wav": "./aishell/wav/test/S0907/BAC009S0907W0388.wav", "txt": "吕会会在全场观众的加油助威声中爆发"} -{"key": "BAC009S0907W0389", "wav": "./aishell/wav/test/S0907/BAC009S0907W0389.wav", "txt": "倾尽全力将标枪掷到了六十六米一三"} -{"key": "BAC009S0907W0390", "wav": "./aishell/wav/test/S0907/BAC009S0907W0390.wav", "txt": "然而就在全场仅剩下莫利托一个人的最后一掷时"} -{"key": "BAC009S0907W0391", "wav": "./aishell/wav/test/S0907/BAC009S0907W0391.wav", "txt": "虽然留下了巨大的遗憾"} -{"key": "BAC009S0907W0392", "wav": "./aishell/wav/test/S0907/BAC009S0907W0392.wav", "txt": "不过这依然是吕会会的个人最好成绩"} -{"key": "BAC009S0907W0393", "wav": "./aishell/wav/test/S0907/BAC009S0907W0393.wav", "txt": "吕会会在走到混合区接受记者采访时止住了泪水"} -{"key": "BAC009S0907W0394", "wav": "./aishell/wav/test/S0907/BAC009S0907W0394.wav", "txt": "在大赛中投出这样好的成绩我自己都没有想到"} -{"key": "BAC009S0907W0395", "wav": "./aishell/wav/test/S0907/BAC009S0907W0395.wav", "txt": "其实比赛过程中我也没有多想"} -{"key": "BAC009S0907W0396", "wav": "./aishell/wav/test/S0907/BAC009S0907W0396.wav", "txt": "就是要一枪一枪地投"} -{"key": "BAC009S0907W0397", "wav": "./aishell/wav/test/S0907/BAC009S0907W0397.wav", "txt": "比成这样我其实已经很开心了"} -{"key": "BAC009S0907W0398", "wav": "./aishell/wav/test/S0907/BAC009S0907W0398.wav", "txt": "能在北京获得一枚奖牌我很骄傲"} -{"key": "BAC009S0907W0399", "wav": "./aishell/wav/test/S0907/BAC009S0907W0399.wav", "txt": "观众们的鼓励也给了我力量"} -{"key": "BAC009S0907W0400", "wav": "./aishell/wav/test/S0907/BAC009S0907W0400.wav", "txt": "我的泪水主要还是来自于喜悦"} -{"key": "BAC009S0907W0401", "wav": "./aishell/wav/test/S0907/BAC009S0907W0401.wav", "txt": "要说一点儿没有遗憾和失落是假的"} -{"key": "BAC009S0907W0402", "wav": "./aishell/wav/test/S0907/BAC009S0907W0402.wav", "txt": "但总的来说还是高兴多于遗憾"} -{"key": "BAC009S0907W0403", "wav": "./aishell/wav/test/S0907/BAC009S0907W0403.wav", "txt": "文本报记者刘艾林"} -{"key": "BAC009S0907W0404", "wav": "./aishell/wav/test/S0907/BAC009S0907W0404.wav", "txt": "去年美国队长二寒冬战士就曾在四月登陆"} -{"key": "BAC009S0907W0405", "wav": "./aishell/wav/test/S0907/BAC009S0907W0405.wav", "txt": "结果创造了相当可观的票房成绩"} -{"key": "BAC009S0907W0406", "wav": "./aishell/wav/test/S0907/BAC009S0907W0406.wav", "txt": "丛林之书则将在二零一六年四月十五日登场"} -{"key": "BAC009S0907W0408", "wav": "./aishell/wav/test/S0907/BAC009S0907W0408.wav", "txt": "两人合作长达二十年"} -{"key": "BAC009S0907W0409", "wav": "./aishell/wav/test/S0907/BAC009S0907W0409.wav", "txt": "不过天下无不散之筵席"} -{"key": "BAC009S0907W0410", "wav": "./aishell/wav/test/S0907/BAC009S0907W0410.wav", "txt": "原来陈善之最近已离开了百仕活"} -{"key": "BAC009S0907W0411", "wav": "./aishell/wav/test/S0907/BAC009S0907W0411.wav", "txt": "有传他离开是因黎明不满其在挽留艺人方面没有尽力"} -{"key": "BAC009S0907W0412", "wav": "./aishell/wav/test/S0907/BAC009S0907W0412.wav", "txt": "搜狐娱乐讯十月九日"} -{"key": "BAC009S0907W0413", "wav": "./aishell/wav/test/S0907/BAC009S0907W0413.wav", "txt": "表示决定辞职"} -{"key": "BAC009S0907W0414", "wav": "./aishell/wav/test/S0907/BAC009S0907W0414.wav", "txt": "不与无线续约"} -{"key": "BAC009S0907W0415", "wav": "./aishell/wav/test/S0907/BAC009S0907W0415.wav", "txt": "他感叹自己在无线十五年都没有机会"} -{"key": "BAC009S0907W0416", "wav": "./aishell/wav/test/S0907/BAC009S0907W0416.wav", "txt": "眼见后辈爬头"} -{"key": "BAC009S0907W0417", "wav": "./aishell/wav/test/S0907/BAC009S0907W0417.wav", "txt": "希望出去发展"} -{"key": "BAC009S0907W0418", "wav": "./aishell/wav/test/S0907/BAC009S0907W0418.wav", "txt": "他直言不想看见自己变作一潭死水"} -{"key": "BAC009S0907W0419", "wav": "./aishell/wav/test/S0907/BAC009S0907W0419.wav", "txt": "早前演出的舞台剧令他醒觉要出外寻找更多演出机会"} -{"key": "BAC009S0907W0420", "wav": "./aishell/wav/test/S0907/BAC009S0907W0420.wav", "txt": "因此决定出外闯"} -{"key": "BAC009S0907W0421", "wav": "./aishell/wav/test/S0907/BAC009S0907W0421.wav", "txt": "虽然未知去向"} -{"key": "BAC009S0907W0422", "wav": "./aishell/wav/test/S0907/BAC009S0907W0422.wav", "txt": "但坚信有我落脚的地方"} -{"key": "BAC009S0907W0423", "wav": "./aishell/wav/test/S0907/BAC009S0907W0423.wav", "txt": "我便会到那里"} -{"key": "BAC009S0907W0424", "wav": "./aishell/wav/test/S0907/BAC009S0907W0424.wav", "txt": "搜狐娱乐讯北京时间七月二十八日消息"} -{"key": "BAC009S0907W0425", "wav": "./aishell/wav/test/S0907/BAC009S0907W0425.wav", "txt": "据香港媒体报导"} -{"key": "BAC009S0907W0427", "wav": "./aishell/wav/test/S0907/BAC009S0907W0427.wav", "txt": "陈奕迅双手合十认真地向蛋糕许愿"} -{"key": "BAC009S0907W0428", "wav": "./aishell/wav/test/S0907/BAC009S0907W0428.wav", "txt": "搜狐娱乐讯据台湾媒体报道"} -{"key": "BAC009S0907W0429", "wav": "./aishell/wav/test/S0907/BAC009S0907W0429.wav", "txt": "港歌神陈奕迅出道近二十年"} -{"key": "BAC009S0907W0430", "wav": "./aishell/wav/test/S0907/BAC009S0907W0430.wav", "txt": "曾获美国时代杂志形容为影响香港乐坛风格的人物"} -{"key": "BAC009S0907W0431", "wav": "./aishell/wav/test/S0907/BAC009S0907W0431.wav", "txt": "并于当日被香港警方拘捕"} -{"key": "BAC009S0907W0432", "wav": "./aishell/wav/test/S0907/BAC009S0907W0432.wav", "txt": "警方以普通袭击罪对涉事乘客提起诉讼"} -{"key": "BAC009S0907W0433", "wav": "./aishell/wav/test/S0907/BAC009S0907W0433.wav", "txt": "其中四名被告分别判即时监禁九至一一天"} -{"key": "BAC009S0907W0434", "wav": "./aishell/wav/test/S0907/BAC009S0907W0434.wav", "txt": "一人被判罚款一五零零元"} -{"key": "BAC009S0907W0435", "wav": "./aishell/wav/test/S0907/BAC009S0907W0435.wav", "txt": "港京航班延误九小时六名内地乘客推撞地勤被捕"} -{"key": "BAC009S0907W0437", "wav": "./aishell/wav/test/S0907/BAC009S0907W0437.wav", "txt": "六名内地乘客与地勤发生肢体冲突"} -{"key": "BAC009S0907W0439", "wav": "./aishell/wav/test/S0907/BAC009S0907W0439.wav", "txt": "将被以普通袭击罪起诉"} -{"key": "BAC009S0907W0440", "wav": "./aishell/wav/test/S0907/BAC009S0907W0440.wav", "txt": "港商在台遭绑三八天获救后痛哭以为必死"} -{"key": "BAC009S0907W0441", "wav": "./aishell/wav/test/S0907/BAC009S0907W0441.wav", "txt": "黄煜坤被警方送到附近医院接受检查"} -{"key": "BAC009S0907W0442", "wav": "./aishell/wav/test/S0907/BAC009S0907W0442.wav", "txt": "惠州公安在金山河捞获一具无头无双手女尸"} -{"key": "BAC009S0907W0443", "wav": "./aishell/wav/test/S0907/BAC009S0907W0443.wav", "txt": "广东惠州惊爆港商杀情妇碎尸凶案"} -{"key": "BAC009S0907W0444", "wav": "./aishell/wav/test/S0907/BAC009S0907W0444.wav", "txt": "五零岁港商疑与其工厂的同龄女主管偷情多年"} -{"key": "BAC009S0907W0445", "wav": "./aishell/wav/test/S0907/BAC009S0907W0445.wav", "txt": "近日再度拒绝女方的逼婚后"} -{"key": "BAC009S0907W0446", "wav": "./aishell/wav/test/S0907/BAC009S0907W0446.wav", "txt": "遭追讨欠款和抚养费共四零万元人民币"} -{"key": "BAC009S0907W0447", "wav": "./aishell/wav/test/S0907/BAC009S0907W0447.wav", "txt": "港商疑恼羞成怒将她杀害"} -{"key": "BAC009S0907W0448", "wav": "./aishell/wav/test/S0907/BAC009S0907W0448.wav", "txt": "并肢解尸体分成多袋抛入河中"} -{"key": "BAC009S0907W0449", "wav": "./aishell/wav/test/S0907/BAC009S0907W0449.wav", "txt": "港商被骗牵出路边地下钱庄涉案资金四三零零亿"} -{"key": "BAC009S0907W0450", "wav": "./aishell/wav/test/S0907/BAC009S0907W0450.wav", "txt": "深圳警方查获的一个地下钱庄窝点"} -{"key": "BAC009S0907W0451", "wav": "./aishell/wav/test/S0907/BAC009S0907W0451.wav", "txt": "由普通商店作为掩护"} -{"key": "BAC009S0907W0452", "wav": "./aishell/wav/test/S0907/BAC009S0907W0452.wav", "txt": "该商店老板郑晓生红衣者涉嫌暗地里兑换外汇"} -{"key": "BAC009S0907W0453", "wav": "./aishell/wav/test/S0907/BAC009S0907W0453.wav", "txt": "替人向境外转移资金"} -{"key": "BAC009S0907W0454", "wav": "./aishell/wav/test/S0907/BAC009S0907W0454.wav", "txt": "港媒关注天价虾店停业破坏青岛形象"} -{"key": "BAC009S0907W0455", "wav": "./aishell/wav/test/S0907/BAC009S0907W0455.wav", "txt": "参考消息网一零月八日报道港媒称"} -{"key": "BAC009S0907W0456", "wav": "./aishell/wav/test/S0907/BAC009S0907W0456.wav", "txt": "备受关注的青岛三八元一只大虾事件有最新发展"} -{"key": "BAC009S0907W0457", "wav": "./aishell/wav/test/S0907/BAC009S0907W0457.wav", "txt": "并责令其立即改正价格违法行为"} -{"key": "BAC009S0907W0458", "wav": "./aishell/wav/test/S0907/BAC009S0907W0458.wav", "txt": "事发后派出所和物价局都互相踢皮球"} -{"key": "BAC009S0907W0459", "wav": "./aishell/wav/test/S0907/BAC009S0907W0459.wav", "txt": "批评职能部门没有将消费者放在第一位"} -{"key": "BAC009S0907W0460", "wav": "./aishell/wav/test/S0907/BAC009S0907W0460.wav", "txt": "港媒关注内地私人美术馆新富人群热衷分享藏品"} -{"key": "BAC009S0907W0461", "wav": "./aishell/wav/test/S0907/BAC009S0907W0461.wav", "txt": "参考消息网七月二九日报道港媒称"} -{"key": "BAC009S0907W0462", "wav": "./aishell/wav/test/S0907/BAC009S0907W0462.wav", "txt": "用来保存他们的藏品"} -{"key": "BAC009S0907W0463", "wav": "./aishell/wav/test/S0907/BAC009S0907W0463.wav", "txt": "其中一些人是近年来国际拍卖会上艺术品的最大买家"} -{"key": "BAC009S0907W0464", "wav": "./aishell/wav/test/S0907/BAC009S0907W0464.wav", "txt": "港媒关注浙江暖男医生手术室播动画片哄小女孩"} -{"key": "BAC009S0907W0465", "wav": "./aishell/wav/test/S0907/BAC009S0907W0465.wav", "txt": "参考消息网九月二二日报道港媒称"} -{"key": "BAC009S0907W0466", "wav": "./aishell/wav/test/S0907/BAC009S0907W0466.wav", "txt": "网络上热传一组暖男医生哄小萝莉的温情照片"} -{"key": "BAC009S0907W0467", "wav": "./aishell/wav/test/S0907/BAC009S0907W0467.wav", "txt": "男医生为了安抚即将做手术的小女孩"} -{"key": "BAC009S0907W0468", "wav": "./aishell/wav/test/S0907/BAC009S0907W0468.wav", "txt": "将小女孩抱在腿上并播放手机中的动画片"} -{"key": "BAC009S0907W0469", "wav": "./aishell/wav/test/S0907/BAC009S0907W0469.wav", "txt": "港媒关注重雾霾重回华北罕见蓝天只持续两周"} -{"key": "BAC009S0907W0470", "wav": "./aishell/wav/test/S0907/BAC009S0907W0470.wav", "txt": "参考消息网九月一九日报道港媒称"} -{"key": "BAC009S0907W0471", "wav": "./aishell/wav/test/S0907/BAC009S0907W0471.wav", "txt": "随着严重雾霾卷土重来"} -{"key": "BAC009S0907W0472", "wav": "./aishell/wav/test/S0907/BAC009S0907W0472.wav", "txt": "港媒关注马云回应被逼捐花钱比挣钱难"} -{"key": "BAC009S0907W0474", "wav": "./aishell/wav/test/S0907/BAC009S0907W0474.wav", "txt": "企业应该做好的投资"} -{"key": "BAC009S0907W0475", "wav": "./aishell/wav/test/S0907/BAC009S0907W0475.wav", "txt": "盲目捐款没有益处"} -{"key": "BAC009S0907W0476", "wav": "./aishell/wav/test/S0907/BAC009S0907W0476.wav", "txt": "港媒曝水货客扮残疾人在轮椅中藏钻石月入八万"} -{"key": "BAC009S0907W0477", "wav": "./aishell/wav/test/S0907/BAC009S0907W0477.wav", "txt": "参考消息网七月二九日报道港媒称"} -{"key": "BAC009S0907W0478", "wav": "./aishell/wav/test/S0907/BAC009S0907W0478.wav", "txt": "香港海关严查水货客"} -{"key": "BAC009S0907W0479", "wav": "./aishell/wav/test/S0907/BAC009S0907W0479.wav", "txt": "水货集团看中轮椅人士收入不高"} -{"key": "BAC009S0907W0480", "wav": "./aishell/wav/test/S0907/BAC009S0907W0480.wav", "txt": "以高收入低风险和免缴税等好处利诱对方成为水货客"} -{"key": "BAC009S0907W0481", "wav": "./aishell/wav/test/S0907/BAC009S0907W0481.wav", "txt": "有走私奢侈品的人士月入高达八万港元"} -{"key": "BAC009S0907W0482", "wav": "./aishell/wav/test/S0907/BAC009S0907W0482.wav", "txt": "港媒盘点亚洲千禧一代十大富豪九人是中国人"} -{"key": "BAC009S0907W0483", "wav": "./aishell/wav/test/S0907/BAC009S0907W0483.wav", "txt": "参考消息网七月二二日报道"} -{"key": "BAC009S0907W0484", "wav": "./aishell/wav/test/S0907/BAC009S0907W0484.wav", "txt": "港媒称假沉香充斥内地多以化学香油制成"} -{"key": "BAC009S0907W0485", "wav": "./aishell/wav/test/S0907/BAC009S0907W0485.wav", "txt": "高仿沉香多以化学香精等制成"} -{"key": "BAC009S0907W0486", "wav": "./aishell/wav/test/S0907/BAC009S0907W0486.wav", "txt": "可比黄金的沉香价格每年倍增"} -{"key": "BAC009S0907W0487", "wav": "./aishell/wav/test/S0907/BAC009S0907W0487.wav", "txt": "港媒称内地中产人数猛增有助稳定企望渐进改革"} -{"key": "BAC009S0907W0488", "wav": "./aishell/wav/test/S0907/BAC009S0907W0488.wav", "txt": "一个国家稳定的社会结构呈橄榄形"} -{"key": "BAC009S0907W0489", "wav": "./aishell/wav/test/S0907/BAC009S0907W0489.wav", "txt": "而橄榄形结构是以中产为主的结构"} -{"key": "BAC009S0907W0490", "wav": "./aishell/wav/test/S0907/BAC009S0907W0490.wav", "txt": "中产阶级在一个国家的现代化中起着稳定作用"} -{"key": "BAC009S0907W0491", "wav": "./aishell/wav/test/S0907/BAC009S0907W0491.wav", "txt": "是社会稳定的主要力量"} -{"key": "BAC009S0907W0492", "wav": "./aishell/wav/test/S0907/BAC009S0907W0492.wav", "txt": "港媒称内地为国际市场修改动画片妖怪不能吃唐僧肉"} -{"key": "BAC009S0907W0493", "wav": "./aishell/wav/test/S0907/BAC009S0907W0493.wav", "txt": "参考消息网一一月一日报道港媒称"} -{"key": "BAC009S0907W0494", "wav": "./aishell/wav/test/S0907/BAC009S0907W0494.wav", "txt": "中国的动画工作室越来越看重海外市场"} -{"key": "BAC009S0907W0495", "wav": "./aishell/wav/test/S0907/BAC009S0907W0495.wav", "txt": "港媒称内地人不穷了为何仍爱抢学者抢习惯了"} -{"key": "BAC009S0908W0121", "wav": "./aishell/wav/test/S0908/BAC009S0908W0121.wav", "txt": "将进一步提振刚需购房者入市信心"} -{"key": "BAC009S0908W0122", "wav": "./aishell/wav/test/S0908/BAC009S0908W0122.wav", "txt": "加速今年楼市成交复苏回暖"} -{"key": "BAC009S0908W0123", "wav": "./aishell/wav/test/S0908/BAC009S0908W0123.wav", "txt": "公积金政策利好首套自住住房贷款需求的消息纷至沓来"} -{"key": "BAC009S0908W0124", "wav": "./aishell/wav/test/S0908/BAC009S0908W0124.wav", "txt": "并已实施"} -{"key": "BAC009S0908W0125", "wav": "./aishell/wav/test/S0908/BAC009S0908W0125.wav", "txt": "贷款额度上限调整为一百万元"} -{"key": "BAC009S0908W0126", "wav": "./aishell/wav/test/S0908/BAC009S0908W0126.wav", "txt": "购买一百平方米以上非政策性住房或第二套住房"} -{"key": "BAC009S0908W0127", "wav": "./aishell/wav/test/S0908/BAC009S0908W0127.wav", "txt": "贷款最高额度仍为一百万元"} -{"key": "BAC009S0908W0128", "wav": "./aishell/wav/test/S0908/BAC009S0908W0128.wav", "txt": "均规定贷款额度不再依据个人信用等级上浮"} -{"key": "BAC009S0908W0129", "wav": "./aishell/wav/test/S0908/BAC009S0908W0129.wav", "txt": "并对异地缴存住房公积金等政策作出调整"} -{"key": "BAC009S0908W0130", "wav": "./aishell/wav/test/S0908/BAC009S0908W0130.wav", "txt": "北京市公积金管理中心明确取消新建商品房评估"} -{"key": "BAC009S0908W0131", "wav": "./aishell/wav/test/S0908/BAC009S0908W0131.wav", "txt": "国管住房公积金中心则表示取消担保服务费"} -{"key": "BAC009S0908W0132", "wav": "./aishell/wav/test/S0908/BAC009S0908W0132.wav", "txt": "这一系列公积金门槛放低额度提高的调整"} -{"key": "BAC009S0908W0133", "wav": "./aishell/wav/test/S0908/BAC009S0908W0133.wav", "txt": "是对过去住房公积金制度不作为方式的纠正"} -{"key": "BAC009S0908W0134", "wav": "./aishell/wav/test/S0908/BAC009S0908W0134.wav", "txt": "即使去年十一月公积金贷款利率降至百分之七"} -{"key": "BAC009S0908W0135", "wav": "./aishell/wav/test/S0908/BAC009S0908W0135.wav", "txt": "很多人需要支付大额首付"} -{"key": "BAC009S0908W0136", "wav": "./aishell/wav/test/S0908/BAC009S0908W0136.wav", "txt": "使用公积金制度的作用和效果没有得到有效的发挥"} -{"key": "BAC009S0908W0137", "wav": "./aishell/wav/test/S0908/BAC009S0908W0137.wav", "txt": "此番公积金政策调整"} -{"key": "BAC009S0908W0138", "wav": "./aishell/wav/test/S0908/BAC009S0908W0138.wav", "txt": "将在诸多方面惠及刚需购房者"} -{"key": "BAC009S0908W0139", "wav": "./aishell/wav/test/S0908/BAC009S0908W0139.wav", "txt": "之前的公积金贷款额只有一百万"} -{"key": "BAC009S0908W0140", "wav": "./aishell/wav/test/S0908/BAC009S0908W0140.wav", "txt": "而最高额度提升至一百万后"} -{"key": "BAC009S0908W0141", "wav": "./aishell/wav/test/S0908/BAC009S0908W0141.wav", "txt": "大部分刚需购房者都可以选择公积金贷款"} -{"key": "BAC009S0908W0142", "wav": "./aishell/wav/test/S0908/BAC009S0908W0142.wav", "txt": "中原地产首席分析师张大伟分析"} -{"key": "BAC009S0908W0143", "wav": "./aishell/wav/test/S0908/BAC009S0908W0143.wav", "txt": "公积金额贷款额度升至一百万可以节省很多少利息"} -{"key": "BAC009S0908W0144", "wav": "./aishell/wav/test/S0908/BAC009S0908W0144.wav", "txt": "公积金贷款可少缴三十馀万"} -{"key": "BAC009S0908W0145", "wav": "./aishell/wav/test/S0908/BAC009S0908W0145.wav", "txt": "而额度最高一百万时"} -{"key": "BAC009S0908W0146", "wav": "./aishell/wav/test/S0908/BAC009S0908W0146.wav", "txt": "这一数值为二十馀万"} -{"key": "BAC009S0908W0147", "wav": "./aishell/wav/test/S0908/BAC009S0908W0147.wav", "txt": "这将使更多购房者具备买房支付能力"} -{"key": "BAC009S0908W0148", "wav": "./aishell/wav/test/S0908/BAC009S0908W0148.wav", "txt": "可以使用公积金贷款的购房者将起码增加百分之七"} -{"key": "BAC009S0908W0149", "wav": "./aishell/wav/test/S0908/BAC009S0908W0149.wav", "txt": "也将在一定程度上降低刚需购房者支付负担"} -{"key": "BAC009S0908W0150", "wav": "./aishell/wav/test/S0908/BAC009S0908W0150.wav", "txt": "公积金政策调整对于楼市成交刺激作用已初见瑞尔"} -{"key": "BAC009S0908W0151", "wav": "./aishell/wav/test/S0908/BAC009S0908W0151.wav", "txt": "链家地产市场研究部数据显示"} -{"key": "BAC009S0908W0152", "wav": "./aishell/wav/test/S0908/BAC009S0908W0152.wav", "txt": "北京市公积金额度提升后的元旦时期"} -{"key": "BAC009S0908W0153", "wav": "./aishell/wav/test/S0908/BAC009S0908W0153.wav", "txt": "近郊小户型楼盘及城区部分公房社区客户咨询量上升"} -{"key": "BAC009S0908W0154", "wav": "./aishell/wav/test/S0908/BAC009S0908W0154.wav", "txt": "而其房源多在一百平方米以下"} -{"key": "BAC009S0908W0155", "wav": "./aishell/wav/test/S0908/BAC009S0908W0155.wav", "txt": "中原地产市场研究部数据显示"} -{"key": "BAC009S0908W0156", "wav": "./aishell/wav/test/S0908/BAC009S0908W0156.wav", "txt": "以北京去年纯商品房成交结构为例"} -{"key": "BAC009S0908W0157", "wav": "./aishell/wav/test/S0908/BAC009S0908W0157.wav", "txt": "一百平均单套总价约一百万元左右"} -{"key": "BAC009S0908W0158", "wav": "./aishell/wav/test/S0908/BAC009S0908W0158.wav", "txt": "公积金贷款上限调整后"} -{"key": "BAC009S0908W0159", "wav": "./aishell/wav/test/S0908/BAC009S0908W0159.wav", "txt": "一百万元的贷款额度能满足大部分首套刚需的贷款需求"} -{"key": "BAC009S0908W0160", "wav": "./aishell/wav/test/S0908/BAC009S0908W0160.wav", "txt": "链家地产市场研究部张旭表示"} -{"key": "BAC009S0908W0161", "wav": "./aishell/wav/test/S0908/BAC009S0908W0161.wav", "txt": "此番公积金贷款政策调整将进一步提升振刚需"} -{"key": "BAC009S0908W0162", "wav": "./aishell/wav/test/S0908/BAC009S0908W0162.wav", "txt": "促进楼市预期向好发展"} -{"key": "BAC009S0908W0163", "wav": "./aishell/wav/test/S0908/BAC009S0908W0163.wav", "txt": "去年已有不少城市对公积金政策进行放松调整"} -{"key": "BAC009S0908W0164", "wav": "./aishell/wav/test/S0908/BAC009S0908W0164.wav", "txt": "公积金政策调整对购房者心理层面影响较大"} -{"key": "BAC009S0908W0165", "wav": "./aishell/wav/test/S0908/BAC009S0908W0165.wav", "txt": "将加速今年楼市成交复苏回暖"} -{"key": "BAC009S0908W0166", "wav": "./aishell/wav/test/S0908/BAC009S0908W0166.wav", "txt": "备受刚需购房者关注的公积金政策也频繁迎来调整"} -{"key": "BAC009S0908W0167", "wav": "./aishell/wav/test/S0908/BAC009S0908W0167.wav", "txt": "北京市管国管住房公积金中心先后发布通知"} -{"key": "BAC009S0908W0168", "wav": "./aishell/wav/test/S0908/BAC009S0908W0168.wav", "txt": "通知指出除北上广深一线城市外"} -{"key": "BAC009S0908W0169", "wav": "./aishell/wav/test/S0908/BAC009S0908W0169.wav", "txt": "对拥有一套住房并已结清相应购房贷款的居民家庭"} -{"key": "BAC009S0908W0170", "wav": "./aishell/wav/test/S0908/BAC009S0908W0170.wav", "txt": "申请公积金购买第二套住房"} -{"key": "BAC009S0908W0171", "wav": "./aishell/wav/test/S0908/BAC009S0908W0171.wav", "txt": "最低首付款比例由百分之七降低至百分之五"} -{"key": "BAC009S0908W0172", "wav": "./aishell/wav/test/S0908/BAC009S0908W0172.wav", "txt": "公积金首付的再次降低实际影响有限"} -{"key": "BAC009S0908W0173", "wav": "./aishell/wav/test/S0908/BAC009S0908W0173.wav", "txt": "但对购房者预期有积极响应"} -{"key": "BAC009S0908W0174", "wav": "./aishell/wav/test/S0908/BAC009S0908W0174.wav", "txt": "这将有利于稳定房地产市场"} -{"key": "BAC009S0908W0175", "wav": "./aishell/wav/test/S0908/BAC009S0908W0175.wav", "txt": "进而对稳定中国经济有正面作用"} -{"key": "BAC009S0908W0176", "wav": "./aishell/wav/test/S0908/BAC009S0908W0176.wav", "txt": "为进一步完善住房公积金个人住房贷款政策"} -{"key": "BAC009S0908W0177", "wav": "./aishell/wav/test/S0908/BAC009S0908W0177.wav", "txt": "对拥有一套住房并已结清相应购房贷款的居民家庭"} -{"key": "BAC009S0908W0178", "wav": "./aishell/wav/test/S0908/BAC009S0908W0178.wav", "txt": "最低首付款比例由百分之七降低至百分之五"} -{"key": "BAC009S0908W0179", "wav": "./aishell/wav/test/S0908/BAC009S0908W0179.wav", "txt": "该政策对于一线城市并不强制执行"} -{"key": "BAC009S0908W0180", "wav": "./aishell/wav/test/S0908/BAC009S0908W0180.wav", "txt": "而是北京上海广州深圳可在国家统一政策基础上"} -{"key": "BAC009S0908W0181", "wav": "./aishell/wav/test/S0908/BAC009S0908W0181.wav", "txt": "易居研究院智库中心研究总监严跃进认为"} -{"key": "BAC009S0908W0182", "wav": "./aishell/wav/test/S0908/BAC009S0908W0182.wav", "txt": "此次住建部财政部和中央联合发文"} -{"key": "BAC009S0908W0183", "wav": "./aishell/wav/test/S0908/BAC009S0908W0183.wav", "txt": "反映出政策层面较大的刺激力度"} -{"key": "BAC009S0908W0184", "wav": "./aishell/wav/test/S0908/BAC009S0908W0184.wav", "txt": "这是自去年以来除降息外"} -{"key": "BAC009S0908W0185", "wav": "./aishell/wav/test/S0908/BAC009S0908W0185.wav", "txt": "相关部门对公积金贷款政策的第三次放松"} -{"key": "BAC009S0908W0186", "wav": "./aishell/wav/test/S0908/BAC009S0908W0186.wav", "txt": "美丽北京大型绿色公益品牌项目"} -{"key": "BAC009S0908W0187", "wav": "./aishell/wav/test/S0908/BAC009S0908W0187.wav", "txt": "完善玉米大豆油菜籽棉花等农产品临时收储政策"} -{"key": "BAC009S0908W0188", "wav": "./aishell/wav/test/S0908/BAC009S0908W0188.wav", "txt": "完善主要农产品吞吐和调节机制"} -{"key": "BAC009S0908W0189", "wav": "./aishell/wav/test/S0908/BAC009S0908W0189.wav", "txt": "健全重要农产品储备制度"} -{"key": "BAC009S0908W0190", "wav": "./aishell/wav/test/S0908/BAC009S0908W0190.wav", "txt": "发挥骨干企业稳定市场的作用"} -{"key": "BAC009S0908W0191", "wav": "./aishell/wav/test/S0908/BAC009S0908W0191.wav", "txt": "完善生猪棉花食糖边销茶等调控预案"} -{"key": "BAC009S0908W0192", "wav": "./aishell/wav/test/S0908/BAC009S0908W0192.wav", "txt": "制定鲜活农产品调控办法"} -{"key": "BAC009S0908W0193", "wav": "./aishell/wav/test/S0908/BAC009S0908W0193.wav", "txt": "探索建立目标价格为核心的反周期补贴制度"} -{"key": "BAC009S0908W0194", "wav": "./aishell/wav/test/S0908/BAC009S0908W0194.wav", "txt": "加强农业科技交流合作"} -{"key": "BAC009S0908W0195", "wav": "./aishell/wav/test/S0908/BAC009S0908W0195.wav", "txt": "提高农业利用外资水平"} -{"key": "BAC009S0908W0196", "wav": "./aishell/wav/test/S0908/BAC009S0908W0196.wav", "txt": "继续用好国外优惠贷款和赠款"} -{"key": "BAC009S0908W0197", "wav": "./aishell/wav/test/S0908/BAC009S0908W0197.wav", "txt": "加大先进适用技术装备的引进消化和吸收力度"} -{"key": "BAC009S0908W0198", "wav": "./aishell/wav/test/S0908/BAC009S0908W0198.wav", "txt": "强化多双边和区域农业磋商谈判和贸易促进"} -{"key": "BAC009S0908W0199", "wav": "./aishell/wav/test/S0908/BAC009S0908W0199.wav", "txt": "做好涉农国际贸易规定制动工作"} -{"key": "BAC009S0908W0200", "wav": "./aishell/wav/test/S0908/BAC009S0908W0200.wav", "txt": "进一步强化贸易促进公共服务能力"} -{"key": "BAC009S0908W0201", "wav": "./aishell/wav/test/S0908/BAC009S0908W0201.wav", "txt": "积极推动优势农产品出口"} -{"key": "BAC009S0908W0202", "wav": "./aishell/wav/test/S0908/BAC009S0908W0202.wav", "txt": "积极应对国际贸易摩擦"} -{"key": "BAC009S0908W0203", "wav": "./aishell/wav/test/S0908/BAC009S0908W0203.wav", "txt": "支持行业协会办企业维护合法权益"} -{"key": "BAC009S0908W0204", "wav": "./aishell/wav/test/S0908/BAC009S0908W0204.wav", "txt": "进一步完善农业产业损害监测预警机制"} -{"key": "BAC009S0908W0205", "wav": "./aishell/wav/test/S0908/BAC009S0908W0205.wav", "txt": "运用符合世界贸易组织规定的相关措施"} -{"key": "BAC009S0908W0206", "wav": "./aishell/wav/test/S0908/BAC009S0908W0206.wav", "txt": "灵活有效调控农业产品进出口"} -{"key": "BAC009S0908W0207", "wav": "./aishell/wav/test/S0908/BAC009S0908W0207.wav", "txt": "积极推动种业农垦等方面改革"} -{"key": "BAC009S0908W0208", "wav": "./aishell/wav/test/S0908/BAC009S0908W0208.wav", "txt": "发展农村服务业和乡村企业"} -{"key": "BAC009S0908W0209", "wav": "./aishell/wav/test/S0908/BAC009S0908W0209.wav", "txt": "制定农村二三产业加快发展的鼓励政策"} -{"key": "BAC009S0908W0210", "wav": "./aishell/wav/test/S0908/BAC009S0908W0210.wav", "txt": "落实和完善有关税收政策"} -{"key": "BAC009S0908W0211", "wav": "./aishell/wav/test/S0908/BAC009S0908W0211.wav", "txt": "统筹城乡基础设施建和公共服务"} -{"key": "BAC009S0908W0212", "wav": "./aishell/wav/test/S0908/BAC009S0908W0212.wav", "txt": "逐步建立城乡统一的公共服务制度"} -{"key": "BAC009S0908W0213", "wav": "./aishell/wav/test/S0908/BAC009S0908W0213.wav", "txt": "积极稳妥推进户籍制度改革"} -{"key": "BAC009S0908W0214", "wav": "./aishell/wav/test/S0908/BAC009S0908W0214.wav", "txt": "推进省直接管理县市财政体制改革"} -{"key": "BAC009S0908W0215", "wav": "./aishell/wav/test/S0908/BAC009S0908W0215.wav", "txt": "优先将农业大县纳入改革范围"} -{"key": "BAC009S0908W0216", "wav": "./aishell/wav/test/S0908/BAC009S0908W0216.wav", "txt": "强化农业法制保障"} -{"key": "BAC009S0908W0217", "wav": "./aishell/wav/test/S0908/BAC009S0908W0217.wav", "txt": "坚持米袋子省长负责制和菜篮子市长负责制"} -{"key": "BAC009S0908W0218", "wav": "./aishell/wav/test/S0908/BAC009S0908W0218.wav", "txt": "全面落实耕地和基本农田保护领导干部离任审计制度"} -{"key": "BAC009S0908W0219", "wav": "./aishell/wav/test/S0908/BAC009S0908W0219.wav", "txt": "各有关部门和地方各级人民政府要围绕规划目标任务"} -{"key": "BAC009S0908W0220", "wav": "./aishell/wav/test/S0908/BAC009S0908W0220.wav", "txt": "研究落实各项强农惠农富农政策"} -{"key": "BAC009S0908W0221", "wav": "./aishell/wav/test/S0908/BAC009S0908W0221.wav", "txt": "统筹协调推动重大工程的实施"} -{"key": "BAC009S0908W0222", "wav": "./aishell/wav/test/S0908/BAC009S0908W0222.wav", "txt": "努力开创我国农业现代化发展新局面"} -{"key": "BAC009S0908W0223", "wav": "./aishell/wav/test/S0908/BAC009S0908W0223.wav", "txt": "农业农村信息化十二五规划"} -{"key": "BAC009S0908W0224", "wav": "./aishell/wav/test/S0908/BAC009S0908W0224.wav", "txt": "关于印发十二五规划的通知"} -{"key": "BAC009S0908W0225", "wav": "./aishell/wav/test/S0908/BAC009S0908W0225.wav", "txt": "中国老龄十二五规划"} -{"key": "BAC009S0908W0226", "wav": "./aishell/wav/test/S0908/BAC009S0908W0226.wav", "txt": "新农村十二五发展规划"} -{"key": "BAC009S0908W0227", "wav": "./aishell/wav/test/S0908/BAC009S0908W0227.wav", "txt": "国家林业十二五规划"} -{"key": "BAC009S0908W0228", "wav": "./aishell/wav/test/S0908/BAC009S0908W0228.wav", "txt": "十二五医药发展规划"} -{"key": "BAC009S0908W0229", "wav": "./aishell/wav/test/S0908/BAC009S0908W0229.wav", "txt": "老龄事业十二五规划"} -{"key": "BAC009S0908W0230", "wav": "./aishell/wav/test/S0908/BAC009S0908W0230.wav", "txt": "国务院总理温家宝五日主持召开国务院常务会议"} -{"key": "BAC009S0908W0231", "wav": "./aishell/wav/test/S0908/BAC009S0908W0231.wav", "txt": "再次听取全国民用核设施综合安全检查情况汇报"} -{"key": "BAC009S0908W0232", "wav": "./aishell/wav/test/S0908/BAC009S0908W0232.wav", "txt": "核电重启的曙光越来越近"} -{"key": "BAC009S0908W0233", "wav": "./aishell/wav/test/S0908/BAC009S0908W0233.wav", "txt": "国务院二零一一年五月"} -{"key": "BAC009S0908W0234", "wav": "./aishell/wav/test/S0908/BAC009S0908W0234.wav", "txt": "相关公司股票走势国海证券"} -{"key": "BAC009S0908W0235", "wav": "./aishell/wav/test/S0908/BAC009S0908W0235.wav", "txt": "决定对全国核设施进行安全检查"} -{"key": "BAC009S0908W0236", "wav": "./aishell/wav/test/S0908/BAC009S0908W0236.wav", "txt": "有关部门组织核安全地震海洋等方面专家"} -{"key": "BAC009S0908W0237", "wav": "./aishell/wav/test/S0908/BAC009S0908W0237.wav", "txt": "用五个多月时间对全国七十台运行在建核电机组"} -{"key": "BAC009S0908W0238", "wav": "./aishell/wav/test/S0908/BAC009S0908W0238.wav", "txt": "以及所有民用研究堆和核燃燃料循环设施等"} -{"key": "BAC009S0908W0239", "wav": "./aishell/wav/test/S0908/BAC009S0908W0239.wav", "txt": "进行了综合安全检查"} -{"key": "BAC009S0908W0240", "wav": "./aishell/wav/test/S0908/BAC009S0908W0240.wav", "txt": "形成了新形势下我国核电发展的建议阶段研究报告"} -{"key": "BAC009S0908W0241", "wav": "./aishell/wav/test/S0908/BAC009S0908W0241.wav", "txt": "国务院常务会议听取了综合安全检查情况汇报"} -{"key": "BAC009S0908W0242", "wav": "./aishell/wav/test/S0908/BAC009S0908W0242.wav", "txt": "对进一步深入检查及落实整改措施作了部署"} -{"key": "BAC009S0908W0243", "wav": "./aishell/wav/test/S0908/BAC009S0908W0243.wav", "txt": "核安全法规标准体系与国际接轨"} -{"key": "BAC009S0908W0244", "wav": "./aishell/wav/test/S0908/BAC009S0908W0244.wav", "txt": "具备一定的严重事故预防和缓解能力"} -{"key": "BAC009S0908W0245", "wav": "./aishell/wav/test/S0908/BAC009S0908W0245.wav", "txt": "部分核电厂未制定实施严重事故预防和缓解规程"} -{"key": "BAC009S0908W0246", "wav": "./aishell/wav/test/S0908/BAC009S0908W0246.wav", "txt": "海啸问题评估和应付基础比较薄弱等"} -{"key": "BAC009S0908W0247", "wav": "./aishell/wav/test/S0908/BAC009S0908W0247.wav", "txt": "有关部门和企业迅速组织整改"} -{"key": "BAC009S0908W0248", "wav": "./aishell/wav/test/S0908/BAC009S0908W0248.wav", "txt": "目前已取得阶段性成效"} -{"key": "BAC009S0908W0249", "wav": "./aishell/wav/test/S0908/BAC009S0908W0249.wav", "txt": "基本原则是预防为主纵深防御"} -{"key": "BAC009S0908W0250", "wav": "./aishell/wav/test/S0908/BAC009S0908W0250.wav", "txt": "新老并重防结结合"} -{"key": "BAC009S0908W0251", "wav": "./aishell/wav/test/S0908/BAC009S0908W0251.wav", "txt": "依靠科技持续改进"} -{"key": "BAC009S0908W0252", "wav": "./aishell/wav/test/S0908/BAC009S0908W0252.wav", "txt": "坚持法治严格监管"} -{"key": "BAC009S0908W0253", "wav": "./aishell/wav/test/S0908/BAC009S0908W0253.wav", "txt": "比如用户抱怨升级之后设施无法像以前那样工作了"} -{"key": "BAC009S0908W0254", "wav": "./aishell/wav/test/S0908/BAC009S0908W0254.wav", "txt": "甚至还不如原来的一点零版本系统好用"} -{"key": "BAC009S0908W0255", "wav": "./aishell/wav/test/S0908/BAC009S0908W0255.wav", "txt": "苹果此举是为了节约用电量"} -{"key": "BAC009S0908W0256", "wav": "./aishell/wav/test/S0908/BAC009S0908W0256.wav", "txt": "有人给出了解决方法"} -{"key": "BAC009S0908W0258", "wav": "./aishell/wav/test/S0908/BAC009S0908W0258.wav", "txt": "强制不断的心率测量"} -{"key": "BAC009S0908W0259", "wav": "./aishell/wav/test/S0908/BAC009S0908W0259.wav", "txt": "只是这种情况下心率传感器会每隔十秒进行一次"} -{"key": "BAC009S0908W0261", "wav": "./aishell/wav/test/S0908/BAC009S0908W0261.wav", "txt": "苹果表这么火爆微软也该出智能手表吗"} -{"key": "BAC009S0908W0262", "wav": "./aishell/wav/test/S0908/BAC009S0908W0262.wav", "txt": "刚开始微软因谨慎起见"} -{"key": "BAC009S0908W0264", "wav": "./aishell/wav/test/S0908/BAC009S0908W0264.wav", "txt": "最近才开始向其他市场推广销售"} -{"key": "BAC009S0908W0265", "wav": "./aishell/wav/test/S0908/BAC009S0908W0265.wav", "txt": "在谷歌与苹果相机推出智能手表后"} -{"key": "BAC009S0908W0266", "wav": "./aishell/wav/test/S0908/BAC009S0908W0266.wav", "txt": "微软目前仍局限于健身手环领域"} -{"key": "BAC009S0908W0267", "wav": "./aishell/wav/test/S0908/BAC009S0908W0267.wav", "txt": "但它的确算不上是智能手表"} -{"key": "BAC009S0908W0268", "wav": "./aishell/wav/test/S0908/BAC009S0908W0268.wav", "txt": "拥有内部存储空间与完整的应用平台"} -{"key": "BAC009S0908W0269", "wav": "./aishell/wav/test/S0908/BAC009S0908W0269.wav", "txt": "支持开发者为其编写应用"} -{"key": "BAC009S0908W0270", "wav": "./aishell/wav/test/S0908/BAC009S0908W0270.wav", "txt": "但它对开发者来说限制太多"} -{"key": "BAC009S0908W0272", "wav": "./aishell/wav/test/S0908/BAC009S0908W0272.wav", "txt": "微软正在向外界推广一次编写"} -{"key": "BAC009S0908W0273", "wav": "./aishell/wav/test/S0908/BAC009S0908W0273.wav", "txt": "跨设备使用的通用应用"} -{"key": "BAC009S0908W0274", "wav": "./aishell/wav/test/S0908/BAC009S0908W0274.wav", "txt": "但至今唯独没有提升智能手表平台"} -{"key": "BAC009S0908W0275", "wav": "./aishell/wav/test/S0908/BAC009S0908W0275.wav", "txt": "具体如下方视频介绍所示"} -{"key": "BAC009S0908W0277", "wav": "./aishell/wav/test/S0908/BAC009S0908W0277.wav", "txt": "刚开始微软因谨慎起见"} -{"key": "BAC009S0908W0280", "wav": "./aishell/wav/test/S0908/BAC009S0908W0280.wav", "txt": "原告当地时间周二在法庭上表示"} -{"key": "BAC009S0908W0281", "wav": "./aishell/wav/test/S0908/BAC009S0908W0281.wav", "txt": "苹果通过发布不必要的软件升级包"} -{"key": "BAC009S0908W0283", "wav": "./aishell/wav/test/S0908/BAC009S0908W0283.wav", "txt": "一起针对苹果的集体反垄断案两名原告的律师称"} -{"key": "BAC009S0908W0284", "wav": "./aishell/wav/test/S0908/BAC009S0908W0284.wav", "txt": "由于苹果要打压竞争对手"} -{"key": "BAC009S0908W0286", "wav": "./aishell/wav/test/S0908/BAC009S0908W0286.wav", "txt": "但却损害了消费者的利益"} -{"key": "BAC009S0908W0287", "wav": "./aishell/wav/test/S0908/BAC009S0908W0287.wav", "txt": "这次庭审将持续九天时间"} -{"key": "BAC009S0908W0288", "wav": "./aishell/wav/test/S0908/BAC009S0908W0288.wav", "txt": "给一桩近十年之久的诉讼一个定论"} -{"key": "BAC009S0908W0292", "wav": "./aishell/wav/test/S0908/BAC009S0908W0292.wav", "txt": "不过这些政策现在已经被废除"} -{"key": "BAC009S0908W0293", "wav": "./aishell/wav/test/S0908/BAC009S0908W0293.wav", "txt": "苹果打压了市场竞争"} -{"key": "BAC009S0908W0297", "wav": "./aishell/wav/test/S0908/BAC009S0908W0297.wav", "txt": "苹果担忧这会蚕食其市场份额"} -{"key": "BAC009S0908W0298", "wav": "./aishell/wav/test/S0908/BAC009S0908W0298.wav", "txt": "生态链中插入其他公司产品会造成问题"} -{"key": "BAC009S0908W0299", "wav": "./aishell/wav/test/S0908/BAC009S0908W0299.wav", "txt": "这会危及用户体验和产品质量"} -{"key": "BAC009S0908W0301", "wav": "./aishell/wav/test/S0908/BAC009S0908W0301.wav", "txt": "价格要么下降要么维持不变"} -{"key": "BAC009S0908W0302", "wav": "./aishell/wav/test/S0908/BAC009S0908W0302.wav", "txt": "苹果没有危害消费者利益"} -{"key": "BAC009S0908W0303", "wav": "./aishell/wav/test/S0908/BAC009S0908W0303.wav", "txt": "对频频的骚扰电话显示无可奈何"} -{"key": "BAC009S0908W0304", "wav": "./aishell/wav/test/S0908/BAC009S0908W0304.wav", "txt": "有黑客在网络上兜售车主信"} -{"key": "BAC009S0908W0305", "wav": "./aishell/wav/test/S0908/BAC009S0908W0305.wav", "txt": "美的摆稳棋局过冬搜狐科技"} -{"key": "BAC009S0908W0306", "wav": "./aishell/wav/test/S0908/BAC009S0908W0306.wav", "txt": "白电行业将进入最惨烈的一年"} -{"key": "BAC009S0908W0307", "wav": "./aishell/wav/test/S0908/BAC009S0908W0307.wav", "txt": "昔日巨头格力美的海尔也将沉浮于其中"} -{"key": "BAC009S0908W0308", "wav": "./aishell/wav/test/S0908/BAC009S0908W0308.wav", "txt": "从本年度第一份季报来看"} -{"key": "BAC009S0908W0309", "wav": "./aishell/wav/test/S0908/BAC009S0908W0309.wav", "txt": "三巨头中的格力海尔均出现不同程度下滑"} -{"key": "BAC009S0908W0310", "wav": "./aishell/wav/test/S0908/BAC009S0908W0310.wav", "txt": "实现净利营收双增长"} -{"key": "BAC009S0908W0311", "wav": "./aishell/wav/test/S0908/BAC009S0908W0311.wav", "txt": "美的吸取了当年大跃进的教训"} -{"key": "BAC009S0908W0312", "wav": "./aishell/wav/test/S0908/BAC009S0908W0312.wav", "txt": "一位买家电的朋友晒出一张销量清单"} -{"key": "BAC009S0908W0313", "wav": "./aishell/wav/test/S0908/BAC009S0908W0313.wav", "txt": "他担心自己马上就要被辞退了"} -{"key": "BAC009S0908W0314", "wav": "./aishell/wav/test/S0908/BAC009S0908W0314.wav", "txt": "发改委约谈各大空调企业的高管"} -{"key": "BAC009S0908W0315", "wav": "./aishell/wav/test/S0908/BAC009S0908W0315.wav", "txt": "媒体采访的电话打到各空调企业的市场负责人那里"} -{"key": "BAC009S0908W0316", "wav": "./aishell/wav/test/S0908/BAC009S0908W0316.wav", "txt": "各公司市场部都在卖场忙活"} -{"key": "BAC009S0908W0317", "wav": "./aishell/wav/test/S0908/BAC009S0908W0317.wav", "txt": "今年的促销从三月份就启动了"} -{"key": "BAC009S0908W0318", "wav": "./aishell/wav/test/S0908/BAC009S0908W0318.wav", "txt": "一位商场场内部人士称"} -{"key": "BAC009S0908W0319", "wav": "./aishell/wav/test/S0908/BAC009S0908W0319.wav", "txt": "注定是白色家电行业最惨烈的一年"} -{"key": "BAC009S0908W0320", "wav": "./aishell/wav/test/S0908/BAC009S0908W0320.wav", "txt": "现实的残酷落到报表上"} -{"key": "BAC009S0908W0321", "wav": "./aishell/wav/test/S0908/BAC009S0908W0321.wav", "txt": "是白电上市企业今年的一季报几乎全部沦陷"} -{"key": "BAC009S0908W0322", "wav": "./aishell/wav/test/S0908/BAC009S0908W0322.wav", "txt": "两大龙头企业格力和海尔"} -{"key": "BAC009S0908W0323", "wav": "./aishell/wav/test/S0908/BAC009S0908W0323.wav", "txt": "格力电器一季报营收为二百四十五亿元"} -{"key": "BAC009S0908W0324", "wav": "./aishell/wav/test/S0908/BAC009S0908W0324.wav", "txt": "同比去年降零点六百分之六"} -{"key": "BAC009S0908W0325", "wav": "./aishell/wav/test/S0908/BAC009S0908W0325.wav", "txt": "净利润为二十七点七五亿元"} -{"key": "BAC009S0908W0326", "wav": "./aishell/wav/test/S0908/BAC009S0908W0326.wav", "txt": "同比上升百分之二十三点零六"} -{"key": "BAC009S0908W0327", "wav": "./aishell/wav/test/S0908/BAC009S0908W0327.wav", "txt": "上一次是金融危机期间的二零零九一季度"} -{"key": "BAC009S0908W0328", "wav": "./aishell/wav/test/S0908/BAC009S0908W0328.wav", "txt": "另一白电巨头青岛海尔"} -{"key": "BAC009S0908W0329", "wav": "./aishell/wav/test/S0908/BAC009S0908W0329.wav", "txt": "一季度营收为二十一八点七亿元"} -{"key": "BAC009S0908W0330", "wav": "./aishell/wav/test/S0908/BAC009S0908W0330.wav", "txt": "净利润为九点七亿元"} -{"key": "BAC009S0908W0331", "wav": "./aishell/wav/test/S0908/BAC009S0908W0331.wav", "txt": "同比增百分之十三点一一"} -{"key": "BAC009S0908W0332", "wav": "./aishell/wav/test/S0908/BAC009S0908W0332.wav", "txt": "海信科龙和惠而浦则是营收增"} -{"key": "BAC009S0908W0333", "wav": "./aishell/wav/test/S0908/BAC009S0908W0333.wav", "txt": "海信科龙一季报营收为六十四点三亿元"} -{"key": "BAC009S0908W0334", "wav": "./aishell/wav/test/S0908/BAC009S0908W0334.wav", "txt": "净利润出现百分之一的下滑"} -{"key": "BAC009S0908W0335", "wav": "./aishell/wav/test/S0908/BAC009S0908W0335.wav", "txt": "净利出现七点三百分之一的降幅"} -{"key": "BAC009S0908W0337", "wav": "./aishell/wav/test/S0908/BAC009S0908W0337.wav", "txt": "实现营收净利双增长"} -{"key": "BAC009S0908W0338", "wav": "./aishell/wav/test/S0908/BAC009S0908W0338.wav", "txt": "十多天压抑的情感终于爆发"} -{"key": "BAC009S0908W0339", "wav": "./aishell/wav/test/S0908/BAC009S0908W0339.wav", "txt": "女排姑娘们在日本的最后一夜"} -{"key": "BAC009S0908W0340", "wav": "./aishell/wav/test/S0908/BAC009S0908W0340.wav", "txt": "大家才安安稳稳地睡了一觉"} -{"key": "BAC009S0908W0341", "wav": "./aishell/wav/test/S0908/BAC009S0908W0341.wav", "txt": "如果要数一下中国女排谁最红"} -{"key": "BAC009S0908W0342", "wav": "./aishell/wav/test/S0908/BAC009S0908W0342.wav", "txt": "张晓雅的人气肯定在前三名"} -{"key": "BAC009S0908W0343", "wav": "./aishell/wav/test/S0908/BAC009S0908W0343.wav", "txt": "她以最帅国手走红网络"} -{"key": "BAC009S0908W0344", "wav": "./aishell/wav/test/S0908/BAC009S0908W0344.wav", "txt": "网友大呼她帅过林丹"} -{"key": "BAC009S0908W0345", "wav": "./aishell/wav/test/S0908/BAC009S0908W0345.wav", "txt": "这位英气十足的九零后很有人缘"} -{"key": "BAC009S0908W0346", "wav": "./aishell/wav/test/S0908/BAC009S0908W0346.wav", "txt": "张晓雅最大的优点是有想法"} -{"key": "BAC009S0908W0347", "wav": "./aishell/wav/test/S0908/BAC009S0908W0347.wav", "txt": "张晓雅这个娃娃训练很自觉"} -{"key": "BAC009S0908W0348", "wav": "./aishell/wav/test/S0908/BAC009S0908W0348.wav", "txt": "在球场上的思路比较清楚"} -{"key": "BAC009S0908W0349", "wav": "./aishell/wav/test/S0908/BAC009S0908W0349.wav", "txt": "是一个在球场上有想法的球员"} -{"key": "BAC009S0908W0350", "wav": "./aishell/wav/test/S0908/BAC009S0908W0350.wav", "txt": "这个娃娃打球时很有思想"} -{"key": "BAC009S0908W0351", "wav": "./aishell/wav/test/S0908/BAC009S0908W0351.wav", "txt": "中国最帅的竞走冠军陈定将亮相苏州吴中"} -{"key": "BAC009S0908W0352", "wav": "./aishell/wav/test/S0908/BAC009S0908W0352.wav", "txt": "一九九二年八月五日出生于云南省保山市龙陵县"} -{"key": "BAC009S0908W0353", "wav": "./aishell/wav/test/S0908/BAC009S0908W0353.wav", "txt": "这个二十三岁的云南小伙子"} -{"key": "BAC009S0908W0354", "wav": "./aishell/wav/test/S0908/BAC009S0908W0354.wav", "txt": "取得瑞士卢加诺竞走挑战赛男子二十公里竞走银牌"} -{"key": "BAC009S0908W0355", "wav": "./aishell/wav/test/S0908/BAC009S0908W0355.wav", "txt": "夺得国际田联竞走世界杯男子二十公里竞走银牌"} -{"key": "BAC009S0908W0356", "wav": "./aishell/wav/test/S0908/BAC009S0908W0356.wav", "txt": "参加全国竞走大奖赛暨世锦赛选拔赛"} -{"key": "BAC009S0908W0357", "wav": "./aishell/wav/test/S0908/BAC009S0908W0357.wav", "txt": "以一小时二十一分十一秒成绩获铜牌"} -{"key": "BAC009S0908W0358", "wav": "./aishell/wav/test/S0908/BAC009S0908W0358.wav", "txt": "并取得世锦赛参赛资格"} -{"key": "BAC009S0908W0359", "wav": "./aishell/wav/test/S0908/BAC009S0908W0359.wav", "txt": "仰泳选手在比赛中"} -{"key": "BAC009S0908W0360", "wav": "./aishell/wav/test/S0908/BAC009S0908W0360.wav", "txt": "本次比赛使用最新的仰泳出发壁架"} -{"key": "BAC009S0908W0361", "wav": "./aishell/wav/test/S0908/BAC009S0908W0361.wav", "txt": "帮助仰泳运动员改善自己的出发技术"} -{"key": "BAC009S0908W0362", "wav": "./aishell/wav/test/S0908/BAC009S0908W0362.wav", "txt": "欧米茄计时管理委员会成员彼得许尔泽勒介绍说"} -{"key": "BAC009S0908W0363", "wav": "./aishell/wav/test/S0908/BAC009S0908W0363.wav", "txt": "可以帮助他们在出发时增加自己距离水面的高度"} -{"key": "BAC009S0908W0364", "wav": "./aishell/wav/test/S0908/BAC009S0908W0364.wav", "txt": "可以防止运动员出发时手部滑落"} -{"key": "BAC009S0908W0365", "wav": "./aishell/wav/test/S0908/BAC009S0908W0365.wav", "txt": "得到了仰泳选手的广泛好评"} -{"key": "BAC009S0908W0366", "wav": "./aishell/wav/test/S0908/BAC009S0908W0366.wav", "txt": "这是它第一次在游泳世界杯上亮相"} -{"key": "BAC009S0908W0367", "wav": "./aishell/wav/test/S0908/BAC009S0908W0367.wav", "txt": "也为背后的关键技术提供开发支持"} -{"key": "BAC009S0908W0368", "wav": "./aishell/wav/test/S0908/BAC009S0908W0368.wav", "txt": "从而确保高度精准地记录竞赛成绩"} -{"key": "BAC009S0908W0369", "wav": "./aishell/wav/test/S0908/BAC009S0908W0369.wav", "txt": "新科世界冠军宁泽涛领衔中国队出战"} -{"key": "BAC009S0908W0370", "wav": "./aishell/wav/test/S0908/BAC009S0908W0370.wav", "txt": "身材傲人颜值爆表的她魅力席卷整个亚洲"} -{"key": "BAC009S0908W0373", "wav": "./aishell/wav/test/S0908/BAC009S0908W0373.wav", "txt": "现年十八岁的莎宾娜身高达一百八十二厘米"} -{"key": "BAC009S0908W0374", "wav": "./aishell/wav/test/S0908/BAC009S0908W0374.wav", "txt": "腿长足足十二厘米"} -{"key": "BAC009S0908W0375", "wav": "./aishell/wav/test/S0908/BAC009S0908W0375.wav", "txt": "去年在亚青赛上亮相后"} -{"key": "BAC009S0908W0376", "wav": "./aishell/wav/test/S0908/BAC009S0908W0376.wav", "txt": "瞬间成为各国媒体的焦点"} -{"key": "BAC009S0908W0377", "wav": "./aishell/wav/test/S0908/BAC009S0908W0377.wav", "txt": "成为宅男心目中的排球女神"} -{"key": "BAC009S0908W0378", "wav": "./aishell/wav/test/S0908/BAC009S0908W0378.wav", "txt": "莎宾娜也凭借兼具清纯和性感气质的漂亮外形走红日本"} -{"key": "BAC009S0908W0379", "wav": "./aishell/wav/test/S0908/BAC009S0908W0379.wav", "txt": "甚至有日本的大牌经纪公司希望与其签约"} -{"key": "BAC009S0908W0380", "wav": "./aishell/wav/test/S0908/BAC009S0908W0380.wav", "txt": "做客日本电视台的新闻节目"} -{"key": "BAC009S0908W0381", "wav": "./aishell/wav/test/S0908/BAC009S0908W0381.wav", "txt": "不少媒体追问她是否有男朋友"} -{"key": "BAC009S0908W0382", "wav": "./aishell/wav/test/S0908/BAC009S0908W0382.wav", "txt": "莎宾娜透露目前单身理想型是喜欢运动"} -{"key": "BAC009S0908W0383", "wav": "./aishell/wav/test/S0908/BAC009S0908W0383.wav", "txt": "身材高挑并且不抽烟喝酒的男生"} -{"key": "BAC009S0908W0384", "wav": "./aishell/wav/test/S0908/BAC009S0908W0384.wav", "txt": "当下想把注意力集中在打球上"} -{"key": "BAC009S0908W0385", "wav": "./aishell/wav/test/S0908/BAC009S0908W0385.wav", "txt": "暂时不考虑恋爱的问题"} -{"key": "BAC009S0908W0386", "wav": "./aishell/wav/test/S0908/BAC009S0908W0386.wav", "txt": "这一单身宣言更加激发了日本粉丝对她的痴迷"} -{"key": "BAC009S0908W0387", "wav": "./aishell/wav/test/S0908/BAC009S0908W0387.wav", "txt": "希望可以见到她本人"} -{"key": "BAC009S0908W0388", "wav": "./aishell/wav/test/S0908/BAC009S0908W0388.wav", "txt": "该球队在官方博客上"} -{"key": "BAC009S0908W0389", "wav": "./aishell/wav/test/S0908/BAC009S0908W0389.wav", "txt": "但喜欢欧美音乐爱吃西红柿意大利面"} -{"key": "BAC009S0908W0391", "wav": "./aishell/wav/test/S0908/BAC009S0908W0391.wav", "txt": "看好她成为日本排球的新女神"} -{"key": "BAC009S0908W0392", "wav": "./aishell/wav/test/S0908/BAC009S0908W0392.wav", "txt": "美貌和实力并存的选手太稀罕了"} -{"key": "BAC009S0908W0393", "wav": "./aishell/wav/test/S0908/BAC009S0908W0393.wav", "txt": "莎宾娜已经在今年八月秘密抵达日本"} -{"key": "BAC009S0908W0395", "wav": "./aishell/wav/test/S0908/BAC009S0908W0395.wav", "txt": "她的母亲在采访中表示莎宾娜为了提升自己的实力"} -{"key": "BAC009S0908W0396", "wav": "./aishell/wav/test/S0908/BAC009S0908W0396.wav", "txt": "以哈萨克斯坦排协特派选手的方式加盟日本的球队"} -{"key": "BAC009S0908W0397", "wav": "./aishell/wav/test/S0908/BAC009S0908W0397.wav", "txt": "日本的排球训练是出了名的严厉"} -{"key": "BAC009S0908W0398", "wav": "./aishell/wav/test/S0908/BAC009S0908W0398.wav", "txt": "对此莎宾娜已经做好了吃苦的心理准备"} -{"key": "BAC009S0908W0399", "wav": "./aishell/wav/test/S0908/BAC009S0908W0399.wav", "txt": "家人和哈排协也表明了全力支持她的态度"} -{"key": "BAC009S0908W0400", "wav": "./aishell/wav/test/S0908/BAC009S0908W0400.wav", "txt": "不仅在各国网络社区和比赛中表现活跃"} -{"key": "BAC009S0908W0401", "wav": "./aishell/wav/test/S0908/BAC009S0908W0401.wav", "txt": "也成为哈萨克斯坦的宣传大使"} -{"key": "BAC009S0908W0402", "wav": "./aishell/wav/test/S0908/BAC009S0908W0402.wav", "txt": "日本排球界的人士指出"} -{"key": "BAC009S0908W0404", "wav": "./aishell/wav/test/S0908/BAC009S0908W0404.wav", "txt": "但是由于加朵要为蝙蝠侠大战超人忙碌"} -{"key": "BAC009S0908W0405", "wav": "./aishell/wav/test/S0908/BAC009S0908W0405.wav", "txt": "档期遇到了不可调和的冲突"} -{"key": "BAC009S0908W0406", "wav": "./aishell/wav/test/S0908/BAC009S0908W0406.wav", "txt": "因此不得不放弃宾虚的演出"} -{"key": "BAC009S0908W0407", "wav": "./aishell/wav/test/S0908/BAC009S0908W0407.wav", "txt": "这对她来说也是一个巨大的遗憾"} -{"key": "BAC009S0908W0408", "wav": "./aishell/wav/test/S0908/BAC009S0908W0408.wav", "txt": "私底下对歌迷亲切和善"} -{"key": "BAC009S0908W0409", "wav": "./aishell/wav/test/S0908/BAC009S0908W0409.wav", "txt": "最近人在大陆举行巡回演唱会的他"} -{"key": "BAC009S0908W0410", "wav": "./aishell/wav/test/S0908/BAC009S0908W0410.wav", "txt": "却被曝出在机场大发飙"} -{"key": "BAC009S0908W0411", "wav": "./aishell/wav/test/S0908/BAC009S0908W0411.wav", "txt": "有网友则晒出当天现场情况"} -{"key": "BAC009S0908W0412", "wav": "./aishell/wav/test/S0908/BAC009S0908W0412.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0908W0413", "wav": "./aishell/wav/test/S0908/BAC009S0908W0413.wav", "txt": "分享入行二十年的感受"} -{"key": "BAC009S0908W0414", "wav": "./aishell/wav/test/S0908/BAC009S0908W0414.wav", "txt": "陈奕迅坦言自己一直有情绪病"} -{"key": "BAC009S0908W0415", "wav": "./aishell/wav/test/S0908/BAC009S0908W0415.wav", "txt": "而且是一个爱哭鬼"} -{"key": "BAC009S0908W0416", "wav": "./aishell/wav/test/S0908/BAC009S0908W0416.wav", "txt": "常常在看电影和新闻时流泪"} -{"key": "BAC009S0908W0417", "wav": "./aishell/wav/test/S0908/BAC009S0908W0417.wav", "txt": "不开心时会找太太徐濠所倾诉"} -{"key": "BAC009S0908W0418", "wav": "./aishell/wav/test/S0908/BAC009S0908W0418.wav", "txt": "搜狐娱乐讯据香港媒体报道"} -{"key": "BAC009S0908W0419", "wav": "./aishell/wav/test/S0908/BAC009S0908W0419.wav", "txt": "陈奕迅在香港出席品牌活动"} -{"key": "BAC009S0908W0420", "wav": "./aishell/wav/test/S0908/BAC009S0908W0420.wav", "txt": "现场他透露道近日忙于内地巡演"} -{"key": "BAC009S0908W0421", "wav": "./aishell/wav/test/S0908/BAC009S0908W0421.wav", "txt": "对于天津爆炸时间"} -{"key": "BAC009S0908W0422", "wav": "./aishell/wav/test/S0908/BAC009S0908W0422.wav", "txt": "他表示感到伤痛"} -{"key": "BAC009S0908W0423", "wav": "./aishell/wav/test/S0908/BAC009S0908W0423.wav", "txt": "又透露去年曾在天津举办演唱会"} -{"key": "BAC009S0908W0424", "wav": "./aishell/wav/test/S0908/BAC009S0908W0424.wav", "txt": "希望送上歌曲今日为受害者打气"} -{"key": "BAC009S0908W0425", "wav": "./aishell/wav/test/S0908/BAC009S0908W0425.wav", "txt": "也祝福伤者早日康复"} -{"key": "BAC009S0908W0426", "wav": "./aishell/wav/test/S0908/BAC009S0908W0426.wav", "txt": "搜狐娱乐讯四月三十日"} -{"key": "BAC009S0908W0427", "wav": "./aishell/wav/test/S0908/BAC009S0908W0427.wav", "txt": "称这二人总是可以把自己逗笑"} -{"key": "BAC009S0908W0428", "wav": "./aishell/wav/test/S0908/BAC009S0908W0428.wav", "txt": "照片中二人坐在沙发上"} -{"key": "BAC009S0908W0429", "wav": "./aishell/wav/test/S0908/BAC009S0908W0429.wav", "txt": "谢霆锋戴着帽子"} -{"key": "BAC009S0908W0430", "wav": "./aishell/wav/test/S0908/BAC009S0908W0430.wav", "txt": "穿着白背心黑色短裤"} -{"key": "BAC009S0908W0431", "wav": "./aishell/wav/test/S0908/BAC009S0908W0431.wav", "txt": "数万只黄色小鸡散落路上"} -{"key": "BAC009S0908W0432", "wav": "./aishell/wav/test/S0908/BAC009S0908W0432.wav", "txt": "村民蜂拥而至捉小鸡"} -{"key": "BAC009S0908W0433", "wav": "./aishell/wav/test/S0908/BAC009S0908W0433.wav", "txt": "香港明报参考消息网八月二十九日报道港媒称"} -{"key": "BAC009S0908W0434", "wav": "./aishell/wav/test/S0908/BAC009S0908W0434.wav", "txt": "近日又出现疯抢水果捡漏等事"} -{"key": "BAC009S0908W0435", "wav": "./aishell/wav/test/S0908/BAC009S0908W0435.wav", "txt": "有内地学者分析背后心态"} -{"key": "BAC009S0908W0436", "wav": "./aishell/wav/test/S0908/BAC009S0908W0436.wav", "txt": "是因为国民抢习惯了"} -{"key": "BAC009S0908W0437", "wav": "./aishell/wav/test/S0908/BAC009S0908W0437.wav", "txt": "港媒称马云向浙江商人发出警告永远不要行贿"} -{"key": "BAC009S0908W0438", "wav": "./aishell/wav/test/S0908/BAC009S0908W0438.wav", "txt": "港媒评助学达人性侵女童案加强监管是关键"} -{"key": "BAC009S0908W0439", "wav": "./aishell/wav/test/S0908/BAC009S0908W0439.wav", "txt": "资料图王杰图片来源于网络"} -{"key": "BAC009S0908W0440", "wav": "./aishell/wav/test/S0908/BAC009S0908W0440.wav", "txt": "港媒评中国游客全球爆买旅游幼稚病"} -{"key": "BAC009S0908W0441", "wav": "./aishell/wav/test/S0908/BAC009S0908W0441.wav", "txt": "参考消息网一零月八日报道国庆长假结束"} -{"key": "BAC009S0908W0442", "wav": "./aishell/wav/test/S0908/BAC009S0908W0442.wav", "txt": "媒体再次盘点长假期间的各种热点新闻"} -{"key": "BAC009S0908W0443", "wav": "./aishell/wav/test/S0908/BAC009S0908W0443.wav", "txt": "其中一组中国旅游购物者全面攻陷日本的图片"} -{"key": "BAC009S0908W0444", "wav": "./aishell/wav/test/S0908/BAC009S0908W0444.wav", "txt": "多家媒体就这组图片中的场景和现象作出评论"} -{"key": "BAC009S0908W0445", "wav": "./aishell/wav/test/S0908/BAC009S0908W0445.wav", "txt": "并提出多种思考和提示"} -{"key": "BAC009S0908W0446", "wav": "./aishell/wav/test/S0908/BAC009S0908W0446.wav", "txt": "比如就中国游客热衷日本药品"} -{"key": "BAC009S0908W0447", "wav": "./aishell/wav/test/S0908/BAC009S0908W0447.wav", "txt": "歧视中国药企改进质量提高信誉改善用户体验"} -{"key": "BAC009S0908W0448", "wav": "./aishell/wav/test/S0908/BAC009S0908W0448.wav", "txt": "以便提高药品竞争力等等"} -{"key": "BAC009S0908W0449", "wav": "./aishell/wav/test/S0908/BAC009S0908W0449.wav", "txt": "港媒道士下山被批引发网友广泛讨论"} -{"key": "BAC009S0908W0450", "wav": "./aishell/wav/test/S0908/BAC009S0908W0450.wav", "txt": "参考消息网七月二零日报道"} -{"key": "BAC009S0908W0451", "wav": "./aishell/wav/test/S0908/BAC009S0908W0451.wav", "txt": "港媒上海成为亚洲奢华生活最昂贵的城市"} -{"key": "BAC009S0908W0452", "wav": "./aishell/wav/test/S0908/BAC009S0908W0452.wav", "txt": "参考消息网一零月二九日报道港媒称"} -{"key": "BAC009S0908W0453", "wav": "./aishell/wav/test/S0908/BAC009S0908W0453.wav", "txt": "上海已成为全亚洲奢华生活最昂贵的城市"} -{"key": "BAC009S0908W0454", "wav": "./aishell/wav/test/S0908/BAC009S0908W0454.wav", "txt": "垫底的是印度城市孟买"} -{"key": "BAC009S0908W0455", "wav": "./aishell/wav/test/S0908/BAC009S0908W0455.wav", "txt": "港媒东莞工地连续两次坍塌路面似被吸入地底"} -{"key": "BAC009S0908W0456", "wav": "./aishell/wav/test/S0908/BAC009S0908W0456.wav", "txt": "东莞常平一地盘两日两度地陷"} -{"key": "BAC009S0908W0457", "wav": "./aishell/wav/test/S0908/BAC009S0908W0457.wav", "txt": "网上流传的视频可见"} -{"key": "BAC009S0908W0458", "wav": "./aishell/wav/test/S0908/BAC009S0908W0458.wav", "txt": "地面在几秒内迅速塌陷成一个大坑"} -{"key": "BAC009S0908W0459", "wav": "./aishell/wav/test/S0908/BAC009S0908W0459.wav", "txt": "恐影响旁边大厦的基地"} -{"key": "BAC009S0908W0460", "wav": "./aishell/wav/test/S0908/BAC009S0908W0460.wav", "txt": "网络图片参考消息网八月一四日报道港媒称"} -{"key": "BAC009S0908W0461", "wav": "./aishell/wav/test/S0908/BAC009S0908W0461.wav", "txt": "一三日上午一零时许"} -{"key": "BAC009S0908W0462", "wav": "./aishell/wav/test/S0908/BAC009S0908W0462.wav", "txt": "东莞常平住宅大厦联邦花园旁边发生大面积地陷"} -{"key": "BAC009S0908W0463", "wav": "./aishell/wav/test/S0908/BAC009S0908W0463.wav", "txt": "面积达逾三零零平方米"} -{"key": "BAC009S0908W0464", "wav": "./aishell/wav/test/S0908/BAC009S0908W0464.wav", "txt": "造成一名井下工人死亡"} -{"key": "BAC009S0908W0465", "wav": "./aishell/wav/test/S0908/BAC009S0908W0465.wav", "txt": "该地盘曾发生地陷事故"} -{"key": "BAC009S0908W0466", "wav": "./aishell/wav/test/S0908/BAC009S0908W0466.wav", "txt": "现场流出的短片显示"} -{"key": "BAC009S0908W0467", "wav": "./aishell/wav/test/S0908/BAC009S0908W0467.wav", "txt": "每一次塌陷的区域前已有一个大坑"} -{"key": "BAC009S0908W0468", "wav": "./aishell/wav/test/S0908/BAC009S0908W0468.wav", "txt": "港媒中国人启动营养革命养生书籍热卖"} -{"key": "BAC009S0908W0469", "wav": "./aishell/wav/test/S0908/BAC009S0908W0469.wav", "txt": "参考消息网八月一五日报道"} -{"key": "BAC009S0908W0470", "wav": "./aishell/wav/test/S0908/BAC009S0908W0470.wav", "txt": "港媒中国出现多中心大都市郊区需要更多移民"} -{"key": "BAC009S0908W0471", "wav": "./aishell/wav/test/S0908/BAC009S0908W0471.wav", "txt": "参考消息网八月二五日报道"} -{"key": "BAC009S0908W0472", "wav": "./aishell/wav/test/S0908/BAC009S0908W0472.wav", "txt": "港媒中国发布金牌月嫂标准实用性遭质疑"} -{"key": "BAC009S0908W0473", "wav": "./aishell/wav/test/S0908/BAC009S0908W0473.wav", "txt": "参考消息网七月八日报道"} -{"key": "BAC009S0908W0474", "wav": "./aishell/wav/test/S0908/BAC009S0908W0474.wav", "txt": "港媒中国城市告别血汗工厂经济转型见成效"} -{"key": "BAC009S0908W0475", "wav": "./aishell/wav/test/S0908/BAC009S0908W0475.wav", "txt": "参考消息网八月一二日报道"} -{"key": "BAC009S0908W0476", "wav": "./aishell/wav/test/S0908/BAC009S0908W0476.wav", "txt": "港媒中国患者年底可在线上美国医生咨询病情"} -{"key": "BAC009S0908W0477", "wav": "./aishell/wav/test/S0908/BAC009S0908W0477.wav", "txt": "参考消息网九月二五日报道港媒称"} -{"key": "BAC009S0908W0478", "wav": "./aishell/wav/test/S0908/BAC009S0908W0478.wav", "txt": "在中国某个在线医疗平台增设一项新服务之后"} -{"key": "BAC009S0908W0479", "wav": "./aishell/wav/test/S0908/BAC009S0908W0479.wav", "txt": "中国正在逐步拥抱智能技术和数字至上创业精神"} -{"key": "BAC009S0908W0480", "wav": "./aishell/wav/test/S0908/BAC009S0908W0480.wav", "txt": "港媒中国成访日第一大客源国还会持续增加"} -{"key": "BAC009S0908W0481", "wav": "./aishell/wav/test/S0908/BAC009S0908W0481.wav", "txt": "参考消息网八月二日报道外媒称"} -{"key": "BAC009S0908W0482", "wav": "./aishell/wav/test/S0908/BAC009S0908W0482.wav", "txt": "访日外国游客突破千万"} -{"key": "BAC009S0908W0483", "wav": "./aishell/wav/test/S0908/BAC009S0908W0483.wav", "txt": "其中上半年中国访日游客接近翻倍"} -{"key": "BAC009S0908W0484", "wav": "./aishell/wav/test/S0908/BAC009S0908W0484.wav", "txt": "超过韩国成为访日最大客源国"} -{"key": "BAC009S0908W0485", "wav": "./aishell/wav/test/S0908/BAC009S0908W0485.wav", "txt": "更是扭转日本旅游赤字"} -{"key": "BAC009S0908W0486", "wav": "./aishell/wav/test/S0908/BAC009S0908W0486.wav", "txt": "港媒中国科学家研究蜈蚣毒液发现新止痛药"} -{"key": "BAC009S0908W0487", "wav": "./aishell/wav/test/S0908/BAC009S0908W0487.wav", "txt": "蜈蚣资料图参考消息网一零月二二日报道中国科学家称"} -{"key": "BAC009S0908W0488", "wav": "./aishell/wav/test/S0908/BAC009S0908W0488.wav", "txt": "港媒中式教学不可复制中国学生在哪都能拿高分"} -{"key": "BAC009S0908W0489", "wav": "./aishell/wav/test/S0908/BAC009S0908W0489.wav", "txt": "参考消息网九月二三日报道港媒称"} -{"key": "BAC009S0908W0490", "wav": "./aishell/wav/test/S0908/BAC009S0908W0490.wav", "txt": "宣传的重点是中国教育和英国教育之战"} -{"key": "BAC009S0908W0491", "wav": "./aishell/wav/test/S0908/BAC009S0908W0491.wav", "txt": "港媒中秋赏月航班受热捧部分靠窗座位售罄"} -{"key": "BAC009S0908W0492", "wav": "./aishell/wav/test/S0908/BAC009S0908W0492.wav", "txt": "参考消息网九月一三日报道港媒称"} -{"key": "BAC009S0908W0493", "wav": "./aishell/wav/test/S0908/BAC009S0908W0493.wav", "txt": "很多人都已为赏月做准备"} -{"key": "BAC009S0908W0494", "wav": "./aishell/wav/test/S0908/BAC009S0908W0494.wav", "txt": "如果对一般登高赏月仍未满足"} -{"key": "BAC009S0908W0495", "wav": "./aishell/wav/test/S0908/BAC009S0908W0495.wav", "txt": "可以考虑一下空中赏月"} -{"key": "BAC009S0912W0121", "wav": "./aishell/wav/test/S0912/BAC009S0912W0121.wav", "txt": "房地产相关领域问题频发"} -{"key": "BAC009S0912W0122", "wav": "./aishell/wav/test/S0912/BAC009S0912W0122.wav", "txt": "东地产财经周刊新一年度审计工作报告出炉"} -{"key": "BAC009S0912W0123", "wav": "./aishell/wav/test/S0912/BAC009S0912W0123.wav", "txt": "审计署审计长刘家义受国务院委托"} -{"key": "BAC009S0912W0124", "wav": "./aishell/wav/test/S0912/BAC009S0912W0124.wav", "txt": "土地相关的审查成为重点之一"} -{"key": "BAC009S0912W0125", "wav": "./aishell/wav/test/S0912/BAC009S0912W0125.wav", "txt": "刘家义在报告中指出"} -{"key": "BAC009S0912W0126", "wav": "./aishell/wav/test/S0912/BAC009S0912W0126.wav", "txt": "共审计二十个省本级和二百个市"} -{"key": "BAC009S0912W0127", "wav": "./aishell/wav/test/S0912/BAC009S0912W0127.wav", "txt": "二零零八年至二零一五年"} -{"key": "BAC009S0912W0128", "wav": "./aishell/wav/test/S0912/BAC009S0912W0128.wav", "txt": "这些地区批准建设用地二百万公顷"} -{"key": "BAC009S0912W0129", "wav": "./aishell/wav/test/S0912/BAC009S0912W0129.wav", "txt": "取得土地出让收入十三万亿元"} -{"key": "BAC009S0912W0130", "wav": "./aishell/wav/test/S0912/BAC009S0912W0130.wav", "txt": "支出十二万亿元"} -{"key": "BAC009S0912W0131", "wav": "./aishell/wav/test/S0912/BAC009S0912W0131.wav", "txt": "为经济社会发展提供了重要基础和支持"} -{"key": "BAC009S0912W0132", "wav": "./aishell/wav/test/S0912/BAC009S0912W0132.wav", "txt": "土地出入收入累计结馀五千亿元"} -{"key": "BAC009S0912W0133", "wav": "./aishell/wav/test/S0912/BAC009S0912W0133.wav", "txt": "主要是土地出让收入少征三千亿元"} -{"key": "BAC009S0912W0134", "wav": "./aishell/wav/test/S0912/BAC009S0912W0134.wav", "txt": "一些地方和单位少支付补偿一亿元"} -{"key": "BAC009S0912W0135", "wav": "./aishell/wav/test/S0912/BAC009S0912W0135.wav", "txt": "编造虚假资料等套取或骗取补偿一亿元"} -{"key": "BAC009S0912W0136", "wav": "./aishell/wav/test/S0912/BAC009S0912W0136.wav", "txt": "一些地方土地出让收支核算不够规范"} -{"key": "BAC009S0912W0137", "wav": "./aishell/wav/test/S0912/BAC009S0912W0137.wav", "txt": "减免或返还土地出让收入一亿元"} -{"key": "BAC009S0912W0138", "wav": "./aishell/wav/test/S0912/BAC009S0912W0138.wav", "txt": "建设用地方面也暴露了不少问题"} -{"key": "BAC009S0912W0139", "wav": "./aishell/wav/test/S0912/BAC009S0912W0139.wav", "txt": "违规以租代征改变规划条件等用地一万公顷"} -{"key": "BAC009S0912W0140", "wav": "./aishell/wav/test/S0912/BAC009S0912W0140.wav", "txt": "有一个突破土地或城市规划"} -{"key": "BAC009S0912W0141", "wav": "./aishell/wav/test/S0912/BAC009S0912W0141.wav", "txt": "还有一个违规扩区一万公顷"} -{"key": "BAC009S0912W0142", "wav": "./aishell/wav/test/S0912/BAC009S0912W0142.wav", "txt": "虚增耕地质量不达标的分别占百分之十和百分之三十"} -{"key": "BAC009S0912W0143", "wav": "./aishell/wav/test/S0912/BAC009S0912W0143.wav", "txt": "整治资金被挤占挪用等一亿元"} -{"key": "BAC009S0912W0144", "wav": "./aishell/wav/test/S0912/BAC009S0912W0144.wav", "txt": "纠正违法用地一万起"} -{"key": "BAC009S0912W0145", "wav": "./aishell/wav/test/S0912/BAC009S0912W0145.wav", "txt": "制定完善制度一百多项"} -{"key": "BAC009S0912W0146", "wav": "./aishell/wav/test/S0912/BAC009S0912W0146.wav", "txt": "审计已向有关部门移送重大违法违纪问题三百起"} -{"key": "BAC009S0912W0147", "wav": "./aishell/wav/test/S0912/BAC009S0912W0147.wav", "txt": "各级政府安排财政资金一亿元"} -{"key": "BAC009S0912W0148", "wav": "./aishell/wav/test/S0912/BAC009S0912W0148.wav", "txt": "为安居工程建设提供了资金保障"} -{"key": "BAC009S0912W0149", "wav": "./aishell/wav/test/S0912/BAC009S0912W0149.wav", "txt": "还有一亿元被套取或用于弥补经费不足等"} -{"key": "BAC009S0912W0150", "wav": "./aishell/wav/test/S0912/BAC009S0912W0150.wav", "txt": "有关地方追回资金或补贴一亿元"} -{"key": "BAC009S0912W0151", "wav": "./aishell/wav/test/S0912/BAC009S0912W0151.wav", "txt": "清理收回住房二十套"} -{"key": "BAC009S0912W0152", "wav": "./aishell/wav/test/S0912/BAC009S0912W0152.wav", "txt": "取消一万户家庭的保障资格"} -{"key": "BAC009S0912W0153", "wav": "./aishell/wav/test/S0912/BAC009S0912W0153.wav", "txt": "审计已向有关部门移送重大违法违纪问题三十起"} -{"key": "BAC009S0912W0154", "wav": "./aishell/wav/test/S0912/BAC009S0912W0154.wav", "txt": "在对央企的审计也发现了不少问题"} -{"key": "BAC009S0912W0155", "wav": "./aishell/wav/test/S0912/BAC009S0912W0155.wav", "txt": "中粮集团违规投资四亿元对原培训中心进行改扩建"} -{"key": "BAC009S0912W0156", "wav": "./aishell/wav/test/S0912/BAC009S0912W0156.wav", "txt": "受土地开发政策和土地规划限制未开发建设"} -{"key": "BAC009S0912W0157", "wav": "./aishell/wav/test/S0912/BAC009S0912W0157.wav", "txt": "六年土地收入十三万度审计报告中"} -{"key": "BAC009S0912W0158", "wav": "./aishell/wav/test/S0912/BAC009S0912W0158.wav", "txt": "房地产相关领域问题频发"} -{"key": "BAC009S0912W0159", "wav": "./aishell/wav/test/S0912/BAC009S0912W0159.wav", "txt": "东地产财经周度审计工作报告出炉"} -{"key": "BAC009S0912W0160", "wav": "./aishell/wav/test/S0912/BAC009S0912W0160.wav", "txt": "审计署审计长刘家义受国务院委托"} -{"key": "BAC009S0912W0161", "wav": "./aishell/wav/test/S0912/BAC009S0912W0161.wav", "txt": "羊年置业小调查的调查结果截图"} -{"key": "BAC009S0912W0162", "wav": "./aishell/wav/test/S0912/BAC009S0912W0162.wav", "txt": "二初楼市迎来多项利好政策"} -{"key": "BAC009S0912W0163", "wav": "./aishell/wav/test/S0912/BAC009S0912W0163.wav", "txt": "在多项政策的支持下"} -{"key": "BAC009S0912W0164", "wav": "./aishell/wav/test/S0912/BAC009S0912W0164.wav", "txt": "今年楼市将走向何方"} -{"key": "BAC009S0912W0165", "wav": "./aishell/wav/test/S0912/BAC009S0912W0165.wav", "txt": "中新网房产频道推置业小调查"} -{"key": "BAC009S0912W0167", "wav": "./aishell/wav/test/S0912/BAC009S0912W0167.wav", "txt": "十位网友参与了本次调查"} -{"key": "BAC009S0912W0168", "wav": "./aishell/wav/test/S0912/BAC009S0912W0168.wav", "txt": "在参与调查的网友中"} -{"key": "BAC009S0912W0169", "wav": "./aishell/wav/test/S0912/BAC009S0912W0169.wav", "txt": "约六成网友看涨全国的商品房价格"} -{"key": "BAC009S0912W0170", "wav": "./aishell/wav/test/S0912/BAC009S0912W0170.wav", "txt": "万科获选性价比最高的房企"} -{"key": "BAC009S0912W0171", "wav": "./aishell/wav/test/S0912/BAC009S0912W0171.wav", "txt": "房价的一涨一跌都牵动着购房者的神经"} -{"key": "BAC009S0912W0172", "wav": "./aishell/wav/test/S0912/BAC009S0912W0172.wav", "txt": "百分之五的网友认为房价将普遍上涨"} -{"key": "BAC009S0912W0173", "wav": "./aishell/wav/test/S0912/BAC009S0912W0173.wav", "txt": "百分之五的网友认为房价将普遍下跌"} -{"key": "BAC009S0912W0174", "wav": "./aishell/wav/test/S0912/BAC009S0912W0174.wav", "txt": "百分之五的网友认为房价走势不好判断"} -{"key": "BAC009S0912W0175", "wav": "./aishell/wav/test/S0912/BAC009S0912W0175.wav", "txt": "作为楼市政策的风向标"} -{"key": "BAC009S0912W0176", "wav": "./aishell/wav/test/S0912/BAC009S0912W0176.wav", "txt": "二全国两会或将楼市基调"} -{"key": "BAC009S0912W0177", "wav": "./aishell/wav/test/S0912/BAC009S0912W0177.wav", "txt": "国务院总理李克强在二政府工作报告中表示"} -{"key": "BAC009S0912W0178", "wav": "./aishell/wav/test/S0912/BAC009S0912W0178.wav", "txt": "支持居民自住和改善住房需求"} -{"key": "BAC009S0912W0179", "wav": "./aishell/wav/test/S0912/BAC009S0912W0179.wav", "txt": "促进房地产市场平稳健康发展"} -{"key": "BAC009S0912W0180", "wav": "./aishell/wav/test/S0912/BAC009S0912W0180.wav", "txt": "这也从宏观层面明确了政府对于房地产市场的态度"} -{"key": "BAC009S0912W0181", "wav": "./aishell/wav/test/S0912/BAC009S0912W0181.wav", "txt": "在今年两会是否会开启新一轮楼市调控这个问题上"} -{"key": "BAC009S0912W0182", "wav": "./aishell/wav/test/S0912/BAC009S0912W0182.wav", "txt": "中新网的调查结果显示"} -{"key": "BAC009S0912W0183", "wav": "./aishell/wav/test/S0912/BAC009S0912W0183.wav", "txt": "百分之五的网友认为不会"} -{"key": "BAC009S0912W0184", "wav": "./aishell/wav/test/S0912/BAC009S0912W0184.wav", "txt": "百分之五的网友认为会"} -{"key": "BAC009S0912W0185", "wav": "./aishell/wav/test/S0912/BAC009S0912W0185.wav", "txt": "百分之五的网友认为不好说"} -{"key": "BAC009S0912W0186", "wav": "./aishell/wav/test/S0912/BAC009S0912W0186.wav", "txt": "楼市政策也深刻影响着房地产行业的走向"} -{"key": "BAC009S0912W0187", "wav": "./aishell/wav/test/S0912/BAC009S0912W0187.wav", "txt": "抓紧做好故调查处理工作"} -{"key": "BAC009S0912W0188", "wav": "./aishell/wav/test/S0912/BAC009S0912W0188.wav", "txt": "督促责任单位彻底排查溢油风险点"} -{"key": "BAC009S0912W0189", "wav": "./aishell/wav/test/S0912/BAC009S0912W0189.wav", "txt": "并重新编报海洋环境影响报告书"} -{"key": "BAC009S0912W0190", "wav": "./aishell/wav/test/S0912/BAC009S0912W0190.wav", "txt": "彻底查明事故原因"} -{"key": "BAC009S0912W0191", "wav": "./aishell/wav/test/S0912/BAC009S0912W0191.wav", "txt": "查清事故造成的危害及损失"} -{"key": "BAC009S0912W0192", "wav": "./aishell/wav/test/S0912/BAC009S0912W0192.wav", "txt": "维护受损各方合法权益"} -{"key": "BAC009S0912W0193", "wav": "./aishell/wav/test/S0912/BAC009S0912W0193.wav", "txt": "立即部署开展海洋石油勘探开发安全生产检查"} -{"key": "BAC009S0912W0194", "wav": "./aishell/wav/test/S0912/BAC009S0912W0194.wav", "txt": "全面加强海洋环境监视监测和监督管理"} -{"key": "BAC009S0912W0195", "wav": "./aishell/wav/test/S0912/BAC009S0912W0195.wav", "txt": "全面准确及时发布事故处置相关信息"} -{"key": "BAC009S0912W0196", "wav": "./aishell/wav/test/S0912/BAC009S0912W0196.wav", "txt": "抓紧研究完善海洋环境保护的法律法规"} -{"key": "BAC009S0912W0197", "wav": "./aishell/wav/test/S0912/BAC009S0912W0197.wav", "txt": "入海污染物排放总量下降"} -{"key": "BAC009S0912W0198", "wav": "./aishell/wav/test/S0912/BAC009S0912W0198.wav", "txt": "力争渤海近岸海域水质总体改善"} -{"key": "BAC009S0912W0199", "wav": "./aishell/wav/test/S0912/BAC009S0912W0199.wav", "txt": "优化产业结构与布局"} -{"key": "BAC009S0912W0200", "wav": "./aishell/wav/test/S0912/BAC009S0912W0200.wav", "txt": "切实改变沿海地区重化工比重过大过于集中的状况"} -{"key": "BAC009S0912W0201", "wav": "./aishell/wav/test/S0912/BAC009S0912W0201.wav", "txt": "严格控制新上石化项目"} -{"key": "BAC009S0912W0202", "wav": "./aishell/wav/test/S0912/BAC009S0912W0202.wav", "txt": "禁止在可能造成生态严重失衡的地方进行围填海活动"} -{"key": "BAC009S0912W0203", "wav": "./aishell/wav/test/S0912/BAC009S0912W0203.wav", "txt": "有效控制陆海污染源"} -{"key": "BAC009S0912W0204", "wav": "./aishell/wav/test/S0912/BAC009S0912W0204.wav", "txt": "坚持海陆统筹河海兼顾"} -{"key": "BAC009S0912W0205", "wav": "./aishell/wav/test/S0912/BAC009S0912W0205.wav", "txt": "加强入海河流综合治理"} -{"key": "BAC009S0912W0206", "wav": "./aishell/wav/test/S0912/BAC009S0912W0206.wav", "txt": "合理布局入海排污口"} -{"key": "BAC009S0912W0207", "wav": "./aishell/wav/test/S0912/BAC009S0912W0207.wav", "txt": "制定更加严格的地方水污染排放标准"} -{"key": "BAC009S0912W0208", "wav": "./aishell/wav/test/S0912/BAC009S0912W0208.wav", "txt": "努力保护和修复渤海生态系统"} -{"key": "BAC009S0912W0209", "wav": "./aishell/wav/test/S0912/BAC009S0912W0209.wav", "txt": "加强用水总量控制与调度管理"} -{"key": "BAC009S0912W0210", "wav": "./aishell/wav/test/S0912/BAC009S0912W0210.wav", "txt": "改善河口和近岸海域生态环境"} -{"key": "BAC009S0912W0211", "wav": "./aishell/wav/test/S0912/BAC009S0912W0211.wav", "txt": "加强海陆过渡区生态建设"} -{"key": "BAC009S0912W0212", "wav": "./aishell/wav/test/S0912/BAC009S0912W0212.wav", "txt": "逐步恢复湿地生态功能"} -{"key": "BAC009S0912W0213", "wav": "./aishell/wav/test/S0912/BAC009S0912W0213.wav", "txt": "在海洋环境敏感区关键区等划定生态红线"} -{"key": "BAC009S0912W0214", "wav": "./aishell/wav/test/S0912/BAC009S0912W0214.wav", "txt": "有效防范海洋环境灾害"} -{"key": "BAC009S0912W0215", "wav": "./aishell/wav/test/S0912/BAC009S0912W0215.wav", "txt": "建立渤海海洋环境预警机制和突发事件应对机制"} -{"key": "BAC009S0912W0216", "wav": "./aishell/wav/test/S0912/BAC009S0912W0216.wav", "txt": "修订完善相关应急预案"} -{"key": "BAC009S0912W0217", "wav": "./aishell/wav/test/S0912/BAC009S0912W0217.wav", "txt": "强化地方政府和企业的主体意识法制意识"} -{"key": "BAC009S0912W0218", "wav": "./aishell/wav/test/S0912/BAC009S0912W0218.wav", "txt": "落实海洋环境保护责任"} -{"key": "BAC009S0912W0219", "wav": "./aishell/wav/test/S0912/BAC009S0912W0219.wav", "txt": "提高公众参与渤海环境保护的积极性和主动性"} -{"key": "BAC009S0912W0220", "wav": "./aishell/wav/test/S0912/BAC009S0912W0220.wav", "txt": "建立公开透明的信息发布机制"} -{"key": "BAC009S0912W0221", "wav": "./aishell/wav/test/S0912/BAC009S0912W0221.wav", "txt": "会议讨论进一步加强环境保护工作的意见"} -{"key": "BAC009S0912W0222", "wav": "./aishell/wav/test/S0912/BAC009S0912W0222.wav", "txt": "强调必须把污染治理和生态保护摆在更加重要的位置"} -{"key": "BAC009S0912W0223", "wav": "./aishell/wav/test/S0912/BAC009S0912W0223.wav", "txt": "切实解决损害公众健康影响科学发展的突发环境问题"} -{"key": "BAC009S0912W0224", "wav": "./aishell/wav/test/S0912/BAC009S0912W0224.wav", "txt": "落实节能减排各项任务"} -{"key": "BAC009S0912W0225", "wav": "./aishell/wav/test/S0912/BAC009S0912W0225.wav", "txt": "凡依法应当进行环评的建设规划和项目"} -{"key": "BAC009S0912W0226", "wav": "./aishell/wav/test/S0912/BAC009S0912W0226.wav", "txt": "都要严格履行环评程序"} -{"key": "BAC009S0912W0227", "wav": "./aishell/wav/test/S0912/BAC009S0912W0227.wav", "txt": "环评过程要公开透明"} -{"key": "BAC009S0912W0228", "wav": "./aishell/wav/test/S0912/BAC009S0912W0228.wav", "txt": "充分征求专家和社会公众意见"} -{"key": "BAC009S0912W0229", "wav": "./aishell/wav/test/S0912/BAC009S0912W0229.wav", "txt": "要依法追究管理部门责任企业及有关人员的责任"} -{"key": "BAC009S0912W0230", "wav": "./aishell/wav/test/S0912/BAC009S0912W0230.wav", "txt": "切实加强重金属污染防治"} -{"key": "BAC009S0912W0231", "wav": "./aishell/wav/test/S0912/BAC009S0912W0231.wav", "txt": "对重点地区行业和企业"} -{"key": "BAC009S0912W0232", "wav": "./aishell/wav/test/S0912/BAC009S0912W0232.wav", "txt": "妥善处理重金属污染历史遗留问题和突发污染事件"} -{"key": "BAC009S0912W0233", "wav": "./aishell/wav/test/S0912/BAC009S0912W0233.wav", "txt": "保障人民群众生命健康安全"} -{"key": "BAC009S0912W0234", "wav": "./aishell/wav/test/S0912/BAC009S0912W0234.wav", "txt": "严格化学品环境管理"} -{"key": "BAC009S0912W0235", "wav": "./aishell/wav/test/S0912/BAC009S0912W0235.wav", "txt": "对化学品项目布局进行梳理评估"} -{"key": "BAC009S0912W0236", "wav": "./aishell/wav/test/S0912/BAC009S0912W0236.wav", "txt": "对化学品生产经营企业进行环境隐患排查"} -{"key": "BAC009S0912W0237", "wav": "./aishell/wav/test/S0912/BAC009S0912W0237.wav", "txt": "对海洋江河湖泊沿岸化工企业进行集中综合整治"} -{"key": "BAC009S0912W0238", "wav": "./aishell/wav/test/S0912/BAC009S0912W0238.wav", "txt": "落实环境监管责任和安全保障措施"} -{"key": "BAC009S0912W0239", "wav": "./aishell/wav/test/S0912/BAC009S0912W0239.wav", "txt": "提高化学品生产的环境准入门槛"} -{"key": "BAC009S0912W0240", "wav": "./aishell/wav/test/S0912/BAC009S0912W0240.wav", "txt": "加强农村环境保护"} -{"key": "BAC009S0912W0241", "wav": "./aishell/wav/test/S0912/BAC009S0912W0241.wav", "txt": "集中整治存在突出环境问题的村庄和集镇"} -{"key": "BAC009S0912W0242", "wav": "./aishell/wav/test/S0912/BAC009S0912W0242.wav", "txt": "重点治理农村土壤饮用水水源地污染"} -{"key": "BAC009S0912W0243", "wav": "./aishell/wav/test/S0912/BAC009S0912W0243.wav", "txt": "推动环保基础设施和服务向农村延伸"} -{"key": "BAC009S0912W0244", "wav": "./aishell/wav/test/S0912/BAC009S0912W0244.wav", "txt": "引导和帮助农民科学处理垃圾和污水"} -{"key": "BAC009S0912W0245", "wav": "./aishell/wav/test/S0912/BAC009S0912W0245.wav", "txt": "科学使用农药化肥和农膜"} -{"key": "BAC009S0912W0246", "wav": "./aishell/wav/test/S0912/BAC009S0912W0246.wav", "txt": "严格农村工矿企业环境监管"} -{"key": "BAC009S0912W0247", "wav": "./aishell/wav/test/S0912/BAC009S0912W0247.wav", "txt": "坚决防止污染向农村转移"} -{"key": "BAC009S0912W0248", "wav": "./aishell/wav/test/S0912/BAC009S0912W0248.wav", "txt": "加快建设环境监测预警体系"} -{"key": "BAC009S0912W0249", "wav": "./aishell/wav/test/S0912/BAC009S0912W0249.wav", "txt": "完善环境事件应急机制"} -{"key": "BAC009S0912W0250", "wav": "./aishell/wav/test/S0912/BAC009S0912W0250.wav", "txt": "完善环境法律政策体系"} -{"key": "BAC009S0912W0251", "wav": "./aishell/wav/test/S0912/BAC009S0912W0251.wav", "txt": "针对近期各种环境事件暴露出的问题"} -{"key": "BAC009S0912W0252", "wav": "./aishell/wav/test/S0912/BAC009S0912W0252.wav", "txt": "抓紧制定和修订相关法律法规"} -{"key": "BAC009S0912W0253", "wav": "./aishell/wav/test/S0912/BAC009S0912W0253.wav", "txt": "毛利率也只有百分之十四"} -{"key": "BAC009S0912W0254", "wav": "./aishell/wav/test/S0912/BAC009S0912W0254.wav", "txt": "由此可见苹果现在的业务确实比汽车行业更加赚钱"} -{"key": "BAC009S0912W0256", "wav": "./aishell/wav/test/S0912/BAC009S0912W0256.wav", "txt": "他表示他肯定会与苹果展开合作"} -{"key": "BAC009S0912W0257", "wav": "./aishell/wav/test/S0912/BAC009S0912W0257.wav", "txt": "苹果公司一直在秘密从事电汽汽车的研发"} -{"key": "BAC009S0912W0258", "wav": "./aishell/wav/test/S0912/BAC009S0912W0258.wav", "txt": "并且计划最早在二零二零年推出生产首款车型"} -{"key": "BAC009S0912W0259", "wav": "./aishell/wav/test/S0912/BAC009S0912W0259.wav", "txt": "苹果已为汽车项目招募了数百名员工"} -{"key": "BAC009S0912W0260", "wav": "./aishell/wav/test/S0912/BAC009S0912W0260.wav", "txt": "包括电池和机器人技术领域的专家"} -{"key": "BAC009S0912W0261", "wav": "./aishell/wav/test/S0912/BAC009S0912W0261.wav", "txt": "苹果涉足汽车行业并不是一个好主意"} -{"key": "BAC009S0912W0264", "wav": "./aishell/wav/test/S0912/BAC009S0912W0264.wav", "txt": "除了苹果上周公布的选定合作伙伴"} -{"key": "BAC009S0912W0269", "wav": "./aishell/wav/test/S0912/BAC009S0912W0269.wav", "txt": "将会在设备发售稳定的推出与更新"} -{"key": "BAC009S0912W0278", "wav": "./aishell/wav/test/S0912/BAC009S0912W0278.wav", "txt": "此款健康设备将延迟到明年推出"} -{"key": "BAC009S0912W0279", "wav": "./aishell/wav/test/S0912/BAC009S0912W0279.wav", "txt": "根据知情人士获得的安吉拉录音手稿"} -{"key": "BAC009S0912W0280", "wav": "./aishell/wav/test/S0912/BAC009S0912W0280.wav", "txt": "安吉拉要求零售店员工养精蓄锐"} -{"key": "BAC009S0912W0281", "wav": "./aishell/wav/test/S0912/BAC009S0912W0281.wav", "txt": "为即将到来的购物季"} -{"key": "BAC009S0912W0282", "wav": "./aishell/wav/test/S0912/BAC009S0912W0282.wav", "txt": "以及中国的春节做准备"} -{"key": "BAC009S0912W0285", "wav": "./aishell/wav/test/S0912/BAC009S0912W0285.wav", "txt": "后有消息称该款产品将于今年的情人节推出"} -{"key": "BAC009S0912W0286", "wav": "./aishell/wav/test/S0912/BAC009S0912W0286.wav", "txt": "不过目前看来不大可能"} -{"key": "BAC009S0912W0287", "wav": "./aishell/wav/test/S0912/BAC009S0912W0287.wav", "txt": "因为春季的计算方式是从三月二十日到六月三十日"} -{"key": "BAC009S0912W0290", "wav": "./aishell/wav/test/S0912/BAC009S0912W0290.wav", "txt": "有报道称苹果计划在二零一四年秋季推出其可穿戴设备"} -{"key": "BAC009S0912W0291", "wav": "./aishell/wav/test/S0912/BAC009S0912W0291.wav", "txt": "该产品将延迟到二零一五年发布"} -{"key": "BAC009S0912W0292", "wav": "./aishell/wav/test/S0912/BAC009S0912W0292.wav", "txt": "纷至沓来的报道显示"} -{"key": "BAC009S0912W0293", "wav": "./aishell/wav/test/S0912/BAC009S0912W0293.wav", "txt": "电池的技术难题最终导致了它的延迟推出"} -{"key": "BAC009S0912W0299", "wav": "./aishell/wav/test/S0912/BAC009S0912W0299.wav", "txt": "包括更换不同尺寸型号和不同的表带"} -{"key": "BAC009S0912W0302", "wav": "./aishell/wav/test/S0912/BAC009S0912W0302.wav", "txt": "这将是苹果零售店采用的全新模式"} -{"key": "BAC009S0912W0303", "wav": "./aishell/wav/test/S0912/BAC009S0912W0303.wav", "txt": "如果融入移动互联的新时代"} -{"key": "BAC009S0912W0304", "wav": "./aishell/wav/test/S0912/BAC009S0912W0304.wav", "txt": "我们凭什么征战全世界"} -{"key": "BAC009S0912W0305", "wav": "./aishell/wav/test/S0912/BAC009S0912W0305.wav", "txt": "在前段时间的上海家电展上"} -{"key": "BAC009S0912W0306", "wav": "./aishell/wav/test/S0912/BAC009S0912W0306.wav", "txt": "美的集团总裁方洪波提出了上述三个问题"} -{"key": "BAC009S0912W0307", "wav": "./aishell/wav/test/S0912/BAC009S0912W0307.wav", "txt": "这是当前所有中国家电企业"} -{"key": "BAC009S0912W0308", "wav": "./aishell/wav/test/S0912/BAC009S0912W0308.wav", "txt": "都必须要回答的问题"} -{"key": "BAC009S0912W0309", "wav": "./aishell/wav/test/S0912/BAC009S0912W0309.wav", "txt": "如果不回答这三个课题"} -{"key": "BAC009S0912W0310", "wav": "./aishell/wav/test/S0912/BAC009S0912W0310.wav", "txt": "企业所有的目标都是空洞的"} -{"key": "BAC009S0912W0311", "wav": "./aishell/wav/test/S0912/BAC009S0912W0311.wav", "txt": "得出这个结论来自于方洪波对当前形势的判断"} -{"key": "BAC009S0912W0312", "wav": "./aishell/wav/test/S0912/BAC009S0912W0312.wav", "txt": "中国家电企业现在正面临前所未有的挑战"} -{"key": "BAC009S0912W0313", "wav": "./aishell/wav/test/S0912/BAC009S0912W0313.wav", "txt": "过去三十年高速发展的前提条件没有了"} -{"key": "BAC009S0912W0316", "wav": "./aishell/wav/test/S0912/BAC009S0912W0316.wav", "txt": "在全世界的产业格局看"} -{"key": "BAC009S0912W0317", "wav": "./aishell/wav/test/S0912/BAC009S0912W0317.wav", "txt": "全世界排列的二加三格局"} -{"key": "BAC009S0912W0319", "wav": "./aishell/wav/test/S0912/BAC009S0912W0319.wav", "txt": "这样一个全球白电的格局短期内是难以撬动的"} -{"key": "BAC009S0912W0320", "wav": "./aishell/wav/test/S0912/BAC009S0912W0320.wav", "txt": "时代力量正在颠复着家电行业"} -{"key": "BAC009S0912W0321", "wav": "./aishell/wav/test/S0912/BAC009S0912W0321.wav", "txt": "移动互联以前改变的是软的层面"} -{"key": "BAC009S0912W0322", "wav": "./aishell/wav/test/S0912/BAC009S0912W0322.wav", "txt": "比如流程的缩短平台化的应用"} -{"key": "BAC009S0912W0323", "wav": "./aishell/wav/test/S0912/BAC009S0912W0323.wav", "txt": "转型升级应该在十年前就开始了"} -{"key": "BAC009S0912W0324", "wav": "./aishell/wav/test/S0912/BAC009S0912W0324.wav", "txt": "中国家电企业在世界产业链地位弱小"} -{"key": "BAC009S0912W0325", "wav": "./aishell/wav/test/S0912/BAC009S0912W0325.wav", "txt": "跟世界产业的差距不是在缩小"} -{"key": "BAC009S0912W0326", "wav": "./aishell/wav/test/S0912/BAC009S0912W0326.wav", "txt": "这是目前我们中国家电企业面临的具体挑战"} -{"key": "BAC009S0912W0327", "wav": "./aishell/wav/test/S0912/BAC009S0912W0327.wav", "txt": "这些挑战来自于四面八方"} -{"key": "BAC009S0912W0328", "wav": "./aishell/wav/test/S0912/BAC009S0912W0328.wav", "txt": "未来给我们的机会和空间是有限的"} -{"key": "BAC009S0912W0330", "wav": "./aishell/wav/test/S0912/BAC009S0912W0330.wav", "txt": "白电行业将进入最惨烈的一年"} -{"key": "BAC009S0912W0331", "wav": "./aishell/wav/test/S0912/BAC009S0912W0331.wav", "txt": "昔日巨头格力美的海尔也将沉浮于其中"} -{"key": "BAC009S0912W0332", "wav": "./aishell/wav/test/S0912/BAC009S0912W0332.wav", "txt": "从本年度第一份季报来看"} -{"key": "BAC009S0912W0333", "wav": "./aishell/wav/test/S0912/BAC009S0912W0333.wav", "txt": "三巨头中的格力海尔均出现不同程度"} -{"key": "BAC009S0912W0335", "wav": "./aishell/wav/test/S0912/BAC009S0912W0335.wav", "txt": "下称美的内部的组织架构二点一五年加大了调整力度"} -{"key": "BAC009S0912W0336", "wav": "./aishell/wav/test/S0912/BAC009S0912W0336.wav", "txt": "七月成立了美的部品事业部"} -{"key": "BAC009S0912W0337", "wav": "./aishell/wav/test/S0912/BAC009S0912W0337.wav", "txt": "威灵电机将有可能兼并美芝压缩机"} -{"key": "BAC009S0912W0338", "wav": "./aishell/wav/test/S0912/BAC009S0912W0338.wav", "txt": "这一切并不是说说而已"} -{"key": "BAC009S0912W0339", "wav": "./aishell/wav/test/S0912/BAC009S0912W0339.wav", "txt": "而是要明确落实在数字上"} -{"key": "BAC009S0912W0340", "wav": "./aishell/wav/test/S0912/BAC009S0912W0340.wav", "txt": "控制在六零微克立方米左右"} -{"key": "BAC009S0912W0341", "wav": "./aishell/wav/test/S0912/BAC009S0912W0341.wav", "txt": "这与市民的期望和城市发展的愿景也是一致的"} -{"key": "BAC009S0912W0342", "wav": "./aishell/wav/test/S0912/BAC009S0912W0342.wav", "txt": "二零一七年二零二二年"} -{"key": "BAC009S0912W0343", "wav": "./aishell/wav/test/S0912/BAC009S0912W0343.wav", "txt": "我们还将继续加大污染防治力度"} -{"key": "BAC009S0912W0344", "wav": "./aishell/wav/test/S0912/BAC009S0912W0344.wav", "txt": "这一点对于京津冀一带的居民来说是才最重要的"} -{"key": "BAC009S0912W0345", "wav": "./aishell/wav/test/S0912/BAC009S0912W0345.wav", "txt": "因为每个人都需要呼吸"} -{"key": "BAC009S0912W0346", "wav": "./aishell/wav/test/S0912/BAC009S0912W0346.wav", "txt": "场馆建设一简约而不简单"} -{"key": "BAC009S0912W0347", "wav": "./aishell/wav/test/S0912/BAC009S0912W0347.wav", "txt": "二零二二年北京冬奥会计划使用一二个比赛场馆"} -{"key": "BAC009S0912W0348", "wav": "./aishell/wav/test/S0912/BAC009S0912W0348.wav", "txt": "总体上以节俭办赛为原则进行规划建设和改造使用"} -{"key": "BAC009S0912W0349", "wav": "./aishell/wav/test/S0912/BAC009S0912W0349.wav", "txt": "充分利用北京奥运后的丰富遗产"} -{"key": "BAC009S0912W0350", "wav": "./aishell/wav/test/S0912/BAC009S0912W0350.wav", "txt": "仅有三个场馆需要新建"} -{"key": "BAC009S0912W0351", "wav": "./aishell/wav/test/S0912/BAC009S0912W0351.wav", "txt": "分别是位于北京市区的国家速滑馆和延庆的二个雪场"} -{"key": "BAC009S0912W0352", "wav": "./aishell/wav/test/S0912/BAC009S0912W0352.wav", "txt": "其馀场馆改建后均可满足赛事需要"} -{"key": "BAC009S0912W0353", "wav": "./aishell/wav/test/S0912/BAC009S0912W0353.wav", "txt": "既免去了不必要的花费"} -{"key": "BAC009S0912W0354", "wav": "./aishell/wav/test/S0912/BAC009S0912W0354.wav", "txt": "每个场馆又高端大气上档次"} -{"key": "BAC009S0912W0355", "wav": "./aishell/wav/test/S0912/BAC009S0912W0355.wav", "txt": "真可谓是简约而不简单啊"} -{"key": "BAC009S0912W0356", "wav": "./aishell/wav/test/S0912/BAC009S0912W0356.wav", "txt": "花样滑冰短道速滑项目在首都体育馆进行"} -{"key": "BAC009S0912W0357", "wav": "./aishell/wav/test/S0912/BAC009S0912W0357.wav", "txt": "冰壶项目在水立方进行"} -{"key": "BAC009S0912W0358", "wav": "./aishell/wav/test/S0912/BAC009S0912W0358.wav", "txt": "计划明年就将开始动工"} -{"key": "BAC009S0912W0359", "wav": "./aishell/wav/test/S0912/BAC009S0912W0359.wav", "txt": "速滑馆建成后将设置四百米滑道"} -{"key": "BAC009S0912W0360", "wav": "./aishell/wav/test/S0912/BAC009S0912W0360.wav", "txt": "设有座位一万两千个"} -{"key": "BAC009S0912W0361", "wav": "./aishell/wav/test/S0912/BAC009S0912W0361.wav", "txt": "在冬奥会举办之前这里将为专业队伍训练提供场地"} -{"key": "BAC009S0912W0362", "wav": "./aishell/wav/test/S0912/BAC009S0912W0362.wav", "txt": "我们的奥运健儿将在此努力备战"} -{"key": "BAC009S0912W0363", "wav": "./aishell/wav/test/S0912/BAC009S0912W0363.wav", "txt": "成为市民体验冰上运动的乐园"} -{"key": "BAC009S0912W0364", "wav": "./aishell/wav/test/S0912/BAC009S0912W0364.wav", "txt": "张家口市的崇礼县从每年十一月初到第二年四月初"} -{"key": "BAC009S0912W0365", "wav": "./aishell/wav/test/S0912/BAC009S0912W0365.wav", "txt": "崇礼县发展较成熟的万龙滑雪场和云顶滑雪场"} -{"key": "BAC009S0912W0366", "wav": "./aishell/wav/test/S0912/BAC009S0912W0366.wav", "txt": "加上一个仍在建的太舞四季滑雪场"} -{"key": "BAC009S0912W0367", "wav": "./aishell/wav/test/S0912/BAC009S0912W0367.wav", "txt": "均已被纳入二零二二年冬奥会的规划场馆"} -{"key": "BAC009S0912W0368", "wav": "./aishell/wav/test/S0912/BAC009S0912W0368.wav", "txt": "万龙和云顶滑雪场都将根据赛事要求进行改造和扩建"} -{"key": "BAC009S0912W0369", "wav": "./aishell/wav/test/S0912/BAC009S0912W0369.wav", "txt": "小海坨山是位于延庆境内的海坨山主峰"} -{"key": "BAC009S0912W0370", "wav": "./aishell/wav/test/S0912/BAC009S0912W0370.wav", "txt": "此地春秋冬三季有雪"} -{"key": "BAC009S0912W0371", "wav": "./aishell/wav/test/S0912/BAC009S0912W0371.wav", "txt": "滑雪期从十一月下旬到次年三月中旬"} -{"key": "BAC009S0912W0372", "wav": "./aishell/wav/test/S0912/BAC009S0912W0372.wav", "txt": "这里常年吸引着众多登山探险运动爱好者"} -{"key": "BAC009S0912W0373", "wav": "./aishell/wav/test/S0912/BAC009S0912W0373.wav", "txt": "拥有高山滑雪要求的八百米落差"} -{"key": "BAC009S0912W0374", "wav": "./aishell/wav/test/S0912/BAC009S0912W0374.wav", "txt": "非常适合修建高山雪场"} -{"key": "BAC009S0912W0375", "wav": "./aishell/wav/test/S0912/BAC009S0912W0375.wav", "txt": "将依托现有山体地形修建临时场地设施"} -{"key": "BAC009S0912W0376", "wav": "./aishell/wav/test/S0912/BAC009S0912W0376.wav", "txt": "用作雪车雪橇大项和滑雪大项中的高山滑雪比赛场地"} -{"key": "BAC009S0912W0377", "wav": "./aishell/wav/test/S0912/BAC009S0912W0377.wav", "txt": "全民冰雪季奥运健儿助力"} -{"key": "BAC009S0912W0378", "wav": "./aishell/wav/test/S0912/BAC009S0912W0378.wav", "txt": "早在申办北京冬奥会的时候"} -{"key": "BAC009S0912W0379", "wav": "./aishell/wav/test/S0912/BAC009S0912W0379.wav", "txt": "很多人都看到了新的奥运商机"} -{"key": "BAC009S0912W0380", "wav": "./aishell/wav/test/S0912/BAC009S0912W0380.wav", "txt": "会投资建设一些冰雪主题乐园和冬季项目体验场所"} -{"key": "BAC009S0912W0381", "wav": "./aishell/wav/test/S0912/BAC009S0912W0381.wav", "txt": "在全民健身成为国家战略的大背景下"} -{"key": "BAC009S0912W0382", "wav": "./aishell/wav/test/S0912/BAC009S0912W0382.wav", "txt": "观赛便利不出国门看奥运"} -{"key": "BAC009S0912W0383", "wav": "./aishell/wav/test/S0912/BAC009S0912W0383.wav", "txt": "以往想要见识奥运级别的比赛"} -{"key": "BAC009S0912W0384", "wav": "./aishell/wav/test/S0912/BAC009S0912W0384.wav", "txt": "冰雪爱好者不得不选择出国"} -{"key": "BAC009S0912W0385", "wav": "./aishell/wav/test/S0912/BAC009S0912W0385.wav", "txt": "高昂的交通和住宿成本让很多人望而却步"} -{"key": "BAC009S0912W0386", "wav": "./aishell/wav/test/S0912/BAC009S0912W0386.wav", "txt": "如今在家门口就可以实现这个愿望了"} -{"key": "BAC009S0912W0387", "wav": "./aishell/wav/test/S0912/BAC009S0912W0387.wav", "txt": "交通住宿花费大大降低"} -{"key": "BAC009S0912W0388", "wav": "./aishell/wav/test/S0912/BAC009S0912W0388.wav", "txt": "让我们能够来一次说走就走的冬奥之行"} -{"key": "BAC009S0912W0389", "wav": "./aishell/wav/test/S0912/BAC009S0912W0389.wav", "txt": "在主场为中国健儿加油"} -{"key": "BAC009S0912W0390", "wav": "./aishell/wav/test/S0912/BAC009S0912W0390.wav", "txt": "该是一件多幸福的事啊"} -{"key": "BAC009S0912W0392", "wav": "./aishell/wav/test/S0912/BAC009S0912W0392.wav", "txt": "责任编辑冯浩"} -{"key": "BAC009S0912W0393", "wav": "./aishell/wav/test/S0912/BAC009S0912W0393.wav", "txt": "十月十八日早上九点"} -{"key": "BAC009S0912W0394", "wav": "./aishell/wav/test/S0912/BAC009S0912W0394.wav", "txt": "各地跑步爱好者齐聚一堂"} -{"key": "BAC009S0912W0395", "wav": "./aishell/wav/test/S0912/BAC009S0912W0395.wav", "txt": "共同享受奔跑带来的乐趣"} -{"key": "BAC009S0912W0396", "wav": "./aishell/wav/test/S0912/BAC009S0912W0396.wav", "txt": "经历过北京站和上海站两次比赛"} -{"key": "BAC009S0912W0397", "wav": "./aishell/wav/test/S0912/BAC009S0912W0397.wav", "txt": "本次沈阳站赛场迎来了许多熟悉的面孔"} -{"key": "BAC009S0912W0398", "wav": "./aishell/wav/test/S0912/BAC009S0912W0398.wav", "txt": "尤为引人瞩目的莫过于李子成"} -{"key": "BAC009S0912W0399", "wav": "./aishell/wav/test/S0912/BAC009S0912W0399.wav", "txt": "他更是以三十分十七秒一举夺得奔跑中国三连冠"} -{"key": "BAC009S0912W0400", "wav": "./aishell/wav/test/S0912/BAC009S0912W0400.wav", "txt": "而十公里女子组由刘庆红以三十四分十秒夺得冠军"} -{"key": "BAC009S0912W0401", "wav": "./aishell/wav/test/S0912/BAC009S0912W0401.wav", "txt": "海信一汽大众等知名企业和品牌也依旧亮相赛场"} -{"key": "BAC009S0912W0402", "wav": "./aishell/wav/test/S0912/BAC009S0912W0402.wav", "txt": "以不同方式助力本次比赛胜利进行"} -{"key": "BAC009S0912W0403", "wav": "./aishell/wav/test/S0912/BAC009S0912W0403.wav", "txt": "近四千名跑步爱好者和其家人朋友齐聚于此"} -{"key": "BAC009S0912W0404", "wav": "./aishell/wav/test/S0912/BAC009S0912W0404.wav", "txt": "全球范围内的创收达到十一点八亿美元"} -{"key": "BAC009S0912W0405", "wav": "./aishell/wav/test/S0912/BAC009S0912W0405.wav", "txt": "亚当桑德勒成功卫冕"} -{"key": "BAC009S0912W0406", "wav": "./aishell/wav/test/S0912/BAC009S0912W0406.wav", "txt": "约翰尼德普紧随其后"} -{"key": "BAC009S0912W0407", "wav": "./aishell/wav/test/S0912/BAC009S0912W0407.wav", "txt": "但是出于预算考虑"} -{"key": "BAC009S0912W0408", "wav": "./aishell/wav/test/S0912/BAC009S0912W0408.wav", "txt": "陈奕迅隔空发表爱的宣言也是啊"} -{"key": "BAC009S0912W0409", "wav": "./aishell/wav/test/S0912/BAC009S0912W0409.wav", "txt": "例如出入帮忙开门拉椅子"} -{"key": "BAC009S0912W0410", "wav": "./aishell/wav/test/S0912/BAC009S0912W0410.wav", "txt": "新京报报道思维发散表情与肢体语言丰富"} -{"key": "BAC009S0912W0411", "wav": "./aishell/wav/test/S0912/BAC009S0912W0411.wav", "txt": "对于疯癫陈奕迅所长的这些设定歌迷早已习惯了"} -{"key": "BAC009S0912W0412", "wav": "./aishell/wav/test/S0912/BAC009S0912W0412.wav", "txt": "在凭借专辑米闪成为新一轮金曲歌王后"} -{"key": "BAC009S0912W0414", "wav": "./aishell/wav/test/S0912/BAC009S0912W0414.wav", "txt": "朱祖儿操刀灰色调封面"} -{"key": "BAC009S0912W0415", "wav": "./aishell/wav/test/S0912/BAC009S0912W0415.wav", "txt": "袁两半一人歌词包办"} -{"key": "BAC009S0912W0416", "wav": "./aishell/wav/test/S0912/BAC009S0912W0416.wav", "txt": "处于寻找状态中的挣扎"} -{"key": "BAC009S0912W0417", "wav": "./aishell/wav/test/S0912/BAC009S0912W0417.wav", "txt": "然而准备中三个字卸掉了他的纠结"} -{"key": "BAC009S0912W0418", "wav": "./aishell/wav/test/S0912/BAC009S0912W0418.wav", "txt": "二十九日晚间举办媒体听歌会"} -{"key": "BAC009S0912W0419", "wav": "./aishell/wav/test/S0912/BAC009S0912W0419.wav", "txt": "现场试听无条件人生马拉松等六首歌曲"} -{"key": "BAC009S0912W0420", "wav": "./aishell/wav/test/S0912/BAC009S0912W0420.wav", "txt": "终站是好友谢霆锋的创作"} -{"key": "BAC009S0912W0422", "wav": "./aishell/wav/test/S0912/BAC009S0912W0422.wav", "txt": "花了三年时间才得到这首歌"} -{"key": "BAC009S0912W0423", "wav": "./aishell/wav/test/S0912/BAC009S0912W0423.wav", "txt": "被问是否感觉到谢霆锋与王菲恋爱的甜蜜"} -{"key": "BAC009S0912W0428", "wav": "./aishell/wav/test/S0912/BAC009S0912W0428.wav", "txt": "十九点二十六分"} -{"key": "BAC009S0912W0429", "wav": "./aishell/wav/test/S0912/BAC009S0912W0429.wav", "txt": "好友陈妍希晒与潘玮柏搞怪合影为他庆生"} -{"key": "BAC009S0912W0430", "wav": "./aishell/wav/test/S0912/BAC009S0912W0430.wav", "txt": "称潘玮柏生日快乐"} -{"key": "BAC009S0912W0431", "wav": "./aishell/wav/test/S0912/BAC009S0912W0431.wav", "txt": "港富豪被绑涉及两岸三地绑匪要求赎金用比特币"} -{"key": "BAC009S0912W0432", "wav": "./aishell/wav/test/S0912/BAC009S0912W0432.wav", "txt": "日前遭人绑架并勒索七零零零万港元"} -{"key": "BAC009S0912W0433", "wav": "./aishell/wav/test/S0912/BAC009S0912W0433.wav", "txt": "台港警方追查一个月"} -{"key": "BAC009S0912W0434", "wav": "./aishell/wav/test/S0912/BAC009S0912W0434.wav", "txt": "二十七日深夜终于在云林县一家废弃空屋中救出了黄立坤"} -{"key": "BAC009S0912W0435", "wav": "./aishell/wav/test/S0912/BAC009S0912W0435.wav", "txt": "获救第一句话就是我以为我活不了了"} -{"key": "BAC009S0912W0436", "wav": "./aishell/wav/test/S0912/BAC009S0912W0436.wav", "txt": "港报评上海迪尼士不意味着香港迪尼士的没落"} -{"key": "BAC009S0912W0437", "wav": "./aishell/wav/test/S0912/BAC009S0912W0437.wav", "txt": "参考消息网七月二八日报道"} -{"key": "BAC009S0912W0438", "wav": "./aishell/wav/test/S0912/BAC009S0912W0438.wav", "txt": "港报内地医院仍控制处方药销售电商盼网售解禁"} -{"key": "BAC009S0912W0439", "wav": "./aishell/wav/test/S0912/BAC009S0912W0439.wav", "txt": "参考消息网九月一七日报道港媒称"} -{"key": "BAC009S0912W0440", "wav": "./aishell/wav/test/S0912/BAC009S0912W0440.wav", "txt": "自从中国内地的第一家网上药店一零年前开张以来"} -{"key": "BAC009S0912W0441", "wav": "./aishell/wav/test/S0912/BAC009S0912W0441.wav", "txt": "大量资本已投入医药企业中"} -{"key": "BAC009S0912W0442", "wav": "./aishell/wav/test/S0912/BAC009S0912W0442.wav", "txt": "希望能从中国内地日益老龄化的一三亿人口中受益"} -{"key": "BAC009S0912W0443", "wav": "./aishell/wav/test/S0912/BAC009S0912W0443.wav", "txt": "港校两名内地生酒后街头野战当事人被起底"} -{"key": "BAC009S0912W0444", "wav": "./aishell/wav/test/S0912/BAC009S0912W0444.wav", "txt": "南都讯记者王睦广发自香港今年四月初"} -{"key": "BAC009S0912W0445", "wav": "./aishell/wav/test/S0912/BAC009S0912W0445.wav", "txt": "被拍下短片冠以野战之名在网上疯传"} -{"key": "BAC009S0912W0446", "wav": "./aishell/wav/test/S0912/BAC009S0912W0446.wav", "txt": "二人早前被香港警方以有违公德罪落案起诉"} -{"key": "BAC009S0912W0447", "wav": "./aishell/wav/test/S0912/BAC009S0912W0447.wav", "txt": "事件中的女方昨日被判一二个月感化令"} -{"key": "BAC009S0912W0448", "wav": "./aishell/wav/test/S0912/BAC009S0912W0448.wav", "txt": "男方则将于下月庭审"} -{"key": "BAC009S0912W0449", "wav": "./aishell/wav/test/S0912/BAC009S0912W0449.wav", "txt": "港珠澳大桥又起漂移风波可能进一步影响工期"} -{"key": "BAC009S0912W0450", "wav": "./aishell/wav/test/S0912/BAC009S0912W0450.wav", "txt": "其人工岛被指移动六七米"} -{"key": "BAC009S0912W0451", "wav": "./aishell/wav/test/S0912/BAC009S0912W0451.wav", "txt": "这个意外可能进一步影响工期"} -{"key": "BAC009S0912W0452", "wav": "./aishell/wav/test/S0912/BAC009S0912W0452.wav", "txt": "游乐场大章鱼甩飞游客母亲落地时紧抱儿子"} -{"key": "BAC009S0912W0453", "wav": "./aishell/wav/test/S0912/BAC009S0912W0453.wav", "txt": "在空中以高速自转带给游客惊险刺激的体验"} -{"key": "BAC009S0912W0454", "wav": "./aishell/wav/test/S0912/BAC009S0912W0454.wav", "txt": "背部撞断了游乐场场边的三根不锈钢护栏"} -{"key": "BAC009S0912W0455", "wav": "./aishell/wav/test/S0912/BAC009S0912W0455.wav", "txt": "游学夏令营的无奈花豪华团价格吃喝难保"} -{"key": "BAC009S0912W0457", "wav": "./aishell/wav/test/S0912/BAC009S0912W0457.wav", "txt": "游客三亚海滩赏月后留二九吨垃圾三百人连夜清理"} -{"key": "BAC009S0912W0458", "wav": "./aishell/wav/test/S0912/BAC009S0912W0458.wav", "txt": "当海滩上如潮的人群散去"} -{"key": "BAC009S0912W0459", "wav": "./aishell/wav/test/S0912/BAC009S0912W0459.wav", "txt": "留下的却是被随手丢弃的垃圾"} -{"key": "BAC009S0912W0460", "wav": "./aishell/wav/test/S0912/BAC009S0912W0460.wav", "txt": "虽然海滩上设置了众多垃圾桶"} -{"key": "BAC009S0912W0461", "wav": "./aishell/wav/test/S0912/BAC009S0912W0461.wav", "txt": "但赏月人群还是乱扔垃圾"} -{"key": "BAC009S0912W0462", "wav": "./aishell/wav/test/S0912/BAC009S0912W0462.wav", "txt": "从二八日凌晨四点半至六点半这整整二个小时里"} -{"key": "BAC009S0912W0463", "wav": "./aishell/wav/test/S0912/BAC009S0912W0463.wav", "txt": "游客三亚游泳致终身残疾向旅行社索赔一九六万"} -{"key": "BAC009S0912W0464", "wav": "./aishell/wav/test/S0912/BAC009S0912W0464.wav", "txt": "成都男子张呈亮化名旅行时到三亚海滩游泳"} -{"key": "BAC009S0912W0465", "wav": "./aishell/wav/test/S0912/BAC009S0912W0465.wav", "txt": "下海后却突然失去意识"} -{"key": "BAC009S0912W0466", "wav": "./aishell/wav/test/S0912/BAC009S0912W0466.wav", "txt": "送医后被查出颈部脊髓损伤"} -{"key": "BAC009S0912W0467", "wav": "./aishell/wav/test/S0912/BAC009S0912W0467.wav", "txt": "张先生在青羊法院提起诉讼"} -{"key": "BAC009S0912W0468", "wav": "./aishell/wav/test/S0912/BAC009S0912W0468.wav", "txt": "此案正在进一步审理之中"} -{"key": "BAC009S0912W0469", "wav": "./aishell/wav/test/S0912/BAC009S0912W0469.wav", "txt": "游客下桥拍照踩死植物水杉栈道仙境拉铁丝网"} -{"key": "BAC009S0912W0470", "wav": "./aishell/wav/test/S0912/BAC009S0912W0470.wav", "txt": "当植物恢复正常生长后铁丝网将拆除"} -{"key": "BAC009S0912W0471", "wav": "./aishell/wav/test/S0912/BAC009S0912W0471.wav", "txt": "游客不满小孩超高补票与景区工作人员群殴"} -{"key": "BAC009S0912W0472", "wav": "./aishell/wav/test/S0912/BAC009S0912W0472.wav", "txt": "一段游客暴打景区员工的视频开始在网上发酵"} -{"key": "BAC009S0912W0473", "wav": "./aishell/wav/test/S0912/BAC009S0912W0473.wav", "txt": "某景点大门处多名游客与身着穿服的工作人员大打出手"} -{"key": "BAC009S0912W0474", "wav": "./aishell/wav/test/S0912/BAC009S0912W0474.wav", "txt": "游客乌鲁木齐吃自助被罚二四零零元工商部门介入"} -{"key": "BAC009S0912W0475", "wav": "./aishell/wav/test/S0912/BAC009S0912W0475.wav", "txt": "剩下了一二零零克食物"} -{"key": "BAC009S0912W0476", "wav": "./aishell/wav/test/S0912/BAC009S0912W0476.wav", "txt": "被餐厅罚款二四零零元"} -{"key": "BAC009S0912W0477", "wav": "./aishell/wav/test/S0912/BAC009S0912W0477.wav", "txt": "餐厅返还了游客的二四零零元"} -{"key": "BAC009S0912W0478", "wav": "./aishell/wav/test/S0912/BAC009S0912W0478.wav", "txt": "物价部门工商部门已介入调查"} -{"key": "BAC009S0912W0479", "wav": "./aishell/wav/test/S0912/BAC009S0912W0479.wav", "txt": "游客偷走雷峰塔砖块想供奉起来做药给老人喝"} -{"key": "BAC009S0912W0480", "wav": "./aishell/wav/test/S0912/BAC009S0912W0480.wav", "txt": "游客入住药店被收二零元马桶使用费消协可举报"} -{"key": "BAC009S0912W0481", "wav": "./aishell/wav/test/S0912/BAC009S0912W0481.wav", "txt": "住酒店还要交二零元马桶费"} -{"key": "BAC009S0912W0482", "wav": "./aishell/wav/test/S0912/BAC009S0912W0482.wav", "txt": "南京市民张女士化姓去无锡旅游时"} -{"key": "BAC009S0912W0483", "wav": "./aishell/wav/test/S0912/BAC009S0912W0483.wav", "txt": "通过网站团购了无锡江南丹青度假酒店一间套房"} -{"key": "BAC009S0912W0484", "wav": "./aishell/wav/test/S0912/BAC009S0912W0484.wav", "txt": "退房结账时却被告知扣了二零元马桶使用费"} -{"key": "BAC009S0912W0485", "wav": "./aishell/wav/test/S0912/BAC009S0912W0485.wav", "txt": "这让张女士哭笑不得"} -{"key": "BAC009S0912W0486", "wav": "./aishell/wav/test/S0912/BAC009S0912W0486.wav", "txt": "酒店方承诺退还二零元马桶使用费"} -{"key": "BAC009S0912W0487", "wav": "./aishell/wav/test/S0912/BAC009S0912W0487.wav", "txt": "酒店行为属于乱收费"} -{"key": "BAC009S0912W0488", "wav": "./aishell/wav/test/S0912/BAC009S0912W0488.wav", "txt": "消费者可以直接向物价部门和旅游部门举报"} -{"key": "BAC009S0912W0489", "wav": "./aishell/wav/test/S0912/BAC009S0912W0489.wav", "txt": "现代快报记者赵书伶"} -{"key": "BAC009S0912W0490", "wav": "./aishell/wav/test/S0912/BAC009S0912W0490.wav", "txt": "游客再曝日照点海鲜太少被围殴当地警方证实"} -{"key": "BAC009S0912W0491", "wav": "./aishell/wav/test/S0912/BAC009S0912W0491.wav", "txt": "网友先在微博中陈述了悲惨遭遇"} -{"key": "BAC009S0912W0492", "wav": "./aishell/wav/test/S0912/BAC009S0912W0492.wav", "txt": "据称是当事人之一在派出所通过一个亲戚的微博发的"} -{"key": "BAC009S0912W0493", "wav": "./aishell/wav/test/S0912/BAC009S0912W0493.wav", "txt": "游客北京游两天遭引导消费近二万元"} -{"key": "BAC009S0912W0494", "wav": "./aishell/wav/test/S0912/BAC009S0912W0494.wav", "txt": "京华时报讯记者武红利与家人来京旅游"} -{"key": "BAC009S0912W0495", "wav": "./aishell/wav/test/S0912/BAC009S0912W0495.wav", "txt": "王女士与旅行社签订四天五晚的旅行合同"} -{"key": "BAC009S0913W0121", "wav": "./aishell/wav/test/S0913/BAC009S0913W0121.wav", "txt": "在最希望国家实施的调控政策这一问题上"} -{"key": "BAC009S0913W0122", "wav": "./aishell/wav/test/S0913/BAC009S0913W0122.wav", "txt": "有百分之五的网友选择了提高公积金贷款额度"} -{"key": "BAC009S0913W0123", "wav": "./aishell/wav/test/S0913/BAC009S0913W0123.wav", "txt": "百分之五的网友选择了房贷利率打折优惠"} -{"key": "BAC009S0913W0124", "wav": "./aishell/wav/test/S0913/BAC009S0913W0124.wav", "txt": "百分之五的网友倾向于房产税的开征"} -{"key": "BAC009S0913W0125", "wav": "./aishell/wav/test/S0913/BAC009S0913W0125.wav", "txt": "百分之五的网友希望放开一线城市的限购政策"} -{"key": "BAC009S0913W0126", "wav": "./aishell/wav/test/S0913/BAC009S0913W0126.wav", "txt": "网友的置业目的为首套房自住的占到了百分之六十"} -{"key": "BAC009S0913W0127", "wav": "./aishell/wav/test/S0913/BAC009S0913W0127.wav", "txt": "改善型二套房比例比约为百分之五"} -{"key": "BAC009S0913W0128", "wav": "./aishell/wav/test/S0913/BAC009S0913W0128.wav", "txt": "三套以上投资性购房占百分之五"} -{"key": "BAC009S0913W0129", "wav": "./aishell/wav/test/S0913/BAC009S0913W0129.wav", "txt": "其他目的的占比为百分之五"} -{"key": "BAC009S0913W0130", "wav": "./aishell/wav/test/S0913/BAC009S0913W0130.wav", "txt": "在商品房性价比的选择上"} -{"key": "BAC009S0913W0131", "wav": "./aishell/wav/test/S0913/BAC009S0913W0131.wav", "txt": "万科以百分之五的票数获选性价比最高的房企"} -{"key": "BAC009S0913W0132", "wav": "./aishell/wav/test/S0913/BAC009S0913W0132.wav", "txt": "绿地保利万达分列性价比最高房企的二三四名"} -{"key": "BAC009S0913W0133", "wav": "./aishell/wav/test/S0913/BAC009S0913W0133.wav", "txt": "选择恒大世茂富力的网友均不足百分之十"} -{"key": "BAC009S0913W0134", "wav": "./aishell/wav/test/S0913/BAC009S0913W0134.wav", "txt": "有百分之五的网友选择了其他房企"} -{"key": "BAC009S0913W0135", "wav": "./aishell/wav/test/S0913/BAC009S0913W0135.wav", "txt": "二初楼市迎来多项利好政策"} -{"key": "BAC009S0913W0136", "wav": "./aishell/wav/test/S0913/BAC009S0913W0136.wav", "txt": "在多项政策的支持下"} -{"key": "BAC009S0913W0137", "wav": "./aishell/wav/test/S0913/BAC009S0913W0137.wav", "txt": "支持新产业新业态"} -{"key": "BAC009S0913W0138", "wav": "./aishell/wav/test/S0913/BAC009S0913W0138.wav", "txt": "集中释放用地政策红利"} -{"key": "BAC009S0913W0139", "wav": "./aishell/wav/test/S0913/BAC009S0913W0139.wav", "txt": "在加大新供用地保障力度方面"} -{"key": "BAC009S0913W0140", "wav": "./aishell/wav/test/S0913/BAC009S0913W0140.wav", "txt": "新产业发展快地用地集约求且需求大的地区"} -{"key": "BAC009S0913W0141", "wav": "./aishell/wav/test/S0913/BAC009S0913W0141.wav", "txt": "在鼓励盘活利用现有用地方面"} -{"key": "BAC009S0913W0142", "wav": "./aishell/wav/test/S0913/BAC009S0913W0142.wav", "txt": "意见提出对制造业迈向中高端的企业用地"} -{"key": "BAC009S0913W0143", "wav": "./aishell/wav/test/S0913/BAC009S0913W0143.wav", "txt": "生产性科技及高技术服务业发展用地"} -{"key": "BAC009S0913W0144", "wav": "./aishell/wav/test/S0913/BAC009S0913W0144.wav", "txt": "建设创业创新平台用地"} -{"key": "BAC009S0913W0145", "wav": "./aishell/wav/test/S0913/BAC009S0913W0145.wav", "txt": "互联网行动计划实实施用地实行过渡期政策"} -{"key": "BAC009S0913W0146", "wav": "./aishell/wav/test/S0913/BAC009S0913W0146.wav", "txt": "按新用途新权利类型市场价办理用地手续"} -{"key": "BAC009S0913W0147", "wav": "./aishell/wav/test/S0913/BAC009S0913W0147.wav", "txt": "支持新产业新业态"} -{"key": "BAC009S0913W0148", "wav": "./aishell/wav/test/S0913/BAC009S0913W0148.wav", "txt": "由国土资源部联合国家"} -{"key": "BAC009S0913W0149", "wav": "./aishell/wav/test/S0913/BAC009S0913W0149.wav", "txt": "正式放松外贸外资投资我国房地产相关规定"} -{"key": "BAC009S0913W0150", "wav": "./aishell/wav/test/S0913/BAC009S0913W0150.wav", "txt": "我国对房地产的行政干预政策陆续退出"} -{"key": "BAC009S0913W0151", "wav": "./aishell/wav/test/S0913/BAC009S0913W0151.wav", "txt": "放松限外是必然趋势"} -{"key": "BAC009S0913W0152", "wav": "./aishell/wav/test/S0913/BAC009S0913W0152.wav", "txt": "此举将有利于市场信心的培养"} -{"key": "BAC009S0913W0153", "wav": "./aishell/wav/test/S0913/BAC009S0913W0153.wav", "txt": "并利好一二线城市的中高端住宅"} -{"key": "BAC009S0913W0154", "wav": "./aishell/wav/test/S0913/BAC009S0913W0154.wav", "txt": "对于外商投资房地产企业注册资本与投资总额比例"} -{"key": "BAC009S0913W0155", "wav": "./aishell/wav/test/S0913/BAC009S0913W0155.wav", "txt": "对于实施住房限购政策的城市"} -{"key": "BAC009S0913W0156", "wav": "./aishell/wav/test/S0913/BAC009S0913W0156.wav", "txt": "境外个人购房应当符合当地政策规定"} -{"key": "BAC009S0913W0157", "wav": "./aishell/wav/test/S0913/BAC009S0913W0157.wav", "txt": "上海易居研究所副院长杨红旭表示"} -{"key": "BAC009S0913W0158", "wav": "./aishell/wav/test/S0913/BAC009S0913W0158.wav", "txt": "外资管制放松是大势所趋"} -{"key": "BAC009S0913W0159", "wav": "./aishell/wav/test/S0913/BAC009S0913W0159.wav", "txt": "随着我国行政干预政策的陆续退出"} -{"key": "BAC009S0913W0160", "wav": "./aishell/wav/test/S0913/BAC009S0913W0160.wav", "txt": "此前为限制外资炒房"} -{"key": "BAC009S0913W0161", "wav": "./aishell/wav/test/S0913/BAC009S0913W0161.wav", "txt": "我国出台了一系列限外令"} -{"key": "BAC009S0913W0162", "wav": "./aishell/wav/test/S0913/BAC009S0913W0162.wav", "txt": "二的向境外投资方出售国内资产征税规定"} -{"key": "BAC009S0913W0163", "wav": "./aishell/wav/test/S0913/BAC009S0913W0163.wav", "txt": "国家外汇局出台过规定"} -{"key": "BAC009S0913W0164", "wav": "./aishell/wav/test/S0913/BAC009S0913W0164.wav", "txt": "国家发改委也发出过通知"} -{"key": "BAC009S0913W0165", "wav": "./aishell/wav/test/S0913/BAC009S0913W0165.wav", "txt": "对于提供给外籍人士的个人住房按揭贷款的外债需求"} -{"key": "BAC009S0913W0166", "wav": "./aishell/wav/test/S0913/BAC009S0913W0166.wav", "txt": "不予安排中长期外债额度"} -{"key": "BAC009S0913W0167", "wav": "./aishell/wav/test/S0913/BAC009S0913W0167.wav", "txt": "房地产被视作保值升值的投资标的被炒作"} -{"key": "BAC009S0913W0168", "wav": "./aishell/wav/test/S0913/BAC009S0913W0168.wav", "txt": "但目前的形势早已改变"} -{"key": "BAC009S0913W0169", "wav": "./aishell/wav/test/S0913/BAC009S0913W0169.wav", "txt": "此前外资购房主要集中在一线城市和几个热点二线城市"} -{"key": "BAC009S0913W0170", "wav": "./aishell/wav/test/S0913/BAC009S0913W0170.wav", "txt": "而现在这类城市房价已经很高"} -{"key": "BAC009S0913W0171", "wav": "./aishell/wav/test/S0913/BAC009S0913W0171.wav", "txt": "即使限外令放开也不会出现外资大规模买房"} -{"key": "BAC009S0913W0172", "wav": "./aishell/wav/test/S0913/BAC009S0913W0172.wav", "txt": "中原地产市场总监张大伟认为"} -{"key": "BAC009S0913W0173", "wav": "./aishell/wav/test/S0913/BAC009S0913W0173.wav", "txt": "放松对外限制利用利好一二线城市中高端物业"} -{"key": "BAC009S0913W0174", "wav": "./aishell/wav/test/S0913/BAC009S0913W0174.wav", "txt": "对于外商房企的注册资本金降低要求"} -{"key": "BAC009S0913W0175", "wav": "./aishell/wav/test/S0913/BAC009S0913W0175.wav", "txt": "也有利于部分企业的资金周转"} -{"key": "BAC009S0913W0176", "wav": "./aishell/wav/test/S0913/BAC009S0913W0176.wav", "txt": "正式放松外资投资我国房地产相关规定"} -{"key": "BAC009S0913W0177", "wav": "./aishell/wav/test/S0913/BAC009S0913W0177.wav", "txt": "允许境外机构在境内设"} -{"key": "BAC009S0913W0178", "wav": "./aishell/wav/test/S0913/BAC009S0913W0178.wav", "txt": "六部委松绑楼市限外令"} -{"key": "BAC009S0913W0179", "wav": "./aishell/wav/test/S0913/BAC009S0913W0179.wav", "txt": "外资在华房地产投资购房限制被松绑"} -{"key": "BAC009S0913W0180", "wav": "./aishell/wav/test/S0913/BAC009S0913W0180.wav", "txt": "允许机构和个人在中国购房"} -{"key": "BAC009S0913W0181", "wav": "./aishell/wav/test/S0913/BAC009S0913W0181.wav", "txt": "中房指数研究所院长陈晟表示"} -{"key": "BAC009S0913W0182", "wav": "./aishell/wav/test/S0913/BAC009S0913W0182.wav", "txt": "此举对促进外企在华投资房地产有积极作用"} -{"key": "BAC009S0913W0183", "wav": "./aishell/wav/test/S0913/BAC009S0913W0183.wav", "txt": "相关公司股票走势鄂尔多斯"} -{"key": "BAC009S0913W0184", "wav": "./aishell/wav/test/S0913/BAC009S0913W0184.wav", "txt": "内地产投资比例有限"} -{"key": "BAC009S0913W0185", "wav": "./aishell/wav/test/S0913/BAC009S0913W0185.wav", "txt": "此项政策对中国楼市影响有限"} -{"key": "BAC009S0913W0186", "wav": "./aishell/wav/test/S0913/BAC009S0913W0186.wav", "txt": "对于实施住房限购政策的城市"} -{"key": "BAC009S0913W0187", "wav": "./aishell/wav/test/S0913/BAC009S0913W0187.wav", "txt": "为环境保护提供更加完备有效的法制保障"} -{"key": "BAC009S0913W0188", "wav": "./aishell/wav/test/S0913/BAC009S0913W0188.wav", "txt": "进一步完善环境政策"} -{"key": "BAC009S0913W0189", "wav": "./aishell/wav/test/S0913/BAC009S0913W0189.wav", "txt": "健全环境执法调协调机制"} -{"key": "BAC009S0913W0190", "wav": "./aishell/wav/test/S0913/BAC009S0913W0190.wav", "txt": "国务院国资委力挺国企"} -{"key": "BAC009S0913W0191", "wav": "./aishell/wav/test/S0913/BAC009S0913W0191.wav", "txt": "具备条件的要积极引进战略投资者"} -{"key": "BAC009S0913W0192", "wav": "./aishell/wav/test/S0913/BAC009S0913W0192.wav", "txt": "推进主营业务整体上市"} -{"key": "BAC009S0913W0193", "wav": "./aishell/wav/test/S0913/BAC009S0913W0193.wav", "txt": "国资委接二连三对此表态或意味着国企将迎来上市高峰"} -{"key": "BAC009S0913W0194", "wav": "./aishell/wav/test/S0913/BAC009S0913W0194.wav", "txt": "使国有资本更多地向重要行业和关键领域集中"} -{"key": "BAC009S0913W0195", "wav": "./aishell/wav/test/S0913/BAC009S0913W0195.wav", "txt": "向具有优势的行业集中"} -{"key": "BAC009S0913W0196", "wav": "./aishell/wav/test/S0913/BAC009S0913W0196.wav", "txt": "向大企业大集团集中"} -{"key": "BAC009S0913W0197", "wav": "./aishell/wav/test/S0913/BAC009S0913W0197.wav", "txt": "要吸收民间资本参与国有企业改制重组"} -{"key": "BAC009S0913W0198", "wav": "./aishell/wav/test/S0913/BAC009S0913W0198.wav", "txt": "发展混合所有制经济"} -{"key": "BAC009S0913W0199", "wav": "./aishell/wav/test/S0913/BAC009S0913W0199.wav", "txt": "发挥国有大企业引领带动作用"} -{"key": "BAC009S0913W0200", "wav": "./aishell/wav/test/S0913/BAC009S0913W0200.wav", "txt": "促进各种所有制企业共同发展"} -{"key": "BAC009S0913W0201", "wav": "./aishell/wav/test/S0913/BAC009S0913W0201.wav", "txt": "至二零一一年六月底"} -{"key": "BAC009S0913W0202", "wav": "./aishell/wav/test/S0913/BAC009S0913W0202.wav", "txt": "中央企业控股境外上市公司"} -{"key": "BAC009S0913W0203", "wav": "./aishell/wav/test/S0913/BAC009S0913W0203.wav", "txt": "国资委还将支持企业走出去"} -{"key": "BAC009S0913W0204", "wav": "./aishell/wav/test/S0913/BAC009S0913W0204.wav", "txt": "逐步实现战略运营管管理全球化"} -{"key": "BAC009S0913W0205", "wav": "./aishell/wav/test/S0913/BAC009S0913W0205.wav", "txt": "应当经国务院国资委核准"} -{"key": "BAC009S0913W0206", "wav": "./aishell/wav/test/S0913/BAC009S0913W0206.wav", "txt": "办法五月一日起实施"} -{"key": "BAC009S0913W0207", "wav": "./aishell/wav/test/S0913/BAC009S0913W0207.wav", "txt": "国务院国资委对央企境外投资的管理法规在逐渐完善"} -{"key": "BAC009S0913W0208", "wav": "./aishell/wav/test/S0913/BAC009S0913W0208.wav", "txt": "中央企业在境外从事非主业投资"} -{"key": "BAC009S0913W0209", "wav": "./aishell/wav/test/S0913/BAC009S0913W0209.wav", "txt": "需要向国务院国资委报送申请核准非主业投资的请示"} -{"key": "BAC009S0913W0210", "wav": "./aishell/wav/test/S0913/BAC009S0913W0210.wav", "txt": "对非主业投资项目的有关决策文件"} -{"key": "BAC009S0913W0211", "wav": "./aishell/wav/test/S0913/BAC009S0913W0211.wav", "txt": "项目可行性研究报告尽职调查等相关文件"} -{"key": "BAC009S0913W0212", "wav": "./aishell/wav/test/S0913/BAC009S0913W0212.wav", "txt": "办法还特别提出一些建议"} -{"key": "BAC009S0913W0213", "wav": "./aishell/wav/test/S0913/BAC009S0913W0213.wav", "txt": "国务院国资委将指导中央企业之间加强境外投资合作"} -{"key": "BAC009S0913W0214", "wav": "./aishell/wav/test/S0913/BAC009S0913W0214.wav", "txt": "中央走出去的步伐正趋加快"} -{"key": "BAC009S0913W0215", "wav": "./aishell/wav/test/S0913/BAC009S0913W0215.wav", "txt": "央企在境外含港澳地区营收"} -{"key": "BAC009S0913W0216", "wav": "./aishell/wav/test/S0913/BAC009S0913W0216.wav", "txt": "利润总额较大"} -{"key": "BAC009S0913W0217", "wav": "./aishell/wav/test/S0913/BAC009S0913W0217.wav", "txt": "同比较上年同期分别增长百分之三十和百分之二十八"} -{"key": "BAC009S0913W0218", "wav": "./aishell/wav/test/S0913/BAC009S0913W0218.wav", "txt": "涨幅远超央企整体水平"} -{"key": "BAC009S0913W0219", "wav": "./aishell/wav/test/S0913/BAC009S0913W0219.wav", "txt": "一方面很多央企已制定了海外战略"} -{"key": "BAC009S0913W0220", "wav": "./aishell/wav/test/S0913/BAC009S0913W0220.wav", "txt": "一些国家经济出现大的波动"} -{"key": "BAC009S0913W0221", "wav": "./aishell/wav/test/S0913/BAC009S0913W0221.wav", "txt": "而社会罢工劳资纠纷也时有发生"} -{"key": "BAC009S0913W0222", "wav": "./aishell/wav/test/S0913/BAC009S0913W0222.wav", "txt": "央企在境外投资面临的问题很多"} -{"key": "BAC009S0913W0223", "wav": "./aishell/wav/test/S0913/BAC009S0913W0223.wav", "txt": "目前央企境外投资仍处在初级阶段"} -{"key": "BAC009S0913W0224", "wav": "./aishell/wav/test/S0913/BAC009S0913W0224.wav", "txt": "制定和发布办法"} -{"key": "BAC009S0913W0225", "wav": "./aishell/wav/test/S0913/BAC009S0913W0225.wav", "txt": "是为了进一步建立健全境外国有资产管理制度"} -{"key": "BAC009S0913W0226", "wav": "./aishell/wav/test/S0913/BAC009S0913W0226.wav", "txt": "切实加强央企境外投资监管"} -{"key": "BAC009S0913W0227", "wav": "./aishell/wav/test/S0913/BAC009S0913W0227.wav", "txt": "确保境外国有资产保值增值"} -{"key": "BAC009S0913W0228", "wav": "./aishell/wav/test/S0913/BAC009S0913W0228.wav", "txt": "更好地适应了新形势的需要"} -{"key": "BAC009S0913W0229", "wav": "./aishell/wav/test/S0913/BAC009S0913W0229.wav", "txt": "国务院多举措力挺农产品流通"} -{"key": "BAC009S0913W0230", "wav": "./aishell/wav/test/S0913/BAC009S0913W0230.wav", "txt": "免征蔬菜流通环节的增值税"} -{"key": "BAC009S0913W0231", "wav": "./aishell/wav/test/S0913/BAC009S0913W0231.wav", "txt": "提出完善农产品流通税收政策"} -{"key": "BAC009S0913W0232", "wav": "./aishell/wav/test/S0913/BAC009S0913W0232.wav", "txt": "免征蔬菜流通环节增值税加强金融支持"} -{"key": "BAC009S0913W0233", "wav": "./aishell/wav/test/S0913/BAC009S0913W0233.wav", "txt": "相关公司股票走势农产品"} -{"key": "BAC009S0913W0234", "wav": "./aishell/wav/test/S0913/BAC009S0913W0234.wav", "txt": "各地要鼓励流通企业跨地区兼并重组和投资合作"} -{"key": "BAC009S0913W0235", "wav": "./aishell/wav/test/S0913/BAC009S0913W0235.wav", "txt": "以加强产销衔接为重点"} -{"key": "BAC009S0913W0236", "wav": "./aishell/wav/test/S0913/BAC009S0913W0236.wav", "txt": "加强鲜活农产品流通基础设施建设"} -{"key": "BAC009S0913W0237", "wav": "./aishell/wav/test/S0913/BAC009S0913W0237.wav", "txt": "创新鲜活农产品的流通模式"} -{"key": "BAC009S0913W0238", "wav": "./aishell/wav/test/S0913/BAC009S0913W0238.wav", "txt": "提高流通组织化程度"} -{"key": "BAC009S0913W0239", "wav": "./aishell/wav/test/S0913/BAC009S0913W0239.wav", "txt": "完善流通链条和市场布局"} -{"key": "BAC009S0913W0240", "wav": "./aishell/wav/test/S0913/BAC009S0913W0240.wav", "txt": "进一步减少流通环节"} -{"key": "BAC009S0913W0241", "wav": "./aishell/wav/test/S0913/BAC009S0913W0241.wav", "txt": "保障鲜活农产品市场供应和价格稳定"} -{"key": "BAC009S0913W0242", "wav": "./aishell/wav/test/S0913/BAC009S0913W0242.wav", "txt": "各地要依据城市总体规划和城市网点商业规划"} -{"key": "BAC009S0913W0243", "wav": "./aishell/wav/test/S0913/BAC009S0913W0243.wav", "txt": "鼓励流通企业跨地区兼并重组和投资合作"} -{"key": "BAC009S0913W0244", "wav": "./aishell/wav/test/S0913/BAC009S0913W0244.wav", "txt": "要大力推进产销衔接"} -{"key": "BAC009S0913W0245", "wav": "./aishell/wav/test/S0913/BAC009S0913W0245.wav", "txt": "完善市场监测预警和信息发布机制"} -{"key": "BAC009S0913W0246", "wav": "./aishell/wav/test/S0913/BAC009S0913W0246.wav", "txt": "建立健全重要农产品储备制度"} -{"key": "BAC009S0913W0247", "wav": "./aishell/wav/test/S0913/BAC009S0913W0247.wav", "txt": "完善农产品跨区调运调剂机制"} -{"key": "BAC009S0913W0248", "wav": "./aishell/wav/test/S0913/BAC009S0913W0248.wav", "txt": "各城市要根据消费需求和季节变化"} -{"key": "BAC009S0913W0249", "wav": "./aishell/wav/test/S0913/BAC009S0913W0249.wav", "txt": "合理确定耐贮蔬菜的流通动态库存数量"} -{"key": "BAC009S0913W0250", "wav": "./aishell/wav/test/S0913/BAC009S0913W0250.wav", "txt": "加快鲜活农产品质量安全追溯体系建设"} -{"key": "BAC009S0913W0251", "wav": "./aishell/wav/test/S0913/BAC009S0913W0251.wav", "txt": "通过投资入股产权回购回租建公建配套等方式"} -{"key": "BAC009S0913W0252", "wav": "./aishell/wav/test/S0913/BAC009S0913W0252.wav", "txt": "发挥财政资金引导示范作用"} -{"key": "BAC009S0913W0254", "wav": "./aishell/wav/test/S0913/BAC009S0913W0254.wav", "txt": "这样可以加深苹果和消费者之间的关系"} -{"key": "BAC009S0913W0255", "wav": "./aishell/wav/test/S0913/BAC009S0913W0255.wav", "txt": "对未来的销量至关重要"} -{"key": "BAC009S0913W0258", "wav": "./aishell/wav/test/S0913/BAC009S0913W0258.wav", "txt": "为提高苹果零售商店的服务质量"} -{"key": "BAC009S0913W0259", "wav": "./aishell/wav/test/S0913/BAC009S0913W0259.wav", "txt": "该系统包含一套算法"} -{"key": "BAC009S0913W0260", "wav": "./aishell/wav/test/S0913/BAC009S0913W0260.wav", "txt": "有媒体援引知情人士消息称"} -{"key": "BAC009S0913W0261", "wav": "./aishell/wav/test/S0913/BAC009S0913W0261.wav", "txt": "苹果将引入这样一套顾客接待系统"} -{"key": "BAC009S0913W0262", "wav": "./aishell/wav/test/S0913/BAC009S0913W0262.wav", "txt": "前往苹果零售店的顾客将比餐厅订餐叫号还方便"} -{"key": "BAC009S0913W0263", "wav": "./aishell/wav/test/S0913/BAC009S0913W0263.wav", "txt": "苹果零售店实行先到先服务的原则"} -{"key": "BAC009S0913W0264", "wav": "./aishell/wav/test/S0913/BAC009S0913W0264.wav", "txt": "这样难免会出现某个客户的维修问题特别复杂"} -{"key": "BAC009S0913W0265", "wav": "./aishell/wav/test/S0913/BAC009S0913W0265.wav", "txt": "导致技术支持时间超过了预期分配时间"} -{"key": "BAC009S0913W0266", "wav": "./aishell/wav/test/S0913/BAC009S0913W0266.wav", "txt": "从而影响接下来的客户无法在指定时间点获得服务"} -{"key": "BAC009S0913W0267", "wav": "./aishell/wav/test/S0913/BAC009S0913W0267.wav", "txt": "新系统可根据难易程度进行排序"} -{"key": "BAC009S0913W0268", "wav": "./aishell/wav/test/S0913/BAC009S0913W0268.wav", "txt": "与现在的接待原则不同的是"} -{"key": "BAC009S0913W0269", "wav": "./aishell/wav/test/S0913/BAC009S0913W0269.wav", "txt": "此时客户可以选择离开苹果零售店"} -{"key": "BAC009S0913W0270", "wav": "./aishell/wav/test/S0913/BAC009S0913W0270.wav", "txt": "而当预订时间接近时"} -{"key": "BAC009S0913W0271", "wav": "./aishell/wav/test/S0913/BAC009S0913W0271.wav", "txt": "客户会再次收到短信提醒"} -{"key": "BAC009S0913W0272", "wav": "./aishell/wav/test/S0913/BAC009S0913W0272.wav", "txt": "客户回到苹果零售店后"} -{"key": "BAC009S0913W0274", "wav": "./aishell/wav/test/S0913/BAC009S0913W0274.wav", "txt": "以告知客户相关技术人员确切的空闲时间"} -{"key": "BAC009S0913W0275", "wav": "./aishell/wav/test/S0913/BAC009S0913W0275.wav", "txt": "以及在店内的具体位置"} -{"key": "BAC009S0913W0277", "wav": "./aishell/wav/test/S0913/BAC009S0913W0277.wav", "txt": "为提高苹果零售商店的服务质量"} -{"key": "BAC009S0913W0278", "wav": "./aishell/wav/test/S0913/BAC009S0913W0278.wav", "txt": "苹果靠什么颠复移动支付市场"} -{"key": "BAC009S0913W0279", "wav": "./aishell/wav/test/S0913/BAC009S0913W0279.wav", "txt": "苹果一口气召开了两次新品发布会"} -{"key": "BAC009S0913W0280", "wav": "./aishell/wav/test/S0913/BAC009S0913W0280.wav", "txt": "就在会场的凳子和垃圾尚未收拾干净的时候"} -{"key": "BAC009S0913W0281", "wav": "./aishell/wav/test/S0913/BAC009S0913W0281.wav", "txt": "全世界的报道已经蜂拥而至"} -{"key": "BAC009S0913W0282", "wav": "./aishell/wav/test/S0913/BAC009S0913W0282.wav", "txt": "失望中夹杂着嘲讽的情绪霸占了各模块的头条"} -{"key": "BAC009S0913W0283", "wav": "./aishell/wav/test/S0913/BAC009S0913W0283.wav", "txt": "科技经济社会金融全都是苹果的消息"} -{"key": "BAC009S0913W0284", "wav": "./aishell/wav/test/S0913/BAC009S0913W0284.wav", "txt": "害得汪峰也不敢随便表白了"} -{"key": "BAC009S0913W0285", "wav": "./aishell/wav/test/S0913/BAC009S0913W0285.wav", "txt": "而是统一地认为苹果开了有史以来最烂的发布会"} -{"key": "BAC009S0913W0286", "wav": "./aishell/wav/test/S0913/BAC009S0913W0286.wav", "txt": "他们推出的产品不仅非常鸡肋"} -{"key": "BAC009S0913W0288", "wav": "./aishell/wav/test/S0913/BAC009S0913W0288.wav", "txt": "就足以让专家们恶心七七四十九天了"} -{"key": "BAC009S0913W0289", "wav": "./aishell/wav/test/S0913/BAC009S0913W0289.wav", "txt": "但这些口水式的讨伐并没有影响苹果前进的脚步"} -{"key": "BAC009S0913W0291", "wav": "./aishell/wav/test/S0913/BAC009S0913W0291.wav", "txt": "证明了其向主流妥协的姿态"} -{"key": "BAC009S0913W0293", "wav": "./aishell/wav/test/S0913/BAC009S0913W0293.wav", "txt": "自二零零七年乔布斯重新发明手机开始"} -{"key": "BAC009S0913W0294", "wav": "./aishell/wav/test/S0913/BAC009S0913W0294.wav", "txt": "把它升级成为一款综合性智能终端之后"} -{"key": "BAC009S0913W0295", "wav": "./aishell/wav/test/S0913/BAC009S0913W0295.wav", "txt": "就开始潜移默化地渗透人类的生活"} -{"key": "BAC009S0913W0296", "wav": "./aishell/wav/test/S0913/BAC009S0913W0296.wav", "txt": "这种渗透犹如蜘蛛结网细菌繁殖病毒传播"} -{"key": "BAC009S0913W0297", "wav": "./aishell/wav/test/S0913/BAC009S0913W0297.wav", "txt": "悄无声息又经年累月"} -{"key": "BAC009S0913W0298", "wav": "./aishell/wav/test/S0913/BAC009S0913W0298.wav", "txt": "我们甚至都没有来得及反抗就被完全征服了"} -{"key": "BAC009S0913W0299", "wav": "./aishell/wav/test/S0913/BAC009S0913W0299.wav", "txt": "我根本无法想象每天在朋友圈上花两个小时的情景"} -{"key": "BAC009S0913W0300", "wav": "./aishell/wav/test/S0913/BAC009S0913W0300.wav", "txt": "但现在已经成为了习生活习惯"} -{"key": "BAC009S0913W0301", "wav": "./aishell/wav/test/S0913/BAC009S0913W0301.wav", "txt": "但新贵移动支付具绝对能更深层次地改变用户的生活"} -{"key": "BAC009S0913W0302", "wav": "./aishell/wav/test/S0913/BAC009S0913W0302.wav", "txt": "乃至颠复现有的经济形态和支付格局"} -{"key": "BAC009S0913W0303", "wav": "./aishell/wav/test/S0913/BAC009S0913W0303.wav", "txt": "第一财经日报记者七月十三日从美的内部获悉"} -{"key": "BAC009S0913W0304", "wav": "./aishell/wav/test/S0913/BAC009S0913W0304.wav", "txt": "已获任美的部品事业部的总裁"} -{"key": "BAC009S0913W0305", "wav": "./aishell/wav/test/S0913/BAC009S0913W0305.wav", "txt": "而威灵电器七月九日下午也公告透露"} -{"key": "BAC009S0913W0306", "wav": "./aishell/wav/test/S0913/BAC009S0913W0306.wav", "txt": "于一九九一年加盟美的集团"} -{"key": "BAC009S0913W0307", "wav": "./aishell/wav/test/S0913/BAC009S0913W0307.wav", "txt": "美芝压缩机已是全球最大空调压缩机企业"} -{"key": "BAC009S0913W0308", "wav": "./aishell/wav/test/S0913/BAC009S0913W0308.wav", "txt": "占全球空调压缩机市场三分之一的份额"} -{"key": "BAC009S0913W0309", "wav": "./aishell/wav/test/S0913/BAC009S0913W0309.wav", "txt": "美的将美芝压缩机威灵电机合并"} -{"key": "BAC009S0913W0310", "wav": "./aishell/wav/test/S0913/BAC009S0913W0310.wav", "txt": "将有助于两大部品业务的研发资源销售渠道共享"} -{"key": "BAC009S0913W0311", "wav": "./aishell/wav/test/S0913/BAC009S0913W0311.wav", "txt": "美的部品事业部建立后"} -{"key": "BAC009S0913W0312", "wav": "./aishell/wav/test/S0913/BAC009S0913W0312.wav", "txt": "将成立压缩机开发研究院和微电机开发研究院"} -{"key": "BAC009S0913W0313", "wav": "./aishell/wav/test/S0913/BAC009S0913W0313.wav", "txt": "以区域为中心建立客户经理负责制制造方面"} -{"key": "BAC009S0913W0314", "wav": "./aishell/wav/test/S0913/BAC009S0913W0314.wav", "txt": "负责统一管理原电机事业部的各工厂制造系统"} -{"key": "BAC009S0913W0315", "wav": "./aishell/wav/test/S0913/BAC009S0913W0315.wav", "txt": "原压缩机事业部各工厂保持不变"} -{"key": "BAC009S0913W0316", "wav": "./aishell/wav/test/S0913/BAC009S0913W0316.wav", "txt": "美的集团公关部的相关人士告诉第一财经日报记者"} -{"key": "BAC009S0913W0317", "wav": "./aishell/wav/test/S0913/BAC009S0913W0317.wav", "txt": "目前美芝与威灵的合并"} -{"key": "BAC009S0913W0318", "wav": "./aishell/wav/test/S0913/BAC009S0913W0318.wav", "txt": "仅处于美的集团内部管理架构调整的阶段"} -{"key": "BAC009S0913W0319", "wav": "./aishell/wav/test/S0913/BAC009S0913W0319.wav", "txt": "还没体现在香港上市公司威灵电器的业务层面"} -{"key": "BAC009S0913W0320", "wav": "./aishell/wav/test/S0913/BAC009S0913W0320.wav", "txt": "由于向为民已获任威灵电机的董事会主席"} -{"key": "BAC009S0913W0321", "wav": "./aishell/wav/test/S0913/BAC009S0913W0321.wav", "txt": "威灵电机今后兼并美芝压缩机"} -{"key": "BAC009S0913W0322", "wav": "./aishell/wav/test/S0913/BAC009S0913W0322.wav", "txt": "美芝压缩机是隶属于美的集团旗下的业务"} -{"key": "BAC009S0913W0323", "wav": "./aishell/wav/test/S0913/BAC009S0913W0323.wav", "txt": "由于美的集团本身就是威灵电机的大股东"} -{"key": "BAC009S0913W0324", "wav": "./aishell/wav/test/S0913/BAC009S0913W0324.wav", "txt": "即使今后美芝压缩机被威灵电器兼并"} -{"key": "BAC009S0913W0325", "wav": "./aishell/wav/test/S0913/BAC009S0913W0325.wav", "txt": "也对美的集团的总体业绩影响不大"} -{"key": "BAC009S0913W0326", "wav": "./aishell/wav/test/S0913/BAC009S0913W0326.wav", "txt": "而威灵电器二零一四年的营业额约九十二点七三亿港元"} -{"key": "BAC009S0913W0327", "wav": "./aishell/wav/test/S0913/BAC009S0913W0327.wav", "txt": "同比增长百分之四净利润六点七八亿港元"} -{"key": "BAC009S0913W0328", "wav": "./aishell/wav/test/S0913/BAC009S0913W0328.wav", "txt": "同比下跌十三六点百分之六"} -{"key": "BAC009S0913W0329", "wav": "./aishell/wav/test/S0913/BAC009S0913W0329.wav", "txt": "如果威灵电器兼并美芝压缩机"} -{"key": "BAC009S0913W0330", "wav": "./aishell/wav/test/S0913/BAC009S0913W0330.wav", "txt": "将有利于增加威灵电器的收入和利润"} -{"key": "BAC009S0913W0331", "wav": "./aishell/wav/test/S0913/BAC009S0913W0331.wav", "txt": "除了威灵电器与美芝压缩机合并成为美的部品事业部之外"} -{"key": "BAC009S0913W0332", "wav": "./aishell/wav/test/S0913/BAC009S0913W0332.wav", "txt": "美的最近还把洗碗机事业部合并到美的的厨房电器事业部"} -{"key": "BAC009S0913W0333", "wav": "./aishell/wav/test/S0913/BAC009S0913W0333.wav", "txt": "美的的洗碗机业务以外销为主"} -{"key": "BAC009S0913W0334", "wav": "./aishell/wav/test/S0913/BAC009S0913W0334.wav", "txt": "业物内士向记者分析说"} -{"key": "BAC009S0913W0335", "wav": "./aishell/wav/test/S0913/BAC009S0913W0335.wav", "txt": "被合并到美的的厨房电器事业部后"} -{"key": "BAC009S0913W0336", "wav": "./aishell/wav/test/S0913/BAC009S0913W0336.wav", "txt": "将有助于美的洗碗机开拓国内市场"} -{"key": "BAC009S0913W0337", "wav": "./aishell/wav/test/S0913/BAC009S0913W0337.wav", "txt": "破坏和颠复是互联网时代的特征"} -{"key": "BAC009S0913W0338", "wav": "./aishell/wav/test/S0913/BAC009S0913W0338.wav", "txt": "美丽的丁香湖公园成为跑步爱好者的狂欢圣地"} -{"key": "BAC009S0913W0339", "wav": "./aishell/wav/test/S0913/BAC009S0913W0339.wav", "txt": "剪纸皮影戏等特色节目更是吸引了一批批观众围观"} -{"key": "BAC009S0913W0340", "wav": "./aishell/wav/test/S0913/BAC009S0913W0340.wav", "txt": "跑友们积极的参与剪纸活动"} -{"key": "BAC009S0913W0341", "wav": "./aishell/wav/test/S0913/BAC009S0913W0341.wav", "txt": "亲身感受沈阳当地浓郁的民俗文化内蕴"} -{"key": "BAC009S0913W0342", "wav": "./aishell/wav/test/S0913/BAC009S0913W0342.wav", "txt": "许多跑友争先恐后穿上沈阳花棉袄拍照"} -{"key": "BAC009S0913W0343", "wav": "./aishell/wav/test/S0913/BAC009S0913W0343.wav", "txt": "并与亲朋好友分享这份快乐"} -{"key": "BAC009S0913W0344", "wav": "./aishell/wav/test/S0913/BAC009S0913W0344.wav", "txt": "而涂鸦墙上写满了跑友们的目标和愿望"} -{"key": "BAC009S0913W0345", "wav": "./aishell/wav/test/S0913/BAC009S0913W0345.wav", "txt": "伴随着专业啦啦队的加油声"} -{"key": "BAC009S0913W0346", "wav": "./aishell/wav/test/S0913/BAC009S0913W0346.wav", "txt": "跑友们在奔跑中国沈阳站的赛道上尽情的展示自己"} -{"key": "BAC009S0913W0347", "wav": "./aishell/wav/test/S0913/BAC009S0913W0347.wav", "txt": "赛道两边设置了许多专业摄像头"} -{"key": "BAC009S0913W0348", "wav": "./aishell/wav/test/S0913/BAC009S0913W0348.wav", "txt": "主办方试图记录每一个跑友挥洒激情的每一个瞬间"} -{"key": "BAC009S0913W0349", "wav": "./aishell/wav/test/S0913/BAC009S0913W0349.wav", "txt": "将这份快乐与跑对跑步的执着传递给身边的好友"} -{"key": "BAC009S0913W0351", "wav": "./aishell/wav/test/S0913/BAC009S0913W0351.wav", "txt": "同时带动当地人民的奔跑热情"} -{"key": "BAC009S0913W0354", "wav": "./aishell/wav/test/S0913/BAC009S0913W0354.wav", "txt": "更加多维度的助推跑步事业在中国的发展"} -{"key": "BAC009S0913W0355", "wav": "./aishell/wav/test/S0913/BAC009S0913W0355.wav", "txt": "服务广大跑步爱好者"} -{"key": "BAC009S0913W0356", "wav": "./aishell/wav/test/S0913/BAC009S0913W0356.wav", "txt": "奔跑中国系列竞跑赛事将转战广州"} -{"key": "BAC009S0913W0357", "wav": "./aishell/wav/test/S0913/BAC009S0913W0357.wav", "txt": "中新网成都九月十五日电付敬懿十五日"} -{"key": "BAC009S0913W0358", "wav": "./aishell/wav/test/S0913/BAC009S0913W0358.wav", "txt": "服务时间约为五十三万小时"} -{"key": "BAC009S0913W0359", "wav": "./aishell/wav/test/S0913/BAC009S0913W0359.wav", "txt": "自二零一四年十二月五日正式启动志愿者招募工作以来"} -{"key": "BAC009S0913W0360", "wav": "./aishell/wav/test/S0913/BAC009S0913W0360.wav", "txt": "因为本次赛事比赛周期长赛区跨度大"} -{"key": "BAC009S0913W0361", "wav": "./aishell/wav/test/S0913/BAC009S0913W0361.wav", "txt": "经过网络测试综合面试专业技能体能测试等环节"} -{"key": "BAC009S0913W0362", "wav": "./aishell/wav/test/S0913/BAC009S0913W0362.wav", "txt": "机关企事业单位等社会志愿者三千一百名"} -{"key": "BAC009S0913W0363", "wav": "./aishell/wav/test/S0913/BAC009S0913W0363.wav", "txt": "为做好志愿者服务工作"} -{"key": "BAC009S0913W0364", "wav": "./aishell/wav/test/S0913/BAC009S0913W0364.wav", "txt": "邀请专家学者等三十馀人组成志愿者培训导师库"} -{"key": "BAC009S0913W0365", "wav": "./aishell/wav/test/S0913/BAC009S0913W0365.wav", "txt": "指导各赛区开展志愿服务培训"} -{"key": "BAC009S0913W0366", "wav": "./aishell/wav/test/S0913/BAC009S0913W0366.wav", "txt": "组委会设计了具有四川特色的志愿者服装"} -{"key": "BAC009S0913W0367", "wav": "./aishell/wav/test/S0913/BAC009S0913W0367.wav", "txt": "志愿者的那一抹绿并大家亲切地称呼为小青椒"} -{"key": "BAC009S0913W0368", "wav": "./aishell/wav/test/S0913/BAC009S0913W0368.wav", "txt": "随着赛会推进被越来越多的人所熟知"} -{"key": "BAC009S0913W0369", "wav": "./aishell/wav/test/S0913/BAC009S0913W0369.wav", "txt": "成为本届残运会志愿服务文化的重要部分"} -{"key": "BAC009S0913W0370", "wav": "./aishell/wav/test/S0913/BAC009S0913W0370.wav", "txt": "电子科大的小青椒早上六点起床"} -{"key": "BAC009S0913W0371", "wav": "./aishell/wav/test/S0913/BAC009S0913W0371.wav", "txt": "每天忙碌十三个小时"} -{"key": "BAC009S0913W0372", "wav": "./aishell/wav/test/S0913/BAC009S0913W0372.wav", "txt": "用他们真挚的微笑和运动员建立起心与心的连接"} -{"key": "BAC009S0913W0373", "wav": "./aishell/wav/test/S0913/BAC009S0913W0373.wav", "txt": "四川大学的手语志愿者要学习四千个手语动作"} -{"key": "BAC009S0913W0374", "wav": "./aishell/wav/test/S0913/BAC009S0913W0374.wav", "txt": "而他们熟练掌握的秘笈是一次又一次反复的训练和排练"} -{"key": "BAC009S0913W0375", "wav": "./aishell/wav/test/S0913/BAC009S0913W0375.wav", "txt": "小青椒用热情和真诚打动了每位运动员"} -{"key": "BAC009S0913W0376", "wav": "./aishell/wav/test/S0913/BAC009S0913W0376.wav", "txt": "他们每天手牵手肩并肩出入赛场"} -{"key": "BAC009S0913W0377", "wav": "./aishell/wav/test/S0913/BAC009S0913W0377.wav", "txt": "就像认识多年的朋友和兄弟姐妹一样"} -{"key": "BAC009S0913W0378", "wav": "./aishell/wav/test/S0913/BAC009S0913W0378.wav", "txt": "湖北运动员的家长给小青椒写来致谢信"} -{"key": "BAC009S0913W0379", "wav": "./aishell/wav/test/S0913/BAC009S0913W0379.wav", "txt": "也温暖和感动着志愿者"} -{"key": "BAC009S0913W0380", "wav": "./aishell/wav/test/S0913/BAC009S0913W0380.wav", "txt": "北京时间十月十日"} -{"key": "BAC009S0913W0381", "wav": "./aishell/wav/test/S0913/BAC009S0913W0381.wav", "txt": "根据韩国乒乓球协会的相关规定"} -{"key": "BAC009S0913W0382", "wav": "./aishell/wav/test/S0913/BAC009S0913W0382.wav", "txt": "根据国际乒联刚刚公布的最新一期世界排名"} -{"key": "BAC009S0913W0383", "wav": "./aishell/wav/test/S0913/BAC009S0913W0383.wav", "txt": "而排名第三位的李尚洙"} -{"key": "BAC009S0913W0384", "wav": "./aishell/wav/test/S0913/BAC009S0913W0384.wav", "txt": "将只参加奥运会团体赛的比赛"} -{"key": "BAC009S0913W0385", "wav": "./aishell/wav/test/S0913/BAC009S0913W0385.wav", "txt": "此次韩国男团派出了一老带二新的阵容"} -{"key": "BAC009S0913W0386", "wav": "./aishell/wav/test/S0913/BAC009S0913W0386.wav", "txt": "此次里约奥运会也将会是其第三次征战奥运会比赛"} -{"key": "BAC009S0913W0387", "wav": "./aishell/wav/test/S0913/BAC009S0913W0387.wav", "txt": "作为经验最为丰富的老大哥"} -{"key": "BAC009S0913W0388", "wav": "./aishell/wav/test/S0913/BAC009S0913W0388.wav", "txt": "他将尽全力带领队伍取得好成绩"} -{"key": "BAC009S0913W0389", "wav": "./aishell/wav/test/S0913/BAC009S0913W0389.wav", "txt": "在韩国男队中排名第四"} -{"key": "BAC009S0913W0390", "wav": "./aishell/wav/test/S0913/BAC009S0913W0390.wav", "txt": "女排三零阿根廷朱婷复出扣杀状态神勇"} -{"key": "BAC009S0913W0391", "wav": "./aishell/wav/test/S0913/BAC009S0913W0391.wav", "txt": "二零一五年第十二届女排世界杯战至第八轮"} -{"key": "BAC009S0913W0392", "wav": "./aishell/wav/test/S0913/BAC009S0913W0392.wav", "txt": "中国女排直落三周以三零取胜阿根廷拿到第七胜"} -{"key": "BAC009S0913W0393", "wav": "./aishell/wav/test/S0913/BAC009S0913W0393.wav", "txt": "早前意外崴伤脚踝的朱婷强势复出"} -{"key": "BAC009S0913W0394", "wav": "./aishell/wav/test/S0913/BAC009S0913W0394.wav", "txt": "斩获十五分冠全场并且拦网独得四分"} -{"key": "BAC009S0913W0395", "wav": "./aishell/wav/test/S0913/BAC009S0913W0395.wav", "txt": "伤愈复出找手感一传防守遇考验"} -{"key": "BAC009S0913W0396", "wav": "./aishell/wav/test/S0913/BAC009S0913W0396.wav", "txt": "本报讯记者李晖经过两天转场"} -{"key": "BAC009S0913W0397", "wav": "./aishell/wav/test/S0913/BAC009S0913W0397.wav", "txt": "中国女排昨天下午在冈山迎战古巴队"} -{"key": "BAC009S0913W0398", "wav": "./aishell/wav/test/S0913/BAC009S0913W0398.wav", "txt": "三局比分是二五比一九二五比十和二五比一四"} -{"key": "BAC009S0913W0399", "wav": "./aishell/wav/test/S0913/BAC009S0913W0399.wav", "txt": "中国女排从第三轮开始便被挤出了三甲"} -{"key": "BAC009S0913W0400", "wav": "./aishell/wav/test/S0913/BAC009S0913W0400.wav", "txt": "而东道主日本队紧追在中国队之后"} -{"key": "BAC009S0913W0401", "wav": "./aishell/wav/test/S0913/BAC009S0913W0401.wav", "txt": "若想保住进入前两名的资格"} -{"key": "BAC009S0913W0402", "wav": "./aishell/wav/test/S0913/BAC009S0913W0402.wav", "txt": "中国队在第二阶段的第三场比赛不仅要保全取九个积分"} -{"key": "BAC009S0913W0403", "wav": "./aishell/wav/test/S0913/BAC009S0913W0403.wav", "txt": "而且还要尽量在小分上取得优势"} -{"key": "BAC009S0913W0404", "wav": "./aishell/wav/test/S0913/BAC009S0913W0404.wav", "txt": "福斯只允许先拍一部"} -{"key": "BAC009S0913W0405", "wav": "./aishell/wav/test/S0913/BAC009S0913W0405.wav", "txt": "另一部要视独立日二的票房而定"} -{"key": "BAC009S0913W0406", "wav": "./aishell/wav/test/S0913/BAC009S0913W0406.wav", "txt": "影片的上映日期"} -{"key": "BAC009S0913W0407", "wav": "./aishell/wav/test/S0913/BAC009S0913W0407.wav", "txt": "也从原计划的二零一六年七月一日"} -{"key": "BAC009S0913W0408", "wav": "./aishell/wav/test/S0913/BAC009S0913W0408.wav", "txt": "潘玮柏以侧颜出镜"} -{"key": "BAC009S0913W0409", "wav": "./aishell/wav/test/S0913/BAC009S0913W0409.wav", "txt": "与陈妍希分别看向对方"} -{"key": "BAC009S0913W0410", "wav": "./aishell/wav/test/S0913/BAC009S0913W0410.wav", "txt": "可见两人友谊非同一般"} -{"key": "BAC009S0913W0411", "wav": "./aishell/wav/test/S0913/BAC009S0913W0411.wav", "txt": "搜狐娱乐据讯据香港媒体报道"} -{"key": "BAC009S0913W0412", "wav": "./aishell/wav/test/S0913/BAC009S0913W0412.wav", "txt": "陈妍希今天五月二日下午出席公益活动"} -{"key": "BAC009S0913W0413", "wav": "./aishell/wav/test/S0913/BAC009S0913W0413.wav", "txt": "小洋装更衬托出她的纤细身材"} -{"key": "BAC009S0913W0414", "wav": "./aishell/wav/test/S0913/BAC009S0913W0414.wav", "txt": "不过她出道以来身材一直是外界关注的焦点"} -{"key": "BAC009S0913W0415", "wav": "./aishell/wav/test/S0913/BAC009S0913W0415.wav", "txt": "陈妍希一直努力让自己的脸圆圆脸变瘦"} -{"key": "BAC009S0913W0416", "wav": "./aishell/wav/test/S0913/BAC009S0913W0416.wav", "txt": "今天她出席活动"} -{"key": "BAC009S0913W0417", "wav": "./aishell/wav/test/S0913/BAC009S0913W0417.wav", "txt": "对着镜头嘟嘴吐舌"} -{"key": "BAC009S0913W0418", "wav": "./aishell/wav/test/S0913/BAC009S0913W0418.wav", "txt": "当被问到对于被选为棉花糖女孩比较肉感的女生"} -{"key": "BAC009S0913W0419", "wav": "./aishell/wav/test/S0913/BAC009S0913W0419.wav", "txt": "她笑说我觉得蛮好的啊"} -{"key": "BAC009S0913W0420", "wav": "./aishell/wav/test/S0913/BAC009S0913W0420.wav", "txt": "搜狐娱乐讯日前"} -{"key": "BAC009S0913W0421", "wav": "./aishell/wav/test/S0913/BAC009S0913W0421.wav", "txt": "名为娱乐圈八卦的自然自媒体"} -{"key": "BAC009S0913W0422", "wav": "./aishell/wav/test/S0913/BAC009S0913W0422.wav", "txt": "曝出陈妍希拍戏时突然干呕"} -{"key": "BAC009S0913W0423", "wav": "./aishell/wav/test/S0913/BAC009S0913W0423.wav", "txt": "并推断其已怀孕"} -{"key": "BAC009S0913W0424", "wav": "./aishell/wav/test/S0913/BAC009S0913W0424.wav", "txt": "陈妍希公司官方账号发表微博辟谣"} -{"key": "BAC009S0913W0425", "wav": "./aishell/wav/test/S0913/BAC009S0913W0425.wav", "txt": "否认了陈妍希疑似怀孕的传闻"} -{"key": "BAC009S0913W0426", "wav": "./aishell/wav/test/S0913/BAC009S0913W0426.wav", "txt": "称陈妍希目前还在剧组拍戏"} -{"key": "BAC009S0913W0427", "wav": "./aishell/wav/test/S0913/BAC009S0913W0427.wav", "txt": "并感谢了各界对于陈妍希公开与陈晓恋情的祝福"} -{"key": "BAC009S0913W0428", "wav": "./aishell/wav/test/S0913/BAC009S0913W0428.wav", "txt": "陈妍希在微博发布跳绳视频"} -{"key": "BAC009S0913W0429", "wav": "./aishell/wav/test/S0913/BAC009S0913W0429.wav", "txt": "并写道每天早上二十零下"} -{"key": "BAC009S0913W0430", "wav": "./aishell/wav/test/S0913/BAC009S0913W0430.wav", "txt": "中午二十零下"} -{"key": "BAC009S0913W0431", "wav": "./aishell/wav/test/S0913/BAC009S0913W0431.wav", "txt": "北京地接旅行社有限公司负责人称"} -{"key": "BAC009S0913W0432", "wav": "./aishell/wav/test/S0913/BAC009S0913W0432.wav", "txt": "向乘客收取的船费属于应收项目"} -{"key": "BAC009S0913W0433", "wav": "./aishell/wav/test/S0913/BAC009S0913W0433.wav", "txt": "旅行社为了盈利设置购物环节"} -{"key": "BAC009S0913W0434", "wav": "./aishell/wav/test/S0913/BAC009S0913W0434.wav", "txt": "北京旅游服务热线反馈称"} -{"key": "BAC009S0913W0435", "wav": "./aishell/wav/test/S0913/BAC009S0913W0435.wav", "txt": "还有待职能部门进一步调查"} -{"key": "BAC009S0913W0436", "wav": "./aishell/wav/test/S0913/BAC009S0913W0436.wav", "txt": "游客参与不合理低价游将被罚专家怎么判断"} -{"key": "BAC009S0913W0437", "wav": "./aishell/wav/test/S0913/BAC009S0913W0437.wav", "txt": "关于低价游旅行团因强制购物产生的纠纷事件频出"} -{"key": "BAC009S0913W0438", "wav": "./aishell/wav/test/S0913/BAC009S0913W0438.wav", "txt": "甚至还出现了一些造成游人身伤害的悲剧"} -{"key": "BAC009S0913W0439", "wav": "./aishell/wav/test/S0913/BAC009S0913W0439.wav", "txt": "旅游法早已明令禁止"} -{"key": "BAC009S0913W0440", "wav": "./aishell/wav/test/S0913/BAC009S0913W0440.wav", "txt": "游客参与不合理低价游也将受到受处难执行"} -{"key": "BAC009S0913W0441", "wav": "./aishell/wav/test/S0913/BAC009S0913W0441.wav", "txt": "京汇佳律师事务所律师邱宝昌表示"} -{"key": "BAC009S0913W0442", "wav": "./aishell/wav/test/S0913/BAC009S0913W0442.wav", "txt": "消费者根本很难判断什么叫做不合理低价"} -{"key": "BAC009S0913W0444", "wav": "./aishell/wav/test/S0913/BAC009S0913W0444.wav", "txt": "园中园收费超景区大门票"} -{"key": "BAC009S0913W0445", "wav": "./aishell/wav/test/S0913/BAC009S0913W0445.wav", "txt": "游客在乌鲁木齐市吃自助餐浪费食物被罚二四零零元"} -{"key": "BAC009S0913W0446", "wav": "./aishell/wav/test/S0913/BAC009S0913W0446.wav", "txt": "剩下了一二零零克食物"} -{"key": "BAC009S0913W0447", "wav": "./aishell/wav/test/S0913/BAC009S0913W0447.wav", "txt": "被餐厅罚款二四零零元"} -{"key": "BAC009S0913W0448", "wav": "./aishell/wav/test/S0913/BAC009S0913W0448.wav", "txt": "此事昨日经网络曝光后"} -{"key": "BAC009S0913W0449", "wav": "./aishell/wav/test/S0913/BAC009S0913W0449.wav", "txt": "食客该不该如此浪费"} -{"key": "BAC009S0913W0450", "wav": "./aishell/wav/test/S0913/BAC009S0913W0450.wav", "txt": "餐厅有没有权力罚款"} -{"key": "BAC009S0913W0451", "wav": "./aishell/wav/test/S0913/BAC009S0913W0451.wav", "txt": "成为了网民争相讨论的话题"} -{"key": "BAC009S0913W0452", "wav": "./aishell/wav/test/S0913/BAC009S0913W0452.wav", "txt": "游客在公园躲雨遭雷击已脑死亡至今无人负责"} -{"key": "BAC009S0913W0453", "wav": "./aishell/wav/test/S0913/BAC009S0913W0453.wav", "txt": "信息时报讯记者周伟龙八月一零日下午"} -{"key": "BAC009S0913W0454", "wav": "./aishell/wav/test/S0913/BAC009S0913W0454.wav", "txt": "六名游客在海珠湖公园凉亭内躲雨"} -{"key": "BAC009S0913W0455", "wav": "./aishell/wav/test/S0913/BAC009S0913W0455.wav", "txt": "昨日记者从医院了解到"} -{"key": "BAC009S0913W0456", "wav": "./aishell/wav/test/S0913/BAC009S0913W0456.wav", "txt": "目前黄某已被诊断为脑死亡"} -{"key": "BAC009S0913W0457", "wav": "./aishell/wav/test/S0913/BAC009S0913W0457.wav", "txt": "记者回访海珠湖公园发现"} -{"key": "BAC009S0913W0458", "wav": "./aishell/wav/test/S0913/BAC009S0913W0458.wav", "txt": "出事凉亭依然呈现事发时的状态"} -{"key": "BAC009S0913W0459", "wav": "./aishell/wav/test/S0913/BAC009S0913W0459.wav", "txt": "一旦雷雨天游客在亭内出事"} -{"key": "BAC009S0913W0460", "wav": "./aishell/wav/test/S0913/BAC009S0913W0460.wav", "txt": "该告示不能成为园方免责的理由"} -{"key": "BAC009S0913W0461", "wav": "./aishell/wav/test/S0913/BAC009S0913W0461.wav", "txt": "游客在北京动物园内小树间拉吊床摇荡"} -{"key": "BAC009S0913W0462", "wav": "./aishell/wav/test/S0913/BAC009S0913W0462.wav", "txt": "却要承载一个成年人的体重"} -{"key": "BAC009S0913W0463", "wav": "./aishell/wav/test/S0913/BAC009S0913W0463.wav", "txt": "一家三口在两棵树间拉起了一张吊床"} -{"key": "BAC009S0913W0464", "wav": "./aishell/wav/test/S0913/BAC009S0913W0464.wav", "txt": "父亲和孩子轮流上去躺"} -{"key": "BAC009S0913W0465", "wav": "./aishell/wav/test/S0913/BAC009S0913W0465.wav", "txt": "躺进吊床的父亲还荡起吊床"} -{"key": "BAC009S0913W0466", "wav": "./aishell/wav/test/S0913/BAC009S0913W0466.wav", "txt": "游客在新加坡买祖母绿回国发现非纯天然"} -{"key": "BAC009S0913W0467", "wav": "./aishell/wav/test/S0913/BAC009S0913W0467.wav", "txt": "夏先生带太太跟团去新马泰旅游"} -{"key": "BAC009S0913W0468", "wav": "./aishell/wav/test/S0913/BAC009S0913W0468.wav", "txt": "在新加坡花费三万元购买了纯天然的祖母绿吊坠"} -{"key": "BAC009S0913W0469", "wav": "./aishell/wav/test/S0913/BAC009S0913W0469.wav", "txt": "回国后经鉴定发现不是天纯天然的"} -{"key": "BAC009S0913W0470", "wav": "./aishell/wav/test/S0913/BAC009S0913W0470.wav", "txt": "游客在日照旅游吃海鲜太少被围殴警方都有错"} -{"key": "BAC009S0913W0471", "wav": "./aishell/wav/test/S0913/BAC009S0913W0471.wav", "txt": "大众网河南游客爆料在日照旅游团因吃海鲜太少被围殴"} -{"key": "BAC009S0913W0472", "wav": "./aishell/wav/test/S0913/BAC009S0913W0472.wav", "txt": "警方回应都有过错经警方调查"} -{"key": "BAC009S0913W0473", "wav": "./aishell/wav/test/S0913/BAC009S0913W0473.wav", "txt": "双方均有不同程度受伤"} -{"key": "BAC009S0913W0474", "wav": "./aishell/wav/test/S0913/BAC009S0913W0474.wav", "txt": "河南籍游客张某某手部受伤及表皮损伤"} -{"key": "BAC009S0913W0475", "wav": "./aishell/wav/test/S0913/BAC009S0913W0475.wav", "txt": "店主陈某头皮裂创二处"} -{"key": "BAC009S0913W0476", "wav": "./aishell/wav/test/S0913/BAC009S0913W0476.wav", "txt": "游客在济南景区水池许愿观赏莲被砸成马蜂窝"} -{"key": "BAC009S0913W0477", "wav": "./aishell/wav/test/S0913/BAC009S0913W0477.wav", "txt": "游客扔硬币许愿观赏莲被砸成马蜂窝"} -{"key": "BAC009S0913W0478", "wav": "./aishell/wav/test/S0913/BAC009S0913W0478.wav", "txt": "游客在百年老店买到发霉盐水鸭商家主动退款"} -{"key": "BAC009S0913W0479", "wav": "./aishell/wav/test/S0913/BAC009S0913W0479.wav", "txt": "谢女士购买的盐水鸭外包装"} -{"key": "BAC009S0913W0480", "wav": "./aishell/wav/test/S0913/BAC009S0913W0480.wav", "txt": "华商报讯记者杨德合买了两个肉夹馍"} -{"key": "BAC009S0913W0481", "wav": "./aishell/wav/test/S0913/BAC009S0913W0481.wav", "txt": "结果被店员搓走了二零零零元"} -{"key": "BAC009S0913W0482", "wav": "./aishell/wav/test/S0913/BAC009S0913W0482.wav", "txt": "尽管在民警的协助下"} -{"key": "BAC009S0913W0483", "wav": "./aishell/wav/test/S0913/BAC009S0913W0483.wav", "txt": "但这也让首次来到陕西游玩的孙女士感到憋屈"} -{"key": "BAC009S0913W0484", "wav": "./aishell/wav/test/S0913/BAC009S0913W0484.wav", "txt": "游客大铜缸刻字警察喊话故宫刻字者请自首"} -{"key": "BAC009S0913W0485", "wav": "./aishell/wav/test/S0913/BAC009S0913W0485.wav", "txt": "北京警方已介入调查"} -{"key": "BAC009S0913W0486", "wav": "./aishell/wav/test/S0913/BAC009S0913W0486.wav", "txt": "目前正在进行一步工作中"} -{"key": "BAC009S0913W0487", "wav": "./aishell/wav/test/S0913/BAC009S0913W0487.wav", "txt": "游客成都遇连环车祸近千人隧道里死里逃亡"} -{"key": "BAC009S0913W0488", "wav": "./aishell/wav/test/S0913/BAC009S0913W0488.wav", "txt": "图片由胡先生提供本报讯记者喻莉出门旅游"} -{"key": "BAC009S0913W0489", "wav": "./aishell/wav/test/S0913/BAC009S0913W0489.wav", "txt": "近千人在隧道里上演生死时速"} -{"key": "BAC009S0913W0490", "wav": "./aishell/wav/test/S0913/BAC009S0913W0490.wav", "txt": "武汉网友胡琦的一条短信微博引起众人关注"} -{"key": "BAC009S0913W0491", "wav": "./aishell/wav/test/S0913/BAC009S0913W0491.wav", "txt": "记者联系上胡先生才知虚惊一场"} -{"key": "BAC009S0913W0492", "wav": "./aishell/wav/test/S0913/BAC009S0913W0492.wav", "txt": "现场有人喊有车要爆炸"} -{"key": "BAC009S0913W0493", "wav": "./aishell/wav/test/S0913/BAC009S0913W0493.wav", "txt": "事后才了解他们遭遇的只是普通连环车祸"} -{"key": "BAC009S0913W0494", "wav": "./aishell/wav/test/S0913/BAC009S0913W0494.wav", "txt": "游客打车被找四张同号假钞官方疑遇克隆车"} -{"key": "BAC009S0913W0495", "wav": "./aishell/wav/test/S0913/BAC009S0913W0495.wav", "txt": "其在出行成都打车时被司机找了四张同号的二元零元假币"} -{"key": "BAC009S0914W0121", "wav": "./aishell/wav/test/S0914/BAC009S0914W0121.wav", "txt": "境外个人购买应当符合当地政策规定"} -{"key": "BAC009S0914W0122", "wav": "./aishell/wav/test/S0914/BAC009S0914W0122.wav", "txt": "外资在华房地产投资限制松绑已成大势"} -{"key": "BAC009S0914W0123", "wav": "./aishell/wav/test/S0914/BAC009S0914W0123.wav", "txt": "在限限制外商投资产业目录中"} -{"key": "BAC009S0914W0124", "wav": "./aishell/wav/test/S0914/BAC009S0914W0124.wav", "txt": "已经删除了此前针对外商投资房地产的全部限制类条款"} -{"key": "BAC009S0914W0125", "wav": "./aishell/wav/test/S0914/BAC009S0914W0125.wav", "txt": "放开外资购买房产限制"} -{"key": "BAC009S0914W0126", "wav": "./aishell/wav/test/S0914/BAC009S0914W0126.wav", "txt": "外资只可以购买商铺写字楼等物业"} -{"key": "BAC009S0914W0127", "wav": "./aishell/wav/test/S0914/BAC009S0914W0127.wav", "txt": "普通住宅很可能仍将限购"} -{"key": "BAC009S0914W0128", "wav": "./aishell/wav/test/S0914/BAC009S0914W0128.wav", "txt": "而就昨日六部委松绑楼市限外令的情况来看"} -{"key": "BAC009S0914W0129", "wav": "./aishell/wav/test/S0914/BAC009S0914W0129.wav", "txt": "对于中国楼市的影响不必过于乐观"} -{"key": "BAC009S0914W0130", "wav": "./aishell/wav/test/S0914/BAC009S0914W0130.wav", "txt": "取消限外令将促进外企在华投资房地产"} -{"key": "BAC009S0914W0131", "wav": "./aishell/wav/test/S0914/BAC009S0914W0131.wav", "txt": "对于中国楼市有一定积极作用"} -{"key": "BAC009S0914W0132", "wav": "./aishell/wav/test/S0914/BAC009S0914W0132.wav", "txt": "特别是在海外热钱有外流预期的情况下"} -{"key": "BAC009S0914W0133", "wav": "./aishell/wav/test/S0914/BAC009S0914W0133.wav", "txt": "继续限制外资投资中国房地产已经不合时宜"} -{"key": "BAC009S0914W0134", "wav": "./aishell/wav/test/S0914/BAC009S0914W0134.wav", "txt": "外资占国内地产投资比例有限"} -{"key": "BAC009S0914W0135", "wav": "./aishell/wav/test/S0914/BAC009S0914W0135.wav", "txt": "此项政策对中国楼市影响有限"} -{"key": "BAC009S0914W0136", "wav": "./aishell/wav/test/S0914/BAC009S0914W0136.wav", "txt": "中国房地产学会副会长陈国强也认为"} -{"key": "BAC009S0914W0137", "wav": "./aishell/wav/test/S0914/BAC009S0914W0137.wav", "txt": "外资购房主要集中在一线城市和几个热点二线城市"} -{"key": "BAC009S0914W0138", "wav": "./aishell/wav/test/S0914/BAC009S0914W0138.wav", "txt": "而现在这类城市的房价已经很高"} -{"key": "BAC009S0914W0139", "wav": "./aishell/wav/test/S0914/BAC009S0914W0139.wav", "txt": "即使限外令放开也不会出现大规模外资买房"} -{"key": "BAC009S0914W0140", "wav": "./aishell/wav/test/S0914/BAC009S0914W0140.wav", "txt": "正处于筑底回暖阶段"} -{"key": "BAC009S0914W0141", "wav": "./aishell/wav/test/S0914/BAC009S0914W0141.wav", "txt": "主要还是依靠中国国内企业投资"} -{"key": "BAC009S0914W0142", "wav": "./aishell/wav/test/S0914/BAC009S0914W0142.wav", "txt": "虽然一线城市房价已出现反弹"} -{"key": "BAC009S0914W0143", "wav": "./aishell/wav/test/S0914/BAC009S0914W0143.wav", "txt": "但包括鄂尔多斯温州等地的去库存还是非常困难"} -{"key": "BAC009S0914W0144", "wav": "./aishell/wav/test/S0914/BAC009S0914W0144.wav", "txt": "七月份各线城市房价分化仍然明显"} -{"key": "BAC009S0914W0145", "wav": "./aishell/wav/test/S0914/BAC009S0914W0145.wav", "txt": "目前整体的宏观经济还是比较困难的"} -{"key": "BAC009S0914W0146", "wav": "./aishell/wav/test/S0914/BAC009S0914W0146.wav", "txt": "房地产的投资增速目前不到五百分之"} -{"key": "BAC009S0914W0148", "wav": "./aishell/wav/test/S0914/BAC009S0914W0148.wav", "txt": "因此开发还要继续坚定的开工和拿地的信心"} -{"key": "BAC009S0914W0149", "wav": "./aishell/wav/test/S0914/BAC009S0914W0149.wav", "txt": "这种分化情况会更剧烈"} -{"key": "BAC009S0914W0150", "wav": "./aishell/wav/test/S0914/BAC009S0914W0150.wav", "txt": "但是整体回暖和好转态势已经确定"} -{"key": "BAC009S0914W0151", "wav": "./aishell/wav/test/S0914/BAC009S0914W0151.wav", "txt": "与前年差不多这种状态"} -{"key": "BAC009S0914W0152", "wav": "./aishell/wav/test/S0914/BAC009S0914W0152.wav", "txt": "外资在华房地产投资购房限制被松绑"} -{"key": "BAC009S0914W0153", "wav": "./aishell/wav/test/S0914/BAC009S0914W0153.wav", "txt": "允许机构和个人在中国购"} -{"key": "BAC009S0914W0154", "wav": "./aishell/wav/test/S0914/BAC009S0914W0154.wav", "txt": "六部委调整房地产市场外资准入和管理政策"} -{"key": "BAC009S0914W0155", "wav": "./aishell/wav/test/S0914/BAC009S0914W0155.wav", "txt": "为促进房地产市场平稳健康发展"} -{"key": "BAC009S0914W0156", "wav": "./aishell/wav/test/S0914/BAC009S0914W0156.wav", "txt": "一外商投资房地产企业注册资本与投资总额比例"} -{"key": "BAC009S0914W0157", "wav": "./aishell/wav/test/S0914/BAC009S0914W0157.wav", "txt": "对于实施住房限购政策的城市"} -{"key": "BAC009S0914W0158", "wav": "./aishell/wav/test/S0914/BAC009S0914W0158.wav", "txt": "境外个人购房应当符合当地政策规定"} -{"key": "BAC009S0914W0159", "wav": "./aishell/wav/test/S0914/BAC009S0914W0159.wav", "txt": "优化和改进外商投资房地产管理"} -{"key": "BAC009S0914W0160", "wav": "./aishell/wav/test/S0914/BAC009S0914W0160.wav", "txt": "除上述政策调整以外"} -{"key": "BAC009S0914W0161", "wav": "./aishell/wav/test/S0914/BAC009S0914W0161.wav", "txt": "为促进房地产市场平稳健康发展"} -{"key": "BAC009S0914W0162", "wav": "./aishell/wav/test/S0914/BAC009S0914W0162.wav", "txt": "以及在中国境内工作学习的境外个人"} -{"key": "BAC009S0914W0163", "wav": "./aishell/wav/test/S0914/BAC009S0914W0163.wav", "txt": "可以购买符合实际需要的自用自住商品房"} -{"key": "BAC009S0914W0164", "wav": "./aishell/wav/test/S0914/BAC009S0914W0164.wav", "txt": "外商投资房地产企业注册资本与投资总额比例"} -{"key": "BAC009S0914W0165", "wav": "./aishell/wav/test/S0914/BAC009S0914W0165.wav", "txt": "将依照中外合资经营企业的相关暂行规定"} -{"key": "BAC009S0914W0166", "wav": "./aishell/wav/test/S0914/BAC009S0914W0166.wav", "txt": "中新网八月二十七日电据商务部官网公布的文件显示"} -{"key": "BAC009S0914W0167", "wav": "./aishell/wav/test/S0914/BAC009S0914W0167.wav", "txt": "取消外商投资房地产企业"} -{"key": "BAC009S0914W0168", "wav": "./aishell/wav/test/S0914/BAC009S0914W0168.wav", "txt": "六部门出台新政楼市限外政策放松"} -{"key": "BAC009S0914W0169", "wav": "./aishell/wav/test/S0914/BAC009S0914W0169.wav", "txt": "这来外资进入我国房地产市场最宽松的政策"} -{"key": "BAC009S0914W0170", "wav": "./aishell/wav/test/S0914/BAC009S0914W0170.wav", "txt": "这份只有五百多字的通知印发于八月十九日"} -{"key": "BAC009S0914W0171", "wav": "./aishell/wav/test/S0914/BAC009S0914W0171.wav", "txt": "规定外商投资建立房地产企业"} -{"key": "BAC009S0914W0172", "wav": "./aishell/wav/test/S0914/BAC009S0914W0172.wav", "txt": "投资总额超过一千万美元含一千万美元的"} -{"key": "BAC009S0914W0173", "wav": "./aishell/wav/test/S0914/BAC009S0914W0173.wav", "txt": "注册资本金不得低于投资总额的百分之五十"} -{"key": "BAC009S0914W0174", "wav": "./aishell/wav/test/S0914/BAC009S0914W0174.wav", "txt": "外商投资房地产企业注册资本金未全部缴付的"} -{"key": "BAC009S0914W0175", "wav": "./aishell/wav/test/S0914/BAC009S0914W0175.wav", "txt": "未取得国有土地使用证的"} -{"key": "BAC009S0914W0176", "wav": "./aishell/wav/test/S0914/BAC009S0914W0176.wav", "txt": "或开发项目资本金未达到项目投资总额百分之五"} -{"key": "BAC009S0914W0177", "wav": "./aishell/wav/test/S0914/BAC009S0914W0177.wav", "txt": "不得办理境内境外贷款"} -{"key": "BAC009S0914W0178", "wav": "./aishell/wav/test/S0914/BAC009S0914W0178.wav", "txt": "外汇管理部门不予批准该企业的外汇借款结汇"} -{"key": "BAC009S0914W0179", "wav": "./aishell/wav/test/S0914/BAC009S0914W0179.wav", "txt": "不得购买非自用非自住商品房"} -{"key": "BAC009S0914W0180", "wav": "./aishell/wav/test/S0914/BAC009S0914W0180.wav", "txt": "港澳台地区居民和华侨因生活需要"} -{"key": "BAC009S0914W0181", "wav": "./aishell/wav/test/S0914/BAC009S0914W0181.wav", "txt": "可在境内限购一定面积的自住商品房"} -{"key": "BAC009S0914W0182", "wav": "./aishell/wav/test/S0914/BAC009S0914W0182.wav", "txt": "二到二"} -{"key": "BAC009S0914W0183", "wav": "./aishell/wav/test/S0914/BAC009S0914W0183.wav", "txt": "我国楼市正处在急速上升通道"} -{"key": "BAC009S0914W0184", "wav": "./aishell/wav/test/S0914/BAC009S0914W0184.wav", "txt": "大量外资希望进入我国市场"} -{"key": "BAC009S0914W0185", "wav": "./aishell/wav/test/S0914/BAC009S0914W0185.wav", "txt": "面对楼市中急剧增长的投资热情"} -{"key": "BAC009S0914W0186", "wav": "./aishell/wav/test/S0914/BAC009S0914W0186.wav", "txt": "对购买住房的数量也未做要求"} -{"key": "BAC009S0914W0187", "wav": "./aishell/wav/test/S0914/BAC009S0914W0187.wav", "txt": "带动和规范民间资本进入农产品流通领域"} -{"key": "BAC009S0914W0188", "wav": "./aishell/wav/test/S0914/BAC009S0914W0188.wav", "txt": "完善农产品流通税收政策"} -{"key": "BAC009S0914W0189", "wav": "./aishell/wav/test/S0914/BAC009S0914W0189.wav", "txt": "免征蔬菜流通环节增值税"} -{"key": "BAC009S0914W0190", "wav": "./aishell/wav/test/S0914/BAC009S0914W0190.wav", "txt": "加大涉农贷款投放力度"} -{"key": "BAC009S0914W0191", "wav": "./aishell/wav/test/S0914/BAC009S0914W0191.wav", "txt": "可按作价出资入股方式办理理用地手续"} -{"key": "BAC009S0914W0192", "wav": "./aishell/wav/test/S0914/BAC009S0914W0192.wav", "txt": "但禁止改变用途和性质"} -{"key": "BAC009S0914W0193", "wav": "./aishell/wav/test/S0914/BAC009S0914W0193.wav", "txt": "严厉打击农产品投机炒作"} -{"key": "BAC009S0914W0194", "wav": "./aishell/wav/test/S0914/BAC009S0914W0194.wav", "txt": "做好外资并购大型农产品批发市场的安全审查"} -{"key": "BAC009S0914W0195", "wav": "./aishell/wav/test/S0914/BAC009S0914W0195.wav", "txt": "严格执行鲜活农产品运输绿色通道政策"} -{"key": "BAC009S0914W0196", "wav": "./aishell/wav/test/S0914/BAC009S0914W0196.wav", "txt": "加快农产品流通标准体系建设"} -{"key": "BAC009S0914W0197", "wav": "./aishell/wav/test/S0914/BAC009S0914W0197.wav", "txt": "各地各部门加强组织领导"} -{"key": "BAC009S0914W0198", "wav": "./aishell/wav/test/S0914/BAC009S0914W0198.wav", "txt": "农产品产销对接的经验介绍"} -{"key": "BAC009S0914W0199", "wav": "./aishell/wav/test/S0914/BAC009S0914W0199.wav", "txt": "农产品产销合作社简介"} -{"key": "BAC009S0914W0200", "wav": "./aishell/wav/test/S0914/BAC009S0914W0200.wav", "txt": "海南农产品流通现状"} -{"key": "BAC009S0914W0201", "wav": "./aishell/wav/test/S0914/BAC009S0914W0201.wav", "txt": "农产品流通加工标准化"} -{"key": "BAC009S0914W0202", "wav": "./aishell/wav/test/S0914/BAC009S0914W0202.wav", "txt": "中国对农产品流通政策"} -{"key": "BAC009S0914W0203", "wav": "./aishell/wav/test/S0914/BAC009S0914W0203.wav", "txt": "温家宝主持召开国务院常务会议"} -{"key": "BAC009S0914W0204", "wav": "./aishell/wav/test/S0914/BAC009S0914W0204.wav", "txt": "研究部署在城市优先发展公共交通"} -{"key": "BAC009S0914W0205", "wav": "./aishell/wav/test/S0914/BAC009S0914W0205.wav", "txt": "审议通过缺陷汽车产品召回管理条例草案"} -{"key": "BAC009S0914W0206", "wav": "./aishell/wav/test/S0914/BAC009S0914W0206.wav", "txt": "国务院总理温家宝主持召开国务院常务会议"} -{"key": "BAC009S0914W0207", "wav": "./aishell/wav/test/S0914/BAC009S0914W0207.wav", "txt": "研究部署在城市优先发展公共交通"} -{"key": "BAC009S0914W0208", "wav": "./aishell/wav/test/S0914/BAC009S0914W0208.wav", "txt": "审议通过缺陷汽车产品召回管理条例草案"} -{"key": "BAC009S0914W0209", "wav": "./aishell/wav/test/S0914/BAC009S0914W0209.wav", "txt": "为加快发展中等职业教育"} -{"key": "BAC009S0914W0210", "wav": "./aishell/wav/test/S0914/BAC009S0914W0210.wav", "txt": "自秋季学期起"} -{"key": "BAC009S0914W0211", "wav": "./aishell/wav/test/S0914/BAC009S0914W0211.wav", "txt": "多数城市公共交通出行比例偏低"} -{"key": "BAC009S0914W0212", "wav": "./aishell/wav/test/S0914/BAC009S0914W0212.wav", "txt": "为从根本上缓解交通拥堵出行不便环境污染等矛盾"} -{"key": "BAC009S0914W0213", "wav": "./aishell/wav/test/S0914/BAC009S0914W0213.wav", "txt": "必须树立公共交通优先发展理念"} -{"key": "BAC009S0914W0214", "wav": "./aishell/wav/test/S0914/BAC009S0914W0214.wav", "txt": "将公共交通放在城市交通发展的首要位置"} -{"key": "BAC009S0914W0215", "wav": "./aishell/wav/test/S0914/BAC009S0914W0215.wav", "txt": "加快构建以公共交通为主"} -{"key": "BAC009S0914W0216", "wav": "./aishell/wav/test/S0914/BAC009S0914W0216.wav", "txt": "同时改善步行自行车出行条件"} -{"key": "BAC009S0914W0217", "wav": "./aishell/wav/test/S0914/BAC009S0914W0217.wav", "txt": "城市综合交通体系规划应明确公共交通优先发展原则"} -{"key": "BAC009S0914W0218", "wav": "./aishell/wav/test/S0914/BAC009S0914W0218.wav", "txt": "城市公共交通规划要科学布局线线网"} -{"key": "BAC009S0914W0219", "wav": "./aishell/wav/test/S0914/BAC009S0914W0219.wav", "txt": "促进城市内外交通便利衔接和城乡公共交通一体化发展"} -{"key": "BAC009S0914W0220", "wav": "./aishell/wav/test/S0914/BAC009S0914W0220.wav", "txt": "加快基础设施建设"} -{"key": "BAC009S0914W0221", "wav": "./aishell/wav/test/S0914/BAC009S0914W0221.wav", "txt": "提升公共交通设施装备水平"} -{"key": "BAC009S0914W0222", "wav": "./aishell/wav/test/S0914/BAC009S0914W0222.wav", "txt": "提高公共交通舒适性"} -{"key": "BAC009S0914W0223", "wav": "./aishell/wav/test/S0914/BAC009S0914W0223.wav", "txt": "将其纳入旧城改造和新城建设规划"} -{"key": "BAC009S0914W0224", "wav": "./aishell/wav/test/S0914/BAC009S0914W0224.wav", "txt": "加强公共交通用地综合开开发"} -{"key": "BAC009S0914W0225", "wav": "./aishell/wav/test/S0914/BAC009S0914W0225.wav", "txt": "对新建公共交通设施用地的地上地下空间"} -{"key": "BAC009S0914W0226", "wav": "./aishell/wav/test/S0914/BAC009S0914W0226.wav", "txt": "按照市场化原则实施土地综合开发"} -{"key": "BAC009S0914W0227", "wav": "./aishell/wav/test/S0914/BAC009S0914W0227.wav", "txt": "收益用于公共交通基础设施建设和弥补运营亏损"} -{"key": "BAC009S0914W0228", "wav": "./aishell/wav/test/S0914/BAC009S0914W0228.wav", "txt": "加大政府投入"} -{"key": "BAC009S0914W0229", "wav": "./aishell/wav/test/S0914/BAC009S0914W0229.wav", "txt": "城市政府要将公共交通发展资金纳入公共财政体系"} -{"key": "BAC009S0914W0230", "wav": "./aishell/wav/test/S0914/BAC009S0914W0230.wav", "txt": "对城市公共交通企业实行税收优惠政策"} -{"key": "BAC009S0914W0231", "wav": "./aishell/wav/test/S0914/BAC009S0914W0231.wav", "txt": "落实对城市公共交通行业的成品油价格补贴政策"} -{"key": "BAC009S0914W0232", "wav": "./aishell/wav/test/S0914/BAC009S0914W0232.wav", "txt": "对城市轨道交通运营企业实行电价优惠"} -{"key": "BAC009S0914W0233", "wav": "./aishell/wav/test/S0914/BAC009S0914W0233.wav", "txt": "拓宽投资渠道"} -{"key": "BAC009S0914W0234", "wav": "./aishell/wav/test/S0914/BAC009S0914W0234.wav", "txt": "吸引和鼓励社会资金参与公共交通基础设施建设和运营"} -{"key": "BAC009S0914W0235", "wav": "./aishell/wav/test/S0914/BAC009S0914W0235.wav", "txt": "保障公交路权优先"} -{"key": "BAC009S0914W0236", "wav": "./aishell/wav/test/S0914/BAC009S0914W0236.wav", "txt": "增加划设城市公共交通优先车道"} -{"key": "BAC009S0914W0237", "wav": "./aishell/wav/test/S0914/BAC009S0914W0237.wav", "txt": "允许机场巴士校车班车使用公共交通优先车道"} -{"key": "BAC009S0914W0238", "wav": "./aishell/wav/test/S0914/BAC009S0914W0238.wav", "txt": "加强公共交通优先车道的监控和管理"} -{"key": "BAC009S0914W0239", "wav": "./aishell/wav/test/S0914/BAC009S0914W0239.wav", "txt": "健全安全管理制度"} -{"key": "BAC009S0914W0240", "wav": "./aishell/wav/test/S0914/BAC009S0914W0240.wav", "txt": "规范技术和产品标准"} -{"key": "BAC009S0914W0241", "wav": "./aishell/wav/test/S0914/BAC009S0914W0241.wav", "txt": "构建服务质量评价指标体系"} -{"key": "BAC009S0914W0242", "wav": "./aishell/wav/test/S0914/BAC009S0914W0242.wav", "txt": "规范公共交通重大决策程序"} -{"key": "BAC009S0914W0243", "wav": "./aishell/wav/test/S0914/BAC009S0914W0243.wav", "txt": "实行线网规划编制公示制度和运营价格听证制度"} -{"key": "BAC009S0914W0244", "wav": "./aishell/wav/test/S0914/BAC009S0914W0244.wav", "txt": "建立城市公共交通运营成本和服务质量信息公开制度"} -{"key": "BAC009S0914W0245", "wav": "./aishell/wav/test/S0914/BAC009S0914W0245.wav", "txt": "应当立即停止生产销售进口"} -{"key": "BAC009S0914W0246", "wav": "./aishell/wav/test/S0914/BAC009S0914W0246.wav", "txt": "由其生产者实施召回"} -{"key": "BAC009S0914W0247", "wav": "./aishell/wav/test/S0914/BAC009S0914W0247.wav", "txt": "并及时发布产品缺陷及信息"} -{"key": "BAC009S0914W0248", "wav": "./aishell/wav/test/S0914/BAC009S0914W0248.wav", "txt": "对实施召回的缺陷汽车产品"} -{"key": "BAC009S0914W0249", "wav": "./aishell/wav/test/S0914/BAC009S0914W0249.wav", "txt": "生产者应当及时采取措施消除缺陷"} -{"key": "BAC009S0914W0250", "wav": "./aishell/wav/test/S0914/BAC009S0914W0250.wav", "txt": "会议还研究了其他事项"} -{"key": "BAC009S0914W0251", "wav": "./aishell/wav/test/S0914/BAC009S0914W0251.wav", "txt": "国务院将对各类交易场所清理整顿"} -{"key": "BAC009S0914W0252", "wav": "./aishell/wav/test/S0914/BAC009S0914W0252.wav", "txt": "国务院近期将开展对各类交易场所的清理整顿工作"} -{"key": "BAC009S0914W0254", "wav": "./aishell/wav/test/S0914/BAC009S0914W0254.wav", "txt": "而且这也可以看作是苹果利用硬件优势"} -{"key": "BAC009S0914W0255", "wav": "./aishell/wav/test/S0914/BAC009S0914W0255.wav", "txt": "衍生出软件服务的又一重要举措"} -{"key": "BAC009S0914W0256", "wav": "./aishell/wav/test/S0914/BAC009S0914W0256.wav", "txt": "又如何和政府银行搞好关系"} -{"key": "BAC009S0914W0257", "wav": "./aishell/wav/test/S0914/BAC009S0914W0257.wav", "txt": "证明他们真得没有手机用户信息"} -{"key": "BAC009S0914W0258", "wav": "./aishell/wav/test/S0914/BAC009S0914W0258.wav", "txt": "苹果靠什么颠复移动支付"} -{"key": "BAC009S0914W0259", "wav": "./aishell/wav/test/S0914/BAC009S0914W0259.wav", "txt": "苹果推出的每一款新产品都不免要引发大讨论"} -{"key": "BAC009S0914W0260", "wav": "./aishell/wav/test/S0914/BAC009S0914W0260.wav", "txt": "才能显得像个知识分子"} -{"key": "BAC009S0914W0261", "wav": "./aishell/wav/test/S0914/BAC009S0914W0261.wav", "txt": "不仅树立了良好的品牌形象"} -{"key": "BAC009S0914W0262", "wav": "./aishell/wav/test/S0914/BAC009S0914W0262.wav", "txt": "也向全世界推广了一种趋之若鹜的文化"} -{"key": "BAC009S0914W0263", "wav": "./aishell/wav/test/S0914/BAC009S0914W0263.wav", "txt": "他们真得赚了很多钱"} -{"key": "BAC009S0914W0264", "wav": "./aishell/wav/test/S0914/BAC009S0914W0264.wav", "txt": "这些特质让库克基本上实现了财务自由"} -{"key": "BAC009S0914W0265", "wav": "./aishell/wav/test/S0914/BAC009S0914W0265.wav", "txt": "这对于一家巨型企业是非常难能可贵的"} -{"key": "BAC009S0914W0266", "wav": "./aishell/wav/test/S0914/BAC009S0914W0266.wav", "txt": "而土豪和穷鬼做生意的最大区别就是"} -{"key": "BAC009S0914W0267", "wav": "./aishell/wav/test/S0914/BAC009S0914W0267.wav", "txt": "而是会更加关注产品本身"} -{"key": "BAC009S0914W0268", "wav": "./aishell/wav/test/S0914/BAC009S0914W0268.wav", "txt": "以及是否能提升他们的历史地位"} -{"key": "BAC009S0914W0270", "wav": "./aishell/wav/test/S0914/BAC009S0914W0270.wav", "txt": "他们没有必要着急回本"} -{"key": "BAC009S0914W0271", "wav": "./aishell/wav/test/S0914/BAC009S0914W0271.wav", "txt": "更大的野心在于深刻变革人类的支付习惯"} -{"key": "BAC009S0914W0272", "wav": "./aishell/wav/test/S0914/BAC009S0914W0272.wav", "txt": "这种状态是苹果颠复现有市场格局的根基"} -{"key": "BAC009S0914W0273", "wav": "./aishell/wav/test/S0914/BAC009S0914W0273.wav", "txt": "除却土豪式的生意属性之外"} -{"key": "BAC009S0914W0276", "wav": "./aishell/wav/test/S0914/BAC009S0914W0276.wav", "txt": "系统会使用不同编码来转移用户凭据和支付数据"} -{"key": "BAC009S0914W0277", "wav": "./aishell/wav/test/S0914/BAC009S0914W0277.wav", "txt": "整个过程基于安全元素芯片"} -{"key": "BAC009S0914W0278", "wav": "./aishell/wav/test/S0914/BAC009S0914W0278.wav", "txt": "这种芯片不会直接发送用户敏感信息"} -{"key": "BAC009S0914W0279", "wav": "./aishell/wav/test/S0914/BAC009S0914W0279.wav", "txt": "而是将其转化成唯一的临时编码"} -{"key": "BAC009S0914W0280", "wav": "./aishell/wav/test/S0914/BAC009S0914W0280.wav", "txt": "可有效降低信息泄漏的风险其次"} -{"key": "BAC009S0914W0284", "wav": "./aishell/wav/test/S0914/BAC009S0914W0284.wav", "txt": "苹果积累了海量的绑定信息卡用户"} -{"key": "BAC009S0914W0285", "wav": "./aishell/wav/test/S0914/BAC009S0914W0285.wav", "txt": "这些资源的特点不单单是数目庞大"} -{"key": "BAC009S0914W0286", "wav": "./aishell/wav/test/S0914/BAC009S0914W0286.wav", "txt": "而且苹果最早一批的用户积累"} -{"key": "BAC009S0914W0287", "wav": "./aishell/wav/test/S0914/BAC009S0914W0287.wav", "txt": "囊括了大量的优质资源"} -{"key": "BAC009S0914W0288", "wav": "./aishell/wav/test/S0914/BAC009S0914W0288.wav", "txt": "甚至包括了一些明星意见领袖和政府官员"} -{"key": "BAC009S0914W0290", "wav": "./aishell/wav/test/S0914/BAC009S0914W0290.wav", "txt": "更是一种文化和习惯的推广者"} -{"key": "BAC009S0914W0291", "wav": "./aishell/wav/test/S0914/BAC009S0914W0291.wav", "txt": "柯振东入狱期间的囚服都能在淘宝上热卖"} -{"key": "BAC009S0914W0292", "wav": "./aishell/wav/test/S0914/BAC009S0914W0292.wav", "txt": "要是詹妮弗劳伦斯也能在微博上说这个应用不错"} -{"key": "BAC009S0914W0293", "wav": "./aishell/wav/test/S0914/BAC009S0914W0293.wav", "txt": "一定会有立竿见影的推广效果"} -{"key": "BAC009S0914W0294", "wav": "./aishell/wav/test/S0914/BAC009S0914W0294.wav", "txt": "也在所不惜的最后"} -{"key": "BAC009S0914W0296", "wav": "./aishell/wav/test/S0914/BAC009S0914W0296.wav", "txt": "早在九月九日发布会上"} -{"key": "BAC009S0914W0297", "wav": "./aishell/wav/test/S0914/BAC009S0914W0297.wav", "txt": "苹果就公布了合作伙伴"} -{"key": "BAC009S0914W0298", "wav": "./aishell/wav/test/S0914/BAC009S0914W0298.wav", "txt": "包括迪斯尼耐克麦当劳梅西百货公司等巨头企业"} -{"key": "BAC009S0914W0300", "wav": "./aishell/wav/test/S0914/BAC009S0914W0300.wav", "txt": "从这些零售商的等级来看"} -{"key": "BAC009S0914W0301", "wav": "./aishell/wav/test/S0914/BAC009S0914W0301.wav", "txt": "库克团队应当是花费了大量精力"} -{"key": "BAC009S0914W0302", "wav": "./aishell/wav/test/S0914/BAC009S0914W0302.wav", "txt": "移动支付肯定会有井喷式的发展"} -{"key": "BAC009S0914W0303", "wav": "./aishell/wav/test/S0914/BAC009S0914W0303.wav", "txt": "现阶段管理创新和组织再造比任何的创新都重要"} -{"key": "BAC009S0914W0304", "wav": "./aishell/wav/test/S0914/BAC009S0914W0304.wav", "txt": "美的美的在二零一四年三月正式发布智慧家庭战略"} -{"key": "BAC009S0914W0305", "wav": "./aishell/wav/test/S0914/BAC009S0914W0305.wav", "txt": "未来将搭建空气水营养等智能管家平台"} -{"key": "BAC009S0914W0306", "wav": "./aishell/wav/test/S0914/BAC009S0914W0306.wav", "txt": "事业部制一直是美的快速成长的法宝"} -{"key": "BAC009S0914W0307", "wav": "./aishell/wav/test/S0914/BAC009S0914W0307.wav", "txt": "一定程度上影响了资源整合的效率"} -{"key": "BAC009S0914W0308", "wav": "./aishell/wav/test/S0914/BAC009S0914W0308.wav", "txt": "美的已将风扇加湿器等空气类产品"} -{"key": "BAC009S0914W0309", "wav": "./aishell/wav/test/S0914/BAC009S0914W0309.wav", "txt": "归到家用空调事业部旗下"} -{"key": "BAC009S0914W0310", "wav": "./aishell/wav/test/S0914/BAC009S0914W0310.wav", "txt": "围绕几大智能管家平台"} -{"key": "BAC009S0914W0311", "wav": "./aishell/wav/test/S0914/BAC009S0914W0311.wav", "txt": "美的整合事业部精简组织架构"} -{"key": "BAC009S0914W0312", "wav": "./aishell/wav/test/S0914/BAC009S0914W0312.wav", "txt": "也是顺应互联网时代管理扁平化的趋势"} -{"key": "BAC009S0914W0314", "wav": "./aishell/wav/test/S0914/BAC009S0914W0314.wav", "txt": "下称美的内部的组织架构二零一五年加大了调整力度"} -{"key": "BAC009S0914W0316", "wav": "./aishell/wav/test/S0914/BAC009S0914W0316.wav", "txt": "每日经济新闻记者从美的家用空调事业部了解到"} -{"key": "BAC009S0914W0317", "wav": "./aishell/wav/test/S0914/BAC009S0914W0317.wav", "txt": "自二零一一年事业部启动自动化升级至今的四年里"} -{"key": "BAC009S0914W0318", "wav": "./aishell/wav/test/S0914/BAC009S0914W0318.wav", "txt": "工人数量减少近一半"} -{"key": "BAC009S0914W0319", "wav": "./aishell/wav/test/S0914/BAC009S0914W0319.wav", "txt": "美的家用空调事业部制造副总裁乌守保对记者表示"} -{"key": "BAC009S0914W0320", "wav": "./aishell/wav/test/S0914/BAC009S0914W0320.wav", "txt": "到二零一八年美的空调营收到达一千亿元规划时"} -{"key": "BAC009S0914W0321", "wav": "./aishell/wav/test/S0914/BAC009S0914W0321.wav", "txt": "员工数量将减至两万人"} -{"key": "BAC009S0914W0322", "wav": "./aishell/wav/test/S0914/BAC009S0914W0322.wav", "txt": "虽然投入产生问题以及机器人后期运行维护等"} -{"key": "BAC009S0914W0323", "wav": "./aishell/wav/test/S0914/BAC009S0914W0323.wav", "txt": "都是家电企业自动化升级需要面临的挑战"} -{"key": "BAC009S0914W0324", "wav": "./aishell/wav/test/S0914/BAC009S0914W0324.wav", "txt": "自动化是未来唯一出路"} -{"key": "BAC009S0914W0325", "wav": "./aishell/wav/test/S0914/BAC009S0914W0325.wav", "txt": "四年来机器人代替人工近半"} -{"key": "BAC009S0914W0326", "wav": "./aishell/wav/test/S0914/BAC009S0914W0326.wav", "txt": "美的家用空调事业提出精品战略"} -{"key": "BAC009S0914W0327", "wav": "./aishell/wav/test/S0914/BAC009S0914W0327.wav", "txt": "机器人应用也进一步提速"} -{"key": "BAC009S0914W0328", "wav": "./aishell/wav/test/S0914/BAC009S0914W0328.wav", "txt": "二零一一年美的空调达到五百亿元营收规模时"} -{"key": "BAC009S0914W0329", "wav": "./aishell/wav/test/S0914/BAC009S0914W0329.wav", "txt": "工人数量超过五万以上"} -{"key": "BAC009S0914W0330", "wav": "./aishell/wav/test/S0914/BAC009S0914W0330.wav", "txt": "空调业务总营收接近七百亿元"} -{"key": "BAC009S0914W0331", "wav": "./aishell/wav/test/S0914/BAC009S0914W0331.wav", "txt": "工人数量已经缩减至二点六万人"} -{"key": "BAC009S0914W0332", "wav": "./aishell/wav/test/S0914/BAC009S0914W0332.wav", "txt": "除了在顺德工厂建成全自动遥控器生产线外"} -{"key": "BAC009S0914W0333", "wav": "./aishell/wav/test/S0914/BAC009S0914W0333.wav", "txt": "美的空调还在其他地区工厂建有三条全自动生产线"} -{"key": "BAC009S0914W0334", "wav": "./aishell/wav/test/S0914/BAC009S0914W0334.wav", "txt": "经过前几年自动化生产线升级改造"} -{"key": "BAC009S0914W0335", "wav": "./aishell/wav/test/S0914/BAC009S0914W0335.wav", "txt": "美的空调工厂的注塑车间"} -{"key": "BAC009S0914W0336", "wav": "./aishell/wav/test/S0914/BAC009S0914W0336.wav", "txt": "在无开灯照明的情况下也能正常稳定运行"} -{"key": "BAC009S0914W0337", "wav": "./aishell/wav/test/S0914/BAC009S0914W0337.wav", "txt": "钣金冲压已实现无人运行"} -{"key": "BAC009S0914W0338", "wav": "./aishell/wav/test/S0914/BAC009S0914W0338.wav", "txt": "而在昨天对阵古巴队的比赛中"} -{"key": "BAC009S0914W0339", "wav": "./aishell/wav/test/S0914/BAC009S0914W0339.wav", "txt": "中国队教练组还是做出了让朱婷继续休战的抉择"} -{"key": "BAC009S0914W0340", "wav": "./aishell/wav/test/S0914/BAC009S0914W0340.wav", "txt": "来自北汽女排的主攻手刘晓彤取代朱婷的位置首发出场"} -{"key": "BAC009S0914W0341", "wav": "./aishell/wav/test/S0914/BAC009S0914W0341.wav", "txt": "除了第一局在开局阶段古巴队一度领先外"} -{"key": "BAC009S0914W0342", "wav": "./aishell/wav/test/S0914/BAC009S0914W0342.wav", "txt": "比赛的节奏始终被中国队控制在手中"} -{"key": "BAC009S0914W0343", "wav": "./aishell/wav/test/S0914/BAC009S0914W0343.wav", "txt": "中国队直落三局零封对手"} -{"key": "BAC009S0914W0344", "wav": "./aishell/wav/test/S0914/BAC009S0914W0344.wav", "txt": "曾春蕾和张常宁均拿到十六分"} -{"key": "BAC009S0914W0345", "wav": "./aishell/wav/test/S0914/BAC009S0914W0345.wav", "txt": "俄罗斯美国和日本三队均零封对手"} -{"key": "BAC009S0914W0346", "wav": "./aishell/wav/test/S0914/BAC009S0914W0346.wav", "txt": "此轮战罢后积分榜前四名排位没有任何变化"} -{"key": "BAC009S0914W0347", "wav": "./aishell/wav/test/S0914/BAC009S0914W0347.wav", "txt": "俄罗斯队十七分居榜首"} -{"key": "BAC009S0914W0348", "wav": "./aishell/wav/test/S0914/BAC009S0914W0348.wav", "txt": "美国队十六分排第二"} -{"key": "BAC009S0914W0349", "wav": "./aishell/wav/test/S0914/BAC009S0914W0349.wav", "txt": "日本和中国同积十五分"} -{"key": "BAC009S0914W0350", "wav": "./aishell/wav/test/S0914/BAC009S0914W0350.wav", "txt": "日本以小分优势暂列第三位"} -{"key": "BAC009S0914W0351", "wav": "./aishell/wav/test/S0914/BAC009S0914W0351.wav", "txt": "中国队将迎战冈山赛区的第二个对手肯尼亚队"} -{"key": "BAC009S0914W0352", "wav": "./aishell/wav/test/S0914/BAC009S0914W0352.wav", "txt": "中国女排昨天下午在松本迎战韩国队"} -{"key": "BAC009S0914W0353", "wav": "./aishell/wav/test/S0914/BAC009S0914W0353.wav", "txt": "主攻手朱婷不慎扭伤脚踝后依然带伤奋战"} -{"key": "BAC009S0914W0354", "wav": "./aishell/wav/test/S0914/BAC009S0914W0354.wav", "txt": "最终中国队以三比一力战韩国队全取三分"} -{"key": "BAC009S0914W0355", "wav": "./aishell/wav/test/S0914/BAC009S0914W0355.wav", "txt": "中韩之战中国队首发再次变阵"} -{"key": "BAC009S0914W0356", "wav": "./aishell/wav/test/S0914/BAC009S0914W0356.wav", "txt": "二传丁霞和主攻刘晏含取代了沈静思和张常宁的位置"} -{"key": "BAC009S0914W0357", "wav": "./aishell/wav/test/S0914/BAC009S0914W0357.wav", "txt": "张常宁则取代曾春蕾站在接应的位置上"} -{"key": "BAC009S0914W0358", "wav": "./aishell/wav/test/S0914/BAC009S0914W0358.wav", "txt": "中国队迅速调整阵容"} -{"key": "BAC009S0914W0359", "wav": "./aishell/wav/test/S0914/BAC009S0914W0359.wav", "txt": "逐渐控制住了局面并连扳两局以二比一优先"} -{"key": "BAC009S0914W0360", "wav": "./aishell/wav/test/S0914/BAC009S0914W0360.wav", "txt": "关键的第四局一开始中国队便发生了意外"} -{"key": "BAC009S0914W0361", "wav": "./aishell/wav/test/S0914/BAC009S0914W0361.wav", "txt": "一脸痛苦的朱婷当即被换下场"} -{"key": "BAC009S0914W0362", "wav": "./aishell/wav/test/S0914/BAC009S0914W0362.wav", "txt": "失去了最稳定的得分手之后"} -{"key": "BAC009S0914W0363", "wav": "./aishell/wav/test/S0914/BAC009S0914W0363.wav", "txt": "中国队进攻火力明显减弱"} -{"key": "BAC009S0914W0364", "wav": "./aishell/wav/test/S0914/BAC009S0914W0364.wav", "txt": "而看到了希望的韩国队也趁机拼命反击"} -{"key": "BAC009S0914W0365", "wav": "./aishell/wav/test/S0914/BAC009S0914W0365.wav", "txt": "当打到一三比一四中国队落后一分时"} -{"key": "BAC009S0914W0366", "wav": "./aishell/wav/test/S0914/BAC009S0914W0366.wav", "txt": "在场下接受完队医高压包扎后的朱婷请命上场"} -{"key": "BAC009S0914W0367", "wav": "./aishell/wav/test/S0914/BAC009S0914W0367.wav", "txt": "虽然扣球落地后朱婷依然一瘸一拐"} -{"key": "BAC009S0914W0368", "wav": "./aishell/wav/test/S0914/BAC009S0914W0368.wav", "txt": "见此情景韩国队的信心受到了打击"} -{"key": "BAC009S0914W0369", "wav": "./aishell/wav/test/S0914/BAC009S0914W0369.wav", "txt": "尽管也一度以二一比一七领先四分之多"} -{"key": "BAC009S0914W0370", "wav": "./aishell/wav/test/S0914/BAC009S0914W0370.wav", "txt": "但朱婷与队友们合力打出了一波八比二的高潮"} -{"key": "BAC009S0914W0371", "wav": "./aishell/wav/test/S0914/BAC009S0914W0371.wav", "txt": "最终中国队以二五比二三拿下第四局"} -{"key": "BAC009S0914W0372", "wav": "./aishell/wav/test/S0914/BAC009S0914W0372.wav", "txt": "以三比一胜出拿到了宝贵的三个积分"} -{"key": "BAC009S0914W0373", "wav": "./aishell/wav/test/S0914/BAC009S0914W0373.wav", "txt": "俄俄罗斯队以全胜战绩列积分榜首位"} -{"key": "BAC009S0914W0374", "wav": "./aishell/wav/test/S0914/BAC009S0914W0374.wav", "txt": "日本与美国同积十十分暂列二三两位"} -{"key": "BAC009S0914W0375", "wav": "./aishell/wav/test/S0914/BAC009S0914W0375.wav", "txt": "中国和多米尼加同积九分排在第四和第五位"} -{"key": "BAC009S0914W0376", "wav": "./aishell/wav/test/S0914/BAC009S0914W0376.wav", "txt": "今天中国队将迎战第一阶段的最后一个对手秘鲁队"} -{"key": "BAC009S0914W0377", "wav": "./aishell/wav/test/S0914/BAC009S0914W0377.wav", "txt": "中国女排三十一日本四夺世界杯冠军直通里约奥运"} -{"key": "BAC009S0914W0378", "wav": "./aishell/wav/test/S0914/BAC009S0914W0378.wav", "txt": "女排三十一日本进军里约众将欢度欢庆"} -{"key": "BAC009S0914W0379", "wav": "./aishell/wav/test/S0914/BAC009S0914W0379.wav", "txt": "夺冠的同时摘得本次世界杯的冠军"} -{"key": "BAC009S0914W0380", "wav": "./aishell/wav/test/S0914/BAC009S0914W0380.wav", "txt": "同时拿到了明年里约奥运会的入场券"} -{"key": "BAC009S0914W0381", "wav": "./aishell/wav/test/S0914/BAC009S0914W0381.wav", "txt": "是全场得分最高的运动员"} -{"key": "BAC009S0914W0382", "wav": "./aishell/wav/test/S0914/BAC009S0914W0382.wav", "txt": "也让这位一九九四年出生的河南妹子"} -{"key": "BAC009S0914W0383", "wav": "./aishell/wav/test/S0914/BAC009S0914W0383.wav", "txt": "逐步成长为中国女排的新核心"} -{"key": "BAC009S0914W0385", "wav": "./aishell/wav/test/S0914/BAC009S0914W0385.wav", "txt": "在今年的亚锦赛夺冠后"} -{"key": "BAC009S0914W0386", "wav": "./aishell/wav/test/S0914/BAC009S0914W0386.wav", "txt": "关于中国队过于依赖朱婷的言论不少"} -{"key": "BAC009S0914W0387", "wav": "./aishell/wav/test/S0914/BAC009S0914W0387.wav", "txt": "本赛季调进张常宁就是郎平为朱婷解压的一个表现"} -{"key": "BAC009S0914W0388", "wav": "./aishell/wav/test/S0914/BAC009S0914W0388.wav", "txt": "加上惠若琪因伤缺席本届世界杯"} -{"key": "BAC009S0914W0389", "wav": "./aishell/wav/test/S0914/BAC009S0914W0389.wav", "txt": "张常宁的幼稚嫩显然还不能立即挑起大梁"} -{"key": "BAC009S0914W0390", "wav": "./aishell/wav/test/S0914/BAC009S0914W0390.wav", "txt": "这支女排的暴露性强攻基本上都是靠朱婷打"} -{"key": "BAC009S0914W0391", "wav": "./aishell/wav/test/S0914/BAC009S0914W0391.wav", "txt": "郎平也认为这样去打世界高水平的球队是不够的"} -{"key": "BAC009S0914W0392", "wav": "./aishell/wav/test/S0914/BAC009S0914W0392.wav", "txt": "在目前中国队的阵容中"} -{"key": "BAC009S0914W0393", "wav": "./aishell/wav/test/S0914/BAC009S0914W0393.wav", "txt": "霸气外露的朱婷是不可或缺的绝对核心"} -{"key": "BAC009S0914W0394", "wav": "./aishell/wav/test/S0914/BAC009S0914W0394.wav", "txt": "在队长惠若琪缺阵的情况下"} -{"key": "BAC009S0914W0395", "wav": "./aishell/wav/test/S0914/BAC009S0914W0395.wav", "txt": "她几乎担当起了场上进攻加振奋士气的主力作用"} -{"key": "BAC009S0914W0396", "wav": "./aishell/wav/test/S0914/BAC009S0914W0396.wav", "txt": "半决赛对阵俄罗斯的比赛中"} -{"key": "BAC009S0914W0397", "wav": "./aishell/wav/test/S0914/BAC009S0914W0397.wav", "txt": "朱婷全场夺得二十九分"} -{"key": "BAC009S0914W0398", "wav": "./aishell/wav/test/S0914/BAC009S0914W0398.wav", "txt": "在俄罗斯队的严密拦防下"} -{"key": "BAC009S0914W0399", "wav": "./aishell/wav/test/S0914/BAC009S0914W0399.wav", "txt": "进攻成功率达到百分之五十六点七六拦网"} -{"key": "BAC009S0914W0400", "wav": "./aishell/wav/test/S0914/BAC009S0914W0400.wav", "txt": "朱婷得到七分同样全队最高"} -{"key": "BAC009S0914W0401", "wav": "./aishell/wav/test/S0914/BAC009S0914W0401.wav", "txt": "作为一个主攻手非常不易"} -{"key": "BAC009S0914W0402", "wav": "./aishell/wav/test/S0914/BAC009S0914W0402.wav", "txt": "与几乎不接一传的科舍列娃相比"} -{"key": "BAC009S0914W0403", "wav": "./aishell/wav/test/S0914/BAC009S0914W0403.wav", "txt": "朱婷的任务更重效率更高"} -{"key": "BAC009S0914W0404", "wav": "./aishell/wav/test/S0914/BAC009S0914W0404.wav", "txt": "提前一周至二零一六年六月二十四日"} -{"key": "BAC009S0914W0405", "wav": "./aishell/wav/test/S0914/BAC009S0914W0405.wav", "txt": "避免和新木乃伊正面较量"} -{"key": "BAC009S0914W0406", "wav": "./aishell/wav/test/S0914/BAC009S0914W0406.wav", "txt": "来源时光网美国时间本周一"} -{"key": "BAC009S0914W0407", "wav": "./aishell/wav/test/S0914/BAC009S0914W0407.wav", "txt": "二十世纪福斯影业公布一批新片的档期"} -{"key": "BAC009S0914W0408", "wav": "./aishell/wav/test/S0914/BAC009S0914W0408.wav", "txt": "晚上二十零下"} -{"key": "BAC009S0914W0409", "wav": "./aishell/wav/test/S0914/BAC009S0914W0409.wav", "txt": "马甲线啊马甲线"} -{"key": "BAC009S0914W0410", "wav": "./aishell/wav/test/S0914/BAC009S0914W0410.wav", "txt": "力证自己没有怀孕"} -{"key": "BAC009S0914W0411", "wav": "./aishell/wav/test/S0914/BAC009S0914W0411.wav", "txt": "网友纷纷调侃道为了辟谣怀孕也是蛮拼的"} -{"key": "BAC009S0914W0412", "wav": "./aishell/wav/test/S0914/BAC009S0914W0412.wav", "txt": "哈哈哈第一次见人用这种方式证明自己没怀孕"} -{"key": "BAC009S0914W0413", "wav": "./aishell/wav/test/S0914/BAC009S0914W0413.wav", "txt": "搜狐娱乐讯九月六日"} -{"key": "BAC009S0914W0414", "wav": "./aishell/wav/test/S0914/BAC009S0914W0414.wav", "txt": "陈妍希晒出一组攀岩照"} -{"key": "BAC009S0914W0415", "wav": "./aishell/wav/test/S0914/BAC009S0914W0415.wav", "txt": "并称攀岩太难会晃"} -{"key": "BAC009S0914W0416", "wav": "./aishell/wav/test/S0914/BAC009S0914W0416.wav", "txt": "不抓紧会被撞到地上"} -{"key": "BAC009S0914W0417", "wav": "./aishell/wav/test/S0914/BAC009S0914W0417.wav", "txt": "希饭快来接住我"} -{"key": "BAC009S0914W0418", "wav": "./aishell/wav/test/S0914/BAC009S0914W0418.wav", "txt": "陈妍希穿着粉色上衣"} -{"key": "BAC009S0914W0419", "wav": "./aishell/wav/test/S0914/BAC009S0914W0419.wav", "txt": "头发随意披在脑后"} -{"key": "BAC009S0914W0420", "wav": "./aishell/wav/test/S0914/BAC009S0914W0420.wav", "txt": "手脚并用努力向往上爬"} -{"key": "BAC009S0914W0421", "wav": "./aishell/wav/test/S0914/BAC009S0914W0421.wav", "txt": "似乎已过了第三关"} -{"key": "BAC009S0914W0422", "wav": "./aishell/wav/test/S0914/BAC009S0914W0422.wav", "txt": "如此高难度的动作"} -{"key": "BAC009S0914W0423", "wav": "./aishell/wav/test/S0914/BAC009S0914W0423.wav", "txt": "再次身体力行地辟谣怀孕传闻"} -{"key": "BAC009S0914W0424", "wav": "./aishell/wav/test/S0914/BAC009S0914W0424.wav", "txt": "搜狐娱乐讯近日频频传出陈晓向陈妍希求婚成功的消息"} -{"key": "BAC009S0914W0425", "wav": "./aishell/wav/test/S0914/BAC009S0914W0425.wav", "txt": "陈妍希回应现在真的很享受快乐恋爱的喜悦"} -{"key": "BAC009S0914W0426", "wav": "./aishell/wav/test/S0914/BAC009S0914W0426.wav", "txt": "有进一步消息一定会通知大家"} -{"key": "BAC009S0914W0427", "wav": "./aishell/wav/test/S0914/BAC009S0914W0427.wav", "txt": "中新网七月二十二日电据台湾东森新闻消息"} -{"key": "BAC009S0914W0428", "wav": "./aishell/wav/test/S0914/BAC009S0914W0428.wav", "txt": "陈妍希曾在新版神鵰侠侣中演小龙女"} -{"key": "BAC009S0914W0429", "wav": "./aishell/wav/test/S0914/BAC009S0914W0429.wav", "txt": "被网友调侃是小笼包"} -{"key": "BAC009S0914W0430", "wav": "./aishell/wav/test/S0914/BAC009S0914W0430.wav", "txt": "尽管她努力瘦身"} -{"key": "BAC009S0914W0431", "wav": "./aishell/wav/test/S0914/BAC009S0914W0431.wav", "txt": "当事网友疑遭遇克隆车"} -{"key": "BAC009S0914W0432", "wav": "./aishell/wav/test/S0914/BAC009S0914W0432.wav", "txt": "经调查核实相关情况"} -{"key": "BAC009S0914W0433", "wav": "./aishell/wav/test/S0914/BAC009S0914W0433.wav", "txt": "游客抢订冬奥运旅游团因遭遇订票难住房等"} -{"key": "BAC009S0914W0434", "wav": "./aishell/wav/test/S0914/BAC009S0914W0434.wav", "txt": "北京冬奥会刚刚申办成功"} -{"key": "BAC009S0914W0435", "wav": "./aishell/wav/test/S0914/BAC009S0914W0435.wav", "txt": "已经有游客迫不及待想去张家口看看了"} -{"key": "BAC009S0914W0436", "wav": "./aishell/wav/test/S0914/BAC009S0914W0436.wav", "txt": "游客摔断腿旅游社赔三成因旅游时未尽提示义务"} -{"key": "BAC009S0914W0437", "wav": "./aishell/wav/test/S0914/BAC009S0914W0437.wav", "txt": "游客日照海鲜店被打受伤警方称言语冲突引发互殴"} -{"key": "BAC009S0914W0438", "wav": "./aishell/wav/test/S0914/BAC009S0914W0438.wav", "txt": "京华时报讯记者卫张宁昨天上午"} -{"key": "BAC009S0914W0439", "wav": "./aishell/wav/test/S0914/BAC009S0914W0439.wav", "txt": "自己和家人因点的海鲜较少"} -{"key": "BAC009S0914W0440", "wav": "./aishell/wav/test/S0914/BAC009S0914W0440.wav", "txt": "并被店主及店员辱骂围殴"} -{"key": "BAC009S0914W0441", "wav": "./aishell/wav/test/S0914/BAC009S0914W0441.wav", "txt": "当时游客出言不逊在先"} -{"key": "BAC009S0914W0442", "wav": "./aishell/wav/test/S0914/BAC009S0914W0442.wav", "txt": "并未将游客脱光衣服殴打"} -{"key": "BAC009S0914W0443", "wav": "./aishell/wav/test/S0914/BAC009S0914W0443.wav", "txt": "日照市公安局官方发布消息"} -{"key": "BAC009S0914W0444", "wav": "./aishell/wav/test/S0914/BAC009S0914W0444.wav", "txt": "称事件系点餐过程中"} -{"key": "BAC009S0914W0445", "wav": "./aishell/wav/test/S0914/BAC009S0914W0445.wav", "txt": "双方发生语言冲突后进行互殴"} -{"key": "BAC009S0914W0446", "wav": "./aishell/wav/test/S0914/BAC009S0914W0446.wav", "txt": "已依法对双方进行处罚"} -{"key": "BAC009S0914W0447", "wav": "./aishell/wav/test/S0914/BAC009S0914W0447.wav", "txt": "游客晋吉岛乘船颠骨折诉旅社索赔二零馀万元"} -{"key": "BAC009S0914W0448", "wav": "./aishell/wav/test/S0914/BAC009S0914W0448.wav", "txt": "本来一家人出国旅游挺高兴的"} -{"key": "BAC009S0914W0449", "wav": "./aishell/wav/test/S0914/BAC009S0914W0449.wav", "txt": "可是我遇见这事还不够添堵的呢"} -{"key": "BAC009S0914W0450", "wav": "./aishell/wav/test/S0914/BAC009S0914W0450.wav", "txt": "崔先生带家人随团前往泰国晋吉岛游玩"} -{"key": "BAC009S0914W0451", "wav": "./aishell/wav/test/S0914/BAC009S0914W0451.wav", "txt": "导致崔先生腰部受伤"} -{"key": "BAC009S0914W0452", "wav": "./aishell/wav/test/S0914/BAC009S0914W0452.wav", "txt": "回国后被确诊为腰部骨折"} -{"key": "BAC009S0914W0453", "wav": "./aishell/wav/test/S0914/BAC009S0914W0453.wav", "txt": "将接团的两家旅行社起诉至法院"} -{"key": "BAC009S0914W0454", "wav": "./aishell/wav/test/S0914/BAC009S0914W0454.wav", "txt": "索赔各项损失共计二零馀万元"} -{"key": "BAC009S0914W0455", "wav": "./aishell/wav/test/S0914/BAC009S0914W0455.wav", "txt": "昌平法院开庭审理了这起案件"} -{"key": "BAC009S0914W0456", "wav": "./aishell/wav/test/S0914/BAC009S0914W0456.wav", "txt": "游客景区被忽悠八零零克石斛收费一二六零零元"} -{"key": "BAC009S0914W0457", "wav": "./aishell/wav/test/S0914/BAC009S0914W0457.wav", "txt": "滕女士在云南购买的石斛"} -{"key": "BAC009S0914W0458", "wav": "./aishell/wav/test/S0914/BAC009S0914W0458.wav", "txt": "游客武夷山就餐麝香肉结账要四八元一两"} -{"key": "BAC009S0914W0459", "wav": "./aishell/wav/test/S0914/BAC009S0914W0459.wav", "txt": "旅游点餐时与海鲜店主起争执互殴二人被行政拘留"} -{"key": "BAC009S0914W0460", "wav": "./aishell/wav/test/S0914/BAC009S0914W0460.wav", "txt": "新京报讯记者林斐然近日"} -{"key": "BAC009S0914W0461", "wav": "./aishell/wav/test/S0914/BAC009S0914W0461.wav", "txt": "有网友反映前往山东日照一海排档点海鲜时"} -{"key": "BAC009S0914W0462", "wav": "./aishell/wav/test/S0914/BAC009S0914W0462.wav", "txt": "该事件系游客点餐时嫌大排档太脏而引起口角纷"} -{"key": "BAC009S0914W0463", "wav": "./aishell/wav/test/S0914/BAC009S0914W0463.wav", "txt": "日照市公安局官方微博通报了这一事件的调查情况"} -{"key": "BAC009S0914W0464", "wav": "./aishell/wav/test/S0914/BAC009S0914W0464.wav", "txt": "双方因互殴均被行政拘留并处罚款"} -{"key": "BAC009S0914W0465", "wav": "./aishell/wav/test/S0914/BAC009S0914W0465.wav", "txt": "游客爬到峨眉山悬崖边石头上拍照"} -{"key": "BAC009S0914W0466", "wav": "./aishell/wav/test/S0914/BAC009S0914W0466.wav", "txt": "游客称点海鲜太少被当地媒体老板受伤更重"} -{"key": "BAC009S0914W0467", "wav": "./aishell/wav/test/S0914/BAC009S0914W0467.wav", "txt": "事情的真相完全不是这样的"} -{"key": "BAC009S0914W0468", "wav": "./aishell/wav/test/S0914/BAC009S0914W0468.wav", "txt": "大排档老板受伤更严重"} -{"key": "BAC009S0914W0469", "wav": "./aishell/wav/test/S0914/BAC009S0914W0469.wav", "txt": "起因也完全不是河南游客自己说的那样"} -{"key": "BAC009S0914W0470", "wav": "./aishell/wav/test/S0914/BAC009S0914W0470.wav", "txt": "希望警方尽快给出公平调查结果"} -{"key": "BAC009S0914W0471", "wav": "./aishell/wav/test/S0914/BAC009S0914W0471.wav", "txt": "游客称在山东日照只因点海鲜少全家遭殴打恐吓"} -{"key": "BAC009S0914W0472", "wav": "./aishell/wav/test/S0914/BAC009S0914W0472.wav", "txt": "并最新发微博表示当地警方已介入调查"} -{"key": "BAC009S0914W0473", "wav": "./aishell/wav/test/S0914/BAC009S0914W0473.wav", "txt": "游客突破八万人限流大关故宫首次提前禁止售票"} -{"key": "BAC009S0914W0474", "wav": "./aishell/wav/test/S0914/BAC009S0914W0474.wav", "txt": "新京报讯记者黄颖自七月六日进入暑期以来"} -{"key": "BAC009S0914W0475", "wav": "./aishell/wav/test/S0914/BAC009S0914W0475.wav", "txt": "故宫博物院接待的观众量也日益攀升"} -{"key": "BAC009S0914W0476", "wav": "./aishell/wav/test/S0914/BAC009S0914W0476.wav", "txt": "屡屡逼近八万人次的限流大关"} -{"key": "BAC009S0914W0477", "wav": "./aishell/wav/test/S0914/BAC009S0914W0477.wav", "txt": "故宫首次启动了起流起票限流措施"} -{"key": "BAC009S0914W0478", "wav": "./aishell/wav/test/S0914/BAC009S0914W0478.wav", "txt": "在馀票数量为售后现场关闭售票窗口"} -{"key": "BAC009S0914W0479", "wav": "./aishell/wav/test/S0914/BAC009S0914W0479.wav", "txt": "游客美签被废因访美停留太久称从没到过欧洲"} -{"key": "BAC009S0914W0480", "wav": "./aishell/wav/test/S0914/BAC009S0914W0480.wav", "txt": "而被美国海关移民官遣返"} -{"key": "BAC009S0914W0481", "wav": "./aishell/wav/test/S0914/BAC009S0914W0481.wav", "txt": "游客脚踩烈士铜像拍照四名当事人鞠躬道歉"} -{"key": "BAC009S0914W0482", "wav": "./aishell/wav/test/S0914/BAC009S0914W0482.wav", "txt": "四人鞠躬道歉据瓜沥人网"} -{"key": "BAC009S0914W0483", "wav": "./aishell/wav/test/S0914/BAC009S0914W0483.wav", "txt": "游客被黑导游拉进农家宴消费蘑菇炖鸡卖九零零元"} -{"key": "BAC009S0914W0484", "wav": "./aishell/wav/test/S0914/BAC009S0914W0484.wav", "txt": "其中一道蘑菇炖鸡收费近九零零元"} -{"key": "BAC009S0914W0485", "wav": "./aishell/wav/test/S0914/BAC009S0914W0485.wav", "txt": "看到该网友的曝光帖后"} -{"key": "BAC009S0914W0486", "wav": "./aishell/wav/test/S0914/BAC009S0914W0486.wav", "txt": "崂山景区勒令该农家宴停止停止营业"} -{"key": "BAC009S0914W0487", "wav": "./aishell/wav/test/S0914/BAC009S0914W0487.wav", "txt": "并索偿该游客全部损失"} -{"key": "BAC009S0914W0488", "wav": "./aishell/wav/test/S0914/BAC009S0914W0488.wav", "txt": "游客西安遭天价玛卡商家四零零零元一价合理"} -{"key": "BAC009S0914W0489", "wav": "./aishell/wav/test/S0914/BAC009S0914W0489.wav", "txt": "张先生购买的四零零元玛卡"} -{"key": "BAC009S0914W0490", "wav": "./aishell/wav/test/S0914/BAC009S0914W0490.wav", "txt": "内江人张先生在这次国庆期间"} -{"key": "BAC009S0914W0491", "wav": "./aishell/wav/test/S0914/BAC009S0914W0491.wav", "txt": "被导游介绍到一家购物点后"} -{"key": "BAC009S0914W0492", "wav": "./aishell/wav/test/S0914/BAC009S0914W0492.wav", "txt": "他被迫交了四零零元"} -{"key": "BAC009S0914W0493", "wav": "./aishell/wav/test/S0914/BAC009S0914W0493.wav", "txt": "这一斤玛卡其实价格只有一零零多元"} -{"key": "BAC009S0914W0494", "wav": "./aishell/wav/test/S0914/BAC009S0914W0494.wav", "txt": "一捧玛卡磨成粉景区商家要四零零零元"} -{"key": "BAC009S0914W0495", "wav": "./aishell/wav/test/S0914/BAC009S0914W0495.wav", "txt": "游客要退团张家界低价团导游称信不信你走走不了"} -{"key": "BAC009S0915W0121", "wav": "./aishell/wav/test/S0915/BAC009S0915W0121.wav", "txt": "从房地产的角度来看"} -{"key": "BAC009S0915W0122", "wav": "./aishell/wav/test/S0915/BAC009S0915W0122.wav", "txt": "这个政策的出台是希望刺激房地产投资"} -{"key": "BAC009S0915W0123", "wav": "./aishell/wav/test/S0915/BAC009S0915W0123.wav", "txt": "则是希望防止外资流出"} -{"key": "BAC009S0915W0124", "wav": "./aishell/wav/test/S0915/BAC009S0915W0124.wav", "txt": "国家统计局公布的数据显示"} -{"key": "BAC009S0915W0125", "wav": "./aishell/wav/test/S0915/BAC009S0915W0125.wav", "txt": "今年一到七月全国房地产开发投资五万亿元"} -{"key": "BAC009S0915W0126", "wav": "./aishell/wav/test/S0915/BAC009S0915W0126.wav", "txt": "增速比一到六月回落一个百分点"} -{"key": "BAC009S0915W0127", "wav": "./aishell/wav/test/S0915/BAC009S0915W0127.wav", "txt": "开发商投资增速处于不断下降的状态"} -{"key": "BAC009S0915W0128", "wav": "./aishell/wav/test/S0915/BAC009S0915W0128.wav", "txt": "市场开发也呈降温态势"} -{"key": "BAC009S0915W0129", "wav": "./aishell/wav/test/S0915/BAC009S0915W0129.wav", "txt": "此次出台的新政虽然放宽了条件"} -{"key": "BAC009S0915W0130", "wav": "./aishell/wav/test/S0915/BAC009S0915W0130.wav", "txt": "但对于实施住房限购政策的城市"} -{"key": "BAC009S0915W0131", "wav": "./aishell/wav/test/S0915/BAC009S0915W0131.wav", "txt": "境外个人购房依然需要符合当地政策规定"} -{"key": "BAC009S0915W0132", "wav": "./aishell/wav/test/S0915/BAC009S0915W0132.wav", "txt": "境外机构和个人在中国投资购买房地产的限制放松"} -{"key": "BAC009S0915W0133", "wav": "./aishell/wav/test/S0915/BAC009S0915W0133.wav", "txt": "兰州房地产市场回暖销量增加价格微涨"} -{"key": "BAC009S0915W0134", "wav": "./aishell/wav/test/S0915/BAC009S0915W0134.wav", "txt": "自二夏季开始"} -{"key": "BAC009S0915W0135", "wav": "./aishell/wav/test/S0915/BAC009S0915W0135.wav", "txt": "得益于一系列稳定房地产市场的措施"} -{"key": "BAC009S0915W0136", "wav": "./aishell/wav/test/S0915/BAC009S0915W0136.wav", "txt": "兰州房地产市场销量增加明显"} -{"key": "BAC009S0915W0137", "wav": "./aishell/wav/test/S0915/BAC009S0915W0137.wav", "txt": "一些楼盘新房价格出现微涨"} -{"key": "BAC009S0915W0138", "wav": "./aishell/wav/test/S0915/BAC009S0915W0138.wav", "txt": "较上月环比上涨百分之五"} -{"key": "BAC009S0915W0139", "wav": "./aishell/wav/test/S0915/BAC009S0915W0139.wav", "txt": "这也是该指数连续三个月出现上涨"} -{"key": "BAC009S0915W0140", "wav": "./aishell/wav/test/S0915/BAC009S0915W0140.wav", "txt": "而在多时间里"} -{"key": "BAC009S0915W0141", "wav": "./aishell/wav/test/S0915/BAC009S0915W0141.wav", "txt": "兰州新建住宅价格均呈现微降的态势"} -{"key": "BAC009S0915W0142", "wav": "./aishell/wav/test/S0915/BAC009S0915W0142.wav", "txt": "兰州楼市出现明显的区域分化"} -{"key": "BAC009S0915W0143", "wav": "./aishell/wav/test/S0915/BAC009S0915W0143.wav", "txt": "兰州市中心城区的一些楼盘"} -{"key": "BAC009S0915W0144", "wav": "./aishell/wav/test/S0915/BAC009S0915W0144.wav", "txt": "自今年初至今上涨幅度超过了十百分之"} -{"key": "BAC009S0915W0145", "wav": "./aishell/wav/test/S0915/BAC009S0915W0145.wav", "txt": "可由于中心城区楼盘数量稀少"} -{"key": "BAC009S0915W0146", "wav": "./aishell/wav/test/S0915/BAC009S0915W0146.wav", "txt": "在兰州雁滩区域的一家楼盘"} -{"key": "BAC009S0915W0147", "wav": "./aishell/wav/test/S0915/BAC009S0915W0147.wav", "txt": "而在兰州市新开楼盘集中的城郊区域"} -{"key": "BAC009S0915W0148", "wav": "./aishell/wav/test/S0915/BAC009S0915W0148.wav", "txt": "但房企调价幅度有限"} -{"key": "BAC009S0915W0149", "wav": "./aishell/wav/test/S0915/BAC009S0915W0149.wav", "txt": "由于商品房供应量充足"} -{"key": "BAC009S0915W0150", "wav": "./aishell/wav/test/S0915/BAC009S0915W0150.wav", "txt": "多个楼盘仍然采取的是低价走量的策略"} -{"key": "BAC009S0915W0151", "wav": "./aishell/wav/test/S0915/BAC009S0915W0151.wav", "txt": "在兰州市北岸由广东房企开发的一个大型楼盘里"} -{"key": "BAC009S0915W0152", "wav": "./aishell/wav/test/S0915/BAC009S0915W0152.wav", "txt": "但房价从七月至今上涨幅度仅为百分之二左右"} -{"key": "BAC009S0915W0153", "wav": "./aishell/wav/test/S0915/BAC009S0915W0153.wav", "txt": "今兰州市商品房销售面积同比上涨超过百分之三十"} -{"key": "BAC009S0915W0154", "wav": "./aishell/wav/test/S0915/BAC009S0915W0154.wav", "txt": "商品房销售额同比上涨超过了百分之四十"} -{"key": "BAC009S0915W0155", "wav": "./aishell/wav/test/S0915/BAC009S0915W0155.wav", "txt": "许多刚性住房和改善型住房需求得到释放"} -{"key": "BAC009S0915W0156", "wav": "./aishell/wav/test/S0915/BAC009S0915W0156.wav", "txt": "兰州房地产市场存在持续上涨可能"} -{"key": "BAC009S0915W0157", "wav": "./aishell/wav/test/S0915/BAC009S0915W0157.wav", "txt": "但由于房地产市场供给仍然不仍然充足"} -{"key": "BAC009S0915W0158", "wav": "./aishell/wav/test/S0915/BAC009S0915W0158.wav", "txt": "自二夏季开始"} -{"key": "BAC009S0915W0159", "wav": "./aishell/wav/test/S0915/BAC009S0915W0159.wav", "txt": "得益于一系列稳定房地产市场的措施"} -{"key": "BAC009S0915W0160", "wav": "./aishell/wav/test/S0915/BAC009S0915W0160.wav", "txt": "兰州房地产市场销量增加明显"} -{"key": "BAC009S0915W0161", "wav": "./aishell/wav/test/S0915/BAC009S0915W0161.wav", "txt": "而且提供各项衍生的福利性服务"} -{"key": "BAC009S0915W0162", "wav": "./aishell/wav/test/S0915/BAC009S0915W0162.wav", "txt": "中新网十月二十一日前"} -{"key": "BAC009S0915W0163", "wav": "./aishell/wav/test/S0915/BAC009S0915W0163.wav", "txt": "北京又一家共享创办公平台落地丰台"} -{"key": "BAC009S0915W0164", "wav": "./aishell/wav/test/S0915/BAC009S0915W0164.wav", "txt": "借全国大众创业万众创新活动周启动之势"} -{"key": "BAC009S0915W0166", "wav": "./aishell/wav/test/S0915/BAC009S0915W0166.wav", "txt": "将生活社区与科技园区两种空间组织融合"} -{"key": "BAC009S0915W0167", "wav": "./aishell/wav/test/S0915/BAC009S0915W0167.wav", "txt": "作为美国新型共享式办公与创新环境的运营品牌"} -{"key": "BAC009S0915W0168", "wav": "./aishell/wav/test/S0915/BAC009S0915W0168.wav", "txt": "是国际上合作性办公品牌的代表"} -{"key": "BAC009S0915W0169", "wav": "./aishell/wav/test/S0915/BAC009S0915W0169.wav", "txt": "由此拉开了跨境共享创新生态平台化发展的新时代"} -{"key": "BAC009S0915W0170", "wav": "./aishell/wav/test/S0915/BAC009S0915W0170.wav", "txt": "而且提供各项行生的福利性服务"} -{"key": "BAC009S0915W0171", "wav": "./aishell/wav/test/S0915/BAC009S0915W0171.wav", "txt": "帮助创新创业者聚合各方面资源"} -{"key": "BAC009S0915W0172", "wav": "./aishell/wav/test/S0915/BAC009S0915W0172.wav", "txt": "旨在帮助小型企业降低运运营成本"} -{"key": "BAC009S0915W0174", "wav": "./aishell/wav/test/S0915/BAC009S0915W0174.wav", "txt": "从创业者真正的需求出发"} -{"key": "BAC009S0915W0175", "wav": "./aishell/wav/test/S0915/BAC009S0915W0175.wav", "txt": "石榴中心位于丰台区宋家庄交通枢纽商圈"} -{"key": "BAC009S0915W0176", "wav": "./aishell/wav/test/S0915/BAC009S0915W0176.wav", "txt": "可以北京四环内唯一的国际化共享办公园区"} -{"key": "BAC009S0915W0177", "wav": "./aishell/wav/test/S0915/BAC009S0915W0177.wav", "txt": "园区总建筑面积一万平方米"} -{"key": "BAC009S0915W0178", "wav": "./aishell/wav/test/S0915/BAC009S0915W0178.wav", "txt": "其中地上一万平方米"} -{"key": "BAC009S0915W0179", "wav": "./aishell/wav/test/S0915/BAC009S0915W0179.wav", "txt": "地下一万平方米"} -{"key": "BAC009S0915W0180", "wav": "./aishell/wav/test/S0915/BAC009S0915W0180.wav", "txt": "由二十二栋企业独栋和二栋二十层的五a级写字楼组成"} -{"key": "BAC009S0915W0181", "wav": "./aishell/wav/test/S0915/BAC009S0915W0181.wav", "txt": "而且提供各项行生的福利性服务"} -{"key": "BAC009S0915W0182", "wav": "./aishell/wav/test/S0915/BAC009S0915W0182.wav", "txt": "中新网十月二十日前"} -{"key": "BAC009S0915W0184", "wav": "./aishell/wav/test/S0915/BAC009S0915W0184.wav", "txt": "关于智能家居你必须懂的五件事"} -{"key": "BAC009S0915W0185", "wav": "./aishell/wav/test/S0915/BAC009S0915W0185.wav", "txt": "智能家居概念的炒作"} -{"key": "BAC009S0915W0186", "wav": "./aishell/wav/test/S0915/BAC009S0915W0186.wav", "txt": "这是自媒体时代的胜利"} -{"key": "BAC009S0915W0187", "wav": "./aishell/wav/test/S0915/BAC009S0915W0187.wav", "txt": "将明确政策界限和工作机制以知以及部门分工"} -{"key": "BAC009S0915W0188", "wav": "./aishell/wav/test/S0915/BAC009S0915W0188.wav", "txt": "证监会将协同有关部门落实相关工作"} -{"key": "BAC009S0915W0189", "wav": "./aishell/wav/test/S0915/BAC009S0915W0189.wav", "txt": "公共娱乐场所清理整顿"} -{"key": "BAC009S0915W0190", "wav": "./aishell/wav/test/S0915/BAC009S0915W0190.wav", "txt": "燃气经经营市场清理整顿"} -{"key": "BAC009S0915W0191", "wav": "./aishell/wav/test/S0915/BAC009S0915W0191.wav", "txt": "行业协会清理整顿报告"} -{"key": "BAC009S0915W0192", "wav": "./aishell/wav/test/S0915/BAC009S0915W0192.wav", "txt": "国务院已批准信贷资产证券化继续扩大试点"} -{"key": "BAC009S0915W0193", "wav": "./aishell/wav/test/S0915/BAC009S0915W0193.wav", "txt": "多方面原因造成今年上半年部分中小企业生产经营困难"} -{"key": "BAC009S0915W0194", "wav": "./aishell/wav/test/S0915/BAC009S0915W0194.wav", "txt": "但没有出现大范围趋势性的破产倒闭"} -{"key": "BAC009S0915W0195", "wav": "./aishell/wav/test/S0915/BAC009S0915W0195.wav", "txt": "部分中小企业国内生产成本有所提高"} -{"key": "BAC009S0915W0196", "wav": "./aishell/wav/test/S0915/BAC009S0915W0196.wav", "txt": "这主要有四方面原因"} -{"key": "BAC009S0915W0197", "wav": "./aishell/wav/test/S0915/BAC009S0915W0197.wav", "txt": "中小企业经营困难"} -{"key": "BAC009S0915W0198", "wav": "./aishell/wav/test/S0915/BAC009S0915W0198.wav", "txt": "既是信贷投放回归常态的体现"} -{"key": "BAC009S0915W0199", "wav": "./aishell/wav/test/S0915/BAC009S0915W0199.wav", "txt": "也是国家淘汰落后产能加快产业升级宏观政策的体现"} -{"key": "BAC009S0915W0200", "wav": "./aishell/wav/test/S0915/BAC009S0915W0200.wav", "txt": "对于中小企业的支持政策"} -{"key": "BAC009S0915W0201", "wav": "./aishell/wav/test/S0915/BAC009S0915W0201.wav", "txt": "国务院已经批准信贷资产证券化继续扩大试点"} -{"key": "BAC009S0915W0202", "wav": "./aishell/wav/test/S0915/BAC009S0915W0202.wav", "txt": "转化成由资产产生的现金流作担保可自由流通的证券"} -{"key": "BAC009S0915W0203", "wav": "./aishell/wav/test/S0915/BAC009S0915W0203.wav", "txt": "销售给资本市场投资者的一种融资方式"} -{"key": "BAC009S0915W0204", "wav": "./aishell/wav/test/S0915/BAC009S0915W0204.wav", "txt": "目前我国正在稳步开展中小企业信贷资产证券化试点"} -{"key": "BAC009S0915W0205", "wav": "./aishell/wav/test/S0915/BAC009S0915W0205.wav", "txt": "为加快发展银行间债券市场"} -{"key": "BAC009S0915W0206", "wav": "./aishell/wav/test/S0915/BAC009S0915W0206.wav", "txt": "对中小企业发行债务融资工具提供绿色通道"} -{"key": "BAC009S0915W0207", "wav": "./aishell/wav/test/S0915/BAC009S0915W0207.wav", "txt": "占非金融企业直接债务融资总额之比"} -{"key": "BAC009S0915W0208", "wav": "./aishell/wav/test/S0915/BAC009S0915W0208.wav", "txt": "有力地支持了中小企业的发展"} -{"key": "BAC009S0915W0209", "wav": "./aishell/wav/test/S0915/BAC009S0915W0209.wav", "txt": "积极指导支持和鼓励金融机构根据中小企业的特点"} -{"key": "BAC009S0915W0210", "wav": "./aishell/wav/test/S0915/BAC009S0915W0210.wav", "txt": "研发推出不同的金融创新产品和服务方式"} -{"key": "BAC009S0915W0211", "wav": "./aishell/wav/test/S0915/BAC009S0915W0211.wav", "txt": "吴显亭称将加强和证监会等相关部门的配合和协作"} -{"key": "BAC009S0915W0212", "wav": "./aishell/wav/test/S0915/BAC009S0915W0212.wav", "txt": "而针对浙江广东民间借贷丰沛的特点"} -{"key": "BAC009S0915W0213", "wav": "./aishell/wav/test/S0915/BAC009S0915W0213.wav", "txt": "一定程度上缓解了部分中小企业的融资困难"} -{"key": "BAC009S0915W0214", "wav": "./aishell/wav/test/S0915/BAC009S0915W0214.wav", "txt": "将在有效防范民间借贷的潜在风险的前提下"} -{"key": "BAC009S0915W0215", "wav": "./aishell/wav/test/S0915/BAC009S0915W0215.wav", "txt": "发挥好民间借贷在服务中小企业发展中的积极作用"} -{"key": "BAC009S0915W0216", "wav": "./aishell/wav/test/S0915/BAC009S0915W0216.wav", "txt": "要加强对民间借贷的合理引导"} -{"key": "BAC009S0915W0217", "wav": "./aishell/wav/test/S0915/BAC009S0915W0217.wav", "txt": "解决中小企业生产经营困难需靠多方面共同努力"} -{"key": "BAC009S0915W0218", "wav": "./aishell/wav/test/S0915/BAC009S0915W0218.wav", "txt": "听取对中央企业监督检查情况的汇报"} -{"key": "BAC009S0915W0219", "wav": "./aishell/wav/test/S0915/BAC009S0915W0219.wav", "txt": "中央企业要进一步深化改革"} -{"key": "BAC009S0915W0220", "wav": "./aishell/wav/test/S0915/BAC009S0915W0220.wav", "txt": "强化企业管理和风险管控"} -{"key": "BAC009S0915W0221", "wav": "./aishell/wav/test/S0915/BAC009S0915W0221.wav", "txt": "加强依法监管和制度建设"} -{"key": "BAC009S0915W0222", "wav": "./aishell/wav/test/S0915/BAC009S0915W0222.wav", "txt": "部分中央企业的结构调整还存在一些困难"} -{"key": "BAC009S0915W0223", "wav": "./aishell/wav/test/S0915/BAC009S0915W0223.wav", "txt": "资源环境面临较大压力有的企业管理水平不高"} -{"key": "BAC009S0915W0224", "wav": "./aishell/wav/test/S0915/BAC009S0915W0224.wav", "txt": "非主业投资存在不少经营风险"} -{"key": "BAC009S0915W0225", "wav": "./aishell/wav/test/S0915/BAC009S0915W0225.wav", "txt": "境外资产监管有待加强"} -{"key": "BAC009S0915W0226", "wav": "./aishell/wav/test/S0915/BAC009S0915W0226.wav", "txt": "中央企业实现营业总收入十六点八亿元"} -{"key": "BAC009S0915W0227", "wav": "./aishell/wav/test/S0915/BAC009S0915W0227.wav", "txt": "上交税金一万亿元"} -{"key": "BAC009S0915W0228", "wav": "./aishell/wav/test/S0915/BAC009S0915W0228.wav", "txt": "增长百分之三十净利润一千亿元"} -{"key": "BAC009S0915W0229", "wav": "./aishell/wav/test/S0915/BAC009S0915W0229.wav", "txt": "二零一一年一月至七月"} -{"key": "BAC009S0915W0230", "wav": "./aishell/wav/test/S0915/BAC009S0915W0230.wav", "txt": "实现营业总收入十一亿元"} -{"key": "BAC009S0915W0231", "wav": "./aishell/wav/test/S0915/BAC009S0915W0231.wav", "txt": "同比增加迅速上缴税金三亿元"} -{"key": "BAC009S0915W0232", "wav": "./aishell/wav/test/S0915/BAC009S0915W0232.wav", "txt": "增长非常迅速"} -{"key": "BAC009S0915W0233", "wav": "./aishell/wav/test/S0915/BAC009S0915W0233.wav", "txt": "进入世界五百强的企业增加"} -{"key": "BAC009S0915W0234", "wav": "./aishell/wav/test/S0915/BAC009S0915W0234.wav", "txt": "包括七座以下小客车及摩托车都被列入免费范范围"} -{"key": "BAC009S0915W0235", "wav": "./aishell/wav/test/S0915/BAC009S0915W0235.wav", "txt": "江苏省交通厅相关负责人昨日对记者表示"} -{"key": "BAC009S0915W0236", "wav": "./aishell/wav/test/S0915/BAC009S0915W0236.wav", "txt": "今年国庆小长假期间私家车主们就可以免费上路了"} -{"key": "BAC009S0915W0237", "wav": "./aishell/wav/test/S0915/BAC009S0915W0237.wav", "txt": "免费时段从节假日第一天开始"} -{"key": "BAC009S0915W0238", "wav": "./aishell/wav/test/S0915/BAC009S0915W0238.wav", "txt": "节假日最后一天结束"} -{"key": "BAC009S0915W0239", "wav": "./aishell/wav/test/S0915/BAC009S0915W0239.wav", "txt": "普通公路以车辆通过收费站收费车道的时间为准"} -{"key": "BAC009S0915W0240", "wav": "./aishell/wav/test/S0915/BAC009S0915W0240.wav", "txt": "高速公路以车辆驶离出口收费车车道的时间为准"} -{"key": "BAC009S0915W0241", "wav": "./aishell/wav/test/S0915/BAC009S0915W0241.wav", "txt": "允许在普通收费公路行驶的摩托车"} -{"key": "BAC009S0915W0242", "wav": "./aishell/wav/test/S0915/BAC009S0915W0242.wav", "txt": "各地机场高速公路是否实行免费通行"} -{"key": "BAC009S0915W0243", "wav": "./aishell/wav/test/S0915/BAC009S0915W0243.wav", "txt": "由各省区市人民政府决定"} -{"key": "BAC009S0915W0244", "wav": "./aishell/wav/test/S0915/BAC009S0915W0244.wav", "txt": "各地机场高速公路是否实行免费通行"} -{"key": "BAC009S0915W0245", "wav": "./aishell/wav/test/S0915/BAC009S0915W0245.wav", "txt": "由各省区市人民政府决定"} -{"key": "BAC009S0915W0246", "wav": "./aishell/wav/test/S0915/BAC009S0915W0246.wav", "txt": "比如南京机场高速一到节假日"} -{"key": "BAC009S0915W0247", "wav": "./aishell/wav/test/S0915/BAC009S0915W0247.wav", "txt": "是南京往南的重要通道"} -{"key": "BAC009S0915W0248", "wav": "./aishell/wav/test/S0915/BAC009S0915W0248.wav", "txt": "对于江苏的机场高速是否免费"} -{"key": "BAC009S0915W0249", "wav": "./aishell/wav/test/S0915/BAC009S0915W0249.wav", "txt": "省交通部门称目前未定"} -{"key": "BAC009S0915W0250", "wav": "./aishell/wav/test/S0915/BAC009S0915W0250.wav", "txt": "但有关负责人认为我想"} -{"key": "BAC009S0915W0251", "wav": "./aishell/wav/test/S0915/BAC009S0915W0251.wav", "txt": "机场高速最大可能还是免费"} -{"key": "BAC009S0915W0252", "wav": "./aishell/wav/test/S0915/BAC009S0915W0252.wav", "txt": "另一个让南京市民特别关心的是"} -{"key": "BAC009S0915W0253", "wav": "./aishell/wav/test/S0915/BAC009S0915W0253.wav", "txt": "或许到二零一六年的时候"} -{"key": "BAC009S0915W0254", "wav": "./aishell/wav/test/S0915/BAC009S0915W0254.wav", "txt": "零售店就再也卖不出去一个实体钱包了"} -{"key": "BAC009S0915W0255", "wav": "./aishell/wav/test/S0915/BAC009S0915W0255.wav", "txt": "催生着移动支付技术的大跃进"} -{"key": "BAC009S0915W0256", "wav": "./aishell/wav/test/S0915/BAC009S0915W0256.wav", "txt": "最关键的两个属性莫过于安全和便捷"} -{"key": "BAC009S0915W0257", "wav": "./aishell/wav/test/S0915/BAC009S0915W0257.wav", "txt": "而且它们两个之间是非常对立的关系"} -{"key": "BAC009S0915W0258", "wav": "./aishell/wav/test/S0915/BAC009S0915W0258.wav", "txt": "安全性的提升需要牺牲一定的便携性"} -{"key": "BAC009S0915W0259", "wav": "./aishell/wav/test/S0915/BAC009S0915W0259.wav", "txt": "究竟哪个特特性更加重要"} -{"key": "BAC009S0915W0260", "wav": "./aishell/wav/test/S0915/BAC009S0915W0260.wav", "txt": "这也影响着移动支付市场的总体进程和发展方向"} -{"key": "BAC009S0915W0261", "wav": "./aishell/wav/test/S0915/BAC009S0915W0261.wav", "txt": "或许是受好莱坞艳照门的影响"} -{"key": "BAC009S0915W0263", "wav": "./aishell/wav/test/S0915/BAC009S0915W0263.wav", "txt": "重点强调了其安全性"} -{"key": "BAC009S0915W0264", "wav": "./aishell/wav/test/S0915/BAC009S0915W0264.wav", "txt": "最基本的逻辑就是我们不读取信息"} -{"key": "BAC009S0915W0265", "wav": "./aishell/wav/test/S0915/BAC009S0915W0265.wav", "txt": "牛师傅总说自己的面没有添加任何防腐剂"} -{"key": "BAC009S0915W0266", "wav": "./aishell/wav/test/S0915/BAC009S0915W0266.wav", "txt": "任何的电子行为都不免会留下痕迹"} -{"key": "BAC009S0915W0267", "wav": "./aishell/wav/test/S0915/BAC009S0915W0267.wav", "txt": "移动支付又会产生非常敏感的操作信息"} -{"key": "BAC009S0915W0268", "wav": "./aishell/wav/test/S0915/BAC009S0915W0268.wav", "txt": "蕴含着巨大商业价值"} -{"key": "BAC009S0915W0269", "wav": "./aishell/wav/test/S0915/BAC009S0915W0269.wav", "txt": "有哪家支付机构愿意心无旁续地放弃这些金子呢"} -{"key": "BAC009S0915W0270", "wav": "./aishell/wav/test/S0915/BAC009S0915W0270.wav", "txt": "安全真的是移动支付的第一属性吗"} -{"key": "BAC009S0915W0271", "wav": "./aishell/wav/test/S0915/BAC009S0915W0271.wav", "txt": "消费者对便捷性的要求可能会更高"} -{"key": "BAC009S0915W0272", "wav": "./aishell/wav/test/S0915/BAC009S0915W0272.wav", "txt": "按照国内消费者的习惯"} -{"key": "BAC009S0915W0273", "wav": "./aishell/wav/test/S0915/BAC009S0915W0273.wav", "txt": "他们通常会单独办一张银行卡来绑定移动支付系统"} -{"key": "BAC009S0915W0274", "wav": "./aishell/wav/test/S0915/BAC009S0915W0274.wav", "txt": "而不是拿着主卡到处刷"} -{"key": "BAC009S0915W0275", "wav": "./aishell/wav/test/S0915/BAC009S0915W0275.wav", "txt": "移动支付可调用的只能是消费者的小额度的钱财"} -{"key": "BAC009S0915W0276", "wav": "./aishell/wav/test/S0915/BAC009S0915W0276.wav", "txt": "一般不会给消费者带来巨大损失"} -{"key": "BAC009S0915W0277", "wav": "./aishell/wav/test/S0915/BAC009S0915W0277.wav", "txt": "消费者会在特定情况下牺牲安全性来提升支付的便捷性"} -{"key": "BAC009S0915W0278", "wav": "./aishell/wav/test/S0915/BAC009S0915W0278.wav", "txt": "她们宁愿可花五个小时讨论是否买一条裙子"} -{"key": "BAC009S0915W0279", "wav": "./aishell/wav/test/S0915/BAC009S0915W0279.wav", "txt": "也不愿意花五分钟重新输一定密码"} -{"key": "BAC009S0915W0281", "wav": "./aishell/wav/test/S0915/BAC009S0915W0281.wav", "txt": "大概十年前就有了这样的说法"} -{"key": "BAC009S0915W0283", "wav": "./aishell/wav/test/S0915/BAC009S0915W0283.wav", "txt": "也正是看中了中国消费者的消费潜力"} -{"key": "BAC009S0915W0284", "wav": "./aishell/wav/test/S0915/BAC009S0915W0284.wav", "txt": "华尔街才对阿里巴巴情有独钟"} -{"key": "BAC009S0915W0286", "wav": "./aishell/wav/test/S0915/BAC009S0915W0286.wav", "txt": "就让业界讨论它会带给中国移动支付市场怎样的影响"} -{"key": "BAC009S0915W0287", "wav": "./aishell/wav/test/S0915/BAC009S0915W0287.wav", "txt": "苹果要想在中国本土化"} -{"key": "BAC009S0915W0288", "wav": "./aishell/wav/test/S0915/BAC009S0915W0288.wav", "txt": "最大难点在于如何改变国内的消费习惯"} -{"key": "BAC009S0915W0289", "wav": "./aishell/wav/test/S0915/BAC009S0915W0289.wav", "txt": "如何说服四大银行一起与之愉快合作"} -{"key": "BAC009S0915W0290", "wav": "./aishell/wav/test/S0915/BAC009S0915W0290.wav", "txt": "如何重修与中国政府的良好关系"} -{"key": "BAC009S0915W0291", "wav": "./aishell/wav/test/S0915/BAC009S0915W0291.wav", "txt": "这对于苹果来说不是件容易的事儿"} -{"key": "BAC009S0915W0292", "wav": "./aishell/wav/test/S0915/BAC009S0915W0292.wav", "txt": "现在是不是也该长点心了吧"} -{"key": "BAC009S0915W0293", "wav": "./aishell/wav/test/S0915/BAC009S0915W0293.wav", "txt": "国内移动支付需主要有两股力量"} -{"key": "BAC009S0915W0295", "wav": "./aishell/wav/test/S0915/BAC009S0915W0295.wav", "txt": "前者有长时间的沉淀"} -{"key": "BAC009S0915W0296", "wav": "./aishell/wav/test/S0915/BAC009S0915W0296.wav", "txt": "银行们对此已深耕多年"} -{"key": "BAC009S0915W0297", "wav": "./aishell/wav/test/S0915/BAC009S0915W0297.wav", "txt": "而后者则是刚刚涌现的后起之秀"} -{"key": "BAC009S0915W0298", "wav": "./aishell/wav/test/S0915/BAC009S0915W0298.wav", "txt": "二零一四年春天打车软件补贴大战"} -{"key": "BAC009S0915W0299", "wav": "./aishell/wav/test/S0915/BAC009S0915W0299.wav", "txt": "两股力量基本上都有一统天下的野心"} -{"key": "BAC009S0915W0300", "wav": "./aishell/wav/test/S0915/BAC009S0915W0300.wav", "txt": "这三个优势能在短时间内颠复美国移动支付市场的格局"} -{"key": "BAC009S0915W0301", "wav": "./aishell/wav/test/S0915/BAC009S0915W0301.wav", "txt": "最终促使苹果成为主流标准但中国市场有其特殊性"} -{"key": "BAC009S0915W0302", "wav": "./aishell/wav/test/S0915/BAC009S0915W0302.wav", "txt": "首先银联和苹果的合作谈判不会顺利"} -{"key": "BAC009S0915W0303", "wav": "./aishell/wav/test/S0915/BAC009S0915W0303.wav", "txt": "今后所有空调产品还将实现联机运行"} -{"key": "BAC009S0915W0304", "wav": "./aishell/wav/test/S0915/BAC009S0915W0304.wav", "txt": "这台设备就不会开机运转"} -{"key": "BAC009S0915W0305", "wav": "./aishell/wav/test/S0915/BAC009S0915W0305.wav", "txt": "这个在美的空调的南沙工厂武汉工厂已全面试点"} -{"key": "BAC009S0915W0306", "wav": "./aishell/wav/test/S0915/BAC009S0915W0306.wav", "txt": "自动化制造是未来唯一的出路"} -{"key": "BAC009S0915W0307", "wav": "./aishell/wav/test/S0915/BAC009S0915W0307.wav", "txt": "未来的制造业方向要实现无人化"} -{"key": "BAC009S0915W0308", "wav": "./aishell/wav/test/S0915/BAC009S0915W0308.wav", "txt": "美的计划在二零一八年"} -{"key": "BAC009S0915W0309", "wav": "./aishell/wav/test/S0915/BAC009S0915W0309.wav", "txt": "将家用空调事业部员工工人数缩减至两万人"} -{"key": "BAC009S0915W0310", "wav": "./aishell/wav/test/S0915/BAC009S0915W0310.wav", "txt": "除了四轴或三轴机器人外"} -{"key": "BAC009S0915W0311", "wav": "./aishell/wav/test/S0915/BAC009S0915W0311.wav", "txt": "今年还将新增二百台"} -{"key": "BAC009S0915W0312", "wav": "./aishell/wav/test/S0915/BAC009S0915W0312.wav", "txt": "机器人维护成本是挑战"} -{"key": "BAC009S0915W0313", "wav": "./aishell/wav/test/S0915/BAC009S0915W0313.wav", "txt": "广东东莞顺德等城市已经掀起大量机器换人计划"} -{"key": "BAC009S0915W0314", "wav": "./aishell/wav/test/S0915/BAC009S0915W0314.wav", "txt": "家电企业机器人智造也正在加速进行"} -{"key": "BAC009S0915W0315", "wav": "./aishell/wav/test/S0915/BAC009S0915W0315.wav", "txt": "从美的海尔使用机器人操作来看来"} -{"key": "BAC009S0915W0316", "wav": "./aishell/wav/test/S0915/BAC009S0915W0316.wav", "txt": "机器换人确实能够大大降低企业的用工数量"} -{"key": "BAC009S0915W0317", "wav": "./aishell/wav/test/S0915/BAC009S0915W0317.wav", "txt": "实现自动化升级也没那么简单"} -{"key": "BAC009S0915W0318", "wav": "./aishell/wav/test/S0915/BAC009S0915W0318.wav", "txt": "美的集团对项目在一定年限内有投入产出的规定"} -{"key": "BAC009S0915W0319", "wav": "./aishell/wav/test/S0915/BAC009S0915W0319.wav", "txt": "这对我们来说是个很大的挑战"} -{"key": "BAC009S0915W0320", "wav": "./aishell/wav/test/S0915/BAC009S0915W0320.wav", "txt": "同时也卡住了自动化的投入"} -{"key": "BAC009S0915W0321", "wav": "./aishell/wav/test/S0915/BAC009S0915W0321.wav", "txt": "一定年限内的投入产出"} -{"key": "BAC009S0915W0322", "wav": "./aishell/wav/test/S0915/BAC009S0915W0322.wav", "txt": "我们必须要有衡量标准"} -{"key": "BAC009S0915W0323", "wav": "./aishell/wav/test/S0915/BAC009S0915W0323.wav", "txt": "美的不能因自动化生产增加制造成本而让用户买单"} -{"key": "BAC009S0915W0324", "wav": "./aishell/wav/test/S0915/BAC009S0915W0324.wav", "txt": "美的空调进行自动化升级"} -{"key": "BAC009S0915W0325", "wav": "./aishell/wav/test/S0915/BAC009S0915W0325.wav", "txt": "一定是为了降低制造成本"} -{"key": "BAC009S0915W0326", "wav": "./aishell/wav/test/S0915/BAC009S0915W0326.wav", "txt": "比如降低人工费用运作费用等"} -{"key": "BAC009S0915W0327", "wav": "./aishell/wav/test/S0915/BAC009S0915W0327.wav", "txt": "机器人后期维护运行成本及技术也是一个高门槛"} -{"key": "BAC009S0915W0328", "wav": "./aishell/wav/test/S0915/BAC009S0915W0328.wav", "txt": "因为机器人生产商派遣技术人员不可能长期驻起驻点企业"} -{"key": "BAC009S0915W0329", "wav": "./aishell/wav/test/S0915/BAC009S0915W0329.wav", "txt": "高工机器人董事长张小飞表示"} -{"key": "BAC009S0915W0330", "wav": "./aishell/wav/test/S0915/BAC009S0915W0330.wav", "txt": "家电企业自动化升级改造必须进行"} -{"key": "BAC009S0915W0331", "wav": "./aishell/wav/test/S0915/BAC009S0915W0331.wav", "txt": "但伴随一定的投资风险"} -{"key": "BAC009S0915W0332", "wav": "./aishell/wav/test/S0915/BAC009S0915W0332.wav", "txt": "除了后期技术维护能力外"} -{"key": "BAC009S0915W0333", "wav": "./aishell/wav/test/S0915/BAC009S0915W0333.wav", "txt": "对于国内家电企业而言"} -{"key": "BAC009S0915W0334", "wav": "./aishell/wav/test/S0915/BAC009S0915W0334.wav", "txt": "自动化生产线的柔性改造也是其面临的一大难题"} -{"key": "BAC009S0915W0335", "wav": "./aishell/wav/test/S0915/BAC009S0915W0335.wav", "txt": "空调产品越来越追求个性化"} -{"key": "BAC009S0915W0336", "wav": "./aishell/wav/test/S0915/BAC009S0915W0336.wav", "txt": "这需要通过机器人的柔性改变来对此进行处理"} -{"key": "BAC009S0915W0337", "wav": "./aishell/wav/test/S0915/BAC009S0915W0337.wav", "txt": "家电企业要建立数字化工厂才能真正提升生产效率"} -{"key": "BAC009S0915W0338", "wav": "./aishell/wav/test/S0915/BAC009S0915W0338.wav", "txt": "她的表现也更加全面"} -{"key": "BAC009S0915W0339", "wav": "./aishell/wav/test/S0915/BAC009S0915W0339.wav", "txt": "对阵俄罗斯的比赛中"} -{"key": "BAC009S0915W0340", "wav": "./aishell/wav/test/S0915/BAC009S0915W0340.wav", "txt": "在张常宁一度进行进攻受阻"} -{"key": "BAC009S0915W0341", "wav": "./aishell/wav/test/S0915/BAC009S0915W0341.wav", "txt": "刘晓彤替补上场打得缩手缩脚的情况下"} -{"key": "BAC009S0915W0342", "wav": "./aishell/wav/test/S0915/BAC009S0915W0342.wav", "txt": "不断地为中国女排得分"} -{"key": "BAC009S0915W0343", "wav": "./aishell/wav/test/S0915/BAC009S0915W0343.wav", "txt": "只要中国队需要有人挺身而出"} -{"key": "BAC009S0915W0344", "wav": "./aishell/wav/test/S0915/BAC009S0915W0344.wav", "txt": "朱婷在中韩之战中一度受伤"} -{"key": "BAC009S0915W0345", "wav": "./aishell/wav/test/S0915/BAC009S0915W0345.wav", "txt": "但她在中国队遇到困难的时候坚持带伤上阵"} -{"key": "BAC009S0915W0346", "wav": "./aishell/wav/test/S0915/BAC009S0915W0346.wav", "txt": "最终掠队拿下了比赛"} -{"key": "BAC009S0915W0347", "wav": "./aishell/wav/test/S0915/BAC009S0915W0347.wav", "txt": "在队长惠若琪因为身体原因无缘世界杯的情况下"} -{"key": "BAC009S0915W0348", "wav": "./aishell/wav/test/S0915/BAC009S0915W0348.wav", "txt": "朱婷就是中国女排的核心"} -{"key": "BAC009S0915W0349", "wav": "./aishell/wav/test/S0915/BAC009S0915W0349.wav", "txt": "朱婷再度扮演了场上头脑的角色"} -{"key": "BAC009S0915W0350", "wav": "./aishell/wav/test/S0915/BAC009S0915W0350.wav", "txt": "队员们也对于她在技术上和心理上都颇为依赖和信服"} -{"key": "BAC009S0915W0351", "wav": "./aishell/wav/test/S0915/BAC009S0915W0351.wav", "txt": "朱婷扣球拿下一百一十三分"} -{"key": "BAC009S0915W0352", "wav": "./aishell/wav/test/S0915/BAC009S0915W0352.wav", "txt": "总共贡献了一百四十一分"} -{"key": "BAC009S0915W0353", "wav": "./aishell/wav/test/S0915/BAC009S0915W0353.wav", "txt": "反超张常宁成为中国队的得分王"} -{"key": "BAC009S0915W0354", "wav": "./aishell/wav/test/S0915/BAC009S0915W0354.wav", "txt": "让朱婷最佳球员的身份和价值再度彰显"} -{"key": "BAC009S0915W0355", "wav": "./aishell/wav/test/S0915/BAC009S0915W0355.wav", "txt": "尚不足十八岁的她身高为一米八六"} -{"key": "BAC009S0915W0356", "wav": "./aishell/wav/test/S0915/BAC009S0915W0356.wav", "txt": "徐建德统领的中国青年队八战全胜夺得冠军"} -{"key": "BAC009S0915W0357", "wav": "./aishell/wav/test/S0915/BAC009S0915W0357.wav", "txt": "作为主力主攻的朱婷"} -{"key": "BAC009S0915W0358", "wav": "./aishell/wav/test/S0915/BAC009S0915W0358.wav", "txt": "从而被授予最有价值球员荣誉"} -{"key": "BAC009S0915W0360", "wav": "./aishell/wav/test/S0915/BAC009S0915W0360.wav", "txt": "当时身披八号战袍的她身高达到了一米九五公分"} -{"key": "BAC009S0915W0361", "wav": "./aishell/wav/test/S0915/BAC009S0915W0361.wav", "txt": "朱婷斩获了一六七分"} -{"key": "BAC009S0915W0362", "wav": "./aishell/wav/test/S0915/BAC009S0915W0362.wav", "txt": "与多米尼加的马丁内斯一起摘得最佳得分奖"} -{"key": "BAC009S0915W0363", "wav": "./aishell/wav/test/S0915/BAC009S0915W0363.wav", "txt": "随后还以百分之五十三点五六的得分率拿到了最佳进攻的大奖"} -{"key": "BAC009S0915W0364", "wav": "./aishell/wav/test/S0915/BAC009S0915W0364.wav", "txt": "朱婷荣膺最有价值球员"} -{"key": "BAC009S0915W0365", "wav": "./aishell/wav/test/S0915/BAC009S0915W0365.wav", "txt": "还与巴西队的加比一起入选最佳主攻"} -{"key": "BAC009S0915W0366", "wav": "./aishell/wav/test/S0915/BAC009S0915W0366.wav", "txt": "当年的整个世青赛上"} -{"key": "BAC009S0915W0367", "wav": "./aishell/wav/test/S0915/BAC009S0915W0367.wav", "txt": "中国队虽然如愿夺冠"} -{"key": "BAC009S0915W0368", "wav": "./aishell/wav/test/S0915/BAC009S0915W0368.wav", "txt": "朱婷却是唯一的硕果"} -{"key": "BAC009S0915W0369", "wav": "./aishell/wav/test/S0915/BAC009S0915W0369.wav", "txt": "去年六月下旬举行的中国国际精英赛北仑站"} -{"key": "BAC009S0915W0371", "wav": "./aishell/wav/test/S0915/BAC009S0915W0371.wav", "txt": "当时郎平率队三战全胜名列第一"} -{"key": "BAC009S0915W0372", "wav": "./aishell/wav/test/S0915/BAC009S0915W0372.wav", "txt": "朱婷两场比赛担任首发"} -{"key": "BAC009S0915W0374", "wav": "./aishell/wav/test/S0915/BAC009S0915W0374.wav", "txt": "而在今年的香港站上"} -{"key": "BAC009S0915W0375", "wav": "./aishell/wav/test/S0915/BAC009S0915W0375.wav", "txt": "中国队三比二力克美国队收获分站赛九连胜"} -{"key": "BAC009S0915W0376", "wav": "./aishell/wav/test/S0915/BAC009S0915W0376.wav", "txt": "赛后主攻朱婷获最有价值球员和最受欢迎球员"} -{"key": "BAC009S0915W0377", "wav": "./aishell/wav/test/S0915/BAC009S0915W0377.wav", "txt": "主教练郎平获得最佳教练"} -{"key": "BAC009S0915W0378", "wav": "./aishell/wav/test/S0915/BAC009S0915W0378.wav", "txt": "三场比赛朱婷均有出色表现"} -{"key": "BAC009S0915W0379", "wav": "./aishell/wav/test/S0915/BAC009S0915W0379.wav", "txt": "朱婷共计拿下二十四分"} -{"key": "BAC009S0915W0380", "wav": "./aishell/wav/test/S0915/BAC009S0915W0380.wav", "txt": "第二场对阵日本也拿下全队第二高的十二分"} -{"key": "BAC009S0915W0381", "wav": "./aishell/wav/test/S0915/BAC009S0915W0381.wav", "txt": "获得二十三分荣誉全场得分王"} -{"key": "BAC009S0915W0382", "wav": "./aishell/wav/test/S0915/BAC009S0915W0382.wav", "txt": "在分站赛总得分榜上"} -{"key": "BAC009S0915W0383", "wav": "./aishell/wav/test/S0915/BAC009S0915W0383.wav", "txt": "朱婷以一百五十七分领先群芳"} -{"key": "BAC009S0915W0384", "wav": "./aishell/wav/test/S0915/BAC009S0915W0384.wav", "txt": "其中扣球拿到一百三十二分"} -{"key": "BAC009S0915W0385", "wav": "./aishell/wav/test/S0915/BAC009S0915W0385.wav", "txt": "扣球成功率五十四点百分之十高居榜首"} -{"key": "BAC009S0915W0386", "wav": "./aishell/wav/test/S0915/BAC009S0915W0386.wav", "txt": "人们首先会想到她的高度"} -{"key": "BAC009S0915W0387", "wav": "./aishell/wav/test/S0915/BAC009S0915W0387.wav", "txt": "其一米九五的身高三米二七的扣球高度"} -{"key": "BAC009S0915W0388", "wav": "./aishell/wav/test/S0915/BAC009S0915W0388.wav", "txt": "在比赛中确实非常有利"} -{"key": "BAC009S0915W0389", "wav": "./aishell/wav/test/S0915/BAC009S0915W0389.wav", "txt": "朱婷进攻相对比较简单"} -{"key": "BAC009S0915W0390", "wav": "./aishell/wav/test/S0915/BAC009S0915W0390.wav", "txt": "主要是四号位的高点强攻和六号位的后排进攻"} -{"key": "BAC009S0915W0391", "wav": "./aishell/wav/test/S0915/BAC009S0915W0391.wav", "txt": "四号位进攻以大斜线为主"} -{"key": "BAC009S0915W0392", "wav": "./aishell/wav/test/S0915/BAC009S0915W0392.wav", "txt": "她进攻的变化逐渐多了起来"} -{"key": "BAC009S0915W0393", "wav": "./aishell/wav/test/S0915/BAC009S0915W0393.wav", "txt": "首先是增加了二号位的进攻"} -{"key": "BAC009S0915W0394", "wav": "./aishell/wav/test/S0915/BAC009S0915W0394.wav", "txt": "即当自己轮转到前排二号位时"} -{"key": "BAC009S0915W0395", "wav": "./aishell/wav/test/S0915/BAC009S0915W0395.wav", "txt": "临时客串接应在二号位参与强攻"} -{"key": "BAC009S0915W0396", "wav": "./aishell/wav/test/S0915/BAC009S0915W0396.wav", "txt": "这样既丰富了自己也增加了全队的进攻变化"} -{"key": "BAC009S0915W0397", "wav": "./aishell/wav/test/S0915/BAC009S0915W0397.wav", "txt": "再就是四号位的进攻除了斜线"} -{"key": "BAC009S0915W0398", "wav": "./aishell/wav/test/S0915/BAC009S0915W0398.wav", "txt": "还增加了直线直线和斜线之间的所谓二直线"} -{"key": "BAC009S0915W0399", "wav": "./aishell/wav/test/S0915/BAC009S0915W0399.wav", "txt": "不时还施以非常巧妙的吊球"} -{"key": "BAC009S0915W0400", "wav": "./aishell/wav/test/S0915/BAC009S0915W0400.wav", "txt": "视频中国三一大胜俄罗斯独占女排世界杯榜首"} -{"key": "BAC009S0915W0401", "wav": "./aishell/wav/test/S0915/BAC009S0915W0401.wav", "txt": "日本二零一五女排世界杯单循环赛战至第十轮"} -{"key": "BAC009S0915W0402", "wav": "./aishell/wav/test/S0915/BAC009S0915W0402.wav", "txt": "由郎平挂帅的中国女排在名古屋赛区"} -{"key": "BAC009S0915W0403", "wav": "./aishell/wav/test/S0915/BAC009S0915W0403.wav", "txt": "提升战绩为九胜一负反超至榜首位置"} -{"key": "BAC009S0915W0404", "wav": "./aishell/wav/test/S0915/BAC009S0915W0404.wav", "txt": "上周在北美电影市场上遭遇票房惨剧"} -{"key": "BAC009S0915W0405", "wav": "./aishell/wav/test/S0915/BAC009S0915W0405.wav", "txt": "只以六百四十八万美元的进账排名第八"} -{"key": "BAC009S0915W0406", "wav": "./aishell/wav/test/S0915/BAC009S0915W0406.wav", "txt": "这部电影的失败并没有影响囧瑟夫的心情"} -{"key": "BAC009S0915W0407", "wav": "./aishell/wav/test/S0915/BAC009S0915W0407.wav", "txt": "将自导自演一部名为睡魔的科幻大片"} -{"key": "BAC009S0915W0408", "wav": "./aishell/wav/test/S0915/BAC009S0915W0408.wav", "txt": "让体重维持在四十五公斤左右"} -{"key": "BAC009S0915W0409", "wav": "./aishell/wav/test/S0915/BAC009S0915W0409.wav", "txt": "但网友的吐槽却一直没有停息"} -{"key": "BAC009S0915W0410", "wav": "./aishell/wav/test/S0915/BAC009S0915W0410.wav", "txt": "她在台湾出席活动"} -{"key": "BAC009S0915W0411", "wav": "./aishell/wav/test/S0915/BAC009S0915W0411.wav", "txt": "坦言刚开拍的一个月中"} -{"key": "BAC009S0915W0412", "wav": "./aishell/wav/test/S0915/BAC009S0915W0412.wav", "txt": "心情低落到崩溃大哭"} -{"key": "BAC009S0915W0413", "wav": "./aishell/wav/test/S0915/BAC009S0915W0413.wav", "txt": "甚至出现忧郁症状况"} -{"key": "BAC009S0915W0414", "wav": "./aishell/wav/test/S0915/BAC009S0915W0414.wav", "txt": "搜狐娱乐讯陈妍希传出和陈晓的恋情之后"} -{"key": "BAC009S0915W0415", "wav": "./aishell/wav/test/S0915/BAC009S0915W0415.wav", "txt": "二人一直鲜少回应"} -{"key": "BAC009S0915W0416", "wav": "./aishell/wav/test/S0915/BAC009S0915W0416.wav", "txt": "陈妍希回到台北代言悠游卡"} -{"key": "BAC009S0915W0417", "wav": "./aishell/wav/test/S0915/BAC009S0915W0417.wav", "txt": "外传她可能已经怀孕"} -{"key": "BAC009S0915W0418", "wav": "./aishell/wav/test/S0915/BAC009S0915W0418.wav", "txt": "但陈妍希在出席活动时"} -{"key": "BAC009S0915W0419", "wav": "./aishell/wav/test/S0915/BAC009S0915W0419.wav", "txt": "穿高跟鞋快步走"} -{"key": "BAC009S0915W0420", "wav": "./aishell/wav/test/S0915/BAC009S0915W0420.wav", "txt": "似乎也让传言不攻自破"} -{"key": "BAC009S0915W0421", "wav": "./aishell/wav/test/S0915/BAC009S0915W0421.wav", "txt": "贵州都市报十月二十九日报道据台湾媒体报道艺人陈妍希认爱小四岁的大陆小生陈晓"} -{"key": "BAC009S0915W0423", "wav": "./aishell/wav/test/S0915/BAC009S0915W0423.wav", "txt": "两人因合作神雕侠侣擦出爱火"} -{"key": "BAC009S0915W0424", "wav": "./aishell/wav/test/S0915/BAC009S0915W0424.wav", "txt": "恋情发展备受关注"} -{"key": "BAC009S0915W0425", "wav": "./aishell/wav/test/S0915/BAC009S0915W0425.wav", "txt": "更在日前爆出交往七个月准备闪婚"} -{"key": "BAC009S0915W0426", "wav": "./aishell/wav/test/S0915/BAC009S0915W0426.wav", "txt": "连男方在法国包游艇求婚的照片都被网友扒出"} -{"key": "BAC009S0915W0427", "wav": "./aishell/wav/test/S0915/BAC009S0915W0427.wav", "txt": "她坦承当时很惊喜很感动"} -{"key": "BAC009S0915W0428", "wav": "./aishell/wav/test/S0915/BAC009S0915W0428.wav", "txt": "男友受访时也首度大方松口确实已经进入求婚阶段"} -{"key": "BAC009S0915W0429", "wav": "./aishell/wav/test/S0915/BAC009S0915W0429.wav", "txt": "让粉丝听了又惊又喜"} -{"key": "BAC009S0915W0430", "wav": "./aishell/wav/test/S0915/BAC009S0915W0430.wav", "txt": "搜狐娱乐讯据台湾媒体报道"} -{"key": "BAC009S0915W0431", "wav": "./aishell/wav/test/S0915/BAC009S0915W0431.wav", "txt": "记者调查湖南张家界国家森林公园低价团问题"} -{"key": "BAC009S0915W0432", "wav": "./aishell/wav/test/S0915/BAC009S0915W0432.wav", "txt": "四零零元左右的低价两日游在当地非常普遍"} -{"key": "BAC009S0915W0433", "wav": "./aishell/wav/test/S0915/BAC009S0915W0433.wav", "txt": "这种低价游自称费用全包"} -{"key": "BAC009S0915W0434", "wav": "./aishell/wav/test/S0915/BAC009S0915W0434.wav", "txt": "原本自费项目变成必须交费项目"} -{"key": "BAC009S0915W0435", "wav": "./aishell/wav/test/S0915/BAC009S0915W0435.wav", "txt": "导游还诱骗游客加钱走特殊路线"} -{"key": "BAC009S0915W0436", "wav": "./aishell/wav/test/S0915/BAC009S0915W0436.wav", "txt": "面对游客质疑和退团要求"} -{"key": "BAC009S0915W0437", "wav": "./aishell/wav/test/S0915/BAC009S0915W0437.wav", "txt": "导游放言此树是我栽"} -{"key": "BAC009S0915W0438", "wav": "./aishell/wav/test/S0915/BAC009S0915W0438.wav", "txt": "你不可能一分钱不花"} -{"key": "BAC009S0915W0439", "wav": "./aishell/wav/test/S0915/BAC009S0915W0439.wav", "txt": "游客赴港游买瑞士表半个月停摆旅行社久拖不管"} -{"key": "BAC009S0915W0440", "wav": "./aishell/wav/test/S0915/BAC009S0915W0440.wav", "txt": "市民刘先生和江西环球国际旅行社的沟通协商再次失败"} -{"key": "BAC009S0915W0441", "wav": "./aishell/wav/test/S0915/BAC009S0915W0441.wav", "txt": "双方矛盾的焦点是一只瑞士名表"} -{"key": "BAC009S0915W0442", "wav": "./aishell/wav/test/S0915/BAC009S0915W0442.wav", "txt": "游客踩敦煌千年古城遗址拍照反问踩了会掉吗"} -{"key": "BAC009S0915W0443", "wav": "./aishell/wav/test/S0915/BAC009S0915W0443.wav", "txt": "现场图一零月五日下午"} -{"key": "BAC009S0915W0444", "wav": "./aishell/wav/test/S0915/BAC009S0915W0444.wav", "txt": "在甘肃敦煌大方盘城遗址"} -{"key": "BAC009S0915W0445", "wav": "./aishell/wav/test/S0915/BAC009S0915W0445.wav", "txt": "几位游客轮流翻越护栏"} -{"key": "BAC009S0915W0446", "wav": "./aishell/wav/test/S0915/BAC009S0915W0446.wav", "txt": "一位游客在拍照中说人家几千年都没有掉下来"} -{"key": "BAC009S0915W0447", "wav": "./aishell/wav/test/S0915/BAC009S0915W0447.wav", "txt": "踩一下就掉下来了"} -{"key": "BAC009S0915W0448", "wav": "./aishell/wav/test/S0915/BAC009S0915W0448.wav", "txt": "澎湃新闻在现场看到"} -{"key": "BAC009S0915W0449", "wav": "./aishell/wav/test/S0915/BAC009S0915W0449.wav", "txt": "遗址附近有多处警示牌写明严禁跨入保护区"} -{"key": "BAC009S0915W0450", "wav": "./aishell/wav/test/S0915/BAC009S0915W0450.wav", "txt": "游客进店未购物被导游嘲讽官方正在立案处理"} -{"key": "BAC009S0915W0451", "wav": "./aishell/wav/test/S0915/BAC009S0915W0451.wav", "txt": "游客铜缸刻字秀恩爱故宫已报警"} -{"key": "BAC009S0915W0452", "wav": "./aishell/wav/test/S0915/BAC009S0915W0452.wav", "txt": "法制晚报讯记者李洁今天傍晚"} -{"key": "BAC009S0915W0453", "wav": "./aishell/wav/test/S0915/BAC009S0915W0453.wav", "txt": "严厉谴责这一不文明应为"} -{"key": "BAC009S0915W0454", "wav": "./aishell/wav/test/S0915/BAC009S0915W0454.wav", "txt": "并称故宫博物院已就此事件向公安机关报案"} -{"key": "BAC009S0915W0455", "wav": "./aishell/wav/test/S0915/BAC009S0915W0455.wav", "txt": "游客青岛遭遇天价虾当地人最多几十元一斤"} -{"key": "BAC009S0915W0456", "wav": "./aishell/wav/test/S0915/BAC009S0915W0456.wav", "txt": "肖先生在上菜后高兴地拍下图片"} -{"key": "BAC009S0915W0457", "wav": "./aishell/wav/test/S0915/BAC009S0915W0457.wav", "txt": "当时他还不知道自己会被暗算"} -{"key": "BAC009S0915W0458", "wav": "./aishell/wav/test/S0915/BAC009S0915W0458.wav", "txt": "游客骑着明孝陵驮碑龟趺拍照市民大煞风景"} -{"key": "BAC009S0915W0459", "wav": "./aishell/wav/test/S0915/BAC009S0915W0459.wav", "txt": "游客骑在龟趺身上报料人供图"} -{"key": "BAC009S0915W0460", "wav": "./aishell/wav/test/S0915/BAC009S0915W0460.wav", "txt": "游戏主播花样作死声称天津是他炸的直播被抓游戏室老板因冲突开枪将人射伤致死"} -{"key": "BAC009S0915W0461", "wav": "./aishell/wav/test/S0915/BAC009S0915W0461.wav", "txt": "一五年后落网"} -{"key": "BAC009S0915W0462", "wav": "./aishell/wav/test/S0915/BAC009S0915W0462.wav", "txt": "贵港民警追凶未言弃嫌犯一五年后落法网"} -{"key": "BAC009S0915W0463", "wav": "./aishell/wav/test/S0915/BAC009S0915W0463.wav", "txt": "游戏平台称投千元可收百万数十民上当"} -{"key": "BAC009S0915W0464", "wav": "./aishell/wav/test/S0915/BAC009S0915W0464.wav", "txt": "信息时报讯记者周伟龙天上不会掉馅饼"} -{"key": "BAC009S0915W0465", "wav": "./aishell/wav/test/S0915/BAC009S0915W0465.wav", "txt": "数十名市民赶到越秀区一酒家维权"} -{"key": "BAC009S0915W0466", "wav": "./aishell/wav/test/S0915/BAC009S0915W0466.wav", "txt": "称他们曾在这里被人游说注册了一游戏平台的账户"} -{"key": "BAC009S0915W0467", "wav": "./aishell/wav/test/S0915/BAC009S0915W0467.wav", "txt": "花费几千元至上万元不等"} -{"key": "BAC009S0915W0468", "wav": "./aishell/wav/test/S0915/BAC009S0915W0468.wav", "txt": "原以为可以按照游戏规则定期分红提现"} -{"key": "BAC009S0915W0469", "wav": "./aishell/wav/test/S0915/BAC009S0915W0469.wav", "txt": "孰料从上月底开始平台关闭"} -{"key": "BAC009S0915W0470", "wav": "./aishell/wav/test/S0915/BAC009S0915W0470.wav", "txt": "随后众人一起到东山派出所报案"} -{"key": "BAC009S0915W0471", "wav": "./aishell/wav/test/S0915/BAC009S0915W0471.wav", "txt": "有待警方进一步调查"} -{"key": "BAC009S0915W0472", "wav": "./aishell/wav/test/S0915/BAC009S0915W0472.wav", "txt": "游戏网站频遭攻击每周交二零零零元保护费息事宁人"} -{"key": "BAC009S0915W0473", "wav": "./aishell/wav/test/S0915/BAC009S0915W0473.wav", "txt": "办案民警检查作案设备金华警方供图昨天"} -{"key": "BAC009S0915W0474", "wav": "./aishell/wav/test/S0915/BAC009S0915W0474.wav", "txt": "记者从金华市公安局获悉"} -{"key": "BAC009S0915W0475", "wav": "./aishell/wav/test/S0915/BAC009S0915W0475.wav", "txt": "仅半年就敲诈勒索了五七二万元"} -{"key": "BAC009S0915W0476", "wav": "./aishell/wav/test/S0915/BAC009S0915W0476.wav", "txt": "该案也被列为公安部督办大案"} -{"key": "BAC009S0915W0477", "wav": "./aishell/wav/test/S0915/BAC009S0915W0477.wav", "txt": "警方已抓获一五名犯罪嫌疑人"} -{"key": "BAC009S0915W0478", "wav": "./aishell/wav/test/S0915/BAC009S0915W0478.wav", "txt": "湖北一七二名教师转岗当保安其中有人曾是校长"} -{"key": "BAC009S0915W0479", "wav": "./aishell/wav/test/S0915/BAC009S0915W0479.wav", "txt": "一身保安制服的他准时站在校门口"} -{"key": "BAC009S0915W0480", "wav": "./aishell/wav/test/S0915/BAC009S0915W0480.wav", "txt": "手握电动栅栏遥控器"} -{"key": "BAC009S0915W0481", "wav": "./aishell/wav/test/S0915/BAC009S0915W0481.wav", "txt": "眼睛警惕地注视着进出校门的车辆和学生"} -{"key": "BAC009S0915W0482", "wav": "./aishell/wav/test/S0915/BAC009S0915W0482.wav", "txt": "湖北一九岁女护士深夜遭抢劫杀害嫌疑嫌犯已落网"} -{"key": "BAC009S0915W0483", "wav": "./aishell/wav/test/S0915/BAC009S0915W0483.wav", "txt": "凶手被抓捕归案钟欣摄"} -{"key": "BAC009S0915W0484", "wav": "./aishell/wav/test/S0915/BAC009S0915W0484.wav", "txt": "湖北二五岁女子从未来例假基因检查是男身"} -{"key": "BAC009S0915W0485", "wav": "./aishell/wav/test/S0915/BAC009S0915W0485.wav", "txt": "家住汉阳的莎莎化名"} -{"key": "BAC009S0915W0486", "wav": "./aishell/wav/test/S0915/BAC009S0915W0486.wav", "txt": "近日在医院检查才发现"} -{"key": "BAC009S0915W0487", "wav": "./aishell/wav/test/S0915/BAC009S0915W0487.wav", "txt": "她的基因竟是个纯爷们"} -{"key": "BAC009S0915W0488", "wav": "./aishell/wav/test/S0915/BAC009S0915W0488.wav", "txt": "湖北三亿打造亚洲玫瑰基地多个种植园杂草丛生"} -{"key": "BAC009S0915W0489", "wav": "./aishell/wav/test/S0915/BAC009S0915W0489.wav", "txt": "湖北四名被捅法官脱离危险一女法官尚在哺乳期"} -{"key": "BAC009S0915W0490", "wav": "./aishell/wav/test/S0915/BAC009S0915W0490.wav", "txt": "经十堰市中级人民法院确认"} -{"key": "BAC009S0915W0491", "wav": "./aishell/wav/test/S0915/BAC009S0915W0491.wav", "txt": "四名法官系送达法律文书时被刺伤"} -{"key": "BAC009S0915W0492", "wav": "./aishell/wav/test/S0915/BAC009S0915W0492.wav", "txt": "目前均暂无生命危险"} -{"key": "BAC009S0915W0493", "wav": "./aishell/wav/test/S0915/BAC009S0915W0493.wav", "txt": "其中一女法官尚在哺乳期"} -{"key": "BAC009S0915W0494", "wav": "./aishell/wav/test/S0915/BAC009S0915W0494.wav", "txt": "湖北六零后求婚九零后被指责欠款六千万因诈骗取保候审"} -{"key": "BAC009S0915W0495", "wav": "./aishell/wav/test/S0915/BAC009S0915W0495.wav", "txt": "湖北黄石市一家商场前"} -{"key": "BAC009S0916W0121", "wav": "./aishell/wav/test/S0916/BAC009S0916W0121.wav", "txt": "真正落地的产品却非常地少"} -{"key": "BAC009S0916W0122", "wav": "./aishell/wav/test/S0916/BAC009S0916W0122.wav", "txt": "而落地后的产品与客户的期待甚远"} -{"key": "BAC009S0916W0123", "wav": "./aishell/wav/test/S0916/BAC009S0916W0123.wav", "txt": "这些状况每日均上演发生"} -{"key": "BAC009S0916W0124", "wav": "./aishell/wav/test/S0916/BAC009S0916W0124.wav", "txt": "大部分的创业者举步艰辛"} -{"key": "BAC009S0916W0125", "wav": "./aishell/wav/test/S0916/BAC009S0916W0125.wav", "txt": "钱烧完了东西出不来"} -{"key": "BAC009S0916W0126", "wav": "./aishell/wav/test/S0916/BAC009S0916W0126.wav", "txt": "创业者成了智慧时代的贡品"} -{"key": "BAC009S0916W0127", "wav": "./aishell/wav/test/S0916/BAC009S0916W0127.wav", "txt": "这不是这个时代的不公平"} -{"key": "BAC009S0916W0128", "wav": "./aishell/wav/test/S0916/BAC009S0916W0128.wav", "txt": "而是我们对这个时代了解的太少"} -{"key": "BAC009S0916W0129", "wav": "./aishell/wav/test/S0916/BAC009S0916W0129.wav", "txt": "如果我们懂得多一点智能家居产品市场的法则"} -{"key": "BAC009S0916W0130", "wav": "./aishell/wav/test/S0916/BAC009S0916W0130.wav", "txt": "我们的路也许会好走得多"} -{"key": "BAC009S0916W0131", "wav": "./aishell/wav/test/S0916/BAC009S0916W0131.wav", "txt": "一智能产品的安全"} -{"key": "BAC009S0916W0132", "wav": "./aishell/wav/test/S0916/BAC009S0916W0132.wav", "txt": "连接的最高代价就是安全问题"} -{"key": "BAC009S0916W0133", "wav": "./aishell/wav/test/S0916/BAC009S0916W0133.wav", "txt": "成千上万的产品通过无线连接"} -{"key": "BAC009S0916W0134", "wav": "./aishell/wav/test/S0916/BAC009S0916W0134.wav", "txt": "只要一个单品存在安全漏洞"} -{"key": "BAC009S0916W0135", "wav": "./aishell/wav/test/S0916/BAC009S0916W0135.wav", "txt": "整个系统的安全就会出现问题"} -{"key": "BAC009S0916W0136", "wav": "./aishell/wav/test/S0916/BAC009S0916W0136.wav", "txt": "产生非常可怕的结果"} -{"key": "BAC009S0916W0137", "wav": "./aishell/wav/test/S0916/BAC009S0916W0137.wav", "txt": "现阶段市场上落地的产品大多对安全的认知都存在缺陷"} -{"key": "BAC009S0916W0138", "wav": "./aishell/wav/test/S0916/BAC009S0916W0138.wav", "txt": "普遍认为现在的市场很小且还是单品"} -{"key": "BAC009S0916W0139", "wav": "./aishell/wav/test/S0916/BAC009S0916W0139.wav", "txt": "不用花那么大的成本去解决安全的问题"} -{"key": "BAC009S0916W0140", "wav": "./aishell/wav/test/S0916/BAC009S0916W0140.wav", "txt": "可大家必须明白一个道理"} -{"key": "BAC009S0916W0141", "wav": "./aishell/wav/test/S0916/BAC009S0916W0141.wav", "txt": "当大家习惯安全的问题留以后解决的时候"} -{"key": "BAC009S0916W0142", "wav": "./aishell/wav/test/S0916/BAC009S0916W0142.wav", "txt": "安全问题立即会成为你的内伤"} -{"key": "BAC009S0916W0143", "wav": "./aishell/wav/test/S0916/BAC009S0916W0143.wav", "txt": "但综观国内同类企业"} -{"key": "BAC009S0916W0144", "wav": "./aishell/wav/test/S0916/BAC009S0916W0144.wav", "txt": "以深圳智能锁业代表为例"} -{"key": "BAC009S0916W0145", "wav": "./aishell/wav/test/S0916/BAC009S0916W0145.wav", "txt": "在安全加解密认证等方面也做足了功夫"} -{"key": "BAC009S0916W0147", "wav": "./aishell/wav/test/S0916/BAC009S0916W0147.wav", "txt": "软件与硬件都做了深度的对接"} -{"key": "BAC009S0916W0148", "wav": "./aishell/wav/test/S0916/BAC009S0916W0148.wav", "txt": "把顾客个人资料全部归客户自己保管"} -{"key": "BAC009S0916W0149", "wav": "./aishell/wav/test/S0916/BAC009S0916W0149.wav", "txt": "企业不接触客户个人资料"} -{"key": "BAC009S0916W0150", "wav": "./aishell/wav/test/S0916/BAC009S0916W0150.wav", "txt": "许多企业都把取得顾客个人资料当作资本"} -{"key": "BAC009S0916W0151", "wav": "./aishell/wav/test/S0916/BAC009S0916W0151.wav", "txt": "这是智能家居行业的先例"} -{"key": "BAC009S0916W0152", "wav": "./aishell/wav/test/S0916/BAC009S0916W0152.wav", "txt": "必须具有高度习惯融合性和耐用性"} -{"key": "BAC009S0916W0153", "wav": "./aishell/wav/test/S0916/BAC009S0916W0153.wav", "txt": "这决不是八零九零的消费习惯这么单纯的问题"} -{"key": "BAC009S0916W0154", "wav": "./aishell/wav/test/S0916/BAC009S0916W0154.wav", "txt": "是每个家庭成员体验的统一"} -{"key": "BAC009S0916W0155", "wav": "./aishell/wav/test/S0916/BAC009S0916W0155.wav", "txt": "也就是每个成员综合体验的最大公约数"} -{"key": "BAC009S0916W0156", "wav": "./aishell/wav/test/S0916/BAC009S0916W0156.wav", "txt": "以情怀代替体验是非常错误的"} -{"key": "BAC009S0916W0157", "wav": "./aishell/wav/test/S0916/BAC009S0916W0157.wav", "txt": "产品的核心是客户的体验"} -{"key": "BAC009S0916W0158", "wav": "./aishell/wav/test/S0916/BAC009S0916W0158.wav", "txt": "顾客体验的核心是真善美"} -{"key": "BAC009S0916W0159", "wav": "./aishell/wav/test/S0916/BAC009S0916W0159.wav", "txt": "近来看到的许多创新型产品"} -{"key": "BAC009S0916W0160", "wav": "./aishell/wav/test/S0916/BAC009S0916W0160.wav", "txt": "可使用起来让人啼笑皆非"} -{"key": "BAC009S0916W0161", "wav": "./aishell/wav/test/S0916/BAC009S0916W0161.wav", "txt": "加解密的措施如同虚设"} -{"key": "BAC009S0916W0162", "wav": "./aishell/wav/test/S0916/BAC009S0916W0162.wav", "txt": "没有智慧手机的成员无法开门"} -{"key": "BAC009S0916W0163", "wav": "./aishell/wav/test/S0916/BAC009S0916W0163.wav", "txt": "这是一帮精英自恋情怀的产品"} -{"key": "BAC009S0916W0164", "wav": "./aishell/wav/test/S0916/BAC009S0916W0164.wav", "txt": "可美国的月亮总是比中国的亮"} -{"key": "BAC009S0916W0165", "wav": "./aishell/wav/test/S0916/BAC009S0916W0165.wav", "txt": "国内许多媒体或企业都在为其背书"} -{"key": "BAC009S0916W0166", "wav": "./aishell/wav/test/S0916/BAC009S0916W0166.wav", "txt": "而对国内比它更优秀的产品却集体失声"} -{"key": "BAC009S0916W0167", "wav": "./aishell/wav/test/S0916/BAC009S0916W0167.wav", "txt": "只要了解一点核桃锁信息的人都能第一时间感受到"} -{"key": "BAC009S0916W0168", "wav": "./aishell/wav/test/S0916/BAC009S0916W0168.wav", "txt": "一智能家居产品的销售渠道"} -{"key": "BAC009S0916W0169", "wav": "./aishell/wav/test/S0916/BAC009S0916W0169.wav", "txt": "你要懂既然不是电子产品不是易损品不是玩品"} -{"key": "BAC009S0916W0170", "wav": "./aishell/wav/test/S0916/BAC009S0916W0170.wav", "txt": "他是家居产品依托互联网技术升级的家居耐用品"} -{"key": "BAC009S0916W0171", "wav": "./aishell/wav/test/S0916/BAC009S0916W0171.wav", "txt": "这产品的换代周期会较长"} -{"key": "BAC009S0916W0172", "wav": "./aishell/wav/test/S0916/BAC009S0916W0172.wav", "txt": "购买的机会受时间的制约"} -{"key": "BAC009S0916W0173", "wav": "./aishell/wav/test/S0916/BAC009S0916W0173.wav", "txt": "而未来借助更多的互联网技术"} -{"key": "BAC009S0916W0174", "wav": "./aishell/wav/test/S0916/BAC009S0916W0174.wav", "txt": "产品的升级速度一定加快"} -{"key": "BAC009S0916W0175", "wav": "./aishell/wav/test/S0916/BAC009S0916W0175.wav", "txt": "而智能家居产品的特殊属性决定了销售渠道的模式"} -{"key": "BAC009S0916W0176", "wav": "./aishell/wav/test/S0916/BAC009S0916W0176.wav", "txt": "他不能按电子产品或传统居家产品的模式去销售"} -{"key": "BAC009S0916W0177", "wav": "./aishell/wav/test/S0916/BAC009S0916W0177.wav", "txt": "除了做好传统门店的体验销售电商平台销售外"} -{"key": "BAC009S0916W0178", "wav": "./aishell/wav/test/S0916/BAC009S0916W0178.wav", "txt": "希望智能家居产品企业在短期的高回报率也是不现实的"} -{"key": "BAC009S0916W0179", "wav": "./aishell/wav/test/S0916/BAC009S0916W0179.wav", "txt": "但可以肯定的他一定是最高成长的企行业"} -{"key": "BAC009S0916W0180", "wav": "./aishell/wav/test/S0916/BAC009S0916W0180.wav", "txt": "一大数据云计算不是你谈的"} -{"key": "BAC009S0916W0181", "wav": "./aishell/wav/test/S0916/BAC009S0916W0181.wav", "txt": "好像不谈你就不属于这个时代的人"} -{"key": "BAC009S0916W0182", "wav": "./aishell/wav/test/S0916/BAC009S0916W0182.wav", "txt": "作用大并不代表每个人"} -{"key": "BAC009S0916W0183", "wav": "./aishell/wav/test/S0916/BAC009S0916W0183.wav", "txt": "大数据云计算是非常烧钱的"} -{"key": "BAC009S0916W0184", "wav": "./aishell/wav/test/S0916/BAC009S0916W0184.wav", "txt": "不是一般的企业个人玩得起的"} -{"key": "BAC009S0916W0185", "wav": "./aishell/wav/test/S0916/BAC009S0916W0185.wav", "txt": "与其厌不其烦的谈论大数据云计算"} -{"key": "BAC009S0916W0186", "wav": "./aishell/wav/test/S0916/BAC009S0916W0186.wav", "txt": "不如做一款实实在在的好产品"} -{"key": "BAC009S0916W0187", "wav": "./aishell/wav/test/S0916/BAC009S0916W0187.wav", "txt": "但是却不在国家文件所指的收费公路范围内"} -{"key": "BAC009S0916W0188", "wav": "./aishell/wav/test/S0916/BAC009S0916W0188.wav", "txt": "而是一条市内快速路"} -{"key": "BAC009S0916W0189", "wav": "./aishell/wav/test/S0916/BAC009S0916W0189.wav", "txt": "对于这条特殊的隧道"} -{"key": "BAC009S0916W0190", "wav": "./aishell/wav/test/S0916/BAC009S0916W0190.wav", "txt": "省交通部门表示应该不会特殊"} -{"key": "BAC009S0916W0191", "wav": "./aishell/wav/test/S0916/BAC009S0916W0191.wav", "txt": "长江隧道估计也顶不住"} -{"key": "BAC009S0916W0192", "wav": "./aishell/wav/test/S0916/BAC009S0916W0192.wav", "txt": "对于提高重大节假日公路通行能力和服务水平"} -{"key": "BAC009S0916W0193", "wav": "./aishell/wav/test/S0916/BAC009S0916W0193.wav", "txt": "降低公众假日出行成本具有重要意义"} -{"key": "BAC009S0916W0194", "wav": "./aishell/wav/test/S0916/BAC009S0916W0194.wav", "txt": "具体工作将由各省区市政府负责统一组织实施"} -{"key": "BAC009S0916W0195", "wav": "./aishell/wav/test/S0916/BAC009S0916W0195.wav", "txt": "国务院及五部门并没有明确实施时间"} -{"key": "BAC009S0916W0196", "wav": "./aishell/wav/test/S0916/BAC009S0916W0196.wav", "txt": "着实让不少网友有些着急"} -{"key": "BAC009S0916W0197", "wav": "./aishell/wav/test/S0916/BAC009S0916W0197.wav", "txt": "免费新规究竟啥时能享受到"} -{"key": "BAC009S0916W0198", "wav": "./aishell/wav/test/S0916/BAC009S0916W0198.wav", "txt": "记者昨日第一时间从江苏省交通运输厅获悉"} -{"key": "BAC009S0916W0199", "wav": "./aishell/wav/test/S0916/BAC009S0916W0199.wav", "txt": "就国家方案我省还会进行再研究"} -{"key": "BAC009S0916W0200", "wav": "./aishell/wav/test/S0916/BAC009S0916W0200.wav", "txt": "具体执行时间由省政府定"} -{"key": "BAC009S0916W0201", "wav": "./aishell/wav/test/S0916/BAC009S0916W0201.wav", "txt": "今年国庆应该可以实施"} -{"key": "BAC009S0916W0202", "wav": "./aishell/wav/test/S0916/BAC009S0916W0202.wav", "txt": "可是通过收费站的车有大客车中型客车还有货车"} -{"key": "BAC009S0916W0203", "wav": "./aishell/wav/test/S0916/BAC009S0916W0203.wav", "txt": "到时候会不会乱成一锅粥"} -{"key": "BAC009S0916W0204", "wav": "./aishell/wav/test/S0916/BAC009S0916W0204.wav", "txt": "在国务院下发的文件中提及"} -{"key": "BAC009S0916W0205", "wav": "./aishell/wav/test/S0916/BAC009S0916W0205.wav", "txt": "为确保免费政策实施后车辆有序通行"} -{"key": "BAC009S0916W0206", "wav": "./aishell/wav/test/S0916/BAC009S0916W0206.wav", "txt": "各地区要对公路收费站现有车道进行全面调查"} -{"key": "BAC009S0916W0207", "wav": "./aishell/wav/test/S0916/BAC009S0916W0207.wav", "txt": "合理规划和利用现有收费车道和免费专用通道"} -{"key": "BAC009S0916W0208", "wav": "./aishell/wav/test/S0916/BAC009S0916W0208.wav", "txt": "确保过往车辆分类分车道有序通行"} -{"key": "BAC009S0916W0209", "wav": "./aishell/wav/test/S0916/BAC009S0916W0209.wav", "txt": "记者昨日从省交通部门了解到"} -{"key": "BAC009S0916W0210", "wav": "./aishell/wav/test/S0916/BAC009S0916W0210.wav", "txt": "这是一个比较复杂的问题"} -{"key": "BAC009S0916W0211", "wav": "./aishell/wav/test/S0916/BAC009S0916W0211.wav", "txt": "估计未来系统可能会改造"} -{"key": "BAC009S0916W0212", "wav": "./aishell/wav/test/S0916/BAC009S0916W0212.wav", "txt": "应该不会开免费车道"} -{"key": "BAC009S0916W0213", "wav": "./aishell/wav/test/S0916/BAC009S0916W0213.wav", "txt": "如果开了小车免费车道"} -{"key": "BAC009S0916W0214", "wav": "./aishell/wav/test/S0916/BAC009S0916W0214.wav", "txt": "有大车或是货车误闯或者闯进去了就不好办了"} -{"key": "BAC009S0916W0215", "wav": "./aishell/wav/test/S0916/BAC009S0916W0215.wav", "txt": "有关负责人告诉记者"} -{"key": "BAC009S0916W0216", "wav": "./aishell/wav/test/S0916/BAC009S0916W0216.wav", "txt": "省里会对此进行专门研究讨论"} -{"key": "BAC009S0916W0217", "wav": "./aishell/wav/test/S0916/BAC009S0916W0217.wav", "txt": "看看山东之前是怎么做的记者了解到"} -{"key": "BAC009S0916W0218", "wav": "./aishell/wav/test/S0916/BAC009S0916W0218.wav", "txt": "面对上述这些问题山东是怎么免费放行的呢"} -{"key": "BAC009S0916W0219", "wav": "./aishell/wav/test/S0916/BAC009S0916W0219.wav", "txt": "免费期间收费员还是按照正常放行的"} -{"key": "BAC009S0916W0220", "wav": "./aishell/wav/test/S0916/BAC009S0916W0220.wav", "txt": "山东潍坊的一位李先生告诉记者"} -{"key": "BAC009S0916W0221", "wav": "./aishell/wav/test/S0916/BAC009S0916W0221.wav", "txt": "今年大年初一他开车去海南"} -{"key": "BAC009S0916W0222", "wav": "./aishell/wav/test/S0916/BAC009S0916W0222.wav", "txt": "一路上很多省份的高速公路收费站都是免费放行"} -{"key": "BAC009S0916W0223", "wav": "./aishell/wav/test/S0916/BAC009S0916W0223.wav", "txt": "到了出口车道再把通行卡收回去"} -{"key": "BAC009S0916W0224", "wav": "./aishell/wav/test/S0916/BAC009S0916W0224.wav", "txt": "由于山东免费放行的时间不在春运最高峰"} -{"key": "BAC009S0916W0225", "wav": "./aishell/wav/test/S0916/BAC009S0916W0225.wav", "txt": "大年初一路上都没什么车"} -{"key": "BAC009S0916W0226", "wav": "./aishell/wav/test/S0916/BAC009S0916W0226.wav", "txt": "所以倒也没产生收费站排队的现象"} -{"key": "BAC009S0916W0227", "wav": "./aishell/wav/test/S0916/BAC009S0916W0227.wav", "txt": "扩大到四个小长假之后"} -{"key": "BAC009S0916W0228", "wav": "./aishell/wav/test/S0916/BAC009S0916W0228.wav", "txt": "国务院批准银行系基金公司再扩容"} -{"key": "BAC009S0916W0229", "wav": "./aishell/wav/test/S0916/BAC009S0916W0229.wav", "txt": "本报记者蔡宗琦中国证券报记者获悉"} -{"key": "BAC009S0916W0230", "wav": "./aishell/wav/test/S0916/BAC009S0916W0230.wav", "txt": "公募基金管理业务有关工作"} -{"key": "BAC009S0916W0231", "wav": "./aishell/wav/test/S0916/BAC009S0916W0231.wav", "txt": "积极推动基金产品审核制度改革"} -{"key": "BAC009S0916W0232", "wav": "./aishell/wav/test/S0916/BAC009S0916W0232.wav", "txt": "鼓励更多资金投资资本市场"} -{"key": "BAC009S0916W0233", "wav": "./aishell/wav/test/S0916/BAC009S0916W0233.wav", "txt": "先后两批共八家商业银行设立或参股八家基金管理公司"} -{"key": "BAC009S0916W0234", "wav": "./aishell/wav/test/S0916/BAC009S0916W0234.wav", "txt": "试点基金管理公司发展态势良好"} -{"key": "BAC009S0916W0235", "wav": "./aishell/wav/test/S0916/BAC009S0916W0235.wav", "txt": "工商银行建设银行和交通银行为首批试点银行"} -{"key": "BAC009S0916W0236", "wav": "./aishell/wav/test/S0916/BAC009S0916W0236.wav", "txt": "增加机构投资者数量"} -{"key": "BAC009S0916W0237", "wav": "./aishell/wav/test/S0916/BAC009S0916W0237.wav", "txt": "促进基金行业规范发展"} -{"key": "BAC009S0916W0238", "wav": "./aishell/wav/test/S0916/BAC009S0916W0238.wav", "txt": "为商业银行探索跨业经营运作积累经验"} -{"key": "BAC009S0916W0239", "wav": "./aishell/wav/test/S0916/BAC009S0916W0239.wav", "txt": "此举可能将进一步推动金融混业经营"} -{"key": "BAC009S0916W0240", "wav": "./aishell/wav/test/S0916/BAC009S0916W0240.wav", "txt": "随着对商业银行设立基金管理公司门槛放宽"} -{"key": "BAC009S0916W0241", "wav": "./aishell/wav/test/S0916/BAC009S0916W0241.wav", "txt": "我国资本市场将迎来更多机构投资者"} -{"key": "BAC009S0916W0242", "wav": "./aishell/wav/test/S0916/BAC009S0916W0242.wav", "txt": "更加有利于价值投资理念形成"} -{"key": "BAC009S0916W0243", "wav": "./aishell/wav/test/S0916/BAC009S0916W0243.wav", "txt": "保险资产管理公司如符合有关规定"} -{"key": "BAC009S0916W0244", "wav": "./aishell/wav/test/S0916/BAC009S0916W0244.wav", "txt": "可以向有关金融监管部门申请"} -{"key": "BAC009S0916W0245", "wav": "./aishell/wav/test/S0916/BAC009S0916W0245.wav", "txt": "依法开展公募性质的资产管理业务"} -{"key": "BAC009S0916W0246", "wav": "./aishell/wav/test/S0916/BAC009S0916W0246.wav", "txt": "通知扩大保险资管公司业务范围"} -{"key": "BAC009S0916W0247", "wav": "./aishell/wav/test/S0916/BAC009S0916W0247.wav", "txt": "这体现出监管部门开放管理的思路"} -{"key": "BAC009S0916W0248", "wav": "./aishell/wav/test/S0916/BAC009S0916W0248.wav", "txt": "允许各类资产管理公司同台竞技"} -{"key": "BAC009S0916W0249", "wav": "./aishell/wav/test/S0916/BAC009S0916W0249.wav", "txt": "在遴选优质管理人提升保险资金投资收益率的同时"} -{"key": "BAC009S0916W0250", "wav": "./aishell/wav/test/S0916/BAC009S0916W0250.wav", "txt": "也通过机构间的竞争促进保险资管公司的转型发展"} -{"key": "BAC009S0916W0251", "wav": "./aishell/wav/test/S0916/BAC009S0916W0251.wav", "txt": "明确了参股基金管理公司股东"} -{"key": "BAC009S0916W0252", "wav": "./aishell/wav/test/S0916/BAC009S0916W0252.wav", "txt": "证监会新闻发言人邓给解释"} -{"key": "BAC009S0916W0253", "wav": "./aishell/wav/test/S0916/BAC009S0916W0253.wav", "txt": "中国的银行居垄断地位"} -{"key": "BAC009S0916W0254", "wav": "./aishell/wav/test/S0916/BAC009S0916W0254.wav", "txt": "作风向来无耻加强势苹果也因强势出名"} -{"key": "BAC009S0916W0255", "wav": "./aishell/wav/test/S0916/BAC009S0916W0255.wav", "txt": "可参照中国移动和苹果的合作传闻中"} -{"key": "BAC009S0916W0257", "wav": "./aishell/wav/test/S0916/BAC009S0916W0257.wav", "txt": "这估计很难让掉进钱眼儿的四大银行接受"} -{"key": "BAC009S0916W0260", "wav": "./aishell/wav/test/S0916/BAC009S0916W0260.wav", "txt": "但却鲜有人会像苹果的服务付费"} -{"key": "BAC009S0916W0261", "wav": "./aishell/wav/test/S0916/BAC009S0916W0261.wav", "txt": "更现实的的困难在于"} -{"key": "BAC009S0916W0263", "wav": "./aishell/wav/test/S0916/BAC009S0916W0263.wav", "txt": "粗估下来大概要七十亿"} -{"key": "BAC009S0916W0264", "wav": "./aishell/wav/test/S0916/BAC009S0916W0264.wav", "txt": "这还不包括改造过程中的渠道分食"} -{"key": "BAC009S0916W0265", "wav": "./aishell/wav/test/S0916/BAC009S0916W0265.wav", "txt": "以及给领导们的审批费用"} -{"key": "BAC009S0916W0266", "wav": "./aishell/wav/test/S0916/BAC009S0916W0266.wav", "txt": "从支付的大环境上看"} -{"key": "BAC009S0916W0268", "wav": "./aishell/wav/test/S0916/BAC009S0916W0268.wav", "txt": "它依旧要面对政府的刁难"} -{"key": "BAC009S0916W0270", "wav": "./aishell/wav/test/S0916/BAC009S0916W0270.wav", "txt": "政府失控的可不是什么隐私了"} -{"key": "BAC009S0916W0271", "wav": "./aishell/wav/test/S0916/BAC009S0916W0271.wav", "txt": "而是实实在在的金融命脉"} -{"key": "BAC009S0916W0272", "wav": "./aishell/wav/test/S0916/BAC009S0916W0272.wav", "txt": "慈禧太后就因乔致庸创办了票号"} -{"key": "BAC009S0916W0273", "wav": "./aishell/wav/test/S0916/BAC009S0916W0273.wav", "txt": "害怕其掌握国家金融命脉"} -{"key": "BAC009S0916W0274", "wav": "./aishell/wav/test/S0916/BAC009S0916W0274.wav", "txt": "而将他软禁十年之久"} -{"key": "BAC009S0916W0275", "wav": "./aishell/wav/test/S0916/BAC009S0916W0275.wav", "txt": "何况是一个来自美帝的小苹果呢"} -{"key": "BAC009S0916W0276", "wav": "./aishell/wav/test/S0916/BAC009S0916W0276.wav", "txt": "科幻星系康斯坦丁文"} -{"key": "BAC009S0916W0277", "wav": "./aishell/wav/test/S0916/BAC009S0916W0277.wav", "txt": "苹果一口气召开了两次新品发布会"} -{"key": "BAC009S0916W0278", "wav": "./aishell/wav/test/S0916/BAC009S0916W0278.wav", "txt": "就在会场的凳子和垃圾尚未收拾干净的时候"} -{"key": "BAC009S0916W0279", "wav": "./aishell/wav/test/S0916/BAC009S0916W0279.wav", "txt": "全世界的报道已经蜂拥而至"} -{"key": "BAC009S0916W0280", "wav": "./aishell/wav/test/S0916/BAC009S0916W0280.wav", "txt": "失望中夹杂着嘲讽的情绪霸占了各模块的头条"} -{"key": "BAC009S0916W0281", "wav": "./aishell/wav/test/S0916/BAC009S0916W0281.wav", "txt": "据华尔街日报网站报道"} -{"key": "BAC009S0916W0282", "wav": "./aishell/wav/test/S0916/BAC009S0916W0282.wav", "txt": "在自己全身心的努力和坚持之下"} -{"key": "BAC009S0916W0284", "wav": "./aishell/wav/test/S0916/BAC009S0916W0284.wav", "txt": "艾维本周四晚在旧金山现代艺术馆向大众表示"} -{"key": "BAC009S0916W0286", "wav": "./aishell/wav/test/S0916/BAC009S0916W0286.wav", "txt": "主要是因为社会对可穿戴智能手表的期望太高"} -{"key": "BAC009S0916W0287", "wav": "./aishell/wav/test/S0916/BAC009S0916W0287.wav", "txt": "手腕是配戴轻便型互动设备与休闲设备的理想之处"} -{"key": "BAC009S0916W0288", "wav": "./aishell/wav/test/S0916/BAC009S0916W0288.wav", "txt": "但不适合那些笨重的解读设备"} -{"key": "BAC009S0916W0289", "wav": "./aishell/wav/test/S0916/BAC009S0916W0289.wav", "txt": "艾维表示尽管苹果智能手表拥有诸多功能"} -{"key": "BAC009S0916W0290", "wav": "./aishell/wav/test/S0916/BAC009S0916W0290.wav", "txt": "这种产品的设计仍需考虑文化历史和未来等因素"} -{"key": "BAC009S0916W0291", "wav": "./aishell/wav/test/S0916/BAC009S0916W0291.wav", "txt": "艾维现为苹果主管设计业务的高级副总裁"} -{"key": "BAC009S0916W0292", "wav": "./aishell/wav/test/S0916/BAC009S0916W0292.wav", "txt": "帮助设计了苹果多项产品的外观和用户体验"} -{"key": "BAC009S0916W0294", "wav": "./aishell/wav/test/S0916/BAC009S0916W0294.wav", "txt": "苹果计划于明年初开始销售其智能手表"} -{"key": "BAC009S0916W0295", "wav": "./aishell/wav/test/S0916/BAC009S0916W0295.wav", "txt": "该公司于上个月简单地宣布了智能手表相关的情况"} -{"key": "BAC009S0916W0296", "wav": "./aishell/wav/test/S0916/BAC009S0916W0296.wav", "txt": "其将提供三种版本的智能手表"} -{"key": "BAC009S0916W0297", "wav": "./aishell/wav/test/S0916/BAC009S0916W0297.wav", "txt": "起步价为三百四九美元十"} -{"key": "BAC009S0916W0298", "wav": "./aishell/wav/test/S0916/BAC009S0916W0298.wav", "txt": "苹果没有透露更昂贵智能手表的具体售价"} -{"key": "BAC009S0916W0299", "wav": "./aishell/wav/test/S0916/BAC009S0916W0299.wav", "txt": "这些手表将配置不同的表带"} -{"key": "BAC009S0916W0300", "wav": "./aishell/wav/test/S0916/BAC009S0916W0300.wav", "txt": "以满足不同用户的需求"} -{"key": "BAC009S0916W0301", "wav": "./aishell/wav/test/S0916/BAC009S0916W0301.wav", "txt": "市场上还有诸多其他制造商也在尝试生产智能手表"} -{"key": "BAC009S0916W0302", "wav": "./aishell/wav/test/S0916/BAC009S0916W0302.wav", "txt": "但这些厂商的产品都难以进入主流"} -{"key": "BAC009S0916W0303", "wav": "./aishell/wav/test/S0916/BAC009S0916W0303.wav", "txt": "这是未来的必经之路"} -{"key": "BAC009S0916W0304", "wav": "./aishell/wav/test/S0916/BAC009S0916W0304.wav", "txt": "美的家用空调事业部总裁吴文新表示"} -{"key": "BAC009S0916W0305", "wav": "./aishell/wav/test/S0916/BAC009S0916W0305.wav", "txt": "每日经济新闻记者从美的家用空调事业部了解到"} -{"key": "BAC009S0916W0306", "wav": "./aishell/wav/test/S0916/BAC009S0916W0306.wav", "txt": "自二零一一年事业部启动自动化升级至今的四年里"} -{"key": "BAC009S0916W0307", "wav": "./aishell/wav/test/S0916/BAC009S0916W0307.wav", "txt": "工人数量减少近一半"} -{"key": "BAC009S0916W0308", "wav": "./aishell/wav/test/S0916/BAC009S0916W0308.wav", "txt": "美的家用空调事业部制造副总裁乌守保对记者"} -{"key": "BAC009S0916W0309", "wav": "./aishell/wav/test/S0916/BAC009S0916W0309.wav", "txt": "老板电器的新增量创新需求追求极致搜狐科技"} -{"key": "BAC009S0916W0310", "wav": "./aishell/wav/test/S0916/BAC009S0916W0310.wav", "txt": "质变中的世界工厂中国正在由中国制造向中国智造蜕变"} -{"key": "BAC009S0916W0311", "wav": "./aishell/wav/test/S0916/BAC009S0916W0311.wav", "txt": "如何借力拥抱互联网加这一全新变量"} -{"key": "BAC009S0916W0312", "wav": "./aishell/wav/test/S0916/BAC009S0916W0312.wav", "txt": "如何重新激活内部潜能"} -{"key": "BAC009S0916W0313", "wav": "./aishell/wav/test/S0916/BAC009S0916W0313.wav", "txt": "便是区别行业龙头企业经营智慧高低的关键时刻"} -{"key": "BAC009S0916W0314", "wav": "./aishell/wav/test/S0916/BAC009S0916W0314.wav", "txt": "身处传统白色家电领域中的重要一支到厨房电器"} -{"key": "BAC009S0916W0315", "wav": "./aishell/wav/test/S0916/BAC009S0916W0315.wav", "txt": "多年来保持奇高市占率的老板电器"} -{"key": "BAC009S0916W0316", "wav": "./aishell/wav/test/S0916/BAC009S0916W0316.wav", "txt": "在成名三十馀年后仍在竭力寻求业态的新鲜化和可能性"} -{"key": "BAC009S0916W0317", "wav": "./aishell/wav/test/S0916/BAC009S0916W0317.wav", "txt": "能否找到厨电行业下一个未知的增量"} -{"key": "BAC009S0916W0318", "wav": "./aishell/wav/test/S0916/BAC009S0916W0318.wav", "txt": "也成为老板电器和它的宿敌们能否领跑下半程的关键"} -{"key": "BAC009S0916W0319", "wav": "./aishell/wav/test/S0916/BAC009S0916W0319.wav", "txt": "阐述老板电器和内部创新外部国际化如何进行破题"} -{"key": "BAC009S0916W0320", "wav": "./aishell/wav/test/S0916/BAC009S0916W0320.wav", "txt": "老板电器如何看待公司的创新驱动"} -{"key": "BAC009S0916W0321", "wav": "./aishell/wav/test/S0916/BAC009S0916W0321.wav", "txt": "赵继宏老板电器做厨电已经三十多年了"} -{"key": "BAC009S0916W0322", "wav": "./aishell/wav/test/S0916/BAC009S0916W0322.wav", "txt": "作为企业理念和产品技术必须要走在时代的前面"} -{"key": "BAC009S0916W0323", "wav": "./aishell/wav/test/S0916/BAC009S0916W0323.wav", "txt": "现在中国的八十五后和九十后消费人群已经成为消费主体"} -{"key": "BAC009S0916W0324", "wav": "./aishell/wav/test/S0916/BAC009S0916W0324.wav", "txt": "他们需要的是智能厨房智能家居与家电"} -{"key": "BAC009S0916W0325", "wav": "./aishell/wav/test/S0916/BAC009S0916W0325.wav", "txt": "公司为此研发并推出市场的智能产品非常贴近市场"} -{"key": "BAC009S0916W0326", "wav": "./aishell/wav/test/S0916/BAC009S0916W0326.wav", "txt": "围绕消费者消费者需要什么"} -{"key": "BAC009S0916W0327", "wav": "./aishell/wav/test/S0916/BAC009S0916W0327.wav", "txt": "我们开发什么的产品研发策略"} -{"key": "BAC009S0916W0328", "wav": "./aishell/wav/test/S0916/BAC009S0916W0328.wav", "txt": "除了产品功能必须不错之外"} -{"key": "BAC009S0916W0329", "wav": "./aishell/wav/test/S0916/BAC009S0916W0329.wav", "txt": "以保证持续长久的黏性互动"} -{"key": "BAC009S0916W0330", "wav": "./aishell/wav/test/S0916/BAC009S0916W0330.wav", "txt": "产品创新其实也是一个双向互动的过程"} -{"key": "BAC009S0916W0331", "wav": "./aishell/wav/test/S0916/BAC009S0916W0331.wav", "txt": "现在消费者的需求越来越个性化差异化"} -{"key": "BAC009S0916W0332", "wav": "./aishell/wav/test/S0916/BAC009S0916W0332.wav", "txt": "可以和我们的消费者有很多的互动并提供超值服务"} -{"key": "BAC009S0916W0333", "wav": "./aishell/wav/test/S0916/BAC009S0916W0333.wav", "txt": "这些都是和消费者增添黏性互动的方式"} -{"key": "BAC009S0916W0334", "wav": "./aishell/wav/test/S0916/BAC009S0916W0334.wav", "txt": "这个方向的创新以后还有更多的东西可以发挥作用"} -{"key": "BAC009S0916W0335", "wav": "./aishell/wav/test/S0916/BAC009S0916W0335.wav", "txt": "如今的智能家电更多意义上是智能加上互动"} -{"key": "BAC009S0916W0336", "wav": "./aishell/wav/test/S0916/BAC009S0916W0336.wav", "txt": "也就是老板电器总结的自动加互动"} -{"key": "BAC009S0916W0338", "wav": "./aishell/wav/test/S0916/BAC009S0916W0338.wav", "txt": "只要在明天的最后一战中赢下东道主日本"} -{"key": "BAC009S0916W0339", "wav": "./aishell/wav/test/S0916/BAC009S0916W0339.wav", "txt": "高清女排力擒俄罗斯夺冠占主动众将喜极而泣"} -{"key": "BAC009S0916W0340", "wav": "./aishell/wav/test/S0916/BAC009S0916W0340.wav", "txt": "今天大家打得都挺好的"} -{"key": "BAC009S0916W0341", "wav": "./aishell/wav/test/S0916/BAC009S0916W0341.wav", "txt": "我们是一条心在打团结作战"} -{"key": "BAC009S0916W0342", "wav": "./aishell/wav/test/S0916/BAC009S0916W0342.wav", "txt": "赛后主攻手朱婷对记者说"} -{"key": "BAC009S0916W0343", "wav": "./aishell/wav/test/S0916/BAC009S0916W0343.wav", "txt": "本场比赛朱婷三七次扣球得到二十一分"} -{"key": "BAC009S0916W0344", "wav": "./aishell/wav/test/S0916/BAC009S0916W0344.wav", "txt": "此外她还凭借拦网和发球分别拿到七分和一分"} -{"key": "BAC009S0916W0345", "wav": "./aishell/wav/test/S0916/BAC009S0916W0345.wav", "txt": "我觉得自己的脚伤已经完全恢复了"} -{"key": "BAC009S0916W0346", "wav": "./aishell/wav/test/S0916/BAC009S0916W0346.wav", "txt": "对弹跳没有什么影响"} -{"key": "BAC009S0916W0347", "wav": "./aishell/wav/test/S0916/BAC009S0916W0347.wav", "txt": "当在新闻发布会上被问及伤情的时候"} -{"key": "BAC009S0916W0348", "wav": "./aishell/wav/test/S0916/BAC009S0916W0348.wav", "txt": "在第四轮与韩国队的比赛中"} -{"key": "BAC009S0916W0349", "wav": "./aishell/wav/test/S0916/BAC009S0916W0349.wav", "txt": "朱婷在第四局比赛中意外崴脚"} -{"key": "BAC009S0916W0350", "wav": "./aishell/wav/test/S0916/BAC009S0916W0350.wav", "txt": "今天出色的数据也佐证了她身体的康复情况良好"} -{"key": "BAC009S0916W0351", "wav": "./aishell/wav/test/S0916/BAC009S0916W0351.wav", "txt": "作为队里年龄最大的球员"} -{"key": "BAC009S0916W0352", "wav": "./aishell/wav/test/S0916/BAC009S0916W0352.wav", "txt": "最终拦网和扣球均得到六这些分"} -{"key": "BAC009S0916W0353", "wav": "./aishell/wav/test/S0916/BAC009S0916W0353.wav", "txt": "位列球队发球榜首位和拦网榜的第二位"} -{"key": "BAC009S0916W0354", "wav": "./aishell/wav/test/S0916/BAC009S0916W0354.wav", "txt": "大家今天打得非常出色"} -{"key": "BAC009S0916W0355", "wav": "./aishell/wav/test/S0916/BAC009S0916W0355.wav", "txt": "能够在这个集体与可爱的队友一起拼杀"} -{"key": "BAC009S0916W0356", "wav": "./aishell/wav/test/S0916/BAC009S0916W0356.wav", "txt": "我感到非常骄傲和自豪"} -{"key": "BAC009S0916W0357", "wav": "./aishell/wav/test/S0916/BAC009S0916W0357.wav", "txt": "在赛后发布会上颜妮对记者说"} -{"key": "BAC009S0916W0358", "wav": "./aishell/wav/test/S0916/BAC009S0916W0358.wav", "txt": "在复盘与俄罗斯一战时"} -{"key": "BAC009S0916W0359", "wav": "./aishell/wav/test/S0916/BAC009S0916W0359.wav", "txt": "这场比赛前教练给我们布置了很多"} -{"key": "BAC009S0916W0360", "wav": "./aishell/wav/test/S0916/BAC009S0916W0360.wav", "txt": "作为就是我上场多去贯彻教练意图"} -{"key": "BAC009S0916W0361", "wav": "./aishell/wav/test/S0916/BAC009S0916W0361.wav", "txt": "颜妮坦言今天俄罗斯表现很好"} -{"key": "BAC009S0916W0362", "wav": "./aishell/wav/test/S0916/BAC009S0916W0362.wav", "txt": "我们两家有时候比较像"} -{"key": "BAC009S0916W0363", "wav": "./aishell/wav/test/S0916/BAC009S0916W0363.wav", "txt": "当被问及新老队员相互担当弥补的话题时"} -{"key": "BAC009S0916W0364", "wav": "./aishell/wav/test/S0916/BAC009S0916W0364.wav", "txt": "颜妮坦言自己的发挥也不是特别稳定"} -{"key": "BAC009S0916W0365", "wav": "./aishell/wav/test/S0916/BAC009S0916W0365.wav", "txt": "但有起伏应该是正常的"} -{"key": "BAC009S0916W0366", "wav": "./aishell/wav/test/S0916/BAC009S0916W0366.wav", "txt": "作为老队员我要多承担"} -{"key": "BAC009S0916W0367", "wav": "./aishell/wav/test/S0916/BAC009S0916W0367.wav", "txt": "用实际行动来弥补不足"} -{"key": "BAC009S0916W0368", "wav": "./aishell/wav/test/S0916/BAC009S0916W0368.wav", "txt": "搜狐体育郭健文"} -{"key": "BAC009S0916W0369", "wav": "./aishell/wav/test/S0916/BAC009S0916W0369.wav", "txt": "女排众将手举国旗敬夺冠"} -{"key": "BAC009S0916W0370", "wav": "./aishell/wav/test/S0916/BAC009S0916W0370.wav", "txt": "拿到了明年里约奥运会的入场券"} -{"key": "BAC009S0916W0371", "wav": "./aishell/wav/test/S0916/BAC009S0916W0371.wav", "txt": "在接受中央电视台记者采访时朱婷表示"} -{"key": "BAC009S0916W0372", "wav": "./aishell/wav/test/S0916/BAC009S0916W0372.wav", "txt": "全队上下面对了巨大困难"} -{"key": "BAC009S0916W0373", "wav": "./aishell/wav/test/S0916/BAC009S0916W0373.wav", "txt": "其中郎平主教练最为辛苦"} -{"key": "BAC009S0916W0374", "wav": "./aishell/wav/test/S0916/BAC009S0916W0374.wav", "txt": "今晚的比赛中朱婷独得二十七分"} -{"key": "BAC009S0916W0375", "wav": "./aishell/wav/test/S0916/BAC009S0916W0375.wav", "txt": "再度成为了比赛的得分王"} -{"key": "BAC009S0916W0377", "wav": "./aishell/wav/test/S0916/BAC009S0916W0377.wav", "txt": "但今天能拿冠军真的是发自肺腑的想哭"} -{"key": "BAC009S0916W0378", "wav": "./aishell/wav/test/S0916/BAC009S0916W0378.wav", "txt": "面对日本队的魔鬼主场"} -{"key": "BAC009S0916W0379", "wav": "./aishell/wav/test/S0916/BAC009S0916W0379.wav", "txt": "中国女排表示承受了巨大的压力"} -{"key": "BAC009S0916W0380", "wav": "./aishell/wav/test/S0916/BAC009S0916W0380.wav", "txt": "朱婷表示我想日本肯定也会拼我们"} -{"key": "BAC009S0916W0381", "wav": "./aishell/wav/test/S0916/BAC009S0916W0381.wav", "txt": "做了很多很多困难准备"} -{"key": "BAC009S0916W0382", "wav": "./aishell/wav/test/S0916/BAC009S0916W0382.wav", "txt": "如果输了就不太好说了"} -{"key": "BAC009S0916W0383", "wav": "./aishell/wav/test/S0916/BAC009S0916W0383.wav", "txt": "但是里面不是淡定的"} -{"key": "BAC009S0916W0384", "wav": "./aishell/wav/test/S0916/BAC009S0916W0384.wav", "txt": "中国队连续三位主力因伤缺战"} -{"key": "BAC009S0916W0385", "wav": "./aishell/wav/test/S0916/BAC009S0916W0385.wav", "txt": "大家可能觉得我们这支队伍很苦"} -{"key": "BAC009S0916W0386", "wav": "./aishell/wav/test/S0916/BAC009S0916W0386.wav", "txt": "但我觉得郎导是最苦的"} -{"key": "BAC009S0916W0387", "wav": "./aishell/wav/test/S0916/BAC009S0916W0387.wav", "txt": "朱婷表示其实我也想"} -{"key": "BAC009S0916W0388", "wav": "./aishell/wav/test/S0916/BAC009S0916W0388.wav", "txt": "女排三零阿根廷朱婷复出扣杀状态神勇"} -{"key": "BAC009S0916W0389", "wav": "./aishell/wav/test/S0916/BAC009S0916W0389.wav", "txt": "全场比赛的焦点是休战三场后重新登场的名将朱婷"} -{"key": "BAC009S0916W0390", "wav": "./aishell/wav/test/S0916/BAC009S0916W0390.wav", "txt": "拿下全场最高分的朱婷赛后表示"} -{"key": "BAC009S0916W0391", "wav": "./aishell/wav/test/S0916/BAC009S0916W0391.wav", "txt": "在八月二十六日中国队和韩国队的比赛中"} -{"key": "BAC009S0916W0392", "wav": "./aishell/wav/test/S0916/BAC009S0916W0392.wav", "txt": "朱婷崴脚之后带伤率队取胜"} -{"key": "BAC009S0916W0393", "wav": "./aishell/wav/test/S0916/BAC009S0916W0393.wav", "txt": "主教练郎平都没有派她出场"} -{"key": "BAC009S0916W0394", "wav": "./aishell/wav/test/S0916/BAC009S0916W0394.wav", "txt": "一日晚的中阿之战"} -{"key": "BAC009S0916W0395", "wav": "./aishell/wav/test/S0916/BAC009S0916W0395.wav", "txt": "重新以首发身份登场的朱婷迅速找回比赛的感觉"} -{"key": "BAC009S0916W0396", "wav": "./aishell/wav/test/S0916/BAC009S0916W0396.wav", "txt": "赛后被评为当场最佳球员"} -{"key": "BAC009S0916W0397", "wav": "./aishell/wav/test/S0916/BAC009S0916W0397.wav", "txt": "这也是她在本届世界杯上第二次获得全场最佳"} -{"key": "BAC009S0916W0398", "wav": "./aishell/wav/test/S0916/BAC009S0916W0398.wav", "txt": "在场上移动很好"} -{"key": "BAC009S0916W0399", "wav": "./aishell/wav/test/S0916/BAC009S0916W0399.wav", "txt": "朱婷在谈到大家关心的脚伤时说"} -{"key": "BAC009S0916W0400", "wav": "./aishell/wav/test/S0916/BAC009S0916W0400.wav", "txt": "在冈山的桃太郎体育馆"} -{"key": "BAC009S0916W0401", "wav": "./aishell/wav/test/S0916/BAC009S0916W0401.wav", "txt": "当地华人团体组织了不少球迷为中国队加油"} -{"key": "BAC009S0916W0402", "wav": "./aishell/wav/test/S0916/BAC009S0916W0402.wav", "txt": "这样的氛围让朱婷感觉像是主场一样"} -{"key": "BAC009S0916W0403", "wav": "./aishell/wav/test/S0916/BAC009S0916W0403.wav", "txt": "大家赢球比自己获得最佳还要高兴"} -{"key": "BAC009S0916W0404", "wav": "./aishell/wav/test/S0916/BAC009S0916W0404.wav", "txt": "这部电影从二零一三年就已经开始筹备了"} -{"key": "BAC009S0916W0405", "wav": "./aishell/wav/test/S0916/BAC009S0916W0405.wav", "txt": "前后打磨了两年时间才得以完成"} -{"key": "BAC009S0916W0406", "wav": "./aishell/wav/test/S0916/BAC009S0916W0406.wav", "txt": "与奥斯卡影帝本金斯利同时出现在海报中央"} -{"key": "BAC009S0916W0407", "wav": "./aishell/wav/test/S0916/BAC009S0916W0407.wav", "txt": "雷诺兹持枪的造型和他在冥界警局里的颇为相似"} -{"key": "BAC009S0916W0408", "wav": "./aishell/wav/test/S0916/BAC009S0916W0408.wav", "txt": "那些年女神陈妍希近来瘦身有成"} -{"key": "BAC009S0916W0409", "wav": "./aishell/wav/test/S0916/BAC009S0916W0409.wav", "txt": "不仅摆脱神雕侠侣时期的小笼包名号"} -{"key": "BAC009S0916W0410", "wav": "./aishell/wav/test/S0916/BAC009S0916W0410.wav", "txt": "日前在大陆真人秀节目秀出两条雪白大长腿"} -{"key": "BAC009S0916W0411", "wav": "./aishell/wav/test/S0916/BAC009S0916W0411.wav", "txt": "更让粉丝看了鼻血直流"} -{"key": "BAC009S0916W0412", "wav": "./aishell/wav/test/S0916/BAC009S0916W0412.wav", "txt": "只不过好景不常"} -{"key": "BAC009S0916W0413", "wav": "./aishell/wav/test/S0916/BAC009S0916W0413.wav", "txt": "她最近又被拍到崩坏实录"} -{"key": "BAC009S0916W0414", "wav": "./aishell/wav/test/S0916/BAC009S0916W0414.wav", "txt": "乱糟糟的马尾加上宽松衣服的村姑打扮"} -{"key": "BAC009S0916W0415", "wav": "./aishell/wav/test/S0916/BAC009S0916W0415.wav", "txt": "搜狐娱乐讯名为娱乐圈八卦的自媒体"} -{"key": "BAC009S0916W0416", "wav": "./aishell/wav/test/S0916/BAC009S0916W0416.wav", "txt": "曝出陈妍希拍戏时突然干呕"} -{"key": "BAC009S0916W0417", "wav": "./aishell/wav/test/S0916/BAC009S0916W0417.wav", "txt": "推断其已怀孕"} -{"key": "BAC009S0916W0418", "wav": "./aishell/wav/test/S0916/BAC009S0916W0418.wav", "txt": "应该是月初吧"} -{"key": "BAC009S0916W0419", "wav": "./aishell/wav/test/S0916/BAC009S0916W0419.wav", "txt": "小笼包身体有反应"} -{"key": "BAC009S0916W0420", "wav": "./aishell/wav/test/S0916/BAC009S0916W0420.wav", "txt": "她突然就干呕"} -{"key": "BAC009S0916W0421", "wav": "./aishell/wav/test/S0916/BAC009S0916W0421.wav", "txt": "陈妍希还去医院做了检查"} -{"key": "BAC009S0916W0422", "wav": "./aishell/wav/test/S0916/BAC009S0916W0422.wav", "txt": "她的团队对她更加关心了"} -{"key": "BAC009S0916W0423", "wav": "./aishell/wav/test/S0916/BAC009S0916W0423.wav", "txt": "中新网六月十六日电六月十六日是容祖儿的生日"} -{"key": "BAC009S0916W0424", "wav": "./aishell/wav/test/S0916/BAC009S0916W0424.wav", "txt": "陈妍希晒出与容祖儿合照"} -{"key": "BAC009S0916W0425", "wav": "./aishell/wav/test/S0916/BAC009S0916W0425.wav", "txt": "并送上真挚祝福"} -{"key": "BAC009S0916W0426", "wav": "./aishell/wav/test/S0916/BAC009S0916W0426.wav", "txt": "祝可爱的你"} -{"key": "BAC009S0916W0427", "wav": "./aishell/wav/test/S0916/BAC009S0916W0427.wav", "txt": "每一天都要快乐喔"} -{"key": "BAC009S0916W0428", "wav": "./aishell/wav/test/S0916/BAC009S0916W0428.wav", "txt": "中新网九月二十五日电据台湾东森新闻报道"} -{"key": "BAC009S0916W0429", "wav": "./aishell/wav/test/S0916/BAC009S0916W0429.wav", "txt": "陈晓与陈妍希承认恋情"} -{"key": "BAC009S0916W0430", "wav": "./aishell/wav/test/S0916/BAC009S0916W0430.wav", "txt": "获得粉丝祝福"} -{"key": "BAC009S0916W0431", "wav": "./aishell/wav/test/S0916/BAC009S0916W0431.wav", "txt": "湖北六小伙温州偷硬币称代表诸葛后人战刘伯温后人"} -{"key": "BAC009S0916W0432", "wav": "./aishell/wav/test/S0916/BAC009S0916W0432.wav", "txt": "专偷摇摇车里的硬币"} -{"key": "BAC009S0916W0433", "wav": "./aishell/wav/test/S0916/BAC009S0916W0433.wav", "txt": "运气好时一天能偷几千枚一元硬币"} -{"key": "BAC009S0916W0434", "wav": "./aishell/wav/test/S0916/BAC009S0916W0434.wav", "txt": "湖北六岁女童被继母虐打下阴撕裂警方已介入"} -{"key": "BAC009S0916W0435", "wav": "./aishell/wav/test/S0916/BAC009S0916W0435.wav", "txt": "湖北七人冒充福彩工作人员兜售中奖秘籍骗取三零零万"} -{"key": "BAC009S0916W0436", "wav": "./aishell/wav/test/S0916/BAC009S0916W0436.wav", "txt": "湖北省黄冈市公安局通报称"} -{"key": "BAC009S0916W0437", "wav": "./aishell/wav/test/S0916/BAC009S0916W0437.wav", "txt": "打掉一个以传授彩票中奖秘籍为名的特大电信诈骗团伙"} -{"key": "BAC009S0916W0438", "wav": "./aishell/wav/test/S0916/BAC009S0916W0438.wav", "txt": "破获电信诈骗案二三三起"} -{"key": "BAC009S0916W0439", "wav": "./aishell/wav/test/S0916/BAC009S0916W0439.wav", "txt": "涉案金额三零零多万元"} -{"key": "BAC009S0916W0440", "wav": "./aishell/wav/test/S0916/BAC009S0916W0440.wav", "txt": "湖北九岁女童遇害案告破凶手强奸不成推下窗外"} -{"key": "BAC009S0916W0441", "wav": "./aishell/wav/test/S0916/BAC009S0916W0441.wav", "txt": "湖北五道杠少年捐出二万元政府奖学金"} -{"key": "BAC009S0916W0442", "wav": "./aishell/wav/test/S0916/BAC009S0916W0442.wav", "txt": "学校里有些同学家里条件不好"} -{"key": "BAC009S0916W0443", "wav": "./aishell/wav/test/S0916/BAC009S0916W0443.wav", "txt": "但是想让更多需要帮助的水高学子感受到温暖"} -{"key": "BAC009S0916W0444", "wav": "./aishell/wav/test/S0916/BAC009S0916W0444.wav", "txt": "湖北卷人电梯设计不合理同型号已售四六四八部"} -{"key": "BAC009S0916W0445", "wav": "./aishell/wav/test/S0916/BAC009S0916W0445.wav", "txt": "事故电梯仍处于拆解状态"} -{"key": "BAC009S0916W0446", "wav": "./aishell/wav/test/S0916/BAC009S0916W0446.wav", "txt": "新华社记者梁建强摄"} -{"key": "BAC009S0916W0447", "wav": "./aishell/wav/test/S0916/BAC009S0916W0447.wav", "txt": "湖北吃人同型号电梯全国四六四八部分布三一省市"} -{"key": "BAC009S0916W0448", "wav": "./aishell/wav/test/S0916/BAC009S0916W0448.wav", "txt": "安良百货商场正常营业"} -{"key": "BAC009S0916W0449", "wav": "./aishell/wav/test/S0916/BAC009S0916W0449.wav", "txt": "但各楼层的自动扶梯均已关停供图新华"} -{"key": "BAC009S0916W0450", "wav": "./aishell/wav/test/S0916/BAC009S0916W0450.wav", "txt": "湖北吃人电梯品牌四年被曝光五次"} -{"key": "BAC009S0916W0452", "wav": "./aishell/wav/test/S0916/BAC009S0916W0452.wav", "txt": "湖北吞人电梯三月份刚检测合格"} -{"key": "BAC009S0916W0453", "wav": "./aishell/wav/test/S0916/BAC009S0916W0453.wav", "txt": "事故电梯出厂刚满一年"} -{"key": "BAC009S0916W0454", "wav": "./aishell/wav/test/S0916/BAC009S0916W0454.wav", "txt": "今年三月份经检验为合格"} -{"key": "BAC009S0916W0455", "wav": "./aishell/wav/test/S0916/BAC009S0916W0455.wav", "txt": "涉事厂家生产的电梯此前曾发生多起事故"} -{"key": "BAC009S0916W0456", "wav": "./aishell/wav/test/S0916/BAC009S0916W0456.wav", "txt": "目前湖北省质监局已要求全省暂停使用涉事厂家电梯"} -{"key": "BAC009S0916W0457", "wav": "./aishell/wav/test/S0916/BAC009S0916W0457.wav", "txt": "湖北咬人电梯厂家曾为盖板支架申请专利"} -{"key": "BAC009S0916W0458", "wav": "./aishell/wav/test/S0916/BAC009S0916W0458.wav", "txt": "湖北荆州吃人电梯盖板设计不合理供图"} -{"key": "BAC009S0916W0459", "wav": "./aishell/wav/test/S0916/BAC009S0916W0459.wav", "txt": "湖北电梯吃人定性为责任事故"} -{"key": "BAC009S0916W0460", "wav": "./aishell/wav/test/S0916/BAC009S0916W0460.wav", "txt": "看过湖北电梯吃人视频的不少上海年轻人"} -{"key": "BAC009S0916W0461", "wav": "./aishell/wav/test/S0916/BAC009S0916W0461.wav", "txt": "在经过商场自动扶梯时会选择跳过视频中的跳板"} -{"key": "BAC009S0916W0462", "wav": "./aishell/wav/test/S0916/BAC009S0916W0462.wav", "txt": "晨报记者张佳琪晨报讯昨晚九时三零分"} -{"key": "BAC009S0916W0463", "wav": "./aishell/wav/test/S0916/BAC009S0916W0463.wav", "txt": "湖北省荆州市安监局召开安良百货电梯事故情报通报会"} -{"key": "BAC009S0916W0464", "wav": "./aishell/wav/test/S0916/BAC009S0916W0464.wav", "txt": "此次事故调查组组长荆州市安监局局长陈观鑫通报称"} -{"key": "BAC009S0916W0465", "wav": "./aishell/wav/test/S0916/BAC009S0916W0465.wav", "txt": "初步认定这是一起安全生产责任事故"} -{"key": "BAC009S0916W0466", "wav": "./aishell/wav/test/S0916/BAC009S0916W0466.wav", "txt": "湖北电梯吃人调查报告电梯厂商及商场负主责"} -{"key": "BAC009S0916W0467", "wav": "./aishell/wav/test/S0916/BAC009S0916W0467.wav", "txt": "二零一五七二六"} -{"key": "BAC009S0916W0468", "wav": "./aishell/wav/test/S0916/BAC009S0916W0468.wav", "txt": "湖北荆州市安良百货公司事发手扶电梯已被关闭检修"} -{"key": "BAC009S0916W0470", "wav": "./aishell/wav/test/S0916/BAC009S0916W0470.wav", "txt": "申龙电梯和安良百货公司应对事故负主要责任"} -{"key": "BAC009S0916W0471", "wav": "./aishell/wav/test/S0916/BAC009S0916W0471.wav", "txt": "湖北飞踢女居民车道办主任被停职"} -{"key": "BAC009S0916W0472", "wav": "./aishell/wav/test/S0916/BAC009S0916W0472.wav", "txt": "网曝视频截图当街飞踢女群众"} -{"key": "BAC009S0916W0473", "wav": "./aishell/wav/test/S0916/BAC009S0916W0473.wav", "txt": "大喊我一脚方言"} -{"key": "BAC009S0916W0474", "wav": "./aishell/wav/test/S0916/BAC009S0916W0474.wav", "txt": "同踹死你的街道办主任"} -{"key": "BAC009S0916W0475", "wav": "./aishell/wav/test/S0916/BAC009S0916W0475.wav", "txt": "湖北一中学教师体罚学生致重伤被判刑三年"} -{"key": "BAC009S0916W0476", "wav": "./aishell/wav/test/S0916/BAC009S0916W0476.wav", "txt": "用右脚踢向董某左腹部"} -{"key": "BAC009S0916W0477", "wav": "./aishell/wav/test/S0916/BAC009S0916W0477.wav", "txt": "董某某所受损伤程度属二重伤二级"} -{"key": "BAC009S0916W0478", "wav": "./aishell/wav/test/S0916/BAC009S0916W0478.wav", "txt": "残疾等级为六级残疾"} -{"key": "BAC009S0916W0479", "wav": "./aishell/wav/test/S0916/BAC009S0916W0479.wav", "txt": "梁某某被一审法院以故意伤害罪判处有期徒刑三年"} -{"key": "BAC009S0916W0480", "wav": "./aishell/wav/test/S0916/BAC009S0916W0480.wav", "txt": "湖北一中学班长失踪坠亡教学楼四小时去向成谜"} -{"key": "BAC009S0916W0481", "wav": "./aishell/wav/test/S0916/BAC009S0916W0481.wav", "txt": "新洲一名高中新生因为没去教室上晚自习"} -{"key": "BAC009S0916W0482", "wav": "./aishell/wav/test/S0916/BAC009S0916W0482.wav", "txt": "老师发现后和学生一起寻找"} -{"key": "BAC009S0916W0483", "wav": "./aishell/wav/test/S0916/BAC009S0916W0483.wav", "txt": "直至晚上一零时左右"} -{"key": "BAC009S0916W0484", "wav": "./aishell/wav/test/S0916/BAC009S0916W0484.wav", "txt": "一名老师才发现学生坠楼摔落在教学楼前"} -{"key": "BAC009S0916W0485", "wav": "./aishell/wav/test/S0916/BAC009S0916W0485.wav", "txt": "今日二二日晨凌晨"} -{"key": "BAC009S0916W0486", "wav": "./aishell/wav/test/S0916/BAC009S0916W0486.wav", "txt": "这名一五岁的花季少年最终送医救治无效死亡"} -{"key": "BAC009S0916W0487", "wav": "./aishell/wav/test/S0916/BAC009S0916W0487.wav", "txt": "湖北一传销头目归国投案骗取群众资金数亿元"} -{"key": "BAC009S0916W0488", "wav": "./aishell/wav/test/S0916/BAC009S0916W0488.wav", "txt": "湖北一骗取群众资金数亿元的传销头目近日归国投案"} -{"key": "BAC009S0916W0489", "wav": "./aishell/wav/test/S0916/BAC009S0916W0489.wav", "txt": "湖北一公司以员工名义贷款数十员工负债千万"} -{"key": "BAC009S0916W0490", "wav": "./aishell/wav/test/S0916/BAC009S0916W0490.wav", "txt": "阳逻一家公司以数十名员工的名义"} -{"key": "BAC009S0916W0491", "wav": "./aishell/wav/test/S0916/BAC009S0916W0491.wav", "txt": "向一家金融公司贷款一千多万元"} -{"key": "BAC009S0916W0492", "wav": "./aishell/wav/test/S0916/BAC009S0916W0492.wav", "txt": "公司承诺贷款本息都由公司负责偿还"} -{"key": "BAC009S0916W0493", "wav": "./aishell/wav/test/S0916/BAC009S0916W0493.wav", "txt": "公司却遇到了资金困难"} -{"key": "BAC009S0916W0494", "wav": "./aishell/wav/test/S0916/BAC009S0916W0494.wav", "txt": "存在无法如期还贷的风险"} -{"key": "BAC009S0916W0495", "wav": "./aishell/wav/test/S0916/BAC009S0916W0495.wav", "txt": "这令被贷款的员工们寝食难安"} diff --git a/models/audio/speech_recognition/conformer/igie/inference.py b/models/audio/speech_recognition/conformer/igie/inference.py deleted file mode 100644 index d0583eeebb82f13966d3a8363f6a2d45f95742dd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/inference.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -logging.basicConfig(level=logging.INFO, format = '[%(asctime)s %(filename)s line:%(lineno)d] %(levelname)s: %(message)s') -logging.getLogger('autotvm').setLevel(logging.ERROR) -logging.getLogger('strategy').setLevel(logging.ERROR) -logging.getLogger('te_compiler').setLevel(logging.ERROR) - -import sys - -from pprint import pprint -import numpy as np -import torch -from torch.utils.data import DataLoader -import yaml -import multiprocessing -import tvm -from tvm import relay -from tvm.contrib import graph_executor -import compute_cer - -from wenet.dataset.dataset import Dataset -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config -try: - from swig_decoders import map_batch -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--engine', required=True, help='igie engine path.') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', default='raw', choices=['raw', 'shard'], help='train and cv data type') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder', required=False, help='encoder magicmind model') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--label', required=True, help='label file path') - parser.add_argument('--batch_size', type=int, default=1, help='inference batch size.') - parser.add_argument('--seq_len', type=int, default=384, help='inference seq length.') - parser.add_argument("--input_name", - type=str, - nargs="+", - required=True, - help="input name of the model.") - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', - 'ctc_prefix_beam_search', - 'attention_rescoring' - ], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--num_workers", - type=int, - default=16, - help="number of workers used in pytorch dataloader.") - parser.add_argument("--warmup", - type=int, - default=3, - help="number of warmup before test.") - parser.add_argument('--fps_target', - type=float, - default=0.0) - parser.add_argument('--acc_target', - type=float, - default=0.0) - - parser.add_argument("--perf_only", - type=bool, - default=False, - help="Run performance test only") - - args = parser.parse_args() - return args - -def main(): - args = get_args() - pprint(vars(args), indent=2) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['shuffle'] = False - test_conf['sort'] = True - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=args.num_workers) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - - - target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer") - device = tvm.device(target.kind.name, 0) - - lib = tvm.runtime.load_module(args.engine) - module = tvm.contrib.graph_executor.GraphModule(lib["default"](device)) - - if args.perf_only: - ftimer = module.module.time_evaluator("run", device, number=100, repeat=1) - prof_res = np.array(ftimer().results) * 1000 - fps = args.batch_size * 1000 / np.mean(prof_res) - print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}") - else: - # warm up - for _ in range(args.warmup): - module.run() - - with open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, label, feats_lengths, label_lengths = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - seq_len = feats.shape[1] - if seq_len > args.seq_len: - continue - - if feats.shape[0] == args.batch_size: - - speech_data = tvm.nd.array(feats, device) - speech_lengths_data = tvm.nd.array([feats_lengths], device) - module.set_input("speech", speech_data) - module.set_input("speech_lengths", speech_lengths_data) - - module.run() - - encoder_out, encoder_out_lens, ctc_log_probs = module.get_output(0).asnumpy(), module.get_output(1).asnumpy(), module.get_output(2).asnumpy() - - preds = torch.from_numpy(ctc_log_probs) - beam_log_probs, beam_log_probs_idx = torch.topk(preds, k=4, dim=2) - - encoder_out = np.array(encoder_out, dtype="float32") - encoder_out_lens = np.array(encoder_out_lens, dtype="int32") - ctc_log_probs = np.array(ctc_log_probs, dtype="float32") - beam_log_probs = np.array(beam_log_probs, dtype="float32") - beam_log_probs_idx = np.array(beam_log_probs_idx, dtype="int64") - - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - - hyps = map_batch(batch_sents, vocabulary, num_processes, True, 0) - - for i, key in enumerate(keys): - content = hyps[i] - fout.write('{} {}\n'.format(key, content)) - - Acc = compute_cer.get_acc(args.label, args.result_file) - metricResult = {"metricResult": {"Accuracy": f"{Acc}%"}} - print(metricResult) - print(f"* Accuracy: {Acc} %") - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/ixrt_inference_accuracy.py b/models/audio/speech_recognition/conformer/igie/ixrt_inference_accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..7e5ec2b116d9368893cf5c9357f630176cf9510e --- /dev/null +++ b/models/audio/speech_recognition/conformer/igie/ixrt_inference_accuracy.py @@ -0,0 +1,253 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +import argparse +import yaml +import copy +import numpy as np + +from tqdm.contrib import tqdm +from torch.utils.data import DataLoader +from wenet.file_utils import read_symbol_table +from wenet.dataset import Dataset +from tools.compute_cer import Calculator, characterize, normalize, default_cluster +import tensorrt +from tensorrt import Dims +from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings +import pickle + +import cuda.cuda as cuda +import cuda.cudart as cudart + +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin() + +import tvm +from tvm import relay +from tvm.contrib import graph_executor + +def get_args(): + parser = argparse.ArgumentParser(description="recognize with your model") + parser.add_argument( + "--infer_type", + default="fp16", + choices=["fp16", "int8"], + help="inference type: fp16 or int8", + ) + parser.add_argument("--warm_up", type=int, default=3, help="warm_up count") + parser.add_argument("--batch_size", type=int, default=24) + parser.add_argument("--data_dir", required=True, help="test data directory") + parser.add_argument( + "--model_dir", type=str, required=True, help="model for inference" + ) + args = parser.parse_args() + return args + + + +def ixrt_infer(module, input, seq_lengths): + module.set_input(key="input", value=input) + module.set_input(key="seq_lengths", value=seq_lengths) + module.run() + out = module.get_output() + return out[0] + + +def tensorrt_infer(engine,context, features, lengths): + + input_names=["input","seq_lengths"] + output_names=["output"] + input_idx = engine.get_binding_index(input_names[0]) + input_shape = features.shape + context.set_binding_shape(input_idx, Dims(input_shape)) + + seq_lengths_idx = engine.get_binding_index(input_names[1]) + seq_lengths_shape = lengths.shape + context.set_binding_shape(seq_lengths_idx, Dims(seq_lengths_shape)) + + inputs, outputs, allocations = setup_io_bindings(engine, context) + pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], features, features.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoD(inputs[1]["allocation"], lengths, lengths.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + context.execute_v2(allocations) + err, = cuda.cuMemcpyDtoH(pred_output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + return pred_output + + +def engine_init(engine): + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + engine, context = create_engine_context(engine, logger) + + return engine,context + + +def igie_infer(module, features, seq_lengths): + module.set_input("input", features) + module.set_input("seq_lengths", seq_lengths) + module.run() + out = module.get_output(0) + return out + +def igie_engine_init(engine_path): + device = tvm.device("iluvatar", 0) + lib = tvm.runtime.load_module(engine_path) + module = graph_executor.GraphModule(lib["default"](device)) + # engine, context = module.engine, module.context + return module + + + +def calculate_cer(data, reference_data): + calculator = Calculator() + tochar = True + split = None + case_sensitive = False + ignore_words = set() + rec_set = {} + for line in data: + if tochar: + array = characterize(line) + else: + array = line.strip().split() + if len(array) == 0: + continue + fid = array[0] + rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) + + default_clusters = {} + default_words = {} + for line in reference_data: + if tochar: + array = characterize(line) + else: + array = line.strip().split() + if len(array) == 0: + continue + fid = array[0] + if fid not in rec_set: + continue + lab = normalize(array[1:], ignore_words, case_sensitive, split) + rec = rec_set[fid] + + for word in rec + lab: + if word not in default_words: + default_cluster_name = default_cluster(word) + if default_cluster_name not in default_clusters: + default_clusters[default_cluster_name] = {} + if word not in default_clusters[default_cluster_name]: + default_clusters[default_cluster_name][word] = 1 + default_words[word] = default_cluster_name + result = calculator.calculate(lab, rec) + + result = calculator.overall() + cer = float(result["ins"] + result["sub"] + result["del"]) / result["all"] + corr = result["cor"] / result["all"] + + return cer, corr + + +def main(): + args = get_args() + + # 读取配置文件 + config_fn = os.path.join(args.model_dir, "config.yaml") + with open(config_fn, "r") as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + dataset_conf = copy.deepcopy(configs["dataset_conf"]) + dataset_conf["filter_conf"]["max_length"] = 102400 + dataset_conf["filter_conf"]["min_length"] = 0 + dataset_conf["filter_conf"]["token_max_length"] = 102400 + dataset_conf["filter_conf"]["token_min_length"] = 0 + dataset_conf["filter_conf"]["max_output_input_ratio"] = 102400 + dataset_conf["filter_conf"]["min_output_input_ratio"] = 0 + dataset_conf["speed_perturb"] = False + dataset_conf["spec_aug"] = False + dataset_conf["shuffle"] = False + dataset_conf["sort"] = True + dataset_conf["fbank_conf"]["dither"] = 0.0 + dataset_conf["batch_conf"]["batch_type"] = "static" + dataset_conf["batch_conf"]["batch_size"] = args.batch_size + + # Load dict + dict_fn = os.path.join(args.model_dir, "words.txt") + char_dict = {} + with open(dict_fn, "r", encoding="utf8") as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + char_dict[int(arr[1])] = arr[0] + eos = len(char_dict) - 1 + + print("*** 1. Prepare data ***") + data_type = "raw" + test_data_fn = os.path.join(args.data_dir, "data.list") + symbol_table = read_symbol_table(dict_fn) + test_dataset = Dataset( + data_type, test_data_fn, symbol_table, dataset_conf, partition=False + ) + test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) + + print("*** 2. Load engine ***") + engine_path = os.path.join(args.model_dir, f"conformer_{args.infer_type}_trt.engine") + module = igie_engine_init(engine_path) + + print("*** 3. Warm up ***") + if args.warm_up > 0: + for i in range(args.warm_up): + module.run() + + results = [] + for batch in test_data_loader: + keys, feats, target, feats_lengths, target_lengths = batch + feats = feats.cpu().numpy().astype(np.float16) + feats_lengths = feats_lengths.cpu().numpy().astype(np.int32) + hyps = igie_infer(module, feats, feats_lengths) + for i, key in enumerate(keys): + line = f"{key} " + for w in hyps[i]: + if w == eos: + break + line += char_dict[w] + results.append(line) + + # 3. 计算 CER + reference_file = os.path.join(args.data_dir, "text") + reference_data = [] + for line in open(reference_file, "r", encoding="utf-8"): + reference_data.append(line) + + cer, corr = calculate_cer(results, reference_data) + + target_cer = float(os.environ["Accuracy"]) + print("CER: ", cer, "target CER: ", target_cer) + if cer <= target_cer: + print("pass!") + exit() + else: + print("failed!") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/models/audio/speech_recognition/conformer/igie/ixrt_inference_performance.py b/models/audio/speech_recognition/conformer/igie/ixrt_inference_performance.py new file mode 100644 index 0000000000000000000000000000000000000000..3f871d9488fb21c208ccd53fccbbc5b523e5eb6d --- /dev/null +++ b/models/audio/speech_recognition/conformer/igie/ixrt_inference_performance.py @@ -0,0 +1,190 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tvm + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +import yaml +import time +import copy +import argparse +import pickle +import numpy as np + +from tqdm.contrib import tqdm +from torch.utils.data import DataLoader + +from wenet.file_utils import read_symbol_table +from wenet.dataset import Dataset + +import tensorrt +from tensorrt import Dims +from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings +import pickle + +import cuda.cuda as cuda +import cuda.cudart as cudart + +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin() + +import tvm +from tvm import relay +from tvm.contrib import graph_executor + +def get_args(): + parser = argparse.ArgumentParser(description="recognize with your model") + parser.add_argument( + "--infer_type", + default="fp16", + choices=["fp16", "int8"], + help="inference type: fp16 or int8", + ) + parser.add_argument("--warm_up", type=int, default=3, help="warm_up count") + parser.add_argument("--batch_size", type=int, default=24) + parser.add_argument("--data_dir", required=True, help="test data directory") + parser.add_argument( + "--model_dir", type=str, required=True, help="model for inference" + ) + args = parser.parse_args() + return args + +def engine_init(engine): + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + engine, context = create_engine_context(engine, logger) + + return engine,context + +def tensorrt_infer(engine,context, features, lengths): + + input_names=["input","seq_lengths"] + output_names=["output"] + input_idx = engine.get_binding_index(input_names[0]) + input_shape = features.shape + context.set_binding_shape(input_idx, Dims(input_shape)) + + seq_lengths_idx = engine.get_binding_index(input_names[1]) + seq_lengths_shape = lengths.shape + context.set_binding_shape(seq_lengths_idx, Dims(seq_lengths_shape)) + + inputs, outputs, allocations = setup_io_bindings(engine, context) + pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], features, features.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoD(inputs[1]["allocation"], lengths, lengths.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + context.execute_v2(allocations) + err, = cuda.cuMemcpyDtoH(pred_output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + return pred_output + +def igie_infer(module, features, seq_lengths): + start_time = time.time() + module.set_input("input", features) + module.set_input("seq_lengths", seq_lengths) + module.run() + out = module.get_output(0) + eval_time = time.time() - start_time + return out, eval_time + +def igie_engine_init(engine_path): + device = tvm.device("iluvatar", 0) + lib = tvm.runtime.load_module(engine_path) + module = graph_executor.GraphModule(lib["default"](device)) + # engine, context = module.engine, module.context + return module + +def main(): + args = get_args() + + # 读取配置文件 + config_fn = os.path.join(args.model_dir, "config.yaml") + with open(config_fn, "r") as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + dataset_conf = copy.deepcopy(configs["dataset_conf"]) + dataset_conf["filter_conf"]["max_length"] = 102400 + dataset_conf["filter_conf"]["min_length"] = 0 + dataset_conf["filter_conf"]["token_max_length"] = 102400 + dataset_conf["filter_conf"]["token_min_length"] = 0 + dataset_conf["filter_conf"]["max_output_input_ratio"] = 102400 + dataset_conf["filter_conf"]["min_output_input_ratio"] = 0 + dataset_conf["speed_perturb"] = False + dataset_conf["spec_aug"] = False + dataset_conf["shuffle"] = False + dataset_conf["sort"] = True + dataset_conf["fbank_conf"]["dither"] = 0.0 + dataset_conf["batch_conf"]["batch_type"] = "static" + dataset_conf["batch_conf"]["batch_size"] = args.batch_size + + # Load dict + dict_fn = os.path.join(args.model_dir, "words.txt") + char_dict = {} + with open(dict_fn, "r", encoding="utf8") as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + char_dict[int(arr[1])] = arr[0] + + print("*** 1. Prepare data ***") + data_type = "raw" + test_data_fn = os.path.join(args.data_dir, "data.list") + symbol_table = read_symbol_table(dict_fn) + test_dataset = Dataset( + data_type, test_data_fn, symbol_table, dataset_conf, partition=False + ) + + test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) + + print("*** 2. Load IxRT engine ***") + engine_path = os.path.join(args.model_dir, f"conformer_{args.infer_type}_trt.engine") + # engine, context = engine_init(engine_path) + module = igie_engine_init(engine_path) + + print("*** 3. Warm up ***") + if args.warm_up > 0: + for i in range(args.warm_up): + module.run() + + print("*** 4. Inference ***") + num_samples = 0 + results = [] + eval_time = 0.0 + for batch in test_data_loader: + keys, feats, target, feats_lengths, target_lengths = batch + feats = feats.cpu().numpy().astype(np.float16) + feats_lengths = feats_lengths.cpu().numpy().astype(np.int32) + num_samples += feats.shape[0] + hyps, batch_eval_time = igie_infer(module, feats, feats_lengths) + results.append([hyps, keys]) + eval_time += batch_eval_time + + QPS = num_samples / eval_time + print(f"Recognize {num_samples} sentences, {QPS} sentences/s") + target_qps = float(os.environ["Accuracy"]) + print("QPS: = ", QPS, "target QPS: ", target_qps) + if QPS >= target_qps: + print("pass!") + exit() + else: + print("failed!") + exit(10) + + +if __name__ == "__main__": + main() diff --git a/models/audio/speech_recognition/conformer/igie/lang_char.txt b/models/audio/speech_recognition/conformer/igie/lang_char.txt deleted file mode 100644 index 9e63f9ec45cc9aa44fcfb5c3e8125beb6ff9b075..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/lang_char.txt +++ /dev/null @@ -1,4233 +0,0 @@ - 0 - 1 -一 2 -丁 3 -七 4 -万 5 -丈 6 -三 7 -上 8 -下 9 -不 10 -与 11 -丐 12 -丑 13 -专 14 -且 15 -世 16 -丘 17 -丙 18 -业 19 -丛 20 -东 21 -丝 22 -丞 23 -丢 24 -两 25 -严 26 -丧 27 -个 28 -丫 29 -中 30 -丰 31 -串 32 -临 33 -丸 34 -丹 35 -为 36 -主 37 -丽 38 -举 39 -乃 40 -久 41 -么 42 -义 43 -之 44 -乌 45 -乍 46 -乎 47 -乏 48 -乐 49 -乒 50 -乓 51 -乔 52 -乖 53 -乘 54 -乙 55 -九 56 -乞 57 -也 58 -习 59 -乡 60 -书 61 -买 62 -乱 63 -乳 64 -乾 65 -了 66 -予 67 -争 68 -事 69 -二 70 -于 71 -亏 72 -云 73 -互 74 -五 75 -井 76 -亚 77 -些 78 -亟 79 -亡 80 -亢 81 -交 82 -亥 83 -亦 84 -产 85 -亨 86 -亩 87 -享 88 -京 89 -亭 90 -亮 91 -亲 92 -亳 93 -亵 94 -人 95 -亿 96 -什 97 -仁 98 -仄 99 -仅 100 -仇 101 -今 102 -介 103 -仍 104 -从 105 -仑 106 -仓 107 -仔 108 -仕 109 -他 110 -仗 111 -付 112 -仙 113 -仡 114 -代 115 -令 116 -以 117 -仨 118 -仪 119 -们 120 -仰 121 -仲 122 -件 123 -价 124 -任 125 -份 126 -仿 127 -企 128 -伉 129 -伊 130 -伍 131 -伎 132 -伏 133 -伐 134 -休 135 -众 136 -优 137 -伙 138 -会 139 -伞 140 -伟 141 -传 142 -伢 143 -伤 144 -伦 145 -伪 146 -伯 147 -估 148 -伴 149 -伶 150 -伸 151 -伺 152 -似 153 -伽 154 -佃 155 -但 156 -位 157 -低 158 -住 159 -佐 160 -佑 161 -体 162 -何 163 -佘 164 -余 165 -佛 166 -作 167 -佟 168 -你 169 -佣 170 -佩 171 -佬 172 -佳 173 -佶 174 -佼 175 -使 176 -侃 177 -侄 178 -侈 179 -例 180 -侍 181 -侑 182 -侗 183 -供 184 -依 185 -侠 186 -侣 187 -侥 188 -侦 189 -侧 190 -侨 191 -侬 192 -侮 193 -侯 194 -侵 195 -便 196 -促 197 -俄 198 -俊 199 -俏 200 -俐 201 -俗 202 -俘 203 -俚 204 -保 205 -俞 206 -信 207 -俨 208 -俩 209 -俪 210 -俭 211 -修 212 -俯 213 -俱 214 -俸 215 -俺 216 -俾 217 -倍 218 -倒 219 -倘 220 -候 221 -倚 222 -倜 223 -借 224 -倡 225 -倦 226 -倩 227 -倪 228 -债 229 -值 230 -倾 231 -假 232 -偏 233 -做 234 -停 235 -健 236 -偶 237 -偷 238 -偿 239 -傅 240 -傍 241 -傥 242 -储 243 -催 244 -傲 245 -傻 246 -像 247 -僚 248 -僧 249 -僮 250 -僵 251 -僻 252 -儒 253 -儿 254 -兀 255 -允 256 -元 257 -兄 258 -充 259 -兆 260 -先 261 -光 262 -克 263 -免 264 -兑 265 -兔 266 -兖 267 -党 268 -兜 269 -兢 270 -入 271 -全 272 -八 273 -公 274 -六 275 -兰 276 -共 277 -关 278 -兴 279 -兵 280 -其 281 -具 282 -典 283 -兹 284 -养 285 -兼 286 -兽 287 -冀 288 -内 289 -冈 290 -冉 291 -册 292 -再 293 -冒 294 -冕 295 -写 296 -军 297 -农 298 -冠 299 -冤 300 -冥 301 -冬 302 -冯 303 -冰 304 -冲 305 -决 306 -况 307 -冶 308 -冷 309 -冻 310 -净 311 -凄 312 -准 313 -凇 314 -凉 315 -凋 316 -凌 317 -减 318 -凑 319 -凝 320 -几 321 -凡 322 -凤 323 -凭 324 -凯 325 -凰 326 -凳 327 -凶 328 -凸 329 -凹 330 -出 331 -击 332 -函 333 -凿 334 -刀 335 -刁 336 -刃 337 -分 338 -切 339 -刊 340 -刑 341 -划 342 -列 343 -刘 344 -则 345 -刚 346 -创 347 -初 348 -删 349 -判 350 -刨 351 -利 352 -别 353 -刮 354 -到 355 -制 356 -刷 357 -券 358 -刹 359 -刺 360 -刻 361 -剁 362 -剂 363 -剃 364 -削 365 -前 366 -剐 367 -剑 368 -剔 369 -剖 370 -剥 371 -剧 372 -剩 373 -剪 374 -副 375 -割 376 -剽 377 -剿 378 -劈 379 -力 380 -劝 381 -办 382 -功 383 -加 384 -务 385 -劣 386 -动 387 -助 388 -努 389 -劫 390 -励 391 -劲 392 -劳 393 -劵 394 -势 395 -勃 396 -勇 397 -勉 398 -勋 399 -勒 400 -勘 401 -募 402 -勤 403 -勺 404 -勾 405 -勿 406 -匀 407 -包 408 -匆 409 -匈 410 -匕 411 -化 412 -北 413 -匙 414 -匝 415 -匠 416 -匡 417 -匣 418 -匪 419 -匮 420 -匹 421 -区 422 -医 423 -匾 424 -匿 425 -十 426 -千 427 -升 428 -午 429 -卉 430 -半 431 -华 432 -协 433 -卑 434 -卒 435 -卓 436 -单 437 -卖 438 -南 439 -博 440 -卜 441 -卞 442 -占 443 -卡 444 -卢 445 -卤 446 -卦 447 -卧 448 -卫 449 -卯 450 -印 451 -危 452 -卲 453 -即 454 -却 455 -卵 456 -卷 457 -卸 458 -卿 459 -厂 460 -厄 461 -厅 462 -历 463 -厉 464 -压 465 -厌 466 -厕 467 -厘 468 -厚 469 -原 470 -厢 471 -厥 472 -厦 473 -厨 474 -厩 475 -厮 476 -去 477 -县 478 -参 479 -又 480 -叉 481 -及 482 -友 483 -双 484 -反 485 -发 486 -叔 487 -取 488 -受 489 -变 490 -叙 491 -叛 492 -叠 493 -口 494 -古 495 -句 496 -另 497 -叨 498 -叩 499 -只 500 -叫 501 -召 502 -叭 503 -叮 504 -可 505 -台 506 -叱 507 -史 508 -右 509 -叵 510 -叶 511 -号 512 -司 513 -叹 514 -叼 515 -吁 516 -吃 517 -各 518 -吆 519 -合 520 -吉 521 -吊 522 -同 523 -名 524 -后 525 -吏 526 -吐 527 -向 528 -吓 529 -吕 530 -吗 531 -君 532 -吝 533 -吞 534 -吟 535 -否 536 -吧 537 -吨 538 -吩 539 -含 540 -听 541 -吭 542 -启 543 -吴 544 -吵 545 -吸 546 -吹 547 -吻 548 -吼 549 -吾 550 -吿 551 -呀 552 -呃 553 -呆 554 -呈 555 -告 556 -呐 557 -呕 558 -呗 559 -员 560 -呛 561 -呜 562 -呢 563 -呦 564 -周 565 -呲 566 -味 567 -呵 568 -呼 569 -命 570 -咀 571 -咄 572 -咋 573 -和 574 -咎 575 -咏 576 -咐 577 -咒 578 -咔 579 -咕 580 -咖 581 -咚 582 -咣 583 -咤 584 -咧 585 -咨 586 -咪 587 -咫 588 -咬 589 -咯 590 -咱 591 -咳 592 -咸 593 -咽 594 -哀 595 -品 596 -哄 597 -哆 598 -哇 599 -哈 600 -哉 601 -响 602 -哎 603 -哑 604 -哒 605 -哗 606 -哟 607 -哥 608 -哦 609 -哨 610 -哪 611 -哭 612 -哲 613 -哺 614 -哼 615 -哽 616 -唁 617 -唇 618 -唉 619 -唏 620 -唐 621 -唠 622 -唤 623 -唬 624 -售 625 -唯 626 -唱 627 -唾 628 -啃 629 -商 630 -啊 631 -啕 632 -啡 633 -啤 634 -啥 635 -啦 636 -啧 637 -啪 638 -啬 639 -啰 640 -啵 641 -啶 642 -啸 643 -啼 644 -喀 645 -喂 646 -善 647 -喆 648 -喇 649 -喉 650 -喊 651 -喔 652 -喘 653 -喜 654 -喝 655 -喧 656 -喱 657 -喵 658 -喷 659 -喻 660 -喽 661 -嗅 662 -嗑 663 -嗒 664 -嗓 665 -嗡 666 -嗣 667 -嗤 668 -嗦 669 -嗨 670 -嗬 671 -嗯 672 -嗲 673 -嗷 674 -嗽 675 -嘀 676 -嘉 677 -嘎 678 -嘘 679 -嘛 680 -嘟 681 -嘭 682 -嘱 683 -嘲 684 -嘴 685 -嘻 686 -噎 687 -器 688 -噩 689 -噪 690 -噬 691 -噱 692 -噼 693 -嚎 694 -嚏 695 -嚓 696 -嚣 697 -嚷 698 -嚼 699 -囊 700 -囚 701 -四 702 -回 703 -因 704 -团 705 -囤 706 -囧 707 -园 708 -困 709 -围 710 -固 711 -国 712 -图 713 -圆 714 -圈 715 -土 716 -圣 717 -在 718 -圩 719 -圪 720 -圭 721 -地 722 -圳 723 -场 724 -圾 725 -址 726 -坂 727 -均 728 -坊 729 -坍 730 -坎 731 -坏 732 -坐 733 -坑 734 -块 735 -坚 736 -坛 737 -坝 738 -坞 739 -坟 740 -坠 741 -坡 742 -坤 743 -坦 744 -坪 745 -坯 746 -坷 747 -垂 748 -垃 749 -垄 750 -垅 751 -型 752 -垌 753 -垒 754 -垛 755 -垢 756 -垣 757 -垤 758 -垦 759 -垫 760 -垮 761 -埃 762 -埋 763 -城 764 -埔 765 -埜 766 -域 767 -培 768 -基 769 -堂 770 -堆 771 -堕 772 -堡 773 -堤 774 -堪 775 -堰 776 -堵 777 -塌 778 -塑 779 -塔 780 -塘 781 -塞 782 -填 783 -塬 784 -塾 785 -境 786 -墅 787 -墓 788 -墙 789 -增 790 -墟 791 -墨 792 -墩 793 -壁 794 -壑 795 -壕 796 -壤 797 -士 798 -壮 799 -声 800 -壳 801 -壶 802 -壹 803 -处 804 -备 805 -复 806 -夏 807 -夕 808 -外 809 -夙 810 -多 811 -夜 812 -够 813 -大 814 -天 815 -太 816 -夫 817 -夭 818 -央 819 -夯 820 -失 821 -头 822 -夷 823 -夸 824 -夹 825 -夺 826 -奂 827 -奇 828 -奈 829 -奉 830 -奋 831 -奎 832 -奏 833 -契 834 -奔 835 -奕 836 -奖 837 -套 838 -奘 839 -奚 840 -奠 841 -奢 842 -奥 843 -女 844 -奴 845 -奶 846 -奸 847 -她 848 -好 849 -如 850 -妃 851 -妄 852 -妆 853 -妇 854 -妈 855 -妊 856 -妍 857 -妒 858 -妖 859 -妙 860 -妞 861 -妤 862 -妥 863 -妧 864 -妨 865 -妩 866 -妮 867 -妯 868 -妹 869 -妻 870 -姆 871 -姊 872 -始 873 -姐 874 -姑 875 -姓 876 -委 877 -姗 878 -姚 879 -姜 880 -姝 881 -姣 882 -姥 883 -姨 884 -姬 885 -姻 886 -姿 887 -威 888 -娃 889 -娄 890 -娅 891 -娇 892 -娌 893 -娘 894 -娜 895 -娟 896 -娠 897 -娥 898 -娩 899 -娱 900 -娴 901 -娶 902 -娼 903 -婀 904 -婆 905 -婉 906 -婕 907 -婚 908 -婧 909 -婪 910 -婴 911 -婵 912 -婶 913 -婷 914 -婿 915 -媒 916 -媚 917 -媛 918 -媞 919 -媲 920 -媳 921 -嫁 922 -嫂 923 -嫉 924 -嫌 925 -嫔 926 -嫖 927 -嫚 928 -嫣 929 -嫦 930 -嫩 931 -嬉 932 -嬛 933 -嬷 934 -孀 935 -子 936 -孔 937 -孕 938 -字 939 -存 940 -孙 941 -孚 942 -孜 943 -孝 944 -孟 945 -孢 946 -季 947 -孤 948 -学 949 -孩 950 -孪 951 -孰 952 -孱 953 -孵 954 -孺 955 -宁 956 -它 957 -宅 958 -宇 959 -守 960 -安 961 -宋 962 -完 963 -宏 964 -宓 965 -宕 966 -宗 967 -官 968 -宙 969 -定 970 -宛 971 -宜 972 -宝 973 -实 974 -宠 975 -审 976 -客 977 -宣 978 -室 979 -宦 980 -宪 981 -宫 982 -宰 983 -害 984 -宴 985 -宵 986 -家 987 -宸 988 -容 989 -宽 990 -宾 991 -宿 992 -寂 993 -寄 994 -寅 995 -密 996 -寇 997 -富 998 -寐 999 -寒 1000 -寓 1001 -寝 1002 -寞 1003 -察 1004 -寡 1005 -寥 1006 -寨 1007 -寮 1008 -寰 1009 -寸 1010 -对 1011 -寺 1012 -寻 1013 -导 1014 -寿 1015 -封 1016 -射 1017 -将 1018 -尊 1019 -小 1020 -少 1021 -尔 1022 -尖 1023 -尘 1024 -尚 1025 -尝 1026 -尤 1027 -尧 1028 -尬 1029 -就 1030 -尴 1031 -尸 1032 -尹 1033 -尺 1034 -尼 1035 -尽 1036 -尾 1037 -尿 1038 -局 1039 -屁 1040 -层 1041 -居 1042 -屈 1043 -届 1044 -屋 1045 -屌 1046 -屎 1047 -屏 1048 -屑 1049 -展 1050 -属 1051 -屠 1052 -屡 1053 -履 1054 -屯 1055 -山 1056 -屹 1057 -屿 1058 -岁 1059 -岂 1060 -岌 1061 -岐 1062 -岔 1063 -岖 1064 -岗 1065 -岚 1066 -岛 1067 -岩 1068 -岬 1069 -岭 1070 -岱 1071 -岳 1072 -岷 1073 -岸 1074 -峁 1075 -峙 1076 -峡 1077 -峥 1078 -峨 1079 -峪 1080 -峭 1081 -峰 1082 -峻 1083 -崂 1084 -崃 1085 -崇 1086 -崎 1087 -崔 1088 -崖 1089 -崛 1090 -崧 1091 -崩 1092 -崭 1093 -崴 1094 -嵋 1095 -嵌 1096 -嵘 1097 -嵛 1098 -嵩 1099 -嶝 1100 -巅 1101 -巍 1102 -川 1103 -州 1104 -巡 1105 -巢 1106 -工 1107 -左 1108 -巧 1109 -巨 1110 -巩 1111 -巫 1112 -差 1113 -己 1114 -已 1115 -巴 1116 -巷 1117 -巾 1118 -巿 1119 -币 1120 -市 1121 -布 1122 -帅 1123 -帆 1124 -师 1125 -希 1126 -帐 1127 -帕 1128 -帖 1129 -帘 1130 -帚 1131 -帜 1132 -帝 1133 -带 1134 -席 1135 -帮 1136 -帷 1137 -常 1138 -帼 1139 -帽 1140 -幂 1141 -幄 1142 -幅 1143 -幌 1144 -幕 1145 -幢 1146 -干 1147 -平 1148 -年 1149 -并 1150 -幸 1151 -幺 1152 -幻 1153 -幼 1154 -幽 1155 -广 1156 -庄 1157 -庆 1158 -庇 1159 -床 1160 -序 1161 -庐 1162 -库 1163 -应 1164 -底 1165 -店 1166 -庙 1167 -庚 1168 -府 1169 -庞 1170 -废 1171 -度 1172 -座 1173 -庭 1174 -庵 1175 -康 1176 -庸 1177 -庾 1178 -廉 1179 -廊 1180 -廓 1181 -廖 1182 -延 1183 -廷 1184 -建 1185 -开 1186 -异 1187 -弃 1188 -弄 1189 -弈 1190 -弊 1191 -式 1192 -弓 1193 -引 1194 -弗 1195 -弘 1196 -弛 1197 -弟 1198 -张 1199 -弥 1200 -弦 1201 -弧 1202 -弩 1203 -弯 1204 -弱 1205 -弹 1206 -强 1207 -归 1208 -当 1209 -录 1210 -彝 1211 -形 1212 -彤 1213 -彦 1214 -彩 1215 -彪 1216 -彬 1217 -彭 1218 -彰 1219 -影 1220 -彷 1221 -役 1222 -彻 1223 -彼 1224 -彿 1225 -往 1226 -征 1227 -径 1228 -待 1229 -徇 1230 -很 1231 -徉 1232 -徊 1233 -律 1234 -徐 1235 -徒 1236 -得 1237 -徘 1238 -徙 1239 -御 1240 -循 1241 -微 1242 -德 1243 -徽 1244 -心 1245 -必 1246 -忆 1247 -忌 1248 -忍 1249 -忐 1250 -忑 1251 -志 1252 -忘 1253 -忙 1254 -忠 1255 -忧 1256 -忪 1257 -快 1258 -忱 1259 -念 1260 -忽 1261 -怀 1262 -态 1263 -怂 1264 -怎 1265 -怒 1266 -怕 1267 -怖 1268 -怜 1269 -思 1270 -怠 1271 -怡 1272 -急 1273 -怦 1274 -性 1275 -怨 1276 -怪 1277 -怯 1278 -怵 1279 -总 1280 -恋 1281 -恍 1282 -恐 1283 -恒 1284 -恙 1285 -恢 1286 -恣 1287 -恤 1288 -恨 1289 -恩 1290 -恪 1291 -恬 1292 -恭 1293 -息 1294 -恰 1295 -恳 1296 -恶 1297 -恸 1298 -恺 1299 -恼 1300 -恿 1301 -悄 1302 -悉 1303 -悍 1304 -悔 1305 -悖 1306 -悚 1307 -悟 1308 -悠 1309 -患 1310 -悦 1311 -您 1312 -悬 1313 -悯 1314 -悲 1315 -悴 1316 -悸 1317 -悼 1318 -情 1319 -惊 1320 -惋 1321 -惑 1322 -惕 1323 -惚 1324 -惜 1325 -惟 1326 -惠 1327 -惦 1328 -惧 1329 -惨 1330 -惩 1331 -惫 1332 -惬 1333 -惮 1334 -惯 1335 -惰 1336 -想 1337 -惶 1338 -惹 1339 -惺 1340 -愁 1341 -愈 1342 -愉 1343 -意 1344 -愕 1345 -愚 1346 -感 1347 -愤 1348 -愧 1349 -愿 1350 -慈 1351 -慌 1352 -慎 1353 -慑 1354 -慕 1355 -慢 1356 -慧 1357 -慨 1358 -慰 1359 -慷 1360 -憋 1361 -憔 1362 -憧 1363 -憨 1364 -憩 1365 -憬 1366 -憷 1367 -憾 1368 -懂 1369 -懈 1370 -懊 1371 -懋 1372 -懒 1373 -懵 1374 -懿 1375 -戈 1376 -戎 1377 -戏 1378 -成 1379 -我 1380 -戒 1381 -或 1382 -战 1383 -戚 1384 -戛 1385 -戟 1386 -截 1387 -戬 1388 -戮 1389 -戳 1390 -戴 1391 -户 1392 -房 1393 -所 1394 -扁 1395 -扇 1396 -扉 1397 -手 1398 -才 1399 -扎 1400 -扑 1401 -扒 1402 -打 1403 -扔 1404 -托 1405 -扛 1406 -扣 1407 -执 1408 -扩 1409 -扫 1410 -扬 1411 -扭 1412 -扮 1413 -扯 1414 -扰 1415 -扳 1416 -扶 1417 -批 1418 -扼 1419 -找 1420 -承 1421 -技 1422 -抄 1423 -抉 1424 -把 1425 -抑 1426 -抒 1427 -抓 1428 -投 1429 -抖 1430 -抗 1431 -折 1432 -抚 1433 -抛 1434 -抠 1435 -抡 1436 -抢 1437 -护 1438 -报 1439 -抨 1440 -披 1441 -抬 1442 -抱 1443 -抵 1444 -抹 1445 -押 1446 -抽 1447 -抿 1448 -拄 1449 -担 1450 -拆 1451 -拇 1452 -拈 1453 -拉 1454 -拌 1455 -拍 1456 -拎 1457 -拐 1458 -拒 1459 -拓 1460 -拔 1461 -拖 1462 -拗 1463 -拘 1464 -拙 1465 -招 1466 -拜 1467 -拟 1468 -拢 1469 -拣 1470 -拥 1471 -拦 1472 -拧 1473 -拨 1474 -择 1475 -括 1476 -拭 1477 -拮 1478 -拯 1479 -拱 1480 -拳 1481 -拴 1482 -拷 1483 -拼 1484 -拽 1485 -拾 1486 -拿 1487 -持 1488 -挂 1489 -指 1490 -按 1491 -挎 1492 -挑 1493 -挖 1494 -挚 1495 -挛 1496 -挝 1497 -挟 1498 -挠 1499 -挡 1500 -挣 1501 -挤 1502 -挥 1503 -挨 1504 -挪 1505 -挫 1506 -振 1507 -挺 1508 -挽 1509 -捂 1510 -捅 1511 -捆 1512 -捉 1513 -捍 1514 -捎 1515 -捏 1516 -捐 1517 -捕 1518 -捞 1519 -损 1520 -捡 1521 -换 1522 -捣 1523 -捧 1524 -据 1525 -捷 1526 -捺 1527 -捻 1528 -掀 1529 -掂 1530 -授 1531 -掉 1532 -掌 1533 -掏 1534 -掐 1535 -排 1536 -掖 1537 -掘 1538 -掠 1539 -探 1540 -掣 1541 -接 1542 -控 1543 -推 1544 -掩 1545 -措 1546 -掬 1547 -掮 1548 -掰 1549 -掴 1550 -掷 1551 -掺 1552 -揉 1553 -揍 1554 -描 1555 -提 1556 -插 1557 -握 1558 -揣 1559 -揩 1560 -揪 1561 -揭 1562 -援 1563 -揽 1564 -搀 1565 -搁 1566 -搂 1567 -搅 1568 -搏 1569 -搜 1570 -搞 1571 -搡 1572 -搪 1573 -搬 1574 -搭 1575 -携 1576 -搽 1577 -摁 1578 -摄 1579 -摆 1580 -摇 1581 -摊 1582 -摒 1583 -摔 1584 -摘 1585 -摧 1586 -摩 1587 -摸 1588 -摹 1589 -撂 1590 -撇 1591 -撑 1592 -撒 1593 -撕 1594 -撞 1595 -撤 1596 -撩 1597 -撬 1598 -播 1599 -撮 1600 -撰 1601 -撵 1602 -撸 1603 -撼 1604 -擂 1605 -擅 1606 -操 1607 -擎 1608 -擒 1609 -擘 1610 -擞 1611 -擦 1612 -攀 1613 -攒 1614 -攥 1615 -支 1616 -收 1617 -改 1618 -攻 1619 -放 1620 -政 1621 -故 1622 -效 1623 -敌 1624 -敏 1625 -救 1626 -敖 1627 -教 1628 -敛 1629 -敝 1630 -敞 1631 -敢 1632 -散 1633 -敦 1634 -敬 1635 -数 1636 -敲 1637 -整 1638 -敷 1639 -文 1640 -斌 1641 -斐 1642 -斑 1643 -斓 1644 -斗 1645 -料 1646 -斛 1647 -斜 1648 -斟 1649 -斤 1650 -斥 1651 -斧 1652 -斩 1653 -断 1654 -斯 1655 -新 1656 -方 1657 -施 1658 -旁 1659 -旅 1660 -旋 1661 -族 1662 -旗 1663 -无 1664 -既 1665 -日 1666 -旦 1667 -旧 1668 -旨 1669 -早 1670 -旬 1671 -旭 1672 -旱 1673 -时 1674 -旷 1675 -旺 1676 -昀 1677 -昂 1678 -昆 1679 -昊 1680 -昌 1681 -明 1682 -昏 1683 -易 1684 -昔 1685 -昕 1686 -昙 1687 -星 1688 -映 1689 -春 1690 -昧 1691 -昨 1692 -昭 1693 -是 1694 -昱 1695 -昵 1696 -昼 1697 -显 1698 -晃 1699 -晋 1700 -晏 1701 -晒 1702 -晓 1703 -晔 1704 -晕 1705 -晖 1706 -晗 1707 -晚 1708 -晟 1709 -晤 1710 -晦 1711 -晨 1712 -普 1713 -景 1714 -晰 1715 -晴 1716 -晶 1717 -智 1718 -晾 1719 -暂 1720 -暄 1721 -暇 1722 -暑 1723 -暖 1724 -暗 1725 -暧 1726 -暨 1727 -暮 1728 -暴 1729 -曙 1730 -曝 1731 -曦 1732 -曰 1733 -曲 1734 -更 1735 -曹 1736 -曼 1737 -曾 1738 -替 1739 -最 1740 -月 1741 -有 1742 -朋 1743 -服 1744 -朐 1745 -朔 1746 -朗 1747 -望 1748 -朝 1749 -期 1750 -朦 1751 -木 1752 -未 1753 -末 1754 -本 1755 -札 1756 -术 1757 -朱 1758 -朴 1759 -朵 1760 -机 1761 -朽 1762 -杀 1763 -杂 1764 -权 1765 -杆 1766 -杉 1767 -李 1768 -杏 1769 -材 1770 -村 1771 -杖 1772 -杜 1773 -杞 1774 -束 1775 -杠 1776 -条 1777 -来 1778 -杨 1779 -杭 1780 -杯 1781 -杰 1782 -杳 1783 -松 1784 -板 1785 -极 1786 -构 1787 -枉 1788 -析 1789 -枕 1790 -林 1791 -枚 1792 -果 1793 -枝 1794 -枞 1795 -枢 1796 -枣 1797 -枪 1798 -枫 1799 -枭 1800 -枯 1801 -架 1802 -枷 1803 -柄 1804 -柏 1805 -某 1806 -染 1807 -柔 1808 -柜 1809 -柞 1810 -柠 1811 -查 1812 -柬 1813 -柯 1814 -柱 1815 -柳 1816 -柴 1817 -柿 1818 -栅 1819 -标 1820 -栈 1821 -栋 1822 -栏 1823 -树 1824 -栓 1825 -栖 1826 -栗 1827 -校 1828 -株 1829 -样 1830 -核 1831 -根 1832 -格 1833 -栽 1834 -栾 1835 -桂 1836 -桃 1837 -框 1838 -案 1839 -桉 1840 -桌 1841 -桎 1842 -桐 1843 -桑 1844 -桓 1845 -桔 1846 -档 1847 -桥 1848 -桦 1849 -桩 1850 -桶 1851 -梁 1852 -梅 1853 -梓 1854 -梗 1855 -梦 1856 -梧 1857 -梨 1858 -梭 1859 -梯 1860 -械 1861 -梳 1862 -梵 1863 -检 1864 -棉 1865 -棋 1866 -棍 1867 -棒 1868 -棕 1869 -棘 1870 -棚 1871 -棠 1872 -森 1873 -棱 1874 -棵 1875 -棺 1876 -椅 1877 -椋 1878 -植 1879 -椎 1880 -椒 1881 -椰 1882 -椿 1883 -楂 1884 -楔 1885 -楚 1886 -楞 1887 -楠 1888 -楣 1889 -楷 1890 -楼 1891 -概 1892 -榄 1893 -榆 1894 -榈 1895 -榉 1896 -榔 1897 -榕 1898 -榜 1899 -榨 1900 -榭 1901 -榴 1902 -榷 1903 -榻 1904 -槌 1905 -槎 1906 -槐 1907 -槛 1908 -槟 1909 -槽 1910 -槿 1911 -樊 1912 -樟 1913 -模 1914 -横 1915 -樱 1916 -橄 1917 -橘 1918 -橙 1919 -橡 1920 -橱 1921 -檀 1922 -檐 1923 -檬 1924 -欠 1925 -次 1926 -欢 1927 -欣 1928 -欧 1929 -欲 1930 -欺 1931 -款 1932 -歆 1933 -歇 1934 -歉 1935 -歌 1936 -止 1937 -正 1938 -此 1939 -步 1940 -武 1941 -歧 1942 -歪 1943 -歹 1944 -死 1945 -殃 1946 -殆 1947 -殉 1948 -殊 1949 -残 1950 -殒 1951 -殓 1952 -殖 1953 -殚 1954 -殡 1955 -殭 1956 -殴 1957 -段 1958 -殷 1959 -殿 1960 -毁 1961 -毂 1962 -毅 1963 -毋 1964 -母 1965 -每 1966 -毒 1967 -毓 1968 -比 1969 -毕 1970 -毗 1971 -毙 1972 -毛 1973 -毫 1974 -毯 1975 -毽 1976 -氏 1977 -民 1978 -氓 1979 -气 1980 -氛 1981 -氟 1982 -氢 1983 -氦 1984 -氧 1985 -氨 1986 -氪 1987 -氮 1988 -氯 1989 -氰 1990 -水 1991 -永 1992 -汀 1993 -汁 1994 -求 1995 -汇 1996 -汉 1997 -汕 1998 -汗 1999 -汛 2000 -汝 2001 -汞 2002 -江 2003 -池 2004 -污 2005 -汤 2006 -汪 2007 -汰 2008 -汲 2009 -汴 2010 -汶 2011 -汹 2012 -汽 2013 -汾 2014 -沁 2015 -沃 2016 -沅 2017 -沈 2018 -沉 2019 -沏 2020 -沐 2021 -沓 2022 -沙 2023 -沛 2024 -沟 2025 -没 2026 -沣 2027 -沥 2028 -沦 2029 -沧 2030 -沪 2031 -沫 2032 -沮 2033 -沱 2034 -河 2035 -沸 2036 -油 2037 -治 2038 -沼 2039 -沽 2040 -沾 2041 -沿 2042 -泄 2043 -泉 2044 -泊 2045 -泌 2046 -泓 2047 -泔 2048 -法 2049 -泗 2050 -泛 2051 -泞 2052 -泠 2053 -泡 2054 -波 2055 -泣 2056 -泥 2057 -注 2058 -泪 2059 -泯 2060 -泰 2061 -泱 2062 -泳 2063 -泵 2064 -泷 2065 -泸 2066 -泻 2067 -泼 2068 -泽 2069 -泾 2070 -洁 2071 -洋 2072 -洒 2073 -洗 2074 -洙 2075 -洛 2076 -洞 2077 -津 2078 -洪 2079 -洱 2080 -洲 2081 -洵 2082 -活 2083 -洼 2084 -洽 2085 -派 2086 -流 2087 -浅 2088 -浆 2089 -浇 2090 -浈 2091 -浊 2092 -测 2093 -济 2094 -浏 2095 -浑 2096 -浓 2097 -浙 2098 -浚 2099 -浦 2100 -浩 2101 -浪 2102 -浮 2103 -浴 2104 -海 2105 -浸 2106 -涂 2107 -涅 2108 -消 2109 -涉 2110 -涌 2111 -涎 2112 -涓 2113 -涕 2114 -涛 2115 -涝 2116 -涞 2117 -涠 2118 -涡 2119 -涤 2120 -润 2121 -涧 2122 -涨 2123 -涩 2124 -涮 2125 -涯 2126 -液 2127 -涵 2128 -涿 2129 -淀 2130 -淄 2131 -淆 2132 -淇 2133 -淋 2134 -淌 2135 -淑 2136 -淖 2137 -淘 2138 -淝 2139 -淞 2140 -淡 2141 -淤 2142 -淫 2143 -淮 2144 -深 2145 -淳 2146 -混 2147 -淹 2148 -添 2149 -淼 2150 -渀 2151 -清 2152 -渊 2153 -渍 2154 -渎 2155 -渐 2156 -渔 2157 -渗 2158 -渚 2159 -渝 2160 -渠 2161 -渡 2162 -渣 2163 -渤 2164 -渥 2165 -温 2166 -渭 2167 -港 2168 -渲 2169 -渴 2170 -游 2171 -渺 2172 -湃 2173 -湍 2174 -湖 2175 -湘 2176 -湛 2177 -湾 2178 -湿 2179 -溃 2180 -溅 2181 -溉 2182 -源 2183 -溜 2184 -溢 2185 -溥 2186 -溧 2187 -溪 2188 -溯 2189 -溶 2190 -溺 2191 -滁 2192 -滇 2193 -滋 2194 -滑 2195 -滔 2196 -滕 2197 -滚 2198 -滞 2199 -满 2200 -滢 2201 -滤 2202 -滥 2203 -滨 2204 -滩 2205 -滴 2206 -漂 2207 -漆 2208 -漏 2209 -漓 2210 -演 2211 -漕 2212 -漠 2213 -漩 2214 -漫 2215 -漭 2216 -漯 2217 -漱 2218 -漳 2219 -漾 2220 -潇 2221 -潘 2222 -潜 2223 -潞 2224 -潢 2225 -潭 2226 -潮 2227 -潼 2228 -澄 2229 -澈 2230 -澎 2231 -澜 2232 -澡 2233 -澳 2234 -激 2235 -濑 2236 -濒 2237 -濠 2238 -濡 2239 -濮 2240 -瀑 2241 -瀚 2242 -瀛 2243 -灌 2244 -灞 2245 -火 2246 -灭 2247 -灯 2248 -灰 2249 -灵 2250 -灶 2251 -灼 2252 -灾 2253 -灿 2254 -炅 2255 -炉 2256 -炊 2257 -炎 2258 -炒 2259 -炕 2260 -炖 2261 -炙 2262 -炜 2263 -炫 2264 -炬 2265 -炭 2266 -炮 2267 -炯 2268 -炳 2269 -炷 2270 -炸 2271 -点 2272 -炼 2273 -炽 2274 -烁 2275 -烂 2276 -烃 2277 -烈 2278 -烊 2279 -烘 2280 -烙 2281 -烟 2282 -烤 2283 -烦 2284 -烧 2285 -烨 2286 -烫 2287 -热 2288 -烯 2289 -烷 2290 -烹 2291 -烽 2292 -焉 2293 -焊 2294 -焕 2295 -焖 2296 -焘 2297 -焚 2298 -焦 2299 -焯 2300 -焰 2301 -焱 2302 -然 2303 -煊 2304 -煌 2305 -煎 2306 -煜 2307 -煞 2308 -煤 2309 -煦 2310 -照 2311 -煮 2312 -煲 2313 -熄 2314 -熊 2315 -熏 2316 -熔 2317 -熙 2318 -熟 2319 -熠 2320 -熨 2321 -熬 2322 -熹 2323 -燃 2324 -燊 2325 -燎 2326 -燕 2327 -燥 2328 -爆 2329 -爪 2330 -爬 2331 -爱 2332 -爵 2333 -父 2334 -爷 2335 -爸 2336 -爹 2337 -爽 2338 -片 2339 -版 2340 -牌 2341 -牙 2342 -牛 2343 -牟 2344 -牡 2345 -牢 2346 -牧 2347 -物 2348 -牲 2349 -牵 2350 -特 2351 -牺 2352 -牾 2353 -犀 2354 -犊 2355 -犒 2356 -犬 2357 -犯 2358 -状 2359 -犷 2360 -犹 2361 -狂 2362 -狄 2363 -狈 2364 -狐 2365 -狗 2366 -狙 2367 -狞 2368 -狠 2369 -狡 2370 -狩 2371 -独 2372 -狭 2373 -狮 2374 -狰 2375 -狱 2376 -狸 2377 -狼 2378 -猎 2379 -猖 2380 -猛 2381 -猜 2382 -猝 2383 -猥 2384 -猩 2385 -猪 2386 -猫 2387 -猬 2388 -献 2389 -猴 2390 -猾 2391 -猿 2392 -獒 2393 -獗 2394 -獾 2395 -玄 2396 -率 2397 -玉 2398 -王 2399 -玖 2400 -玛 2401 -玟 2402 -玥 2403 -玩 2404 -玫 2405 -玮 2406 -环 2407 -现 2408 -玲 2409 -玳 2410 -玺 2411 -玻 2412 -珀 2413 -珉 2414 -珊 2415 -珍 2416 -珏 2417 -珑 2418 -珜 2419 -珠 2420 -班 2421 -珮 2422 -珲 2423 -珺 2424 -球 2425 -琅 2426 -理 2427 -琉 2428 -琊 2429 -琏 2430 -琐 2431 -琛 2432 -琢 2433 -琥 2434 -琦 2435 -琪 2436 -琬 2437 -琰 2438 -琳 2439 -琴 2440 -琵 2441 -琶 2442 -琼 2443 -瑁 2444 -瑄 2445 -瑕 2446 -瑙 2447 -瑚 2448 -瑛 2449 -瑜 2450 -瑞 2451 -瑟 2452 -瑰 2453 -瑶 2454 -瑾 2455 -璀 2456 -璃 2457 -璇 2458 -璋 2459 -璐 2460 -璞 2461 -璧 2462 -璨 2463 -瓜 2464 -瓢 2465 -瓣 2466 -瓦 2467 -瓮 2468 -瓯 2469 -瓶 2470 -瓷 2471 -甄 2472 -甘 2473 -甚 2474 -甜 2475 -生 2476 -甥 2477 -用 2478 -甩 2479 -甫 2480 -甬 2481 -田 2482 -由 2483 -甲 2484 -申 2485 -电 2486 -男 2487 -甸 2488 -町 2489 -画 2490 -畅 2491 -畊 2492 -界 2493 -畏 2494 -畔 2495 -留 2496 -畜 2497 -略 2498 -番 2499 -畴 2500 -畸 2501 -畿 2502 -疃 2503 -疆 2504 -疏 2505 -疑 2506 -疗 2507 -疚 2508 -疝 2509 -疤 2510 -疫 2511 -疯 2512 -疲 2513 -疵 2514 -疹 2515 -疼 2516 -疾 2517 -病 2518 -症 2519 -痉 2520 -痊 2521 -痒 2522 -痕 2523 -痘 2524 -痛 2525 -痣 2526 -痪 2527 -痫 2528 -痰 2529 -痱 2530 -痴 2531 -痹 2532 -痼 2533 -瘀 2534 -瘁 2535 -瘟 2536 -瘠 2537 -瘤 2538 -瘦 2539 -瘩 2540 -瘪 2541 -瘫 2542 -瘸 2543 -瘾 2544 -癌 2545 -癖 2546 -癣 2547 -癫 2548 -登 2549 -白 2550 -百 2551 -皂 2552 -的 2553 -皆 2554 -皇 2555 -皋 2556 -皎 2557 -皓 2558 -皖 2559 -皙 2560 -皮 2561 -皱 2562 -盆 2563 -盈 2564 -益 2565 -盎 2566 -盐 2567 -监 2568 -盒 2569 -盔 2570 -盖 2571 -盗 2572 -盘 2573 -盛 2574 -盟 2575 -目 2576 -盯 2577 -盲 2578 -直 2579 -相 2580 -盹 2581 -盼 2582 -盾 2583 -省 2584 -眈 2585 -眉 2586 -看 2587 -真 2588 -眠 2589 -眨 2590 -眬 2591 -眯 2592 -眶 2593 -眷 2594 -眺 2595 -眼 2596 -着 2597 -睁 2598 -睐 2599 -睛 2600 -睡 2601 -督 2602 -睦 2603 -睫 2604 -睬 2605 -睹 2606 -睿 2607 -瞄 2608 -瞅 2609 -瞌 2610 -瞎 2611 -瞒 2612 -瞟 2613 -瞧 2614 -瞩 2615 -瞪 2616 -瞬 2617 -瞰 2618 -瞳 2619 -瞻 2620 -瞿 2621 -矗 2622 -矛 2623 -矜 2624 -矢 2625 -矣 2626 -知 2627 -矩 2628 -矫 2629 -短 2630 -矮 2631 -石 2632 -矶 2633 -矿 2634 -码 2635 -砂 2636 -砌 2637 -砍 2638 -砒 2639 -研 2640 -砖 2641 -砚 2642 -砝 2643 -砥 2644 -砰 2645 -砲 2646 -破 2647 -砷 2648 -砸 2649 -砺 2650 -砾 2651 -础 2652 -硅 2653 -硕 2654 -硚 2655 -硝 2656 -硫 2657 -硬 2658 -确 2659 -碉 2660 -碌 2661 -碍 2662 -碎 2663 -碑 2664 -碗 2665 -碘 2666 -碚 2667 -碟 2668 -碧 2669 -碰 2670 -碱 2671 -碳 2672 -碴 2673 -碾 2674 -磁 2675 -磅 2676 -磊 2677 -磋 2678 -磐 2679 -磕 2680 -磡 2681 -磨 2682 -磴 2683 -磷 2684 -磺 2685 -礁 2686 -示 2687 -礼 2688 -社 2689 -祁 2690 -祈 2691 -祉 2692 -祖 2693 -祛 2694 -祝 2695 -神 2696 -祠 2697 -祢 2698 -祥 2699 -票 2700 -祭 2701 -祯 2702 -祷 2703 -祸 2704 -祺 2705 -禀 2706 -禁 2707 -禄 2708 -禅 2709 -福 2710 -禧 2711 -禹 2712 -禺 2713 -离 2714 -禽 2715 -禾 2716 -秀 2717 -私 2718 -秃 2719 -秆 2720 -秉 2721 -秋 2722 -种 2723 -科 2724 -秒 2725 -秘 2726 -租 2727 -秣 2728 -秤 2729 -秦 2730 -秧 2731 -秩 2732 -积 2733 -称 2734 -秸 2735 -移 2736 -秽 2737 -稀 2738 -程 2739 -稍 2740 -税 2741 -稚 2742 -稠 2743 -稣 2744 -稳 2745 -稻 2746 -稼 2747 -稽 2748 -稿 2749 -穆 2750 -穗 2751 -穴 2752 -究 2753 -穷 2754 -空 2755 -穿 2756 -突 2757 -窃 2758 -窄 2759 -窈 2760 -窍 2761 -窑 2762 -窒 2763 -窕 2764 -窖 2765 -窗 2766 -窘 2767 -窜 2768 -窝 2769 -窟 2770 -窥 2771 -窦 2772 -窨 2773 -窿 2774 -立 2775 -竖 2776 -站 2777 -竞 2778 -竟 2779 -章 2780 -竣 2781 -童 2782 -竭 2783 -端 2784 -竲 2785 -竹 2786 -竺 2787 -竽 2788 -竿 2789 -笃 2790 -笈 2791 -笋 2792 -笑 2793 -笔 2794 -笙 2795 -笛 2796 -符 2797 -笨 2798 -第 2799 -笼 2800 -等 2801 -筋 2802 -筐 2803 -筑 2804 -筒 2805 -答 2806 -策 2807 -筛 2808 -筱 2809 -筵 2810 -筷 2811 -筹 2812 -签 2813 -简 2814 -箍 2815 -算 2816 -管 2817 -箫 2818 -箭 2819 -箱 2820 -篇 2821 -篡 2822 -篪 2823 -篮 2824 -篷 2825 -簇 2826 -簧 2827 -簸 2828 -簿 2829 -籁 2830 -籍 2831 -米 2832 -类 2833 -籽 2834 -粉 2835 -粒 2836 -粕 2837 -粗 2838 -粘 2839 -粟 2840 -粤 2841 -粥 2842 -粪 2843 -粮 2844 -粱 2845 -粹 2846 -精 2847 -糊 2848 -糕 2849 -糖 2850 -糗 2851 -糙 2852 -糟 2853 -糯 2854 -系 2855 -紊 2856 -素 2857 -索 2858 -紧 2859 -紫 2860 -累 2861 -絮 2862 -綦 2863 -繁 2864 -纠 2865 -红 2866 -纣 2867 -纤 2868 -约 2869 -级 2870 -纪 2871 -纬 2872 -纯 2873 -纰 2874 -纱 2875 -纲 2876 -纳 2877 -纵 2878 -纶 2879 -纷 2880 -纸 2881 -纹 2882 -纺 2883 -纽 2884 -线 2885 -练 2886 -组 2887 -绅 2888 -细 2889 -织 2890 -终 2891 -绊 2892 -绌 2893 -绍 2894 -绎 2895 -经 2896 -绑 2897 -绒 2898 -结 2899 -绕 2900 -绘 2901 -给 2902 -绚 2903 -络 2904 -绝 2905 -绞 2906 -统 2907 -绣 2908 -继 2909 -绩 2910 -绪 2911 -续 2912 -绮 2913 -绯 2914 -绰 2915 -绳 2916 -维 2917 -绵 2918 -绷 2919 -绸 2920 -综 2921 -绽 2922 -绿 2923 -缀 2924 -缄 2925 -缅 2926 -缆 2927 -缇 2928 -缉 2929 -缓 2930 -缔 2931 -缕 2932 -编 2933 -缘 2934 -缙 2935 -缚 2936 -缜 2937 -缝 2938 -缠 2939 -缤 2940 -缨 2941 -缩 2942 -缪 2943 -缭 2944 -缮 2945 -缰 2946 -缴 2947 -缸 2948 -缺 2949 -罂 2950 -罄 2951 -罐 2952 -网 2953 -罕 2954 -罗 2955 -罚 2956 -罡 2957 -罢 2958 -罩 2959 -罪 2960 -置 2961 -署 2962 -罹 2963 -羁 2964 -羊 2965 -美 2966 -羚 2967 -羞 2968 -羡 2969 -羣 2970 -群 2971 -羲 2972 -羹 2973 -羽 2974 -羿 2975 -翁 2976 -翅 2977 -翌 2978 -翔 2979 -翘 2980 -翟 2981 -翠 2982 -翡 2983 -翩 2984 -翰 2985 -翱 2986 -翻 2987 -翼 2988 -耀 2989 -老 2990 -考 2991 -耄 2992 -者 2993 -耋 2994 -而 2995 -耍 2996 -耐 2997 -耒 2998 -耕 2999 -耗 3000 -耘 3001 -耳 3002 -耶 3003 -耷 3004 -耸 3005 -耻 3006 -耽 3007 -耿 3008 -聂 3009 -聆 3010 -聊 3011 -聋 3012 -职 3013 -联 3014 -聘 3015 -聚 3016 -聪 3017 -肃 3018 -肆 3019 -肇 3020 -肉 3021 -肋 3022 -肌 3023 -肖 3024 -肘 3025 -肚 3026 -肛 3027 -肝 3028 -肠 3029 -股 3030 -肢 3031 -肤 3032 -肥 3033 -肩 3034 -肪 3035 -肮 3036 -肯 3037 -育 3038 -肴 3039 -肺 3040 -肾 3041 -肿 3042 -胀 3043 -胁 3044 -胃 3045 -胆 3046 -背 3047 -胎 3048 -胖 3049 -胚 3050 -胛 3051 -胜 3052 -胞 3053 -胡 3054 -胤 3055 -胧 3056 -胫 3057 -胯 3058 -胰 3059 -胱 3060 -胳 3061 -胶 3062 -胸 3063 -胺 3064 -能 3065 -脂 3066 -脆 3067 -脉 3068 -脊 3069 -脍 3070 -脏 3071 -脐 3072 -脑 3073 -脖 3074 -脚 3075 -脯 3076 -脱 3077 -脸 3078 -脾 3079 -腆 3080 -腊 3081 -腋 3082 -腌 3083 -腐 3084 -腑 3085 -腓 3086 -腔 3087 -腕 3088 -腥 3089 -腩 3090 -腰 3091 -腱 3092 -腹 3093 -腺 3094 -腻 3095 -腼 3096 -腾 3097 -腿 3098 -膀 3099 -膊 3100 -膏 3101 -膑 3102 -膛 3103 -膜 3104 -膝 3105 -膨 3106 -膳 3107 -膺 3108 -臀 3109 -臂 3110 -臃 3111 -臆 3112 -臣 3113 -自 3114 -臭 3115 -至 3116 -致 3117 -臻 3118 -舀 3119 -舅 3120 -舆 3121 -舌 3122 -舍 3123 -舒 3124 -舛 3125 -舜 3126 -舞 3127 -舟 3128 -航 3129 -般 3130 -舰 3131 -舱 3132 -舵 3133 -舶 3134 -舸 3135 -船 3136 -艇 3137 -艋 3138 -艘 3139 -良 3140 -艰 3141 -色 3142 -艳 3143 -艺 3144 -艾 3145 -节 3146 -芊 3147 -芋 3148 -芒 3149 -芙 3150 -芜 3151 -芝 3152 -芦 3153 -芬 3154 -芭 3155 -芮 3156 -芯 3157 -花 3158 -芳 3159 -芷 3160 -芸 3161 -芽 3162 -苇 3163 -苍 3164 -苏 3165 -苑 3166 -苗 3167 -苛 3168 -苟 3169 -苡 3170 -苣 3171 -若 3172 -苦 3173 -苯 3174 -英 3175 -苹 3176 -茁 3177 -茂 3178 -范 3179 -茄 3180 -茅 3181 -茆 3182 -茎 3183 -茗 3184 -茜 3185 -茨 3186 -茫 3187 -茵 3188 -茶 3189 -茸 3190 -茹 3191 -荃 3192 -荆 3193 -草 3194 -荐 3195 -荒 3196 -荔 3197 -荚 3198 -荞 3199 -荟 3200 -荡 3201 -荣 3202 -荤 3203 -荧 3204 -荫 3205 -药 3206 -荷 3207 -荼 3208 -莅 3209 -莆 3210 -莉 3211 -莎 3212 -莓 3213 -莘 3214 -莞 3215 -莠 3216 -莫 3217 -莱 3218 -莲 3219 -莴 3220 -获 3221 -莹 3222 -莺 3223 -莽 3224 -菁 3225 -菇 3226 -菊 3227 -菌 3228 -菜 3229 -菠 3230 -菡 3231 -菩 3232 -菱 3233 -菲 3234 -萃 3235 -萄 3236 -萋 3237 -萌 3238 -萍 3239 -萎 3240 -萝 3241 -萤 3242 -营 3243 -萦 3244 -萧 3245 -萨 3246 -萱 3247 -落 3248 -葆 3249 -著 3250 -葛 3251 -葡 3252 -董 3253 -葩 3254 -葫 3255 -葬 3256 -葱 3257 -葵 3258 -蒂 3259 -蒋 3260 -蒙 3261 -蒜 3262 -蒲 3263 -蒸 3264 -蒿 3265 -蓁 3266 -蓄 3267 -蓉 3268 -蓝 3269 -蓟 3270 -蓬 3271 -蔑 3272 -蔓 3273 -蔗 3274 -蔚 3275 -蔡 3276 -蔫 3277 -蔬 3278 -蔷 3279 -蔺 3280 -蔽 3281 -蕉 3282 -蕊 3283 -蕙 3284 -蕲 3285 -蕴 3286 -蕾 3287 -薄 3288 -薇 3289 -薛 3290 -薪 3291 -薯 3292 -薰 3293 -藏 3294 -藜 3295 -藤 3296 -藩 3297 -藻 3298 -蘑 3299 -虎 3300 -虐 3301 -虑 3302 -虚 3303 -虞 3304 -虫 3305 -虱 3306 -虹 3307 -虽 3308 -虾 3309 -蚀 3310 -蚁 3311 -蚂 3312 -蚊 3313 -蚌 3314 -蚓 3315 -蚕 3316 -蚝 3317 -蚣 3318 -蚯 3319 -蛀 3320 -蛇 3321 -蛋 3322 -蛐 3323 -蛙 3324 -蛛 3325 -蛟 3326 -蛮 3327 -蛰 3328 -蜀 3329 -蜂 3330 -蜇 3331 -蜈 3332 -蜊 3333 -蜒 3334 -蜓 3335 -蜕 3336 -蜘 3337 -蜚 3338 -蜜 3339 -蜡 3340 -蜥 3341 -蜴 3342 -蜷 3343 -蜿 3344 -蝇 3345 -蝉 3346 -蝎 3347 -蝗 3348 -蝙 3349 -蝠 3350 -蝴 3351 -蝶 3352 -螂 3353 -螃 3354 -融 3355 -螳 3356 -螺 3357 -蟑 3358 -蟹 3359 -蠢 3360 -血 3361 -衅 3362 -行 3363 -衍 3364 -衔 3365 -街 3366 -衙 3367 -衡 3368 -衣 3369 -补 3370 -表 3371 -衫 3372 -衬 3373 -衰 3374 -衷 3375 -袁 3376 -袂 3377 -袄 3378 -袆 3379 -袈 3380 -袋 3381 -袍 3382 -袒 3383 -袖 3384 -袜 3385 -被 3386 -袭 3387 -袱 3388 -裁 3389 -裂 3390 -装 3391 -裆 3392 -裔 3393 -裕 3394 -裙 3395 -裟 3396 -裤 3397 -裳 3398 -裴 3399 -裸 3400 -裹 3401 -褂 3402 -褒 3403 -褓 3404 -褚 3405 -褛 3406 -褪 3407 -褴 3408 -褶 3409 -襁 3410 -襄 3411 -襟 3412 -西 3413 -要 3414 -覃 3415 -覆 3416 -见 3417 -观 3418 -规 3419 -觅 3420 -视 3421 -览 3422 -觉 3423 -觊 3424 -觎 3425 -觐 3426 -觑 3427 -角 3428 -解 3429 -觥 3430 -触 3431 -言 3432 -詹 3433 -誉 3434 -誓 3435 -警 3436 -譬 3437 -计 3438 -订 3439 -认 3440 -讧 3441 -讨 3442 -让 3443 -讪 3444 -训 3445 -议 3446 -讯 3447 -记 3448 -讲 3449 -讳 3450 -讶 3451 -许 3452 -讹 3453 -论 3454 -讼 3455 -讽 3456 -设 3457 -访 3458 -诀 3459 -证 3460 -评 3461 -诅 3462 -识 3463 -诈 3464 -诉 3465 -诊 3466 -词 3467 -译 3468 -诓 3469 -试 3470 -诗 3471 -诙 3472 -诚 3473 -话 3474 -诞 3475 -诟 3476 -诠 3477 -诡 3478 -询 3479 -该 3480 -详 3481 -诧 3482 -诩 3483 -诫 3484 -诬 3485 -语 3486 -误 3487 -诱 3488 -诲 3489 -说 3490 -诵 3491 -诶 3492 -请 3493 -诸 3494 -诺 3495 -读 3496 -诽 3497 -课 3498 -诿 3499 -谀 3500 -谁 3501 -调 3502 -谅 3503 -谈 3504 -谊 3505 -谋 3506 -谌 3507 -谍 3508 -谎 3509 -谐 3510 -谑 3511 -谓 3512 -谕 3513 -谙 3514 -谚 3515 -谜 3516 -谢 3517 -谣 3518 -谤 3519 -谦 3520 -谨 3521 -谩 3522 -谬 3523 -谭 3524 -谱 3525 -谴 3526 -谷 3527 -豁 3528 -豆 3529 -豚 3530 -象 3531 -豪 3532 -豫 3533 -豹 3534 -貅 3535 -貉 3536 -貌 3537 -貔 3538 -贝 3539 -贞 3540 -负 3541 -贡 3542 -财 3543 -责 3544 -贤 3545 -败 3546 -账 3547 -货 3548 -质 3549 -贩 3550 -贪 3551 -贫 3552 -贬 3553 -购 3554 -贮 3555 -贯 3556 -贱 3557 -贴 3558 -贵 3559 -贷 3560 -贸 3561 -费 3562 -贺 3563 -贼 3564 -贾 3565 -贿 3566 -赁 3567 -赂 3568 -赃 3569 -资 3570 -赋 3571 -赌 3572 -赎 3573 -赏 3574 -赐 3575 -赔 3576 -赖 3577 -赘 3578 -赚 3579 -赛 3580 -赝 3581 -赞 3582 -赠 3583 -赡 3584 -赢 3585 -赣 3586 -赤 3587 -赦 3588 -赫 3589 -走 3590 -赴 3591 -赵 3592 -赶 3593 -起 3594 -趁 3595 -超 3596 -越 3597 -趋 3598 -趟 3599 -趣 3600 -足 3601 -趴 3602 -趸 3603 -趾 3604 -跃 3605 -跄 3606 -跆 3607 -跌 3608 -跑 3609 -跛 3610 -距 3611 -跟 3612 -跤 3613 -跨 3614 -跪 3615 -路 3616 -跳 3617 -践 3618 -跷 3619 -跺 3620 -跻 3621 -踉 3622 -踊 3623 -踏 3624 -踝 3625 -踞 3626 -踢 3627 -踩 3628 -踪 3629 -踵 3630 -踹 3631 -蹂 3632 -蹄 3633 -蹈 3634 -蹊 3635 -蹚 3636 -蹦 3637 -蹬 3638 -蹭 3639 -蹲 3640 -蹴 3641 -蹶 3642 -蹼 3643 -蹿 3644 -躁 3645 -躏 3646 -身 3647 -躬 3648 -躯 3649 -躲 3650 -躺 3651 -车 3652 -轧 3653 -轨 3654 -轩 3655 -转 3656 -轮 3657 -软 3658 -轰 3659 -轴 3660 -轶 3661 -轻 3662 -载 3663 -轿 3664 -较 3665 -辄 3666 -辅 3667 -辆 3668 -辈 3669 -辉 3670 -辍 3671 -辐 3672 -辑 3673 -输 3674 -辖 3675 -辗 3676 -辘 3677 -辙 3678 -辛 3679 -辜 3680 -辞 3681 -辟 3682 -辣 3683 -辨 3684 -辩 3685 -辫 3686 -辰 3687 -辱 3688 -边 3689 -辽 3690 -达 3691 -迁 3692 -迂 3693 -迄 3694 -迅 3695 -过 3696 -迈 3697 -迎 3698 -运 3699 -近 3700 -返 3701 -还 3702 -这 3703 -进 3704 -远 3705 -违 3706 -连 3707 -迟 3708 -迢 3709 -迥 3710 -迪 3711 -迫 3712 -迭 3713 -述 3714 -迷 3715 -迸 3716 -迹 3717 -追 3718 -退 3719 -送 3720 -适 3721 -逃 3722 -逅 3723 -逆 3724 -选 3725 -逊 3726 -逍 3727 -透 3728 -逐 3729 -递 3730 -途 3731 -逗 3732 -通 3733 -逛 3734 -逝 3735 -逞 3736 -速 3737 -造 3738 -逡 3739 -逢 3740 -逮 3741 -逵 3742 -逸 3743 -逻 3744 -逼 3745 -逾 3746 -遁 3747 -遂 3748 -遇 3749 -遍 3750 -遏 3751 -遐 3752 -道 3753 -遗 3754 -遛 3755 -遢 3756 -遣 3757 -遥 3758 -遨 3759 -遭 3760 -遮 3761 -遴 3762 -遵 3763 -避 3764 -邀 3765 -邂 3766 -邃 3767 -邋 3768 -邑 3769 -邓 3770 -邛 3771 -邝 3772 -邢 3773 -那 3774 -邦 3775 -邪 3776 -邬 3777 -邮 3778 -邯 3779 -邱 3780 -邵 3781 -邹 3782 -邺 3783 -邻 3784 -郁 3785 -郊 3786 -郎 3787 -郑 3788 -郜 3789 -郝 3790 -郡 3791 -部 3792 -郫 3793 -郭 3794 -郸 3795 -都 3796 -鄂 3797 -鄙 3798 -鄞 3799 -鄢 3800 -酋 3801 -酌 3802 -配 3803 -酒 3804 -酗 3805 -酝 3806 -酣 3807 -酪 3808 -酬 3809 -酯 3810 -酱 3811 -酵 3812 -酶 3813 -酷 3814 -酸 3815 -酿 3816 -醇 3817 -醉 3818 -醋 3819 -醍 3820 -醐 3821 -醒 3822 -醛 3823 -采 3824 -釉 3825 -释 3826 -里 3827 -重 3828 -野 3829 -量 3830 -金 3831 -釜 3832 -鉴 3833 -鏖 3834 -鑫 3835 -针 3836 -钉 3837 -钊 3838 -钓 3839 -钛 3840 -钝 3841 -钞 3842 -钟 3843 -钠 3844 -钢 3845 -钥 3846 -钦 3847 -钧 3848 -钩 3849 -钮 3850 -钰 3851 -钱 3852 -钵 3853 -钻 3854 -钾 3855 -铀 3856 -铁 3857 -铂 3858 -铃 3859 -铅 3860 -铆 3861 -铉 3862 -铎 3863 -铐 3864 -铜 3865 -铝 3866 -铠 3867 -铣 3868 -铨 3869 -铬 3870 -铭 3871 -铮 3872 -铰 3873 -铲 3874 -银 3875 -铸 3876 -铺 3877 -链 3878 -铿 3879 -销 3880 -锁 3881 -锂 3882 -锄 3883 -锅 3884 -锆 3885 -锈 3886 -锋 3887 -锌 3888 -锏 3889 -锐 3890 -错 3891 -锜 3892 -锟 3893 -锡 3894 -锢 3895 -锣 3896 -锤 3897 -锥 3898 -锦 3899 -锭 3900 -键 3901 -锯 3902 -锰 3903 -锵 3904 -锷 3905 -锹 3906 -锻 3907 -镀 3908 -镁 3909 -镇 3910 -镉 3911 -镊 3912 -镍 3913 -镑 3914 -镖 3915 -镜 3916 -镯 3917 -镳 3918 -镶 3919 -长 3920 -门 3921 -闪 3922 -闫 3923 -闭 3924 -问 3925 -闯 3926 -闰 3927 -闲 3928 -闳 3929 -间 3930 -闵 3931 -闷 3932 -闸 3933 -闹 3934 -闺 3935 -闻 3936 -闽 3937 -阀 3938 -阁 3939 -阂 3940 -阅 3941 -阎 3942 -阐 3943 -阔 3944 -阙 3945 -阚 3946 -阜 3947 -队 3948 -阮 3949 -阱 3950 -防 3951 -阳 3952 -阴 3953 -阵 3954 -阶 3955 -阻 3956 -阿 3957 -陀 3958 -陂 3959 -附 3960 -际 3961 -陆 3962 -陈 3963 -陋 3964 -陌 3965 -降 3966 -限 3967 -陕 3968 -陡 3969 -院 3970 -除 3971 -陨 3972 -险 3973 -陪 3974 -陬 3975 -陵 3976 -陶 3977 -陷 3978 -隅 3979 -隆 3980 -隋 3981 -隍 3982 -随 3983 -隐 3984 -隔 3985 -隘 3986 -隙 3987 -障 3988 -隧 3989 -隶 3990 -隼 3991 -隽 3992 -难 3993 -雀 3994 -雁 3995 -雄 3996 -雅 3997 -集 3998 -雇 3999 -雌 4000 -雍 4001 -雏 4002 -雕 4003 -雨 4004 -雪 4005 -雯 4006 -雳 4007 -零 4008 -雷 4009 -雾 4010 -需 4011 -霁 4012 -霄 4013 -霆 4014 -震 4015 -霈 4016 -霉 4017 -霍 4018 -霎 4019 -霏 4020 -霖 4021 -霜 4022 -霞 4023 -露 4024 -霸 4025 -霹 4026 -霾 4027 -靑 4028 -青 4029 -靓 4030 -靖 4031 -静 4032 -靛 4033 -非 4034 -靠 4035 -靡 4036 -面 4037 -革 4038 -靳 4039 -靴 4040 -靶 4041 -鞋 4042 -鞍 4043 -鞘 4044 -鞠 4045 -鞭 4046 -韦 4047 -韧 4048 -韩 4049 -韬 4050 -音 4051 -韵 4052 -韶 4053 -页 4054 -顶 4055 -顷 4056 -项 4057 -顺 4058 -须 4059 -顽 4060 -顾 4061 -顿 4062 -颁 4063 -颂 4064 -预 4065 -颅 4066 -领 4067 -颇 4068 -颈 4069 -颊 4070 -颍 4071 -颐 4072 -频 4073 -颓 4074 -颖 4075 -颗 4076 -题 4077 -颚 4078 -颜 4079 -额 4080 -颠 4081 -颤 4082 -风 4083 -飒 4084 -飓 4085 -飘 4086 -飙 4087 -飚 4088 -飞 4089 -食 4090 -餐 4091 -餮 4092 -饕 4093 -饥 4094 -饪 4095 -饭 4096 -饮 4097 -饰 4098 -饱 4099 -饲 4100 -饵 4101 -饶 4102 -饺 4103 -饼 4104 -饽 4105 -饿 4106 -馀 4107 -馅 4108 -馆 4109 -馈 4110 -馊 4111 -馋 4112 -馑 4113 -馒 4114 -首 4115 -馗 4116 -香 4117 -馥 4118 -馨 4119 -马 4120 -驭 4121 -驯 4122 -驰 4123 -驱 4124 -驳 4125 -驴 4126 -驶 4127 -驻 4128 -驼 4129 -驾 4130 -驿 4131 -骁 4132 -骂 4133 -骄 4134 -骅 4135 -骆 4136 -骇 4137 -骊 4138 -骋 4139 -验 4140 -骏 4141 -骐 4142 -骑 4143 -骗 4144 -骚 4145 -骜 4146 -骤 4147 -骥 4148 -骨 4149 -骷 4150 -骸 4151 -骼 4152 -髅 4153 -髋 4154 -髓 4155 -高 4156 -髦 4157 -鬼 4158 -魁 4159 -魂 4160 -魄 4161 -魅 4162 -魇 4163 -魏 4164 -魔 4165 -鱼 4166 -鲁 4167 -鲍 4168 -鲜 4169 -鲟 4170 -鲨 4171 -鲶 4172 -鲷 4173 -鲸 4174 -鳄 4175 -鳅 4176 -鳌 4177 -鳖 4178 -鳝 4179 -鳞 4180 -鸟 4181 -鸠 4182 -鸡 4183 -鸣 4184 -鸥 4185 -鸦 4186 -鸭 4187 -鸯 4188 -鸳 4189 -鸵 4190 -鸽 4191 -鸾 4192 -鸿 4193 -鹃 4194 -鹅 4195 -鹊 4196 -鹏 4197 -鹜 4198 -鹞 4199 -鹤 4200 -鹭 4201 -鹰 4202 -鹿 4203 -麋 4204 -麒 4205 -麓 4206 -麟 4207 -麦 4208 -麻 4209 -麾 4210 -黄 4211 -黍 4212 -黎 4213 -黏 4214 -黑 4215 -黔 4216 -默 4217 -黛 4218 -黝 4219 -黯 4220 -鼎 4221 -鼓 4222 -鼠 4223 -鼻 4224 -鼾 4225 -齐 4226 -齿 4227 -龄 4228 -龙 4229 -龚 4230 -龟 4231 - 4232 diff --git a/models/audio/speech_recognition/conformer/igie/load_ixrt_plugin.py b/models/audio/speech_recognition/conformer/igie/load_ixrt_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..f4452f2edf9877ea84e31f34bdadefcc247e5b52 --- /dev/null +++ b/models/audio/speech_recognition/conformer/igie/load_ixrt_plugin.py @@ -0,0 +1,12 @@ +import ctypes +import tensorrt +from os.path import join, dirname, exists +def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path) + tensorrt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") diff --git a/models/audio/speech_recognition/conformer/igie/requirements.txt b/models/audio/speech_recognition/conformer/igie/requirements.txt index 8820eb754dec653c319dc0c86d53049346c7f7b6..3dcea1ccc8337478e16d50942acc6175d270b9b5 100644 --- a/models/audio/speech_recognition/conformer/igie/requirements.txt +++ b/models/audio/speech_recognition/conformer/igie/requirements.txt @@ -1,4 +1,5 @@ tqdm onnx -typeguard==2.13.3 -onnxsim \ No newline at end of file +onnxsim +librosa +soundfile \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/scripts/aishell_data_prepare.sh b/models/audio/speech_recognition/conformer/igie/scripts/aishell_data_prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..985564c2294b2a413531d6ced018029ec911fb23 --- /dev/null +++ b/models/audio/speech_recognition/conformer/igie/scripts/aishell_data_prepare.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +# set -euox pipefail + +data_dir=$1 +tool_dir=$2 + +wav_dir=${data_dir}/wav +aishell_text=${data_dir}/transcript/aishell_transcript_v0.8.txt + +# data directory check +if [ ! -d $wav_dir ] || [ ! -f $aishell_text ]; then + echo "Error: wav directory and aishell text not found!" + exit 1; +fi + +# find test wav file +local_dir=${data_dir}/local +mkdir -p $local_dir +find $wav_dir -iname "*.wav" > $local_dir/wav.flist || exit 1; + +# Transcriptions preparation +sed -e 's/\.wav//' $local_dir/wav.flist | awk -F '/' '{print $NF}' > $local_dir/utt.list +paste -d' ' $local_dir/utt.list $local_dir/wav.flist > $local_dir/wav.scp_all +${tool_dir}/filter_scp.pl -f 1 $local_dir/utt.list $aishell_text > $local_dir/transcripts.txt +awk '{print $1}' $local_dir/transcripts.txt > $local_dir/utt.list +${tool_dir}/filter_scp.pl -f 1 $local_dir/utt.list $local_dir/wav.scp_all | sort -u > $local_dir/wav.scp +sort -u $local_dir/transcripts.txt > $local_dir/text +echo "Preparing transcriptions succeeded!" + +test_dir=${data_dir}/test +mkdir -p ${test_dir} +for f in wav.scp text; do + cp $local_dir/$f ${test_dir}/$f || exit 1; +done +rm -r ${data_dir}/local + +# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, +# `shard` is used for large dataset which is over 1k hours, and `shard` is +# faster on reading data and training. +data_type=raw +num_utts_per_shard=1000 + +# remove the space between the text labels for Mandarin dataset +cp $test_dir/text $test_dir/text.org +paste -d " " <(cut -f 1 -d" " ${test_dir}/text.org) \ + <(cut -f 2- -d" " ${test_dir}/text.org | tr -d " ") \ + > ${test_dir}/text +rm ${test_dir}/text.org + +# Prepare required format +if [ $data_type == "shard" ]; then + ${tool_dir}/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ + --num_threads 16 $test_dir/wav.scp $test_dir/text \ + $(realpath $test_dir/shards) $test_dir/data.list +else + ${tool_dir}/make_raw_list.py $test_dir/wav.scp $test_dir/text \ + $test_dir/data.list +fi + +echo "AISHELL data preparation succeeded!" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh index 7b9d18cc7e8305cb86cc31a000ba44e60afde7e9..1f9b7fb21850f9b793887701bc542bcd30f75cf0 100644 --- a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh +++ b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh @@ -14,10 +14,8 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - +set -euo pipefail batchsize=24 -seqlen=384 -model_path="encoder_bs24_seq384_static_opt_matmul.onnx" # Update arguments index=0 @@ -33,22 +31,33 @@ done echo "batch size is ${batchsize}" -# build engine -python3 build_engine.py \ - --model_path ${model_path} \ - --input speech:${batchsize},${seqlen},80 speech_lengths:${batchsize} \ - --precision fp16 \ - --engine_path encoder_bs${batchsize}_seq${seqlen}_fp16.so - -# inference -python3 inference.py \ - --engine encoder_bs${batchsize}_seq${seqlen}_fp16.so \ - --input speech speech_lengths \ - --label text \ - --config train.yaml \ - --test_data data.list \ - --dict lang_char.txt \ - --mode ctc_greedy_search \ - --batch_size ${batchsize} \ - --seq_len ${seqlen} \ - --result_file conformer_output_log \ No newline at end of file +EXIT_STATUS=0 +check_status() +{ + ret_code=${PIPESTATUS[0]} + if [ ${ret_code} != 0 ]; then + echo "fails" + [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1 + fi +} + +current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +PROJECT_DIR=${current_path}/.. +DATA_DIR=${current_path}/../aishell_test_data/test +MODEL_DIR=${current_path}/../conformer_checkpoints + +export Accuracy=${Accuracy:=0.05} + +cd ${PROJECT_DIR} + +python3 build_engine.py \ + --onnx_model ${MODEL_DIR}/conformer_fp16_trt.onnx \ + --engine ${MODEL_DIR}/conformer_fp16_trt.engine "$@" ;check_status + +python3 ixrt_inference_accuracy.py \ + --infer_type fp16 \ + --batch_size ${batchsize} \ + --data_dir ${DATA_DIR} \ + --model_dir ${MODEL_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh index 311beac35ee379ac13c11153a347e573eaf09e46..499021223e97726aecc2eff66849278ac6dfc25d 100644 --- a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh +++ b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh @@ -14,10 +14,9 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +set -euo pipefail batchsize=24 -seqlen=384 -model_path="encoder_bs24_seq384_static_opt_matmul.onnx" # Update arguments index=0 @@ -33,23 +32,33 @@ done echo "batch size is ${batchsize}" -# build engine -python3 build_engine.py \ - --model_path ${model_path} \ - --input speech:${batchsize},${seqlen},80 speech_lengths:${batchsize} \ - --precision fp16 \ - --engine_path encoder_bs${batchsize}_seq${seqlen}_fp16.so - -# inference -python3 inference.py \ - --engine encoder_bs${batchsize}_seq${seqlen}_fp16.so \ - --input speech speech_lengths \ - --label text \ - --config train.yaml \ - --test_data data.list \ - --dict lang_char.txt \ - --mode ctc_greedy_search \ - --batch_size ${batchsize} \ - --seq_len ${seqlen} \ - --result_file conformer_output_log \ - --perf_only True \ No newline at end of file +EXIT_STATUS=0 +check_status() +{ + ret_code=${PIPESTATUS[0]} + if [ ${ret_code} != 0 ]; then + echo "fails" + [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1 + fi +} + +current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +PROJECT_DIR=${current_path}/.. +DATA_DIR=${current_path}/../aishell_test_data/test +MODEL_DIR=${current_path}/../conformer_checkpoints + +export Accuracy=${Accuracy:=529} + +cd ${PROJECT_DIR} + +python3 build_engine.py \ + --onnx_model ${MODEL_DIR}/conformer_fp16_trt.onnx \ + --engine ${MODEL_DIR}/conformer_fp16_trt.engine "$@" ;check_status + +python3 ixrt_inference_performance.py \ + --infer_type fp16 \ + --batch_size ${batchsize} \ + --data_dir ${DATA_DIR} \ + --model_dir ${MODEL_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/text b/models/audio/speech_recognition/conformer/igie/text deleted file mode 100644 index 93f768e9349b68e0705cfefb694d3c57a397b2c4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/text +++ /dev/null @@ -1,7176 +0,0 @@ -BAC009S0764W0121 甚至出现交易几乎停滞的情况 -BAC009S0764W0122 一二线城市虽然也处于调整中 -BAC009S0764W0123 但因为聚集了过多公共资源 -BAC009S0764W0124 为了规避三四线城市明显过剩的市场风险 -BAC009S0764W0125 标杆房企必然调整市场战略 -BAC009S0764W0126 因此土地储备至关重要 -BAC009S0764W0127 中原地产首席分析师张大伟说 -BAC009S0764W0128 一线城市土地供应量减少 -BAC009S0764W0129 也助推了土地市场的火爆 -BAC009S0764W0130 北京仅新增住宅土地供应十宗 -BAC009S0764W0131 开发边界将作为城市发展的刚性约定 -BAC009S0764W0132 不得超越界限盲目扩张 -BAC009S0764W0133 目前挂牌的只有几宗土地 -BAC009S0764W0134 再加上近期一二线楼市升温 -BAC009S0764W0135 房企对土地的争抢更加积极 -BAC009S0764W0136 土地市场体现了房企对一二线市场的看重 -BAC009S0764W0137 面包价格会跟风上涨吗 -BAC009S0764W0138 成交量环比大幅增加 -BAC009S0764W0139 国家统计局的数据显示 -BAC009S0764W0140 其中广州深圳甚至出现了多个日光盘 -BAC009S0764W0141 零三年到去年 -BAC009S0764W0142 市场基数已不可同日而语 -BAC009S0764W0143 在市场整体从高速增长进入中高速增长区间的同时 -BAC009S0764W0144 一线城市在价格较高的基础上整体回升并领涨全国 -BAC009S0764W0145 绝大部分三线城市房价仍然下降 -BAC009S0764W0146 一线楼市成交量激增 -BAC009S0764W0147 三四线城市依然冷清 -BAC009S0764W0148 根据中原地产研究中心最新数据 -BAC009S0764W0149 一线城市签约十七万套 -BAC009S0764W0150 同比涨幅达到百分之四 -BAC009S0764W0151 三线城市签约十六万套 -BAC009S0764W0152 四线城市成交量有轻微下调 -BAC009S0764W0153 住房城乡建设部政策研究中心主任秦虹表示 -BAC009S0764W0154 我国房地产市场过去从体偏紧部分地区过紧 -BAC009S0764W0155 总体偏松部分地区过剩 -BAC009S0764W0156 当供给远快于需求时 -BAC009S0764W0157 很难出现去年那样的楼市暴涨 -BAC009S0764W0158 即便是北上广深等供应偏紧的一线城市 -BAC009S0764W0159 也有限购政策在控制需求规模 -BAC009S0764W0160 从而有利于抑制楼市过快上涨 -BAC009S0764W0161 楼市调控供的行政手段宜减不宜加 -BAC009S0764W0162 稳增长措施需更全面地考虑化解楼市风险问题 -BAC009S0764W0163 楼市调控将去向何方 -BAC009S0764W0164 进一步发挥市场在资源配置中的决定性作用 -BAC009S0764W0165 楼市调控的行政手段宜减不宜加 -BAC009S0764W0166 去行政化 -BAC009S0764W0167 随着市场调整的深入 -BAC009S0764W0168 一些三线城市取消限购及限贷 -BAC009S0764W0169 实施较大幅度的补贴政策 -BAC009S0764W0170 当地新建商品住宅的房价多在每平方米三四千元 -BAC009S0764W0171 政府出台每平方米补贴五百元的托市政策 -BAC009S0764W0172 由于不可能从根本上改变供求关系 -BAC009S0764W0173 类似的补贴政策常常是短效刺激 -BAC009S0764W0174 会对市场造成新一轮的干扰 -BAC009S0764W0175 安徽铜陵结束了当地契税补贴政策 -BAC009S0764W0176 当月住宅类商品房成交套数骤跌 -BAC009S0764W0177 在经济下行压力加大的背景下 -BAC009S0764W0178 稳增长措施需更全面地考虑化解楼市风险问题 -BAC009S0764W0179 国务院发展研究中心市场经济研究所副所长邓郁松认为 -BAC009S0764W0180 可能引发房价泡沫风险 -BAC009S0764W0181 在经济增速放缓阶段运用货币政策工具时 -BAC009S0764W0183 基本住房需求得到满足后 -BAC009S0764W0184 对绿色高效宜居的高品质住房需求快速上升 -BAC009S0764W0185 通过改革和政策调整 -BAC009S0764W0186 实现我国房地产市场的平稳运行 -BAC009S0764W0187 及时发现产业发展中的倾向性苗头性问题 -BAC009S0764W0188 促进战略性新兴产业健康发展 -BAC009S0764W0189 有关部门和社会各界积极响应 -BAC009S0764W0190 采取了一系列的政策措施 -BAC009S0764W0191 促使我国战略性新兴产业发展实现了良好开局 -BAC009S0764W0192 战略性新兴产业在各地呈现出集聚蓬勃发展的态势 -BAC009S0764W0193 先后出台的战略性新兴产业的政策措施主要有六项 -BAC009S0764W0194 在加强宏观引导方面 -BAC009S0764W0195 形成了系统完整的规划体系 -BAC009S0764W0196 明确了发展目标和重点任务 -BAC009S0764W0197 在加大要素支持方面 -BAC009S0764W0198 新批复了七只创投基金的设立方案 -BAC009S0764W0199 吸引社会资本七亿元 -BAC009S0764W0200 在加快体制改革方面 -BAC009S0764W0201 组织了第一批七个地区城市开展三网融合试点 -BAC009S0764W0202 第二批三网融合试点工作业已启动 -BAC009S0764W0203 制定了可再生能源电价附加补贴和配额交易方案 -BAC009S0764W0204 发改委双节期间重点关注电商促销行为 -BAC009S0764W0205 本报记者王颖春国家发改委近日发出通知 -BAC009S0764W0206 相关公司股票走势农产品 -BAC009S0764W0207 积极防范和妥善应对市场价格异常波动 -BAC009S0764W0208 维护正常的市场价格秩序 -BAC009S0764W0209 严厉打击春运期间违规上调票价价外收费等违法行为 -BAC009S0764W0210 切实降低农产品流通成本 -BAC009S0764W0211 要加强节日期间旅游市场价格监管 -BAC009S0764W0212 以及提供服务中的变相涨价或价格欺诈行为 -BAC009S0764W0213 构建良好的旅游市场环境 -BAC009S0764W0214 要继续开展商贸零售领域价格秩序整治 -BAC009S0764W0215 重点关注大型电子商务经营者的促销行为 -BAC009S0764W0216 规范降价打折返券赠送等促销行为 -BAC009S0764W0217 营造良好的消费环境 -BAC009S0764W0218 发改委多渠道筹集保障房建设资金到 -BAC009S0764W0219 要加大保障性安居工程建设资计划落实力度 -BAC009S0764W0220 二零一二年中央进一步加大了资金支持力度 -BAC009S0764W0221 地方政府也要加大资金筹措力度 -BAC009S0764W0222 加强建设资金统筹和组织实施工作 -BAC009S0764W0223 确保保障性安居工程年度建设任务的完成 -BAC009S0764W0224 充分发挥地方政府融资平台作用 -BAC009S0764W0225 鼓励引导社会力量参与建设保障性住房及配套设施 -BAC009S0764W0226 尽快将中央补助投资和省级配套资金分解下达到市县 -BAC009S0764W0227 二零一二保障房建设 -BAC009S0764W0228 七千万套保障房多少钢材 -BAC009S0764W0229 如何在五天内筹集到七万元 -BAC009S0764W0230 各地保障房建设的套数 -BAC009S0764W0231 保障房和水利建设概念股 -BAC009S0764W0232 发改委将订制战略避免境外投资恶性竞争到 -BAC009S0764W0233 新京报讯记者钟晶晶发改委昨日表示 -BAC009S0764W0234 政府将制订境外投资总体战略 -BAC009S0764W0235 避免中国企业境外恶性竞争 -BAC009S0764W0236 并鼓励企业在境外上市 -BAC009S0764W0237 加强海外信息监测为企业提供对外投资指导 -BAC009S0764W0238 形成一批具有国际竞争力的中国企业 -BAC009S0764W0239 十一五期间我国累计境外投资七千亿美元 -BAC009S0764W0240 年均增速百分之七 -BAC009S0764W0242 单项投资规模日益增大 -BAC009S0764W0243 几个亿美元的项目不断出现 -BAC009S0764W0244 规划对十二五的投资规模未做预测 -BAC009S0764W0245 但在鼓励企业走出去方面释出多个信号 -BAC009S0764W0246 鼓励传统纺织家电汽车等一般制造业外移 -BAC009S0764W0247 鼓励商业银行去境外开设分支机构 -BAC009S0764W0248 政府将完善境外投资统计制度 -BAC009S0764W0249 实行全口径统计和动态监测 -BAC009S0764W0250 确保境外企业和人员安全 -BAC009S0764W0251 但目前还存在服务架构不完善 -BAC009S0764W0252 缺乏对外投资长远规划等问题 -BAC009S0764W0253 可控是病毒武器最基本的要求 -BAC009S0764W0254 它必须尽量做到只针对敌对国家的计算机和网络 -BAC009S0764W0255 不能波及和影响其他无关国家甚至本国 -BAC009S0764W0256 具有精确的目标定位和识别能力 -BAC009S0764W0257 一旦战事结束或出于特殊需要可以实现自毁 -BAC009S0764W0258 病毒武器的传染性超强 -BAC009S0764W0259 它可以跨硬件平台传染 -BAC009S0764W0260 除了普通计算机以外 -BAC009S0764W0261 病毒武器的隐蔽性极佳 -BAC009S0764W0262 可以实现在敌国网络中的长期潜伏 -BAC009S0764W0263 是威力巨大的定时炸弹 -BAC009S0764W0264 用电脑进行战争比用核武器还有效 -BAC009S0764W0265 核武器并不能征服类似美国这样的国家 -BAC009S0764W0266 利用电脑病毒却可以在一秒钟内从银行盗走过亿美元 -BAC009S0764W0267 足够使美国失去战争基础因此彻底失败 -BAC009S0764W0268 但是病毒武器的出现 -BAC009S0764W0269 预示着未来战争模样将完全改变 -BAC009S0764W0270 病毒武器被认为是目前最具有代表性的网络武器 -BAC009S0764W0271 美国芯片行业兴起并购热潮搜狐科技 -BAC009S0764W0272 反映了芯片行业出现整合热潮 -BAC009S0764W0273 英特尔是世界头号芯片制造商 -BAC009S0764W0274 此次以一百六十七亿美元收购拓朗 -BAC009S0764W0275 将创下该公司成立四七年来最大收购交易的记录 -BAC009S0764W0276 正在寻求扩大移动市场份额 -BAC009S0764W0277 拓朗的主打产品是现场可编程门阵列芯片 -BAC009S0764W0278 可供客户为特定任务重新编程 -BAC009S0764W0279 应用于汽车医疗等行业 -BAC009S0764W0280 英特尔首席执行官布赖恩克尔扎尼奇在一份声明中说 -BAC009S0764W0281 合并拓朗之后将推出新的产品 -BAC009S0764W0282 满足数据中心和物联网细分市场的用户需求 -BAC009S0764W0283 形成高度定制化的集成产品 -BAC009S0764W0284 微芯片科技公司表示 -BAC009S0764W0285 两家公司是联网汽车的主要芯片供应商 -BAC009S0764W0287 今年芯片行业并购交易额在八百亿美元以上 -BAC009S0764W0288 半导体行业的大公司正在寻求通过并购 -BAC009S0764W0289 扩大它们在新的芯片市场的份额 -BAC009S0764W0290 随着个人计算机芯片的需求放慢 -BAC009S0764W0291 英特尔需要找到新的增长点 -BAC009S0764W0292 高德纳咨询公司分析师马克黄说 -BAC009S0764W0293 如今则猛增到一两亿美元 -BAC009S0764W0294 解决小小芯片上的连线和物理问题需要大量昂贵设备 -BAC009S0764W0295 芯片行业的并购风体现了整个科技行业的一种趋势 -BAC009S0764W0296 即一些财大气粗的科技公司自己不创新 -BAC009S0764W0297 而是寻求收购规模较小更为灵活的公司 -BAC009S0764W0298 反映了芯片行业出现整合热 -BAC009S0764W0299 因为难以忍受股价长期被低估 -BAC009S0764W0300 中国游戏公司纷纷忙着退市 -BAC009S0764W0301 巨人网络盛大游戏以及完美世界均已选择了私有化 -BAC009S0764W0302 是这类公司在美国市场估值长期受低估 -BAC009S0764W0303 北京商报讯记者王晔君日前 -BAC009S0764W0304 裁员二千人是由于销售模式发生改变 -BAC009S0764W0305 公司已将原有的直销模式改为经销模式 -BAC009S0764W0306 因此需要的人员大幅下降 -BAC009S0764W0307 由于去年底制定的销售战略是直销模式 -BAC009S0764W0308 所以今年上半年公司在全国各地的员工人数大幅增加 -BAC009S0764W0309 由于近期销售模式的调整 -BAC009S0764W0310 即由直销模式转变为经销模式 -BAC009S0764W0311 公司将更多地依靠经销商进行销售 -BAC009S0764W0312 正是由于销售模式的改变 -BAC009S0764W0313 汉能直接销售人员大幅度减少 -BAC009S0764W0314 汉能发布中期财报披露 -BAC009S0764W0315 上半年营业收入二十一点零八亿港元 -BAC009S0764W0316 同比减少百分之三十四毛利十四点六一亿港元 -BAC009S0764W0317 同比减少约百分之四十六亏损额为五百九十三二万港元 -BAC009S0764W0318 而去年同期盈利十六点七六亿港元 -BAC009S0764W0319 是自二零一一年借壳上市以来首次出现亏损 -BAC009S0764W0320 同时公布了重组计划 -BAC009S0764W0321 撤销旗下高端产业集团和产品开发集团 -BAC009S0764W0322 并将从总部事业部及各区域公司共裁员二千人 -BAC009S0764W0323 汉能曾计划今年底前将这一数字提高到三百家 -BAC009S0764W0324 汉能上半年业绩出现大幅下滑 -BAC009S0764W0325 当务之急是扭转业绩 -BAC009S0764W0326 而由直销模式改为经销模式 -BAC009S0764W0327 可以缩减很多人力成本 -BAC009S0764W0328 有利于降低公司运营成本 -BAC009S0764W0329 但是由于直销改为经销 -BAC009S0764W0330 汉能对自身产品的议价能力推广力都将减弱 -BAC009S0764W0331 公司已经暂停或终止部分关联交易项目 -BAC009S0764W0332 已经花费了一定的资源和成本 -BAC009S0764W0333 因此暂停或终止这些项目 -BAC009S0764W0334 对本公司的上半年业绩带来了负面影响 -BAC009S0764W0335 北京商报讯记者王晔君日前 -BAC009S0764W0336 裁员两千人是由 -BAC009S0764W0338 他们在训练和比赛过程之中的速度也会逐渐慢下来 -BAC009S0764W0339 但是根据国外科学家最新的研究结果 -BAC009S0764W0340 通过对脚踝和小腿等部位的强化 -BAC009S0764W0341 可以有效的抵消年龄所带来的速度劣势 -BAC009S0764W0342 使上年纪的跑者也能保持较快的速度 -BAC009S0764W0343 美国东卡罗莱纳大学和维克森林大学的研究者认为 -BAC009S0764W0344 脚踝和小腿的能力变弱 -BAC009S0764W0345 如果能够加强这方面的锻炼 -BAC009S0764W0346 他们会拥有较快的速度 -BAC009S0764W0347 研究者们选取了一些年龄大的跑者作为研究对象 -BAC009S0764W0348 并让年轻跑者作为参照 -BAC009S0764W0349 他们的步频大致相同 -BAC009S0764W0350 年龄大跑者的步幅明显短于年轻人 -BAC009S0764W0351 使得他们的速度变慢了 -BAC009S0764W0352 研究者们选取了十九位跑者 -BAC009S0764W0353 年龄从二十三岁到五十九岁 -BAC009S0764W0354 身体质量指数平均为二十三点四 -BAC009S0764W0355 身材偏瘦而且比较健康 -BAC009S0764W0356 跑者从二十多岁到五十九岁 -BAC009S0764W0357 步幅长度和跑步速度大约下降了百分之二十 -BAC009S0764W0358 脚踝的能力损失了大约百分之四十八 -BAC009S0764W0359 按照平时训练的速度进行跑步 -BAC009S0764W0360 二十岁的跑者平均每英里耗时八分十八秒 -BAC009S0764W0361 而六十岁的跑者每英里耗时十分十八秒 -BAC009S0764W0362 已经有过不少关于这方面的研究 -BAC009S0764W0363 但是研究对象都是年轻跑者和年老跑者 -BAC009S0764W0364 年龄段的复盖范围比较窄 -BAC009S0764W0365 最令德维塔感到不可思议的是 -BAC009S0764W0366 跑者们随着年龄的增长 -BAC009S0764W0367 速度呈现出直线下降 -BAC009S0764W0368 速度下降的更加明显 -BAC009S0764W0369 很多六七十岁的跑者看到这个研究结果时 -BAC009S0764W0370 意思是他们比较认同这个结果 -BAC009S0764W0371 研究者们希望年龄大的跑者能够注意脚踝的锻炼 -BAC009S0764W0372 但德维塔觉得归根到底还是小腿肌肉的问题 -BAC009S0764W0373 尤其是比目鱼肌和腓肠肌 -BAC009S0764W0374 这才是产生跑步力量的根源 -BAC009S0764W0375 这两种方式的结合能够有效锻炼小腿肌肉 -BAC009S0764W0376 对于高年龄跑者来说 -BAC009S0764W0377 开始一项新的锻炼方式具有一定的风险性 -BAC009S0764W0378 想通过训练提升脚踝和小腿的能力 -BAC009S0764W0379 这些常年坚持跑步的人身体质量指数偏低 -BAC009S0764W0380 长期跑步可能是一种不需要药物来保持身材的有效方式 -BAC009S0764W0381 在二零二二年冬季奥运会的竞选当中 -BAC009S0764W0382 北京和张家口最终击败了强大的对手阿拉木图 -BAC009S0764W0383 顺利获得了冬奥会的主办权 -BAC009S0764W0384 这也是这项冰雪顶级盛事首次来到中国 -BAC009S0764W0385 在此次申办冬奥会的过程中 -BAC009S0764W0386 我们看到了自身强大的综合实力 -BAC009S0764W0387 也看到了在冰雪运动综合实力上的欠缺和不足 -BAC009S0764W0388 经历过夏奥会的沉淀 -BAC009S0764W0389 加上近几年承办诸多国际性赛事的经验积累 -BAC009S0764W0390 在这场亚洲国家锁定胜局的申办博弈中 -BAC009S0764W0391 北申办此次冬奥会的价值要远远超过承办本身 -BAC009S0764W0392 对于北京申办冬奥会的最终结果 -BAC009S0764W0393 我们也应该抱着更加长远和开阔的视角来看待 -BAC009S0764W0394 北京申办冬奥强大实力成获胜武器 -BAC009S0764W0395 此次北京联手张家口申办冬奥会 -BAC009S0764W0396 在与阿拉木图的直接博弈中 -BAC009S0764W0397 财政能力和硬件设施的优势是我们最终取胜的关键原因 -BAC009S0764W0398 而二零零八年举办夏季奥运会所留下的宝贵遗产 -BAC009S0764W0399 也是最终打动国家奥运委会评审团的法宝 -BAC009S0764W0400 从经济实力和基础设施建设上看 -BAC009S0764W0401 北京和张家口要占据着相当明显的优势 -BAC009S0764W0402 北京和张家口两地的生产总值是二万二千七百三十点八亿元 -BAC009S0764W0403 而阿拉木图仅为四百亿美元 -BAC009S0764W0404 影片将在二零一五年一月在慕尼黑正式开机 -BAC009S0764W0405 好莱坞当红明星之前曾被盛传将扮演斯诺登 -BAC009S0764W0406 好莱坞当红明星之前曾被盛传将扮演斯诺登 -BAC009S0764W0407 他确实拿下了这个角色 -BAC009S0764W0408 对男友有什么条件 -BAC009S0764W0409 她表示最重要的就是诚恳 -BAC009S0764W0410 对于姊弟恋也不排斥 -BAC009S0764W0411 搜狐娱乐讯七月十日消息 -BAC009S0764W0412 据台湾媒体报道 -BAC009S0764W0413 许玮甯最近到法国工作 -BAC009S0764W0414 仍在个人社群网站频繁更新动态 -BAC009S0764W0415 甚至被外界揣测是因为和阮经天分手后所刺 -BAC009S0764W0416 她近日终于在受访时松口公开正解 -BAC009S0764W0417 背后意义竟只是不要忘记自己从哪里来 -BAC009S0764W0418 搜狐娱乐讯据台湾媒体报道 -BAC009S0764W0419 阮经天和许玮甯交往八年屡传婚讯 -BAC009S0764W0420 今年三月底惊爆分手 -BAC009S0764W0421 当时女方坦承已分居 -BAC009S0764W0422 但小天坚持玮甯依然是我的女人 -BAC009S0764W0423 有网友日前目击他俩在大稻埕分食炒饭 -BAC009S0764W0424 昨天她出席保养品活动 -BAC009S0764W0425 松口仍有联络 -BAC009S0764W0426 但称自己单身 -BAC009S0764W0427 恰巧昨日记者碰见阮经天出门倒垃圾 -BAC009S0764W0428 对许玮甯单身说语气落寞表示我没有什么看法 -BAC009S0764W0429 搜狐娱乐讯男方和小三还藕断丝连 -BAC009S0764W0430 因而痛斩情丝她除了拥有模特儿火辣身材 -BAC009S0764W0431 快报讯记者赵丹丹快递实名制时代终于到来了 -BAC009S0764W0432 按照国家邮政总局统一部署 -BAC009S0764W0433 从下月起全面实施快递实名制登记 -BAC009S0764W0434 现代快报记者从省邮政管理局了解到 -BAC009S0764W0435 江苏快递实名制登记动真格 -BAC009S0764W0436 本周内动员部署全省九零零多家快递企业按要求执行 -BAC009S0764W0437 个人寄快递必须登记有效的身份证件 -BAC009S0764W0438 本山传媒回应赵本山将有新作品没听说 -BAC009S0764W0439 不仅赢得观众好口碑 -BAC009S0764W0440 特别是师父赵本山也公开出面为大鹏点赞 -BAC009S0764W0441 本月二八日超级月亮和最圆中秋月喜相逢 -BAC009S0764W0442 月亮和地球之间的平均距离仅为三五六八九六万公里 -BAC009S0764W0443 月亮看起来会比往常大 -BAC009S0764W0444 也就是我们常说的超级月亮 -BAC009S0764W0445 这一天还将上演月全食 -BAC009S0764W0446 超级月亮碰上月全食 -BAC009S0764W0447 错过了这次就要到二零三三年了 -BAC009S0764W0448 本月下旬天宇将现五星连线奇观 -BAC009S0764W0449 中科院紫金山天文台公布了一零月天象 -BAC009S0764W0450 现代快报记者注意到 -BAC009S0764W0451 天龙座流星雨猎户座流星雨 -BAC009S0764W0452 让一零月的天空有点甜蜜蜜的味道 -BAC009S0764W0453 水星金星也将迎来观测良机 -BAC009S0764W0454 现代快报记者胡玉梅 -BAC009S0764W0455 本月中下旬小行星撞地球 -BAC009S0764W0456 专家没有科学依据 -BAC009S0764W0457 京华时报讯记者任珊记者从北京市教育考试院获悉 -BAC009S0764W0458 高招本科二批今天开始进行征集志愿录取 -BAC009S0764W0459 一八一所院校将补录一九四九人 -BAC009S0764W0460 朱军系阅兵世家曾参与一九八四年阅兵军乐演奏 -BAC009S0764W0461 朱圣祎爆王思聪女朋友被诉法官送达起诉书遇阻 -BAC009S0764W0462 王思聪将朱圣祎诉至北京朝阳法院 -BAC009S0764W0463 要求停止侵权公开道歉赔偿精神损失一元 -BAC009S0764W0464 法官送达起诉书副本等应诉材料遇阻 -BAC009S0764W0465 朱茵说紫霞仙子谁来演不是我可以决定的 -BAC009S0764W0466 资料图片在湖南卫视上周开播的偶像来了中 -BAC009S0764W0467 永远的紫霞仙子朱茵的亮相引起粉丝的热捧 -BAC009S0764W0468 永远的紫霞仙子朱茵的亮相引起粉丝的热捧 -BAC009S0764W0469 来自全球四七个国家和地区的二零零零多名选手参赛 -BAC009S0764W0470 机器人服务员现身火锅店顾客直呼女神 -BAC009S0764W0471 女神机器人在火锅店内工作 -BAC009S0764W0472 机场严查匿打火机过安检放在鞋子里算藏匿 -BAC009S0764W0473 本报讯记者杨柳昨天 -BAC009S0764W0474 记者从首都机场公安分局航站区派出所获悉 -BAC009S0764W0475 首都机场公安分局航站区派出所联合驻场安检人员 -BAC009S0764W0476 坚持违法零容忍和高限处理的执法态度 -BAC009S0764W0477 严格搜集和固定相关证据 -BAC009S0764W0478 近日在违法事实认识清楚法律法规适用明确的基础上 -BAC009S0764W0479 依法对一名藏匿打火机过检的旅客进行了行政处罚 -BAC009S0764W0480 机场公安加航航班未发生性侵事件 -BAC009S0764W0482 网传该航班一名男性旅客对空姐试图性侵导致飞机返航 -BAC009S0764W0483 新京报记者从首都国际机场公安分局相关人员处获悉 -BAC009S0764W0484 冲突因空姐发餐时餐车碰到了一名旅客 -BAC009S0764W0485 双方因语言交流不畅导致纠纷 -BAC009S0764W0486 该男子因影响航班正常秩序 -BAC009S0764W0487 明星刘晓庆又火了一把 -BAC009S0764W0488 她几乎刷遍了各大媒体 -BAC009S0764W0489 不是她的戏或是她的八卦 -BAC009S0764W0490 而是因为她也中了天价的招 -BAC009S0764W0491 机组成功处置深航机上纵火事件获奖二五零万 -BAC009S0764W0492 成功处置深航机上纵火事件 -BAC009S0764W0493 杀中传女生嫌犯就想找个人发泄 -BAC009S0764W0494 其室友在微博上所发的寻人启事 -BAC009S0764W0495 警方证实周云露遇害 -BAC009S0765W0121 一线城市出现日光盘 -BAC009S0765W0122 楼市地市交相升温房价会不会再度暴涨 -BAC009S0765W0123 经济热点导读专家认为 -BAC009S0765W0124 我国房地产市场过去总体偏紧部分地区过紧 -BAC009S0765W0125 为了将后辈的婚姻分险隔断 -BAC009S0765W0126 将受益人定为直系血亲后代非配偶继承人 -BAC009S0765W0127 按公司持有房产计征 -BAC009S0765W0128 相关公司股票走势 -BAC009S0765W0129 房价起飞前购置了十几套房产 -BAC009S0765W0130 目前总估值已过亿元 -BAC009S0765W0131 这些房产全由宋芳自己打理 -BAC009S0765W0132 每月光租金收入便已远大于自己和子女的总开销 -BAC009S0765W0133 宋芳最近却有点烦恼 -BAC009S0765W0134 我想把房产留给儿女 -BAC009S0765W0135 万一以后儿女的婚姻出了问题 -BAC009S0765W0136 他们的财产和生活不会受到太大影响 -BAC009S0765W0137 宋芳对北京银行私人银行的财富顾问说 -BAC009S0765W0138 该信托出资购入宋芳的房产 -BAC009S0765W0139 成立资金信托购买自家房产 -BAC009S0765W0140 在了解宋芳的资产情况与需求之后 -BAC009S0765W0141 设立一个单一资金信托 -BAC009S0765W0142 宋芳本人为信托的发起人和委托人 -BAC009S0765W0143 北京信托作为受托人 -BAC009S0765W0144 之后由该信托对宋芳指定的房产发出购买要约 -BAC009S0765W0145 实现该信托对房产的控制 -BAC009S0765W0146 虽然房产是在信托的名下 -BAC009S0765W0147 但您和您的儿女能自由支配 -BAC009S0765W0148 这相当于左兜掏右兜 -BAC009S0765W0149 通过信托实现了财产的隔离保护 -BAC009S0765W0150 未来子女出现姻缘风险 -BAC009S0765W0151 其中资金这一要素指基于信托登记的相关法规局限 -BAC009S0765W0152 为了购买自己想要传承给子女的房产 -BAC009S0765W0153 宋芳必须再掏出完全属于自己的资金 -BAC009S0765W0154 委托人以其持有的资金设立一个单一资金信托 -BAC009S0765W0155 该资金可以是委托人的自有资金 -BAC009S0765W0156 也可以是委托人合法获得的过桥资金 -BAC009S0765W0157 确保所设信托的合法性 -BAC009S0765W0158 按公司持有房产计税 -BAC009S0765W0159 由于家族信托的存续期通常较长 -BAC009S0765W0160 在信托收益的处置上 -BAC009S0765W0161 不同的客户对收益再投资的需求差别较大 -BAC009S0765W0162 对收益率的要求差别却不至于相去甚远 -BAC009S0765W0163 从我们遇到的客户来看 -BAC009S0765W0164 回报率普遍要求并不高 -BAC009S0765W0165 有些客户只要求收益率超过利率即可 -BAC009S0765W0166 看中的是其财产保护与传承的功能 -BAC009S0765W0167 我现在就是担心自己哪天突然出现个什么情况 -BAC009S0765W0168 他们说不定又有离婚风险 -BAC009S0765W0169 我又不指望设立信托来赚钱 -BAC009S0765W0170 主要目的是把后辈的婚姻风险隔断 -BAC009S0765W0171 在宋芳的资金信托购买其房产时 -BAC009S0765W0172 需要按北京当地的要求缴纳二手房交易费用 -BAC009S0765W0173 而在信托持有这些房产后 -BAC009S0765W0174 这是因为宋芳购买其房产 -BAC009S0765W0175 按照公司持有房产计征 -BAC009S0765W0176 各项费用的加总并不低 -BAC009S0765W0177 在目前的法律框架下 -BAC009S0765W0178 这些税费均无法避免 -BAC009S0765W0179 他认为跟后辈姻缘风险相比 -BAC009S0765W0180 点击进入股友会参与讨论 -BAC009S0765W0181 本世纪经济报道 -BAC009S0765W0182 为了将后辈的婚姻风险隔断 -BAC009S0765W0183 并将受益人定为直系血亲后代非配偶继承人 -BAC009S0765W0184 今久整合营销集团迎来了它的生日 -BAC009S0765W0185 从最初的几十人 -BAC009S0765W0186 事业版图遍布全中国的集团化整合营销公司 -BAC009S0765W0187 无序竞争甚至恶意竞争时常发生 -BAC009S0765W0188 将发挥部际会议联席制度作用 -BAC009S0765W0189 制订境外投资总体战略 -BAC009S0765W0190 对重大项目和重大问题进行协调 -BAC009S0765W0191 引导企业围绕重点国家和地区在重点领域展开投资 -BAC009S0765W0192 鼓励本土中介机构提供服务 -BAC009S0765W0193 建立起政府部门企业和中介机构各司其职的组织架构 -BAC009S0765W0194 北京科技大学教授刘澄表示 -BAC009S0765W0195 主要是如何提供服务及做好监管 -BAC009S0765W0196 规划提出诸多想法 -BAC009S0765W0197 政府提供的服务企业是否需要 -BAC009S0765W0198 如何为企业提供信息避免海外投资风险 -BAC009S0765W0199 避免海外一窝蜂上项目等 -BAC009S0765W0200 发改委将尽快建立地方政府债务管理体系到 -BAC009S0765W0201 他就上述关注问题指出 -BAC009S0765W0202 目前我国出现政府性债务违约可能性并不大 -BAC009S0765W0203 下一步将进一步完善城投债券发行制度和防范风险机制 -BAC009S0765W0204 并尽快建立我国地方政府债务管理体系等 -BAC009S0765W0205 政府性违约可能性不大 -BAC009S0765W0206 中国证券报随着欧美等国主权债务危机陆续爆发 -BAC009S0765W0207 您如何看待政府的举债行为和债务风险 -BAC009S0765W0208 徐林吸取欧美等国主权债务危机的教训 -BAC009S0765W0209 采取必要措施加强政府债务管理 -BAC009S0765W0210 防范我国政府债务风险 -BAC009S0765W0211 但在具体评估我国地方政府债务风险程度时 -BAC009S0765W0212 也要看到我国与欧美国家的不同之处 -BAC009S0765W0213 我国地方政府性债务 -BAC009S0765W0214 特别是地方投融资平台公司形成的债务 -BAC009S0765W0215 主要用于各地基础设施的投资建设 -BAC009S0765W0216 当代人和后代人共同承担债务还本付息责任 -BAC009S0765W0217 可以更好地体现代际公平 -BAC009S0765W0218 克服当期建设资金不足的瓶颈制约 -BAC009S0765W0219 有利于加快完善基础设施和投资环境 -BAC009S0765W0220 是一种合理的基础设施投融资建设行为 -BAC009S0765W0221 政府举债建设形成了大量资产 -BAC009S0765W0222 相当部分资产具有长期的直接收益 -BAC009S0765W0223 一些没有直接收益的项目 -BAC009S0765W0224 也具有间接的经济效益或社会效益 -BAC009S0765W0225 对促进当地经济增长和政府财力的增长 -BAC009S0765W0226 不能简单地用寅吃卯粮来作价值判断 -BAC009S0765W0227 这并不意味着政府可以无节制地借债 -BAC009S0765W0228 关键是要把投资规模和债务规模 -BAC009S0765W0229 控制在合理的范围内 -BAC009S0765W0230 防止出现系统性的偿债风险 -BAC009S0765W0231 国务院高度重视防范地方政府债务风险 -BAC009S0765W0232 从二零零九年下半年就开始要求有关部门调研这一问题 -BAC009S0765W0233 国家审计署还专门组织力量 -BAC009S0765W0234 对全国各地的政府债务进行了严格审计 -BAC009S0765W0235 审计署的审计结论表明 -BAC009S0765W0236 我国地方政府的累积债务相对于偿付能力来看 -BAC009S0765W0237 远低于发生债务危机的欧美国家 -BAC009S0765W0238 考虑到我国正处在经济快速增长期 -BAC009S0765W0239 政府财力增长也相应较快 -BAC009S0765W0240 政府还拥有较多的可变现资产 -BAC009S0765W0241 相对于目前的负债规模 -BAC009S0765W0242 政府总体上具有较强的偿债能力 -BAC009S0765W0243 采取积极有效的措施化解部分地区和领域的债务风险 -BAC009S0765W0244 在我国出现政府性债务违约的可能性是不大的 -BAC009S0765W0245 债券市场城投债券发行不畅 -BAC009S0765W0246 从城投债券发行监管部门的角度 -BAC009S0765W0247 您如何看待这一现象 -BAC009S0765W0248 徐林出于对地方政府债务风险的担忧 -BAC009S0765W0249 投资者采取措施防范风险是成熟的表现 -BAC009S0765W0250 但出于对我国地方政府债务风险的不合理判断 -BAC009S0765W0251 并进而对城投债券进行唱空或做空 -BAC009S0765W0252 最近企业债券特别是城投债券的发行难度加大 -BAC009S0765W0253 其在纳斯达克上市时的发行价为一六美元 -BAC009S0765W0254 其股票价格在十五点七六美元上下徘徊 -BAC009S0765W0255 中国手游在退市之前的市盈率在十六十七倍左右 -BAC009S0765W0256 掌趣科技三零零三一五一度超过二百倍 -BAC009S0765W0257 华尔街并不认可游戏这种商业模式 -BAC009S0765W0258 并非仅仅针对中国公司 -BAC009S0765W0260 作为美国本土著名社交游戏开发商 -BAC009S0765W0261 在当年社交游戏风靡的时候 -BAC009S0765W0262 因为快速发展的业务和不断膨胀的营收受资本市场亲睐 -BAC009S0765W0265 亏损二千六百九十万美元相比上一季度 -BAC009S0765W0266 这一亏损已经收窄了百分之五十七 -BAC009S0765W0267 成熟的资本市场相对公平 -BAC009S0765W0268 这些被市场唱空的游戏公司本身业务模式遇到了困境 -BAC009S0765W0269 游戏公司往往靠一款游戏在市场上火爆 -BAC009S0765W0270 大多数游戏产品往往病毒式地成长 -BAC009S0765W0272 这些中国游戏公司大多成长于中国市场 -BAC009S0765W0273 他们的产品在海外市场也极少被认可 -BAC009S0765W0274 这些公司在海外市场上市往往除了获得融资机会 -BAC009S0765W0275 并未能给这些公司带来其他的效应 -BAC009S0765W0277 中国游戏产品和美国产品极为不同 -BAC009S0765W0278 美国玩家对游戏难度创造性要求较高 -BAC009S0765W0279 中国产品不可能照搬到美国市场 -BAC009S0765W0280 在融资和发展海外市场上 -BAC009S0765W0281 还希望拓展市场的话 -BAC009S0765W0282 触控科技全资韩国子公司在韩国上市 -BAC009S0765W0283 而从二零一三年开始 -BAC009S0765W0284 这家公司就在挖角当地游戏公司高管建立分公司 -BAC009S0765W0285 打造适合当地市场的产品 -BAC009S0765W0286 根据陈昊芝在二零一四年八月提供的数据 -BAC009S0765W0287 市场份额做到了前十位 -BAC009S0765W0288 未来上市能够放大公司的品牌效益 -BAC009S0765W0289 让当地更多的人知道这家公司 -BAC009S0765W0290 在韩国股市低迷情况下 -BAC009S0765W0291 触控科技子公司涨幅居前 -BAC009S0765W0292 对于游戏这种地域属性较重的产品 -BAC009S0765W0293 应慎重考虑上市时机和地点 -BAC009S0765W0294 反复检视自身商业模式 -BAC009S0765W0295 而不是迫不及待抓住一切可以上市圈钱的机会 -BAC009S0765W0296 往往连最直接的目标都无法达成 -BAC009S0765W0297 因为难以忍受股价长期被低估 -BAC009S0765W0298 中国游戏公司纷纷忙着退市 -BAC009S0765W0299 最近都在流行做预测 -BAC009S0765W0300 于是他也来凑凑热闹 -BAC009S0765W0301 他的预测有点毒基本上是在讨论谁会下台 -BAC009S0765W0303 每日经济新闻记者杨建江南嘉捷六万 -BAC009S0765W0304 收盘价十三点六五元于七月八日发布公告 -BAC009S0765W0305 为使股价与公司价值匹配 -BAC009S0765W0306 公司拟计划通过集中竞价交易方式回购公司股份 -BAC009S0765W0307 公司此次回购股份的价格不超过十五点一零七元股 -BAC009S0765W0308 用于回购的资金总额不超过一点五一七亿元 -BAC009S0765W0309 预计回购股份约一千万股 -BAC009S0765W0310 占公司总股本约二点百分之五十 -BAC009S0765W0311 公司股票于二零一五年七月八日复牌 -BAC009S0765W0312 每日经济新闻记者注意到 -BAC009S0765W0313 截至二零一四年十二月三十一日 -BAC009S0765W0314 资金来源为自有资金 -BAC009S0765W0315 回购期限为自回购股份方案之日起至今年底 -BAC009S0765W0317 收盘价四点九九元也于七月八日公告 -BAC009S0765W0321 其目前的股票市值已经不能完全反映公司价值 -BAC009S0765W0323 增持后持股比例为六十二点百分之二十三 -BAC009S0765W0325 拟在二零一五年二零一七年先行推出两期回购方案 -BAC009S0765W0326 其中第一期回购资金上限为二零一四年净利润的百分之二十五 -BAC009S0765W0327 第二期股票回购方案不晚于二零一七年六月三十日推出 -BAC009S0765W0328 回购期限为股东大会通过后不超过十二个月 -BAC009S0765W0329 预计可回购不少于七百九十一万股 -BAC009S0765W0330 每日经济新闻记者杨建江南嘉捷六万一千一百三十一三 -BAC009S0765W0331 收盘价十三点六五元于七月八日发布公告 -BAC009S0765W0332 为使股价与公司价值匹配 -BAC009S0765W0333 公司拟计划通过集中竞价交易方式回购公司股份 -BAC009S0765W0334 锂电池在今年上半年成为诸多上市公司的业绩功臣 -BAC009S0765W0335 成飞集成百二十一九十 -BAC009S0765W0337 公司上半年营业收入六点三四亿元 -BAC009S0765W0338 折合人民币大概二千四八零亿 -BAC009S0765W0339 而与经济实力相关的一些基础设施建设方面 -BAC009S0765W0340 无论是城市交通建设还是机场运力 -BAC009S0765W0341 阿拉木图都无法和北京相比 -BAC009S0765W0342 远远无法和北京相提并论 -BAC009S0765W0343 从申办冬奥会的硬件基础上看 -BAC009S0765W0344 北京冬奥会的硬件基础要强于阿拉木图 -BAC009S0765W0345 北京张家口计划启用一二个竞赛场馆 -BAC009S0765W0346 其中五个场馆需要新建 -BAC009S0765W0347 其馀场馆改扩建后可以满足赛事需要 -BAC009S0765W0348 其中北京市区仅需要新建一座速滑场馆 -BAC009S0765W0349 阿拉木图方面将会使用十四个场馆作为比赛之用 -BAC009S0765W0350 目前八座为已有场馆并在使用中 -BAC009S0765W0351 其于六个场馆都需要新建 -BAC009S0765W0352 从举办大型体育赛事的经验来看 -BAC009S0765W0353 北京的经验比阿拉木图丰富 -BAC009S0765W0354 还有二零一五年的田径世锦赛 -BAC009S0765W0355 这些使得北京积累了大量的举办与运营经验 -BAC009S0765W0356 也证明了北京举办大型体育赛事的能力 -BAC009S0765W0357 自从哈萨克斯坦独立 -BAC009S0765W0358 二零一一年的亚冬会是其举办的第一个国际性综合赛事 -BAC009S0765W0359 之后就没有举办过的大型体育赛事 -BAC009S0765W0360 花样滑冰大奖赛中国杯常年在北京和上海之间轮换 -BAC009S0765W0361 二零一四年的冰壶世锦赛也在北京举行 -BAC009S0765W0362 一系列大型赛事的承办 -BAC009S0765W0363 让北京具备了承办冬奥会这种顶级赛事的经验和能力 -BAC009S0765W0364 北京申办冬奥影响远超承办本身 -BAC009S0765W0365 此次北京申办冬奥会 -BAC009S0765W0366 也让我们看到了自身存在着的不足 -BAC009S0765W0367 其中主要集中于冰雪运动本身实力上的有所欠缺 -BAC009S0765W0368 二零二二年冬奥会的举行 -BAC009S0765W0369 对于我国冰雪运动实力的提升会有巨大的推动作用 -BAC009S0765W0370 和夏季奥运会上的斩金夺银不同 -BAC009S0765W0371 中国的冬季运动还处于半起步阶段 -BAC009S0765W0372 这在我国体育发展史上具有划时代的意义 -BAC009S0765W0373 标志着我国体育开始走向国际化 -BAC009S0765W0374 成为了国际体育运动大家庭中的一员 -BAC009S0765W0375 但直到一二年后的法国阿尔贝维尔冬奥会上 -BAC009S0765W0376 我国选手才实现了冬奥奖牌零的突破 -BAC009S0765W0377 取得这一突破的领军人就包括轮椅英雄叶乔波 -BAC009S0765W0378 又是十年的空白期 -BAC009S0765W0379 二零零二年的美国盐湖城冬奥会上 -BAC009S0765W0380 杨扬拿到了五百米和一千米两项短道速的金牌 -BAC009S0765W0381 更具历史性意义的是 -BAC009S0765W0382 这是中国奥运代表团在冬季奥运会上取得的首枚金牌 -BAC009S0765W0383 经过二十多年的努力 -BAC009S0765W0384 中国冰雪健儿终于站到了冬奥会的最高领奖台 -BAC009S0765W0385 以及拥有陈露的女单项目 -BAC009S0765W0386 可以在世界范围内立足 -BAC009S0765W0387 但随着这一系列名将的退役 -BAC009S0765W0388 在中国的这一传统优势项目上 -BAC009S0765W0389 我们可以说已经输给了其他强敌 -BAC009S0765W0390 更直观的体现是在冰雪运动的核心项目冰球上 -BAC009S0765W0391 竞争对手哈萨克斯坦在这一点上要强过我们 -BAC009S0765W0392 中国国家男子冰球队目前排名第三十二位 -BAC009S0765W0393 而哈萨克斯坦则是第十六位 -BAC009S0765W0394 所有主办国的男子冰球成绩排位均在二十位之内 -BAC009S0765W0395 二零一八年冬奥会的主办地韩国平昌是一个绝好的例子 -BAC009S0765W0396 平昌曾经三次申办冬奥会 -BAC009S0765W0397 前两次申办的过程中 -BAC009S0765W0398 男子冰球的战绩均在二十五名左右 -BAC009S0765W0399 而第三次申办周期内 -BAC009S0765W0401 几乎帮助了平昌拿下二零一八年冬奥会的主办权 -BAC009S0765W0402 由于韩国冰球协会的四年规划 -BAC009S0765W0403 保证国家队水平不会被其他球队相差太远的承诺下 -BAC009S0765W0404 业已正式启动斯诺登事件电影的拍摄 -BAC009S0765W0405 影片发布了第一批定装照 -BAC009S0765W0406 以一身越野军装黑框眼镜的造型出现 -BAC009S0765W0407 看上去和人物原型相当贴合 -BAC009S0765W0408 演技也日渐精湛 -BAC009S0765W0409 更有一手好厨艺 -BAC009S0765W0410 可说是超完美女神 -BAC009S0765W0411 男友却仍然劈腿偷吃 -BAC009S0765W0412 好友林心如也心疼喊话我会陪她 -BAC009S0765W0413 中新网五月六日电据台湾媒体报道 -BAC009S0765W0414 刚与阮经天传出情变不久的许玮甯近日接拍恐怖片 -BAC009S0765W0415 称为了演好戏 -BAC009S0765W0416 她看了不少恐怖片 -BAC009S0765W0417 看完片后会睡不好做恶梦 -BAC009S0765W0418 上厕所都要把灯全部打开 -BAC009S0765W0419 搜狐娱乐讯日前 -BAC009S0765W0420 引发众多粉丝围堵 -BAC009S0765W0421 玩心大起的许绍洋与玩家一起比拼游戏 -BAC009S0765W0422 没想竟然惨败 -BAC009S0765W0423 这让自称游戏达人的他颇有些不好意思 -BAC009S0765W0424 金陵晚报八月十二日报道二零一四年 -BAC009S0765W0425 许茹芸与韩籍男友举行了婚礼 -BAC009S0765W0426 迎来了人生崭新阶段 -BAC009S0765W0427 不同于大家心中按部就班的乖乖女形象 -BAC009S0765W0428 许茹芸突然闪婚让当时的娱乐圈也惊起了一阵小波澜 -BAC009S0765W0429 在许茹芸看来 -BAC009S0765W0430 但几乎一个都没有实现 -BAC009S0765W0431 一四年前轰动东莞沙田的一起命案 -BAC009S0765W0432 日前因为广东省高院作出的无罪判决 -BAC009S0765W0433 再次吸引了众人的目光 -BAC009S0765W0434 八月一七日上午一一时 -BAC009S0765W0435 陈传钧从东莞市第二看守所出来 -BAC009S0765W0436 这是二零一零年四月二三日以来 -BAC009S0765W0437 杀人犯出狱后喊冤被驳回供述与鉴定相印证 -BAC009S0765W0438 丈夫关某身负多处刀伤 -BAC009S0765W0439 呼救报警时称有人入屋行凶 -BAC009S0765W0440 又供称是自己失手杀妻 -BAC009S0765W0441 关某先后被判死刑死缓 -BAC009S0765W0442 他向广东省高院申诉 -BAC009S0765W0443 广东高院审理后驳回了关某的申诉 -BAC009S0765W0444 杀人犯受民警感召行刑前捐器官谢罪 -BAC009S0765W0445 杀人犯抢劫获刑未查出旧案警方指纹识别有遗漏 -BAC009S0765W0446 京华时报记者蒲东峰摄二零零七年 -BAC009S0765W0447 时年二三岁的杨柱军在北京抢劫杀害了一名出租车司机 -BAC009S0765W0448 此后他没有隐姓埋名逃往外地 -BAC009S0765W0449 公安机关并未查出其身上还背着命案 -BAC009S0765W0450 并于二零一五年一月将其抓获 -BAC009S0765W0451 曾多次比对二零零七年命案现场匕首上的指纹 -BAC009S0765W0452 但指纹比对识别系统会出现一定概率的遗漏 -BAC009S0765W0453 杨柱军因涉嫌抢劫罪在市二中院受审 -BAC009S0765W0454 杀害中传失联女主嫌犯想找个无辜的人发泄 -BAC009S0765W0455 视频截图新京报快讯记者杨锋昨日 -BAC009S0765W0456 杀害中传女学生犯罪嫌疑人从小家庭教育严格 -BAC009S0765W0457 失联近两天的中传研究生周云露 -BAC009S0765W0458 李斯达表示自己跟周云露并没有深仇大恨 -BAC009S0765W0459 称就是想找个无辜的人 -BAC009S0765W0460 目前李斯达被关押在朝阳区看守所 -BAC009S0765W0461 周云露的父母在昨天上午去过朝阳刑警队 -BAC009S0765W0462 杀害中传女生嫌犯曾私藏刺刀同学称其特立独行 -BAC009S0765W0463 李斯达手持尖刀的自拍照 -BAC009S0765W0464 新京报快讯记者杨锋凌晨今日下午 -BAC009S0765W0465 中国传媒大学官网发布消息称 -BAC009S0765W0466 在朝阳区百子湾阳光嘉园小区遇害 -BAC009S0765W0467 犯罪嫌疑人已被抓获 -BAC009S0765W0468 学校正在全力配合公安机关和家属进行善后处理 -BAC009S0765W0469 杀害夜跑女子嫌犯不言不语拾荒者身份尚未确认 -BAC009S0765W0470 杀害女教师疑犯行凶后脸有伤警方悬赏五万缉拿 -BAC009S0765W0471 遇害女教师昨晚七时五七分 -BAC009S0765W0472 其作案后身上有大量血迹 -BAC009S0765W0473 双手背脸部等裸露部位有刺伤划伤 -BAC009S0765W0474 通告呼吁广大群众积极检举揭发提供线索 -BAC009S0765W0475 对提供重大线索协助破案者 -BAC009S0765W0476 我局将给予五万元奖励 -BAC009S0765W0477 杀害女童凶手被抓指认现场上千民众喊打 -BAC009S0765W0478 四川广安一一岁女孩的失踪 -BAC009S0765W0479 九日晚女孩尸体被找到 -BAC009S0765W0480 凶手在郫县安靖镇被抓 -BAC009S0765W0481 凶手到岳池县石垭镇指认骗走孩子的现场 -BAC009S0765W0482 数千围观人群高呼打死这个杂碎 -BAC009S0765W0483 现场喊打声持续不断 -BAC009S0765W0484 杀害宝鸡夜跑教师嫌犯落网是否为拾荒者尚无定论 -BAC009S0765W0485 吕某于一零月一四日晚从家中外出锻炼失踪 -BAC009S0765W0486 尸体于一零月二零日在宝鸡渭河公园被发现 -BAC009S0765W0487 李克强集众智汇众力攻坚克难激发活力 -BAC009S0765W0488 李彬彬喂大象喝水略显老态提醒网友夏天要补水 -BAC009S0765W0489 联合国官方微博晒出一张李彬彬喂大象喝水的照片 -BAC009S0765W0490 华西都市报讯记者杜恩湖一零月二四日中午一二时 -BAC009S0765W0491 一零月二三曰现身成都平乐古城 -BAC009S0765W0492 应邀参加第二届天府古镇艺术节 -BAC009S0765W0493 现场李双江受到了观众的热烈欢迎 -BAC009S0765W0494 二零零多幅珍贵油画抵达南京 -BAC009S0765W0495 李嘉诚军师抛售马云一五亿买香港最贵单价豪宅 -BAC009S0766W0121 实现数字化整合营销 -BAC009S0766W0122 是当今广告行业的需要 -BAC009S0766W0123 消费者行为的变化及技术的进步 -BAC009S0766W0124 催生了广告领域新的变革和创新 -BAC009S0766W0125 唯有实力雄厚又颇具现代创新意识的广告企业 -BAC009S0766W0126 今久整合营销集团就是如此 -BAC009S0766W0127 成为圈子里首屈一指的超大企业 -BAC009S0766W0128 自成立以来 -BAC009S0766W0129 服务项目几千个 -BAC009S0766W0130 开创了蔓延全国的青年社区概念 -BAC009S0766W0131 确立了无人撼动的行业老大地位 -BAC009S0766W0132 成为房地产最信任的营销公司 -BAC009S0766W0133 然而这家雄心勃勃的公司并未止步于此 -BAC009S0766W0134 一个以互联网和大数据为核心的时代已经到来 -BAC009S0766W0135 今久必须担当起引领时代潮流的重任 -BAC009S0766W0136 蓝色光标以几亿人民币收购今久 -BAC009S0766W0137 这成为今久转型的起点 -BAC009S0766W0138 依托蓝色光标强大的技术和资源优势 -BAC009S0766W0139 今久率先提出整合营销的概念 -BAC009S0766W0140 其核心在于利用数字化工具 -BAC009S0766W0141 为房地产商提供系统化的服务 -BAC009S0766W0142 整合营销实现了从策略到执行的系统化服务 -BAC009S0766W0143 当地产商的效果预期不断提高 -BAC009S0766W0144 这时候更要求服务商具备思考和行动的一致性 -BAC009S0766W0145 这样也为开发商节省了运营成本 -BAC009S0766W0146 整合营销是利用全案思维和大数据技术 -BAC009S0766W0147 市场上就出现了各类新型技术软件 -BAC009S0766W0148 但大多是雷声大雨点小 -BAC009S0766W0149 与房地产商的需求相去甚远 -BAC009S0766W0150 大数据营销需要的是强大的技术实力 -BAC009S0766W0151 而非某些功能的简单嫁接 -BAC009S0766W0152 蓝色光标作为全球首屈一指的广告服务商 -BAC009S0766W0153 在大数据上的技术优势无可匹敌 -BAC009S0766W0154 今久正是在蓝色光标的技术支持下 -BAC009S0766W0155 实现了大数据营销的创新 -BAC009S0766W0156 许多数字新产品 -BAC009S0766W0157 广泛应用于移动端 -BAC009S0766W0158 分析用户的消费行为和生活方式 -BAC009S0766W0159 帮助广告主找出目标用户 -BAC009S0766W0160 然后对广告信息进行精确匹配 -BAC009S0766W0161 达到降低成本提升营销效果的目的 -BAC009S0766W0162 今久在大举创新的同时 -BAC009S0766W0163 保持原有业务的正常运作 -BAC009S0766W0164 这才是一个大企业应该有的战略方向 -BAC009S0766W0165 带动了区域板块的扩张 -BAC009S0766W0166 在海南成立了分公司 -BAC009S0766W0167 现在已经是海南本土最大的房地产推广公司 -BAC009S0766W0168 拥有许多优质客户 -BAC009S0766W0169 今久上海分公司又悄无声息地开张了 -BAC009S0766W0170 新媒体推广的业务扩张 -BAC009S0766W0171 逐渐地撬开了上海这个外来公司很难生根的大都市 -BAC009S0766W0172 郑州长春和哈尔滨三地办事处 -BAC009S0766W0173 用蓝色光标强大的新媒体技术和资源 -BAC009S0766W0174 搭起了全国地产推广新媒体的版图 -BAC009S0766W0175 今久又出高价 -BAC009S0766W0176 收购了房地产互联网营销公司沈阳新维一半股份 -BAC009S0766W0177 今久又一次利用资本市场 -BAC009S0766W0178 实现区域扩张 -BAC009S0766W0179 区域产品和业务三大层面 -BAC009S0766W0180 今久成功实现了转型 -BAC009S0766W0181 后今久时代正式到来 -BAC009S0766W0182 转型后的今久整合营销集团 -BAC009S0766W0183 在全球大数据浪潮中 -BAC009S0766W0184 依托蓝色光标的强大平台 -BAC009S0766W0185 助力中国房地产开发企业发掘并实现更大的价值需求 -BAC009S0766W0186 在机遇与挑战共存的互联网时代 -BAC009S0766W0187 发行利率也有较大幅度上升 -BAC009S0766W0188 人民银行多次提高存款准备金率和存贷款基准利率 -BAC009S0766W0189 不仅是城投债券发行利率 -BAC009S0766W0190 债券市场所有品种发行利率整体上都表现出向上的走向 -BAC009S0766W0191 导致城投债券发行产生较高的风险溢价 -BAC009S0766W0192 城投债券收益率上升 -BAC009S0766W0193 对债券投资人来说不是坏事 -BAC009S0766W0194 有利于提升城投债券的资产配置价值 -BAC009S0766W0195 则需要在发债时机和发债规模上进行合理的把握 -BAC009S0766W0196 我个人不赞成这一判断 -BAC009S0766W0197 债券发行人是优质的 -BAC009S0766W0198 还本付息也是正常的 -BAC009S0766W0199 投资者对城投债券风险表现出的恐慌 -BAC009S0766W0200 加强城投债监管完善制度建设 -BAC009S0766W0201 有的媒体甚至用井喷来描述 -BAC009S0766W0202 您如何看待城投债券这几年的发展和作用 -BAC009S0766W0203 徐林这几年城投债券发行数量的确有所增加 -BAC009S0766W0204 地方投融资平台公司通过发行债券进行融资 -BAC009S0766W0205 符合提高直接融资比重的要求 -BAC009S0766W0206 城投债券也适应了发行人和投资人的需要 -BAC009S0766W0207 这是这几年城投债券发行规模不断扩大的主要原因 -BAC009S0766W0208 我委核准发行的企业债券累计为七千亿元 -BAC009S0766W0209 其中城投债券共发行七千亿元 -BAC009S0766W0210 占比只有百分之七 -BAC009S0766W0211 城投债券的发行有比较严格的条件 -BAC009S0766W0212 从已发行的城投债券用途看 -BAC009S0766W0213 保障房建设和棚户区改造 -BAC009S0766W0214 城市文化和体育设施 -BAC009S0766W0215 地震灾后重建等领域 -BAC009S0766W0216 都起到了积极的作用 -BAC009S0766W0217 随着我国资本市场的进一步发展 -BAC009S0766W0218 城投债券作为中国债券市场的准市政债 -BAC009S0766W0219 发行规模还会稳步扩大 -BAC009S0766W0220 中国证券报面对市场对城投债券风险的担忧 -BAC009S0766W0221 是如何更好地防范城投债券可能出现的风险的 -BAC009S0766W0222 虽然已发行的城投债券的还本付息都是正常的 -BAC009S0766W0223 城投债作为一个信用产品 -BAC009S0766W0224 不可能是完全无风险的 -BAC009S0766W0225 我看了以后很受震动 -BAC009S0766W0226 虽然报道内容并没有具体的城投债券还本付息违约案 -BAC009S0766W0227 但却提醒了我们要更加关注城投债券可能出现的风险 -BAC009S0766W0228 并采取措施切实保护债券投资人的合法权益 -BAC009S0766W0229 作为城投债券发行监管部门 -BAC009S0766W0230 我们对城投债券发行人的审核一直是比较严格的 -BAC009S0766W0231 地方投融资平台公司申请发行债券 -BAC009S0766W0232 必须符合一些基本的条件企业必须连续三年盈利 -BAC009S0766W0233 所投项目必须经过合规性审查 -BAC009S0766W0234 我们还控制了投融资平台公司发债的范围 -BAC009S0766W0235 才能申请发行城投债券 -BAC009S0766W0236 就不得再通过发行城投债券新增政府性债务 -BAC009S0766W0237 正是有了这样一些严格的规定 -BAC009S0766W0238 使得很多投融资平台公司 -BAC009S0766W0239 难以满足发行城投债券的资格和条件 -BAC009S0766W0240 这在相当程度上控制了城投债券的发行规模 -BAC009S0766W0241 也降低了城投债券的风险 -BAC009S0766W0242 为了控制地方政府本届发债下届还钱的道德风险 -BAC009S0766W0243 我们还安排了专门的偿债均摊机制 -BAC009S0766W0244 也就是将债券还本压力在债券存续期内进行合理分摊 -BAC009S0766W0245 避免在最后一年累积过大的还本压力和风险 -BAC009S0766W0246 有媒体报道了云投集团等发债企业转移核心资产 -BAC009S0766W0247 损害债券持有人利益的事件 -BAC009S0766W0248 并对债券市场形成了不小的冲击 -BAC009S0766W0249 我们如何考虑防止这类事件再次发生 -BAC009S0766W0250 更好地保护债券投资人的利益 -BAC009S0766W0251 徐林发债企业在债券存续期内进行资产转移 -BAC009S0766W0252 极有可能对债券持有人利益构成不利影响 -BAC009S0766W0253 华尔街的半兽人已经为他的离开紧锣密鼓地敲起退堂鼓 -BAC009S0766W0257 问题是他也想不出谁能干得更好 -BAC009S0766W0260 但亏损却达到了一点八亿美元 -BAC009S0766W0262 不应从用户身上榨取广告收入 -BAC009S0766W0263 试问又有哪位有魔法能挽回巨额亏损呢 -BAC009S0766W0268 对于高中生来说这会有点令人尴尬罢了 -BAC009S0766W0269 可对于一个成年人来说算什么 -BAC009S0766W0270 还有他对日本文化的迷恋 -BAC009S0766W0271 然后又要去竞选纽约州长 -BAC009S0766W0273 跟星巴克的合作就是灾难 -BAC009S0766W0285 梅姐待的已算够长了 -BAC009S0766W0287 但是至少会给股价刺激一下 -BAC009S0766W0288 而梅姐则可以陪陪小孩或者去搞搞政治 -BAC009S0766W0293 但亏损达到了一点七亿美元 -BAC009S0766W0294 这样的成绩已经比二零一三年要好 -BAC009S0766W0296 十年都还没赚钱的话 -BAC009S0766W0299 它已经失去了作为独立公司的存在意义 -BAC009S0766W0303 同比增长一百三十四点七百分之三 -BAC009S0766W0304 归属于上市公司股东的净利润二十五十二万元 -BAC009S0766W0305 去年同期则是亏损二百四十二万元 -BAC009S0766W0306 同比增长十一五十六点四百分之二 -BAC009S0766W0307 公司锂电池业务实现营业收入四点零一亿元 -BAC009S0766W0308 同比增长二百六十八点五百分之一 -BAC009S0766W0309 成飞集成相关人士告诉每日经济新闻记者 -BAC009S0766W0310 前两年锂电池行业整体处于市场培育期 -BAC009S0766W0311 虽然国家在二零一零年就颁布了新能源补贴政策 -BAC009S0766W0312 但是市场启动不像预期那么快 -BAC009S0766W0313 基本上是从二零一四年下半年才有明显的感觉 -BAC009S0766W0314 目前公司锂电池订单比较充足 -BAC009S0766W0315 由于传统汽车产业步入寒冬 -BAC009S0766W0316 汽车厂商纷纷转型新能源汽车 -BAC009S0766W0317 新能源汽车的爆发使得锂电池供不应求 -BAC009S0766W0318 随着锂电池产业链迎来井喷 -BAC009S0766W0319 锂电需求带动业绩增长 -BAC009S0766W0320 成飞集成的锂电池业务在前两年情况并不好 -BAC009S0766W0321 新能源汽车市场在逐步启动 -BAC009S0766W0322 锂电池市场也在向好 -BAC009S0766W0323 成飞集成相关人士告诉记者 -BAC009S0766W0324 这是今年上半年锂电池业务爆发的原因 -BAC009S0766W0325 成飞集成的其他主营业务中 -BAC009S0766W0326 汽车模具以及汽车零部件表现一般 -BAC009S0766W0327 汽车模具实现营收一点一零一亿元 -BAC009S0766W0328 毛利率为十八点百分之三 -BAC009S0766W0329 毛利率为十九点六百分之六 -BAC009S0766W0330 同比增长四点百分之三十五 -BAC009S0766W0331 但是由于该项业务占比较小 -BAC009S0766W0332 所以对业绩的影响有限 -BAC009S0766W0333 汽车零部件总体规模不大 -BAC009S0766W0334 汽车模具毛利率下滑 -BAC009S0766W0335 一方面是由于上半年模具的比较基数较低 -BAC009S0766W0336 也就是去年和今年上半年营收总额都不高 -BAC009S0766W0337 另外今年上半年个别订单的价格也比较低 -BAC009S0766W0339 这一状况有望发生改变 -BAC009S0766W0340 七年之后宋安东二十五岁 -BAC009S0766W0341 正是冰球运动员的黄金年龄 -BAC009S0766W0342 年少成名的他带领国家队出征冬奥会 -BAC009S0766W0343 铁定会有助于提升我国的冰球水平 -BAC009S0766W0344 进而提升我国在冬奥会申办过程中的竞争力 -BAC009S0766W0345 二零二二年冬奥会在北京举行 -BAC009S0766W0346 以宋安东为首的运动员们可以说是鲜活的冬奥名片 -BAC009S0766W0347 让越来越多人关注并参与到其中来 -BAC009S0766W0348 建设三个相对集聚的场馆群 -BAC009S0766W0349 申奥过程本身已经推动了城际交通建设 -BAC009S0766W0350 因此对于北京申办冬奥会的最终结果 -BAC009S0766W0351 我们应该抱着更加长远和开阔的视角来看待 -BAC009S0766W0352 更要期待着中国冰雪运动真正强大起来的那一天 -BAC009S0766W0353 法国冰协同于放人五度世界冠军即将复出搜狐体育 -BAC009S0766W0354 北京时间十月二十七日 -BAC009S0766W0355 经过将近一年时间的漫长谈判 -BAC009S0766W0356 法国花样滑冰联合会终于同意 -BAC009S0766W0357 允许布鲁诺马塞洛特代表德国 -BAC009S0766W0358 两人的更改国籍禁赛期即将开始 -BAC009S0766W0359 这也意味着最晚在明年的各项赛事中 -BAC009S0766W0360 我们就能看到这对强大组合的身影 -BAC009S0766W0361 在金牌搭档罗宾索尔科维退役之后 -BAC009S0766W0362 萨维申科宣布会再坚持一个冬奥会周期 -BAC009S0766W0363 她所选择的新搭档就是法国猛男马塞洛特 -BAC009S0766W0364 但是因为涉及到男伴更改国籍问题 -BAC009S0766W0365 两人的联手十分不顺利 -BAC009S0766W0366 这也让他们虽然可以参加小型赛事 -BAC009S0766W0367 但是由于国籍不统一 -BAC009S0766W0368 无法参加奥运会的比赛 -BAC009S0766W0369 对于法国冰协的行为 -BAC009S0766W0370 不少粉丝都表达了谴责 -BAC009S0766W0371 支持他们继续训练参加比赛 -BAC009S0766W0372 显然处于最艰难时期的两人丝毫没有放弃 -BAC009S0766W0373 休赛期内他们参加了小型赛事 -BAC009S0766W0374 从目前已经传出的视频来看 -BAC009S0766W0375 男伴更是在最新公布的视频中 -BAC009S0766W0376 他们的不放弃换来了成功 -BAC009S0766W0377 马塞洛特的母亲表示 -BAC009S0766W0378 法国冰协方面的态度有了缓和 -BAC009S0766W0379 法国冰协提出最后要求 -BAC009S0766W0380 要求马塞洛特缴纳七万欧元的转国籍费用 -BAC009S0766W0381 随后冰迷们自发为其网上募集资金 -BAC009S0766W0382 马塞洛特来到法国花样滑冰联合会 -BAC009S0766W0383 双方进行了最后一次也是最成功的一次洽谈 -BAC009S0766W0384 能够保障他的职业生涯发展是我的荣幸 -BAC009S0766W0385 恭喜他与萨维申科走上正确的道路 -BAC009S0766W0386 我们的朋友将代表德国 -BAC009S0766W0387 继续征战花样滑冰的比赛 -BAC009S0766W0388 今年第二位离开法国冰协更换国籍的选手 -BAC009S0766W0389 对于这个万众期待的消息 -BAC009S0766W0390 我可以带着它回家了 -BAC009S0766W0391 谢谢每一个支持我们的人 -BAC009S0766W0392 没有你们的支持我们该如何度过最挣扎的时期呢 -BAC009S0766W0393 是时候去努力工作了 -BAC009S0766W0394 他们的禁赛期即将开始 -BAC009S0766W0395 我们或许就将看到他们征战各类大型赛事的身影 -BAC009S0766W0396 对于隋文静韩聪彭程张昊领衔的中国双人滑军团 -BAC009S0766W0397 五度世锦赛冠军萨维申科联手年轻新搭档马塞洛特 -BAC009S0766W0398 这会是一对绝对强大的对手 -BAC009S0766W0399 经过近两个星期的漫长等待 -BAC009S0766W0400 北京时间八月九日一五三零 -BAC009S0766W0401 为观众们奉上昆仑决鏖战香江的精彩赛事 -BAC009S0766W0402 泰拳黑王子播求无疑同小皇帝詹姆斯最为相似 -BAC009S0766W0403 并在各自领域中享受着各自粉丝们帝王般的顶礼膜拜 -BAC009S0766W0404 曾以分歧者星运里的错窜红的谢琳伍德蕾 -BAC009S0766W0405 将出演影片的女主角 -BAC009S0766W0406 搜狐娱乐讯文耷子备受关注的重拍版乌鸦 -BAC009S0766W0407 在经历了无数次的导演和演员更换之后 -BAC009S0766W0408 除了去年结婚 -BAC009S0766W0409 有时候人生是计划赶不上变化的 -BAC009S0766W0410 就顺着你的感觉走就好了 -BAC009S0766W0411 日前在初赛收官战中返场的她加盟猜评团 -BAC009S0766W0412 一袭土豪金西装简直潮爆 -BAC009S0766W0413 有眼尖的网友发现 -BAC009S0766W0414 与孙楠巫启贤共同起立鼓掌的许茹芸小腹凸起 -BAC009S0766W0415 搜狐娱乐讯九月十二日 -BAC009S0766W0416 许茹芸与韩国丈夫崔栽诚迎来结婚一周年纪念日 -BAC009S0766W0417 许茹芸特地发微博感谢婚姻带来的幸福 -BAC009S0766W0418 许茹芸重回舞台不做苦情女娱乐频道 -BAC009S0766W0419 华西都市报讯闪婚欧巴一年后二零一四年 -BAC009S0766W0420 许茹芸与韩籍男朋友举行了婚礼 -BAC009S0766W0421 迎来了人生崭新阶段 -BAC009S0766W0422 不同于大家心中按部就班的乖乖女形象 -BAC009S0766W0423 许茹芸突然闪婚让当时的娱乐圈也惊起了一阵小波澜 -BAC009S0766W0424 此后便鲜有消息 -BAC009S0766W0425 和往日的端庄淑女形象大有不同 -BAC009S0766W0426 对于重回舞台夺下当日歌王 -BAC009S0766W0427 她也坦言内心感触很多 -BAC009S0766W0428 论眉毛重要性 -BAC009S0766W0430 中新网五月七日电据台湾中国时报消息 -BAC009S0766W0431 李嘉诚回应撤资不爱国指控完全不成立 -BAC009S0766W0432 李嘉诚首次公开回应撤资不爱国等质疑 -BAC009S0766W0433 称一篇似是而非的文章 -BAC009S0766W0434 在其发给记者的新闻稿中说 -BAC009S0766W0435 所谓撤资指控完全不成立 -BAC009S0766W0436 其集团在全球拥有一三零零零间店铺 -BAC009S0766W0437 其中内地由两年前的一三零零间增至今天的二三零零间 -BAC009S0766W0438 李嘉诚怎么回答与中央关系有变 -BAC009S0766W0439 李嘉诚或再抛售内地地产项目拟出售上海办公楼 -BAC009S0766W0440 中国日报网八月三日电据华尔街日报三日报道 -BAC009S0766W0441 据两名知情人透露 -BAC009S0766W0442 李嘉诚正式回应撤资指控不相信文革式思维复苏 -BAC009S0766W0443 李嘉诚首次对撤资做出回应 -BAC009S0766W0444 我明白言论自由是一把两刃刀 -BAC009S0766W0445 因此一篇似是而非的文章 -BAC009S0766W0446 李嘉诚首次回应撤资传闻对中国发展充满信心 -BAC009S0766W0447 中新网九月三零日电据香港文汇报报道 -BAC009S0766W0448 对中央坚定不移继续改革开放 -BAC009S0766W0449 致力优化营商环境有信心 -BAC009S0766W0450 对中国发展充满信心 -BAC009S0766W0451 李娜产女后首次亮相运动员掌握英语很重要 -BAC009S0766W0452 李娜在一个商业活动中谈退役后的生活 -BAC009S0766W0453 李娜不想大家一直记得我那说明中国网球没突破 -BAC009S0766W0454 虽然已经退役近一年 -BAC009S0766W0455 但曾经的中国网球一姐李娜仍然没有淡出媒体的关注 -BAC009S0766W0456 李娜媒体用一次性参赛是对运动员的侮辱 -BAC009S0766W0457 长江商报消息本报记者张萌昨日 -BAC009S0766W0458 家居养娃的李娜又重新出现在媒体大众的面前 -BAC009S0766W0459 带着辛吉斯逛完了黄鹤楼 -BAC009S0766W0460 当日的新闻发布会上 -BAC009S0766W0461 李娜一身素色休闲装 -BAC009S0766W0462 走进了武网的新闻大厅 -BAC009S0766W0463 她身上少了些以往的悍将拼劲 -BAC009S0766W0464 多了初为人母的幸福光彩 -BAC009S0766W0465 看似犀利不再的娜姐老将气场立刻显出 -BAC009S0766W0466 一语回击一次性参赛这种说法是一种侮辱 -BAC009S0766W0467 希望媒体不要用这样的词来形容所有网球运动员 -BAC009S0766W0468 因为没有哪个运动员不想表现出最好的自己 -BAC009S0766W0469 李岚清座谈戏称自己八零后退休不等于生命终结 -BAC009S0766W0470 不知不觉我成为一个八零后的老头 -BAC009S0766W0471 退休后不在其位不谋其政 -BAC009S0766W0472 退休并不等于生命的终结 -BAC009S0766W0473 如果放弃学习没有追求 -BAC009S0766W0474 一个人的精神生命就将走向衰老 -BAC009S0766W0475 因此我给自己规划了八个字的退休生活 -BAC009S0766W0476 戏称自己年过八零当为八零后 -BAC009S0766W0477 李开复经历死亡这一课学会看透和放下 -BAC009S0766W0478 李开复被医生宣判为第四期淋巴癌 -BAC009S0766W0479 不期而至的阴霾让他被迫抛下工作 -BAC009S0766W0480 在新书向死而生我修的死亡学分中 -BAC009S0766W0481 我从没想过自己竟会出版一本这样的书 -BAC009S0766W0482 李晨马震就是玩笑任何情况都力挺范冰冰 -BAC009S0766W0483 新京报快讯记者刘玮近日 -BAC009S0766W0484 由于电影王朝的女人杨贵妃中的一场激情戏 -BAC009S0766W0485 范冰冰承包了娱乐头条 -BAC009S0766W0486 出席活动时笑称今后拍激情戏会征求男友李晨的意见 -BAC009S0766W0487 李晨秀才遇到兵发布会后回应称 -BAC009S0766W0488 如果这个事情反过来 -BAC009S0766W0489 演员这个职业就是这样 -BAC009S0766W0490 李玉刚张学友黄琦雯入选一零大最涨姿势歌曲 -BAC009S0766W0491 李玉刚新歌点击逾一亿网友李家每人只需半次 -BAC009S0766W0492 李玉刚饰演的杨贵妃被指芳华绝代说到神曲 -BAC009S0766W0493 该歌曲二零零字的歌词用典竟达三六处之多 -BAC009S0766W0494 让一些网友有如猜谜 -BAC009S0766W0495 被称为二零一五年第一神曲 -BAC009S0767W0121 时刻保持创新和变革意识 -BAC009S0767W0122 引领中国房地产广告行业走向新的黄金时代 -BAC009S0767W0123 今久整合营销集团迎来了它的十岁生日 -BAC009S0767W0124 今久从最初的几十个人 -BAC009S0767W0125 今久商品房销售额首次上涨 -BAC009S0767W0126 房地产投资增速仍下降 -BAC009S0767W0127 大智慧阿思达克通讯社 -BAC009S0767W0128 一五年一月份 -BAC009S0767W0129 全国房地产开发投资三万亿元 -BAC009S0767W0130 同比名义增长许多 -BAC009S0767W0131 增速比一月份回落零点九个百分点 -BAC009S0767W0132 全国商品房销售额两万亿元 -BAC009S0767W0133 年内首次出现同比增长 -BAC009S0767W0134 住宅销售额也增长了 -BAC009S0767W0135 办公楼销售额下降了 -BAC009S0767W0136 商业营业用房销售额下降了 -BAC009S0767W0137 住宅成为全国房地产销售金额唯一增长的板块 -BAC009S0767W0138 一系列楼市新政效果逐步显现 -BAC009S0767W0139 德佑链家市场研究部总监陆骑麟表示 -BAC009S0767W0140 全国房地产开发投资增速仍然延续了增速放缓的渠势 -BAC009S0767W0141 尽管有央行降息等各方利好刺激 -BAC009S0767W0142 尤其是库存高企的三四线城市 -BAC009S0767W0143 开发商仍然面临着较大的销售压力 -BAC009S0767W0144 国家统计局公布的数据显示 -BAC009S0767W0145 无论是东部中部还是西部地区 -BAC009S0767W0146 商品房房的销售面积同比数据出现好转 -BAC009S0767W0147 商品房销售面积三亿平方米 -BAC009S0767W0148 降幅比四月份收窄六个百分点 -BAC009S0767W0149 在公积金松绑等作用的刺激下 -BAC009S0767W0150 五月份商品房销售的面积同比数据由负转正 -BAC009S0767W0151 作为三四线城市最为集中的中部地区来说 -BAC009S0767W0152 房地产开发企业土地购置面积很大 -BAC009S0767W0153 同比下降不少 -BAC009S0767W0154 降幅收窄三个版百分点 -BAC009S0767W0155 各方原因的叠加导致了房企拿地量的明显减少 -BAC009S0767W0156 今年一线城市住宅用地价格涨五成 -BAC009S0767W0157 今年商品房销售一度低迷 -BAC009S0767W0158 一线城市土地市场成交火热 -BAC009S0767W0159 中介机构统计数据显示 -BAC009S0767W0160 平均价格为十万元每平方米 -BAC009S0767W0161 同比上涨五成 -BAC009S0767W0162 随着一线城市楼市企温回升 -BAC009S0767W0163 房企在一线城市拿地的热情还将提高 -BAC009S0767W0164 土地市场热度可能有所下降 -BAC009S0767W0165 大型房企低迷期拿地 -BAC009S0767W0166 中原地产市场研究部统计数据显示 -BAC009S0767W0167 土地成交价款三千亿元 -BAC009S0767W0168 预计全年有望突破四千亿元 -BAC009S0767W0169 一线城市住宅用地平均价格为十一万元每平方米 -BAC009S0767W0170 同比上涨约六成 -BAC009S0767W0171 中原地产首席分析师张大伟认为 -BAC009S0767W0172 住宅市场交易明显升温 -BAC009S0767W0173 迅速带动一线城市土地市场的整体成交 -BAC009S0767W0174 房地产业正经历一个调整阶段 -BAC009S0767W0175 大型房企实施低迷期拿地的策略 -BAC009S0767W0176 在整体市场供大于求区域分化严重的情况下 -BAC009S0767W0177 房企更加愿意扎堆一线城市 -BAC009S0767W0178 而非在三四线城市深耕 -BAC009S0767W0179 这使得一线城市的土地竞争激烈 -BAC009S0767W0180 今年一线城市宅地成交的溢价率不高 -BAC009S0767W0181 平均溢价率较低 -BAC009S0767W0182 较去年明显下降 -BAC009S0767W0183 这是因为土地一级开发成本提高 -BAC009S0767W0184 一线城市住宅用地的低价不断抬升 -BAC009S0767W0185 北京等城市在土地出让中 -BAC009S0767W0186 将保障房地块和商品房地块捆绑出让 -BAC009S0767W0187 直接涉及到债券持有人利益的保护问题 -BAC009S0767W0188 我们立即与云投集团进行了沟通 -BAC009S0767W0189 并严格按照合规程序进行 -BAC009S0767W0190 我委也注意到在企业债券存续期间 -BAC009S0767W0191 需要对发行人资产重组等重大事宜加强监管 -BAC009S0767W0192 在制度上对债券持有人的合法权益进行保护 -BAC009S0767W0193 建立地方政府债务管理体系 -BAC009S0767W0194 中国证券报从您刚才的介绍中我们了解到 -BAC009S0767W0195 城投债券对推动城市基础设施和市政设施的建设 -BAC009S0767W0196 起到了非常积极的作用 -BAC009S0767W0197 对丰富债券市场品种也具有积极意义 -BAC009S0767W0198 结合地方政府债务管理制度的完善 -BAC009S0767W0199 下一步我国的城投债券还需要做哪些完善 -BAC009S0767W0200 徐林这个问题涉及到一系列的制度完善 -BAC009S0767W0201 是一个比较复杂的问题 -BAC009S0767W0202 我个人是这么认认识的 -BAC009S0767W0203 我国还处于城市化快速发展期 -BAC009S0767W0204 需要为各地的城市建设提供规范的融资渠道 -BAC009S0767W0205 农业与非农产业之间劳动生产率的差距也很大 -BAC009S0767W0206 这决定了我国城市化动力十分强劲 -BAC009S0767W0207 城市化进程远未结束 -BAC009S0767W0208 城市化快速发展期的重要特征就是基础设施投资需求大 -BAC009S0767W0209 这是我国所处的发展阶段决定的 -BAC009S0767W0210 政府通过债务融资从事基础设施建设 -BAC009S0767W0211 我们应该建立风险可控的规范化的地方政府融资机制 -BAC009S0767W0212 为各地的基础设施建设提供有制度保障的融资渠道 -BAC009S0767W0213 城投债劵作为准市政债劵仍将是有效的融资工具 -BAC009S0767W0214 但还需要进一步改进 -BAC009S0767W0215 在政府投融资体制改革过程中 -BAC009S0767W0216 从事当地的基础设施建设 -BAC009S0767W0217 相对于过去的体制而言是更加市场化的 -BAC009S0767W0218 城投债劵作为投融资平台公司最透明的直接融资工具 -BAC009S0767W0219 仍然会存在并具有发展空间 -BAC009S0767W0220 由于目前城投债劵的发行需要符合企业债劵发行的条件 -BAC009S0767W0221 这使得我国城投债劵的发行利率相对偏高 -BAC009S0767W0222 城投债劵的发行期限和利率 -BAC009S0767W0223 未来应该在制度上作进一步完善 -BAC009S0767W0224 使得城投公司能够发行真正意义上的长期市政债劵 -BAC009S0767W0225 要尽快建立我国的地方政府债务管理体系 -BAC009S0767W0226 对于如何建立规范的地方政府融资渠道 -BAC009S0767W0227 加强地方政府债务管理和风险防控 -BAC009S0767W0228 一些专家学者提出了许多好的建议 -BAC009S0767W0229 如建立规范透明的地方政府融资渠道 -BAC009S0767W0230 并对地方政府债务进行监控和风险防范等 -BAC009S0767W0231 由于我国还没有建立统一的地方政府债务风险管理制度 -BAC009S0767W0232 设定政府性债务风险控制指标和标准 -BAC009S0767W0233 并对政府性债务实行馀额管理 -BAC009S0767W0234 使地方政府的债务融资规模控制在安全范围内 -BAC009S0767W0235 远低于发生债务危机的欧美国家 -BAC009S0767W0236 债券发行人是优质的 -BAC009S0767W0237 还本付息也是正常的 -BAC009S0767W0238 应该建立风险可控的规范化地方政府融资机制 -BAC009S0767W0239 为各地的基础设施建设提供有制度保障的融资渠道 -BAC009S0767W0240 本报记者曹志为王婷王颖春来源中国证券报 -BAC009S0767W0241 责任编辑廖一宁 -BAC009S0767W0242 据国家发改委网站消息 -BAC009S0767W0243 将考试费标准由各地自行制定改为实行上限管理 -BAC009S0767W0244 价格主管部门将按统一合理的平均成本确定考试费用 -BAC009S0767W0245 将切实减轻考生经济负担 -BAC009S0767W0246 针对目前职业资格考试收费项目增加 -BAC009S0767W0247 一些考试单位考务成本偏高 -BAC009S0767W0248 有的考试在不同地区收费标准差异较大 -BAC009S0767W0249 考生对考试收费问题反映较多等问题 -BAC009S0767W0250 改革了职业资格考试收费管理方式 -BAC009S0767W0251 对考务费标准实行统一标准化管理 -BAC009S0767W0252 通知按照不同考生规模考试类类别的合理平均成本 -BAC009S0767W0254 无疑也会成为投资者的宠儿 -BAC009S0767W0260 而苹果虽有可能卖出不少手表给忠实的粉丝 -BAC009S0767W0264 就开始追寻打造真正的机器人的梦想 -BAC009S0767W0265 但是过去整整一年他都在秘密工作 -BAC009S0767W0266 没人知道他在干什么 -BAC009S0767W0269 无论他做的什么都是什么都会引人注目的 -BAC009S0767W0270 像索尼被黑那样的事 -BAC009S0767W0271 也可能会是受到国家支持的攻击 -BAC009S0767W0272 未来的战争形态有可能就是计算机对抗计算机 -BAC009S0767W0273 当年泡沫破裂前也是这样的情景 -BAC009S0767W0274 一堆不赚钱的公司赶着上市当然不是什么好事 -BAC009S0767W0276 疯狂估值局限于私有公司内 -BAC009S0767W0277 可现在那些公司纷纷上市后疯狂是不是就暴露了呢 -BAC009S0767W0278 而现在的股票市场也已经达到创纪录的新高 -BAC009S0767W0282 称有些技术公司烧钱太快可能会人间蒸发 -BAC009S0767W0284 连这些人都预测不准的话 -BAC009S0767W0285 还有谁能预测得准呢 -BAC009S0767W0287 但是需记住对风向保持敏感 -BAC009S0767W0288 高空飞航时战略无人机 -BAC009S0767W0289 全球鹰并不能独占鳌头 -BAC009S0767W0290 继二零一一年出现独特的连翼造型的翔龙无人机以后 -BAC009S0767W0291 又一款个性十足的双机身气动外形的大型无人机神雕 -BAC009S0767W0292 又一次引爆坊间议论 -BAC009S0767W0293 今年三月美国大众科学杂志刊文称 -BAC009S0767W0294 中国正在研制一种世界上尺寸最大的无人机 -BAC009S0767W0295 发表的想象图与最近曝光的飞机布局很像 -BAC009S0767W0296 这使得神雕在全球也成为最大的无人机之一 -BAC009S0767W0297 据可靠的网络消息源称 -BAC009S0767W0298 其相应的对手不是全球鹰 -BAC009S0767W0300 神雕的两个机身前后装有两对机翼 -BAC009S0767W0301 位于后方的主翼中央挂着两具涡轮风扇发动机 -BAC009S0767W0303 上述公司人士对每日经济新闻记者表示 -BAC009S0767W0304 成飞机成业绩增长主要是由于锂电需求增长 -BAC009S0767W0305 传统汽车业务并没有太大起色 -BAC009S0767W0306 现在汽车市场也在下滑 -BAC009S0767W0307 整个汽车体系都是随着汽车销量在变动 -BAC009S0767W0308 在锂电业务爆发的情况下 -BAC009S0767W0309 公司的汽车业务应该会有一些弱化 -BAC009S0767W0310 新能源汽车的爆发带动了锂电池供不应求 -BAC009S0767W0311 几乎所有锂电厂商都在满负荷生产 -BAC009S0767W0312 上述成飞集成人士表示 -BAC009S0767W0313 产能现在已经满足不了订单需求 -BAC009S0767W0314 八月初公司通过了增加产能建设的决议 -BAC009S0767W0315 今年初也在原来厂里新增了生产线 -BAC009S0767W0316 预计在三四季度会有陆续新增产能 -BAC009S0767W0317 每日经济新闻记者注意到 -BAC009S0767W0318 项目总投资十四点五亿元 -BAC009S0767W0319 总投资预计一百二十五亿元 -BAC009S0767W0320 上述成飞集成人士告诉记者 -BAC009S0767W0321 公司目前看好锂电池行业的发展渠势 -BAC009S0767W0322 但其并未向记者透露项目盈利水平预测 -BAC009S0767W0323 洛阳本部项目是一个增量投资 -BAC009S0767W0324 有一部分研发楼办公楼是利用现成的 -BAC009S0767W0325 包括管理人员等并不会因为新增生产线而增加 -BAC009S0767W0326 这个项目是自有资金投入 -BAC009S0767W0327 就没有要求专业机构做可研报告 -BAC009S0767W0328 我们内部做的盈利测算数据暂时无法公告 -BAC009S0767W0329 成飞集成与常州市金坛区政府合作的项目将分三期完成 -BAC009S0767W0330 一期投资额为二十五亿元 -BAC009S0767W0331 上述成飞集成人士告诉记者 -BAC009S0767W0332 随着国内新能源车产业的迅猛发展 -BAC009S0767W0333 锂电池作为新能源车的重要部件 -BAC009S0767W0334 锂电池生产企业将迎来业绩持续高增长阶段 -BAC009S0767W0335 二零一四年其市场规模已达七十一五亿元 -BAC009S0767W0336 随着锂电池产业链迎来井喷 -BAC009S0767W0337 锂电池在今年上半年成为诸多上市公司的业绩功臣 -BAC009S0767W0338 二者虽从事项目不同 -BAC009S0767W0339 也恰恰符合播求的个人风格 -BAC009S0767W0340 擂台上的黑王子肌肉强健 -BAC009S0767W0341 这也是他一次次在擂台上打出恐怖重击的最大资本 -BAC009S0767W0342 却可以演绎出撼人心魄的体育大美 -BAC009S0767W0343 此次播求面对的强敌 -BAC009S0767W0344 恰恰在风格打法和比赛理念上 -BAC009S0767W0345 同詹姆斯昔年头号强敌卡梅隆安东尼如出一辙 -BAC009S0767W0346 丰富的战斗技巧是我的特色 -BAC009S0767W0347 我希望自己可以像卡梅隆一样 -BAC009S0767W0348 在比赛中展示出更多击败对手的手段 -BAC009S0767W0349 对于我的对手制造更多的麻烦 -BAC009S0767W0350 俄罗斯搏击新生代旗帜性天才高手哈亚在接受采访时 -BAC009S0767W0351 而哈亚的表现也正如其自己所言 -BAC009S0767W0352 展示出了如同其偶像安东尼一样的全面犀利 -BAC009S0767W0353 直接将威瑟里诺夫击倒 -BAC009S0767W0354 其搏击技能之全面精湛格斗天赋之卓越令人惊叹 -BAC009S0767W0355 当搏击界的勒布朗与卡梅隆狭路相逢 -BAC009S0767W0356 激情战火必将以燎原之势 -BAC009S0767W0357 彭博一英里接力赛将在十月十五日首次登陆上海 -BAC009S0767W0358 今天赛事举行了赛前新闻发布会 -BAC009S0767W0359 宣告彭博一英里接力赛上海站全面启动 -BAC009S0767W0360 让他们能在工作之馀释放对于体育的热情 -BAC009S0767W0361 从二零零七年在伦敦创办至今 -BAC009S0767W0362 已在新加坡香港等城市成功落地 -BAC009S0767W0363 得到当地企业的强烈积极响应 -BAC009S0767W0364 在各城市都有至少百支队伍报名参加 -BAC009S0767W0365 彭博一英里接力赛区别于其他跑步活动的是 -BAC009S0767W0366 每支报名队伍以企业为单位每队十名成员 -BAC009S0767W0367 每人分别完成一英里即约一点六公里的路程 -BAC009S0767W0368 最后取全队用时最少者为胜 -BAC009S0767W0369 冠军皆由麦格理集团获得 -BAC009S0767W0370 现场参赛企业誓言要打破这项记录 -BAC009S0767W0371 本次赛事已经开始接受团队报名 -BAC009S0767W0372 目前报名仍在进行中 -BAC009S0767W0373 这一项目将为神农架林区的孩子筹建开放式体育空间 -BAC009S0767W0374 为他们搭建一个特色自由的体育室加户外体育课堂 -BAC009S0767W0375 让他们也可以有机会参与体育运动 -BAC009S0767W0376 高清女排金花手捧奖杯庆夺冠 -BAC009S0767W0377 刚刚在日本女排世界杯上夺冠的中国女排载誉回京 -BAC009S0767W0378 在首都国际机场受到了各界人士的欢迎 -BAC009S0767W0379 这其中一位身材高挑的女孩子颇为引人关注 -BAC009S0767W0380 她就是因伤未能随队参加本次赛事的徐云丽 -BAC009S0767W0381 我觉得这冠军来之不易 -BAC009S0767W0382 特别是我们今年刚开始的时候特别艰难 -BAC009S0767W0383 整个队伍承受了很大的困难和考验 -BAC009S0767W0384 最后顶住困难和压力拿到冠军 -BAC009S0767W0385 我为她们感到骄傲和自豪 -BAC009S0767W0386 那就是姑娘们高举起惠若琪徐云丽和杨方旭的球衣 -BAC009S0767W0387 感谢这三位因伤未能参赛的姐妹对球队做出的巨大贡献 -BAC009S0767W0388 徐云丽透露在赛前队友曾经给自己发了一条短信 -BAC009S0767W0389 就此事征求她的意见 -BAC009S0767W0390 看到这一幕我控制不住自己了 -BAC009S0767W0391 一切都难以用言语来表达 -BAC009S0767W0392 自己此时此刻特别迫切地想要尽快恢复 -BAC009S0767W0393 希望能够跟大家一起再次站在领奖台上 -BAC009S0767W0394 徐云丽最后动情地说 -BAC009S0767W0395 搜狐体育郭健文 -BAC009S0767W0396 二零一五年八月十五 -BAC009S0767W0397 这次赛事是昆仑决二零一五欧洲之旅的第三站 -BAC009S0767W0398 四季如春的俄罗斯黑海东部沿岸 -BAC009S0767W0399 新一轮激战烽火即将炽烈点燃 -BAC009S0767W0400 我是这次中俄对抗赛第一个出场的中方选手 -BAC009S0767W0401 一定要尽全力打一场漂亮的比赛 -BAC009S0767W0402 为中国战队打响第一枪 -BAC009S0767W0403 徐永昊的站立打击技术在该级别中堪称翘楚 -BAC009S0767W0404 如今再遭厄运 -BAC009S0767W0406 最终又确定为科林哈迪 -BAC009S0767W0407 科林也退出了剧组 -BAC009S0767W0408 柯震东去年经历吸毒风波 -BAC009S0767W0409 演艺事业受挫 -BAC009S0767W0410 沉寂一段时间后 -BAC009S0767W0411 近期他积极复出 -BAC009S0767W0412 再度经营他的粉丝团与粉丝互动六日昨晚 -BAC009S0767W0414 他调皮地将自己的眉毛抹掉 -BAC009S0767W0415 搜狐娱乐讯据香港明晚九月十日报道 -BAC009S0767W0416 诸葛紫岐十日晚出席活动时表示 -BAC009S0767W0417 一个月内暴瘦了九至十三斤 -BAC009S0767W0418 有时甚至忘记吃饭 -BAC009S0767W0419 也有情绪问题 -BAC009S0767W0420 打算看医生寻求纾缓方式 -BAC009S0767W0421 她说之前打电话给医生 -BAC009S0767W0422 医生说得好恐怖 -BAC009S0767W0423 但不至于要吃药 -BAC009S0767W0424 我叫他不要吓我 -BAC009S0767W0425 现在有点怕要见他 -BAC009S0767W0426 搜狐娱乐讯九月十六日二十点二十七分 -BAC009S0767W0428 并自嘲的写道自幼就走性感风格 -BAC009S0767W0429 谢依霖穿着白色吊带裙 -BAC009S0767W0430 嘟嘴作亲吻状 -BAC009S0767W0431 李玉刚离开蒙面歌王网友遗憾没听见神曲 -BAC009S0767W0432 蒙面歌王迎来初赛的收官之战 -BAC009S0767W0433 千面娇娃绝地反击拿下最后一席歌王头衔 -BAC009S0767W0434 而绝代歌姬李玉刚揭面引起了广泛热议 -BAC009S0767W0435 也有网友发出疑问若是李玉刚演唱李的话 -BAC009S0767W0436 登上歌王宝座的概率是不是会要大很多倍呢 -BAC009S0767W0437 短短二零零字歌词运用诗词典故三六处 -BAC009S0767W0438 每句歌词都蕴含一段李姓历史文化在里头 -BAC009S0767W0439 该歌曲今年一经在各大音乐网站上线便收获无数好评 -BAC009S0767W0440 更有全球李氏宗亲大会将其列为祭祖主题曲 -BAC009S0767W0441 同时李也俘获了无数中国大妈的芳心 -BAC009S0767W0442 成为各国各地广场舞今年最流行的背景音乐之一 -BAC009S0767W0443 李磊灭门案遗产纠纷终结八零零馀万三人有份 -BAC009S0767W0444 大兴灭门案的凶犯李磊被执行死刑后 -BAC009S0767W0445 其身后的财产分割问题尘埃落定 -BAC009S0767W0446 市二中院终审认定李家遗产共计八零零多万元 -BAC009S0767W0447 李磊的奶奶继承四三七万馀元 -BAC009S0767W0448 姥姥继承二六六万馀元 -BAC009S0767W0449 岳父母继承一零九万馀元 -BAC009S0767W0450 李谷一曾怒揭东方歌舞团腐败事后被调离岗位 -BAC009S0767W0451 顾欣资料图片昨早九号一零时 -BAC009S0767W0452 东方演艺集团大门口戒备森严 -BAC009S0767W0453 中纪委监察部网站发布消息 -BAC009S0767W0454 顾欣因涉嫌严重违纪违法 -BAC009S0767W0455 集团新领导已经上任 -BAC009S0767W0456 是原中国文化集团党委书记宋官林 -BAC009S0767W0457 李连杰名誉维权案一审胜诉网站被判至致歉赔偿一零万 -BAC009S0767W0458 新京报快讯记者林野记者今天傍晚获悉 -BAC009S0767W0459 李银河的文学梦将出版虐恋小说三卷本 -BAC009S0767W0460 权义澎湃资料李银河在现实中是柔软的 -BAC009S0767W0461 不像她发表的那些先锋的观点一样冲击人眼球 -BAC009S0767W0462 李银河写虐恋不会伤害小波 -BAC009S0767W0463 北京南三环附近一家茶楼里 -BAC009S0767W0464 六三岁的李银河拿着钥匙袋走了进来 -BAC009S0767W0465 她身着湖蓝色细纱短袖黑长裤白运动鞋 -BAC009S0767W0466 手腕上还戴着一块与之呼应的白色塑料腕表 -BAC009S0767W0467 出门前我拿了两套衣服 -BAC009S0767W0468 面对第一财经日报记者 -BAC009S0767W0469 说起自己的伴侣大侠 -BAC009S0767W0470 村中数百亩农田干旱村民质疑水库断了灌溉水 -BAC009S0767W0471 高新区钓渭镇疙瘩沟村村民称 -BAC009S0767W0472 却为了发电断了灌溉农田的水 -BAC009S0767W0473 导致数百亩农田干旱 -BAC009S0767W0474 该镇农办一名主管水利负责人介绍 -BAC009S0767W0475 政府曾多次叫停电站发电 -BAC009S0767W0476 但是干旱原因主要系降水减少 -BAC009S0767W0477 今后将加强水库管理 -BAC009S0767W0478 努力处理好灌溉与发电之间的关系 -BAC009S0767W0479 村主任发环卫工一六零零元工资含一四张假钞 -BAC009S0767W0480 村主任发账号给村文书想要继续当拿四八万元 -BAC009S0767W0481 华商报商洛讯记者白鹏飞近日 -BAC009S0767W0482 并向其发送银行账号 -BAC009S0767W0483 原因是有人愿为该村垫资四八万元费用修桥 -BAC009S0767W0484 村委会主任建议由垫资人担任村文书 -BAC009S0767W0485 唐寨子村党支部书记村委会主任已被全镇通报批评 -BAC009S0767W0486 村主任向开发商索贿五二零万村组干部几乎全参与分赃 -BAC009S0767W0487 城改拆迁对很多村民来说 -BAC009S0767W0488 意味着生活条件的改善 -BAC009S0767W0489 但对于部分村官及个别政府工作人员来说 -BAC009S0767W0490 却是一块大大的唐僧肉 -BAC009S0767W0491 想办法扑上去咬一口 -BAC009S0767W0492 村主任给狗盖房吞六万公款被判刑二年八个月 -BAC009S0767W0493 京华时报讯记者王晓飞在农村 -BAC009S0767W0494 几乎家家户户都会在院子里养狗 -BAC009S0767W0495 平时作为看家护院之用 -BAC009S0768W0121 成本的转嫁使得商品房用地成本更高 -BAC009S0768W0122 明年初料迎供应淡季 -BAC009S0768W0123 土地市场交易火热的局面可能降温 -BAC009S0768W0124 土地供应往往呈现前松后紧的态势 -BAC009S0768W0125 年初往往是土地供应的淡季 -BAC009S0768W0126 为完成全念土地供应计划 -BAC009S0768W0127 地方政府倾向于频繁推出优质地块 -BAC009S0768W0128 土地交易可能随着供应淡季的到来而降温 -BAC009S0768W0129 叠加春节因素的影响 -BAC009S0768W0130 这种情况在二月可能较明显 -BAC009S0768W0131 房地产企业的整体资金状况超紧 -BAC009S0768W0132 不利于继续大规模拿地 -BAC009S0768W0133 国家统计局数据显示 -BAC009S0768W0134 房地产开发企业到位资金十万亿元 -BAC009S0768W0135 增速比三月回落六个百分点 -BAC009S0768W0136 未来房企拿地投资新开工等指标可能受到影响 -BAC009S0768W0137 尽管降息等利好政策出台 -BAC009S0768W0138 但房地产市场仍处于调整期 -BAC009S0768W0139 预计不会在短期内迅速回暖 -BAC009S0768W0140 与之相联系的土地市场也会受到影响 -BAC009S0768W0141 中国证券报报道 -BAC009S0768W0142 今年商品房销售一度低日迷 -BAC009S0768W0143 一线城市土地市场成交火热 -BAC009S0768W0144 中介机构统计数据显示 -BAC009S0768W0145 五环内商品住宅的成交在市场中并非主流 -BAC009S0768W0146 一位房企人士认为五环内项目的稀缺性难以改变 -BAC009S0768W0147 新京报讯记者张徐报道 -BAC009S0768W0148 北京去年土地出让落下大幕 -BAC009S0768W0149 在丰台潘家村一宗商业用地低价成交后 -BAC009S0768W0150 北京今年的土地出让金锁定在两千亿元 -BAC009S0768W0151 同比前年增长五成 -BAC009S0768W0152 丰台区域潘家村危改三号地成为今年的收官地质块 -BAC009S0768W0153 这宗零售商业用地位于南三环外 -BAC009S0768W0154 邻近地铁十号线首竟贸站 -BAC009S0768W0155 规划建筑面积约五万平方米 -BAC009S0768W0156 潘家村地块体量较小 -BAC009S0768W0157 未必吸引太多擅长上规模开发的企业 -BAC009S0768W0158 因此最终仅有龙湖地产一家报价 -BAC009S0768W0159 龙湖即以低价五点五亿元拿地 -BAC009S0768W0160 楼面价折合约一万元每平方米 -BAC009S0768W0161 龙湖地产有关人士对记者表示 -BAC009S0768W0162 龙湖已经在丰台有土地储备 -BAC009S0768W0163 未来还将继续深耕丰台区域 -BAC009S0768W0164 龙湖在丰台西局撤资三十亿元拿地 -BAC009S0768W0165 纯商品房楼面价接近六万元每平方米 -BAC009S0768W0166 并不代表全年土地市场行情走低 -BAC009S0768W0167 今年北京土地市场仍然是高温状态 -BAC009S0768W0168 特别是一至四月土地出让金即破千亿元 -BAC009S0768W0169 根据北京中原地产统计 -BAC009S0768W0170 去年北京共出让五十宗居住楼用地 -BAC009S0768W0171 规划建筑面积合计九百万平方米 -BAC009S0768W0172 出让金合计一千亿元 -BAC009S0768W0173 整体平均楼面价折合一万元每平方米 -BAC009S0768W0174 这一平均楼面价较年前的九千元每平方米 -BAC009S0768W0175 北京中原地产首席分析师张大伟认为 -BAC009S0768W0176 一二线城市特别是京沪这样的核心城市 -BAC009S0768W0177 投资价值更好房企看好 -BAC009S0768W0178 加上优质地块的连续供应 -BAC009S0768W0179 促成了今年北京土地市场的走高 -BAC009S0768W0180 通州新城彩虹之门用地挂出三十日 -BAC009S0768W0181 记者从北京市国土局网站看到 -BAC009S0768W0182 通州运河核心区一宗多功能用地挂出 -BAC009S0768W0183 将于明年投标 -BAC009S0768W0184 该地块位于通州新城五河交汇处东南角 -BAC009S0768W0185 规划建筑面积为四十万平方米 -BAC009S0768W0186 据记者从多个渠道了解 -BAC009S0768W0187 分档制定了中央部门收取的考务费统一上限标准 -BAC009S0768W0188 考虑到地方组织考试的成本相对比较固定 -BAC009S0768W0189 即各省在考务费标准基础上 -BAC009S0768W0190 实践技能操作和面试类考试科目 -BAC009S0768W0191 需配备租赁精密仪器专业设备大型场地 -BAC009S0768W0192 考试过程需要消耗相关材料或需聘请专业面试考官的 -BAC009S0768W0193 由于影响成本的因素过多 -BAC009S0768W0194 由各省根据实际成本制定 -BAC009S0768W0195 三是促进考务成本降低和考试单位合并 -BAC009S0768W0196 对考务费实行统一标准化管理后 -BAC009S0768W0197 而是改按统一合理的平均成本确定 -BAC009S0768W0198 将切实减轻考生经济负担 -BAC009S0768W0199 改革将对考试单位的费用支出形成倒逼机制 -BAC009S0768W0200 促使考试单位自觉降低成本由于形不成规模效益 -BAC009S0768W0201 一些规模较小的考试机构也将自动寻求合并 -BAC009S0768W0202 利用价格杠杆促进考试单位向集约化发展 -BAC009S0768W0203 他就上述关注问题指出 -BAC009S0768W0204 目前我国出现政府性债务违约可能性并不大 -BAC009S0768W0205 下一步将进一步完善城投债卷发行制度和防范风险机制 -BAC009S0768W0206 并尽快建立我国地方政府债务管理体系等 -BAC009S0768W0207 政府性违约可能性不大 -BAC009S0768W0208 中国证卷报随着欧美等国主权债务危机陆续爆发 -BAC009S0768W0209 您如何看待政府的举债行为和债务风险 -BAC009S0768W0210 徐林吸取欧美等国主权债务危机的教训 -BAC009S0768W0211 采取必要措施加强政府债务管理 -BAC009S0768W0212 防范我国政府债务风险 -BAC009S0768W0213 但在具体评估我国地方政府债务风险程度时 -BAC009S0768W0214 也要看到我国与欧美国家的不同之处 -BAC009S0768W0215 我国地方政府性债务 -BAC009S0768W0216 特别是地方投投融资平台公司形成的债务 -BAC009S0768W0217 主要由于各各种基础设施的投资建设 -BAC009S0768W0218 当代人和后代人共同承担债务还本付息责任 -BAC009S0768W0219 可以更好地体现代际公平 -BAC009S0768W0220 克服当期建设资金不足的瓶颈制约 -BAC009S0768W0221 有利于加快完善基础设施和投资环境 -BAC009S0768W0222 是一种合理的基础设施投融资金建设行为 -BAC009S0768W0223 政府举债建设形成大量资金 -BAC009S0768W0224 相当部分资产具有长期的直接收益 -BAC009S0768W0225 一些没有直接收益的项目 -BAC009S0768W0226 也具有间接的经济效益或社会效益 -BAC009S0768W0227 对促进当地经济增长和政府财力的增长 -BAC009S0768W0228 不能简单地用寅吃卯粮来作价值判断 -BAC009S0768W0229 这并不意味着政府可以无节制地借债 -BAC009S0768W0230 关键是要把投资规模和债务规模 -BAC009S0768W0231 控制在合理的范围内 -BAC009S0768W0232 防止出现系统性的偿债风险 -BAC009S0768W0233 国务院高度重视防范地方政府债务风险 -BAC009S0768W0234 从二零零九年下半年就开始要求有关部门调研这一问题 -BAC009S0768W0235 国家审计署还专门组织力量 -BAC009S0768W0236 对全国各地的政府债务进行啦严格审计 -BAC009S0768W0237 审计署的审计结论表明 -BAC009S0768W0238 我国地方政府的累积债务相对于偿付能力来看 -BAC009S0768W0239 远低于发生债务危机的欧美国家 -BAC009S0768W0240 考虑到我国正处在经济快速增长期 -BAC009S0768W0241 政府财力增长也相当较快 -BAC009S0768W0242 政府还拥有较多的可变现资产 -BAC009S0768W0243 相对于目前的负债规模 -BAC009S0768W0244 政府总体上具有较强的偿债能力 -BAC009S0768W0245 采取积极有效的措施化解部分地区和领域的债务风险 -BAC009S0768W0246 在我国出现政府性债务违约的可能性是不大的 -BAC009S0768W0247 债卷市场城投债卷发行不畅 -BAC009S0768W0248 从城投债券发行监管部门的角度 -BAC009S0768W0249 您如何看待这一现象 -BAC009S0768W0250 徐林出于对地方政府债务风险的担忧 -BAC009S0768W0251 投资者采取措施防范风险是成熟的表现 -BAC009S0768W0252 但出于对我国地方政府债务风险的不合理判断 -BAC009S0768W0253 神雕的机身四周装有分布式有源相控阵雷达天线 -BAC009S0768W0254 可以提供三六零度无死角的早期预警 -BAC009S0768W0255 它的雷达可能采用了双波段设计 -BAC009S0768W0259 该机的雷达还具备合成孔径工作能力 -BAC009S0768W0260 可用于侦察缓慢移动的地面和海面目标 -BAC009S0768W0261 神雕还有一定的隐身特性 -BAC009S0768W0262 加上它凭借远程雷达与对方舰队保持远距离 -BAC009S0768W0263 如果神雕大量服役和部署 -BAC009S0768W0264 在战区上空形成有效韧的信息网络 -BAC009S0768W0265 那将会是中国海空军的战力倍增器之一 -BAC009S0768W0266 高空长航时战略无人机 -BAC009S0768W0267 全球鹰并不能独占鳌头 -BAC009S0768W0268 继二零一一年出现独特的连翼造型的翔龙无人机以后 -BAC009S0768W0269 又一款个性十足的双机身气动外形的大型无人机神雕 -BAC009S0768W0271 据新华社电美国国际贸易委员会二十一日作出终裁 -BAC009S0768W0272 从台湾地区进口的此类产品存在切销行为 -BAC009S0768W0273 美国国际贸易委员会称 -BAC009S0768W0274 在征收反倾销或反补贴税之前 -BAC009S0768W0275 美商务部与国际贸易委员会都需作出肯定性终裁 -BAC009S0768W0276 商务部裁定切销或补贴幅度 -BAC009S0768W0277 根据美国商务部去年十二月份终裁确定的幅度 -BAC009S0768W0278 针对中美光伏贸易纠纷 -BAC009S0768W0279 中国商务部已明确表示 -BAC009S0768W0280 再次对中国光伏产品发起双反调查并试图征收高额关税 -BAC009S0768W0281 中方对此表示强烈不满 -BAC009S0768W0282 美方对中国产品进行限制的做法 -BAC009S0768W0283 是对贸易救济措施的滥用 -BAC009S0768W0284 势必使用中美光伏贸易纠纷再度升级 -BAC009S0768W0285 美国智库学学者和太阳能行业协会也多次警告 -BAC009S0768W0286 许多美国太阳太阳能制造商依赖于全球光伏供应链 -BAC009S0768W0287 并减少太阳能产业相关就业岗位 -BAC009S0768W0288 美初裁中国产轮胎倾销 -BAC009S0768W0289 据新华社电美国商务部二十一日宣布初裁结果 -BAC009S0768W0290 认定从中国进口的乘用车和轻型卡车轮胎存在倾销行为 -BAC009S0768W0291 美商务部当天发表声明说 -BAC009S0768W0292 倾销幅度从百分之十七至百分之九十九 -BAC009S0768W0293 基于倾销幅度的初裁结果 -BAC009S0768W0294 就美国对中国产轮胎发起双坊调查 -BAC009S0768W0295 中国商务部曾表示强烈反对 -BAC009S0768W0296 认为美国此举违反世界贸易组织规则和美国国内法 -BAC009S0768W0297 希望美方吸取前车之鉴 -BAC009S0768W0298 避免破坏两国相关产业的贸易与合作 -BAC009S0768W0299 据新华社电美国国际贸易委员会二十一日作出终裁 -BAC009S0768W0300 从台湾地区进口的此类产品存在倾销行为 -BAC009S0768W0301 这意味着美国将对相关产品 -BAC009S0768W0303 成飞集成百二十一九十 -BAC009S0768W0305 公司上半年营业收入六点三四亿元 -BAC009S0768W0307 从而获取用户信息的案件 -BAC009S0768W0308 杨某等四人一同在深圳成立了安丰公司 -BAC009S0768W0309 公司主要从事计算机手机的软件开发业务 -BAC009S0768W0310 由于安丰公司的业务不景气 -BAC009S0768W0311 杨某等四人经过商议 -BAC009S0768W0312 决定由麦德公司的技术部门研发静默插件 -BAC009S0768W0313 使用户在刷机过程中 -BAC009S0768W0314 不知不觉地安装上公司开发的插件 -BAC009S0768W0315 而手机被安装上这一插件后 -BAC009S0768W0316 公司不仅可以向手机推送软件广告等商业性电子信息 -BAC009S0768W0317 安丰公司的广告网页是他们推送的重要内容 -BAC009S0768W0318 他们通过这个插件已获利广告收入二十馀万元 -BAC009S0768W0319 同案被捕的马某等四人是公司技术部门的员工 -BAC009S0768W0320 软件开发是领导的授意 -BAC009S0768W0321 自己只是执行公司的工作要求 -BAC009S0768W0322 三百六十软件识别出了麦德公司的插件 -BAC009S0768W0323 将其列为恶意软件用户称其为流氓软件 -BAC009S0768W0324 马某等人进一步完善了插件 -BAC009S0768W0325 再次利用同样的静默安装方式继续推广软件 -BAC009S0768W0326 二十馀万部手机遭殃 -BAC009S0768W0327 在被公安机关查获后 -BAC009S0768W0328 警方在麦德公司数据库中发现 -BAC009S0768W0329 获取到的通讯录近两千万条 -BAC009S0768W0330 判处有期徒刑三年六个月 -BAC009S0768W0331 其馀九人获刑一年五个月至三年不等 -BAC009S0768W0332 依据国家相关法律法规 -BAC009S0768W0333 杨某等人在明知插件功能的情况下 -BAC009S0768W0334 未经用户同意将该插件预置到呃用户手机中 -BAC009S0768W0335 非法获取用户身份认证信息 -BAC009S0768W0336 已经构成了对他人计算机信息系统的侵入控制 -BAC009S0768W0337 侵犯了公民的合法权益 -BAC009S0768W0338 强劲犀利的拳法与膝法破坏力惊人 -BAC009S0768W0339 二零一五年初在南京的笼斗中 -BAC009S0768W0340 徐永昊以雷霆万钧之势缔造了一场震撼的秒杀之作 -BAC009S0768W0341 迅即杀狠的站立技术令人惊叹不已 -BAC009S0768W0342 也是我喜欢的格斗方式 -BAC009S0768W0343 我都会对站立技术进行重点强化 -BAC009S0768W0344 让自己的攻击变得更快更狠 -BAC009S0768W0345 对于这场比赛的备战 -BAC009S0768W0346 我在重点强化站立技术的同时 -BAC009S0768W0347 也对地面技术和防摔技术上做了很多针对性的训练 -BAC009S0768W0348 对于综合能力的严苛要求 -BAC009S0768W0349 是综合格斗运动的一大特色 -BAC009S0768W0350 相较于其精湛凶猛的站立技术 -BAC009S0768W0351 徐永昊的地面技术无疑是其格斗体系中的一块短板 -BAC009S0768W0352 上一场同包尔江的比赛之后 -BAC009S0768W0353 一个强项跟弱项同样突出的拳手 -BAC009S0768W0354 是很难成为真正的王者 -BAC009S0768W0355 我必须要变的更加全面 -BAC009S0768W0356 此次昆罗决中俄对抗赛上 -BAC009S0768W0357 对于代表中国战队略先出阵的徐永昊来讲 -BAC009S0768W0358 这无疑又是一次严峻的考验 -BAC009S0768W0359 也是其对于自身技术全面性提高程度的一次检验 -BAC009S0768W0360 我这次的对手水平很高 -BAC009S0768W0361 拳法和摔跤能力很出色 -BAC009S0768W0362 而在谈及此次应敌的策略时 -BAC009S0768W0363 我不会改变自己擅长的风格 -BAC009S0768W0364 这次比赛我会用胜利证明自己的实力 -BAC009S0768W0365 二零一五年世界田径锦标赛即将在北京拉开序幕 -BAC009S0768W0366 近日德郭队公布了参加此次世锦赛的六六人大名单 -BAC009S0768W0367 上届莫斯科世锦赛上拿到了金牌的四位选手悉数出战 -BAC009S0768W0368 主教练对于这支以老带新的队伍也充满了自信 -BAC009S0768W0369 上届莫斯科世锦赛上拿到的金牌的四位选手悉数出战 -BAC009S0768W0370 包括前秋运动员维斯多尔 -BAC009S0768W0371 撑杆跳选手拉斐尔霍尔泽德佩 -BAC009S0768W0372 哈特灵今年饱受十字韧带伤势困扰 -BAC009S0768W0373 他是否接受外卡参赛要视情况而定 -BAC009S0768W0374 德国队此次以老带新 -BAC009S0768W0375 这也是他一年四记来第一次参加世锦赛 -BAC009S0768W0376 也有经验丰富的老队员 -BAC009S0768W0377 我相信每个人都会付出一切来为团队力争最好的成绩 -BAC009S0768W0378 附二零一五田径世锦赛德国队名单 -BAC009S0768W0379 一百米塞文基尼菲尔斯 -BAC009S0768W0380 二百米罗宾埃尔瓦 -BAC009S0768W0381 八百米罗宾斯切姆贝拉 -BAC009S0768W0382 五千米理查德灵格 -BAC009S0768W0383 一万米阿尔恩加比乌斯 -BAC009S0768W0384 一百一十米栏马特里亚斯布赫雷尔 -BAC009S0768W0385 格里格尔特拉贝尔 -BAC009S0768W0386 马特伍兹菲兹比亚尔科 -BAC009S0768W0387 撑杆跳拉斐尔霍尔泽德斯佩 -BAC009S0768W0388 托比亚斯斯切尔巴尔斯 -BAC009S0768W0389 跳远阿莱恩卡马拉 -BAC009S0768W0390 铅球达维斯多尔 -BAC009S0768W0391 铁饼克里斯托弗哈特灵 -BAC009S0768W0392 标枪拉尔斯哈曼恩 -BAC009S0768W0393 全能里科费雷姆斯 -BAC009S0768W0394 迈克尔斯齐莱德尔 -BAC009S0768W0395 二零千米竞走尼尔斯布莱姆巴号 -BAC009S0768W0396 五零千米竞走卡尔多赫曼 -BAC009S0768W0397 四乘一百米接力罗伯特哈特灵 -BAC009S0768W0398 卢卡斯亚库比泽克 -BAC009S0768W0399 亚历山大克塞诺科夫 -BAC009S0768W0400 雅莱克斯欧帕拉迪尼门格 -BAC009S0768W0401 一百米莱贝卡哈塞 -BAC009S0768W0402 吉娜卢克肯科姆普尔 -BAC009S0768W0403 八百米克里斯蒂娜哈灵 -BAC009S0768W0404 这部命运多旭的电影 -BAC009S0768W0405 原本计划在今年六月正式开机 -BAC009S0768W0406 可现在已经全部泡汤 -BAC009S0768W0407 科林之前已经积极的支持影片拍摄 -BAC009S0768W0408 圆圆的脸蛋非常的可爱 -BAC009S0768W0409 此照片萌翻众网友 -BAC009S0768W0410 纷纷留言点赞 -BAC009S0768W0411 称哈哈哈性感的不要不要的 -BAC009S0768W0412 自小卖得一脸好萌 -BAC009S0768W0413 搜狐娱乐讯据香港媒体报道 -BAC009S0768W0414 诞下很像天华的小宝贝 -BAC009S0768W0415 一向都是在圈子中人缘甚佳的谢天华 -BAC009S0768W0416 使得宝宝刚出生就有了一大班星星级干爹干娘 -BAC009S0768W0417 搜狐娱乐讯据香港媒体报道 -BAC009S0768W0418 艺人谢婷婷出席活动时 -BAC009S0768W0419 被问到有传其胞兄谢霆锋将与王菲结婚 -BAC009S0768W0420 她回应是么 -BAC009S0768W0421 没有人同我讲 -BAC009S0768W0422 好多传闻我都不会特别问他 -BAC009S0768W0423 是真的话他自己会同我讲 -BAC009S0768W0424 想不想他再次成家立室 -BAC009S0768W0425 他开心就好 -BAC009S0768W0426 不过要看他心情工作同家人相处同小朋友 -BAC009S0768W0427 各样都平衡得好处理得好 -BAC009S0768W0428 结婚都只是一张纸同戒指 -BAC009S0768W0430 搜狐娱乐讯据香港媒体报道 -BAC009S0768W0431 为了给自己的爱犬盖狗舍及休息场所 -BAC009S0768W0432 他指使他人虚开发票六万馀元用公款报销 -BAC009S0768W0433 贾某被市三中院终审判处有期徒刑两年八个月 -BAC009S0768W0434 村书记被村民驾车撞倒身亡肇事者已被刑拘 -BAC009S0768W0435 京华时报讯记者迟名常鑫前天中午近一一点半 -BAC009S0768W0436 大兴区礼贤镇紫各庄村 -BAC009S0768W0437 村书记乔俊然在家门前被一辆轿车撞倒后 -BAC009S0768W0438 肇事者为紫各庄村民乔某 -BAC009S0768W0439 大兴警方以涉嫌交通肇事罪将肇事者刑事拘留 -BAC009S0768W0440 案件正在进一步调查中 -BAC009S0768W0441 村书记骗补助被判一一年受审辩称不了解政策 -BAC009S0768W0442 新京报讯记者王巍利用村里遭受泥石流灾害后 -BAC009S0768W0443 政府出钱搬迁盖房的机会 -BAC009S0768W0444 延庆县永宁镇偏坡峪村原党支部书记钱某 -BAC009S0768W0445 将不应享受政府的两个女儿作为搬迁户上报 -BAC009S0768W0446 骗取搬迁补助资金用于支付搬迁安置房费用 -BAC009S0768W0447 延庆法院一审判决认为 -BAC009S0768W0448 钱某贪污一二馀万元拆迁款 -BAC009S0768W0449 判处有期徒刑一一年 -BAC009S0768W0450 村内常有蛇出没疑从养蛇村民中爬出 -BAC009S0768W0451 信息时报讯记者陈子玉近日 -BAC009S0768W0452 白云区钟落潭竹一村民白云区钟落潭竹一村的村民跟记者报料 -BAC009S0768W0453 说最近他们村里经常有蛇出没 -BAC009S0768W0454 甚至还会爬到村民家中 -BAC009S0768W0455 他们怀疑是有人在村里养蛇所致 -BAC009S0768W0456 蛇主刘先生表示以后将不在家里养蛇 -BAC009S0768W0457 村医研发神奇止痛药网销全全国获刑三年 -BAC009S0768W0458 村卫生室医师兼职黑b超记者暗访结束被跟踪 -BAC009S0768W0459 明着是大兴区黄村镇狼垡三村的医师 -BAC009S0768W0460 暗地里却发布小广告揽客 -BAC009S0768W0461 村妇为缓解丈夫病痛种罂丽当药用被判刑六个月 -BAC009S0768W0462 曲靖一村妇竟在自家菜地内非法种植罂丽一零四二株 -BAC009S0768W0463 用罂丽熬汤为丈夫止痛 -BAC009S0768W0464 该村妇因犯非法种植毒品原植物罪 -BAC009S0768W0465 被麒麟区法院判处有期徒刑六个月 -BAC009S0768W0466 并处罚金人民币一千元 -BAC009S0768W0467 村妇将一零万元现金埋地底四年多已腐烂成碎块 -BAC009S0768W0468 村委会主任因经济问题两次被免第三次当选惹争议 -BAC009S0768W0469 张绵跃当选村委会主任 -BAC009S0768W0470 村委会在农田搭起违法建筑每年收租金一四万元 -BAC009S0768W0471 奉化江口儒江村村委会却带头盖起了违法建筑 -BAC009S0768W0472 记者接到这样的报料 -BAC009S0768W0473 三改一拆可以说是一条红线 -BAC009S0768W0474 村委会竟然会顶风作案 -BAC009S0768W0475 记者和宁波市三三改一拆办工作人员前往现场 -BAC009S0768W0476 这听起来多少有些匪夷所思的违建竟然是真的 -BAC009S0768W0477 村官一顿工作餐吃二六个菜挂钩蹲点领导被诫勉谈话 -BAC009S0768W0478 一顿工作餐竟上二六个菜 -BAC009S0768W0479 且逢餐必有烟酒从园区领导到村组干部 -BAC009S0768W0480 在严查四风的高压态势下 -BAC009S0768W0481 以公务招待为名大肆公款吃喝 -BAC009S0768W0482 村官借四零零多户居民三亿一携款失联 -BAC009S0768W0483 村官接连顶风违纪其子领证为热闹摆酒六七桌 -BAC009S0768W0484 编者按为深入贯彻落实中央八项规定精神 -BAC009S0768W0485 按照中央纪委宣传部的统一部署 -BAC009S0768W0486 陆续派记者深入采访 -BAC009S0768W0487 进一步加大舆论监督力度 -BAC009S0768W0488 通报一个教育一批震灭一片 -BAC009S0768W0489 释放出中央执纪必严紧抓不放的强烈信号 -BAC009S0768W0490 广大领导干部要以引以为戒守住底线 -BAC009S0768W0491 坚决不在四风问题上犯错犯错误跌跟头 -BAC009S0768W0492 村官涉不雅视频被免职饭桌上摸女子胸部臀部等 -BAC009S0768W0493 村官私刻公章侵占二八万粮补派人殴打上访村民 -BAC009S0768W0494 党和人民不会管到我身上来 -BAC009S0768W0495 侵吞征地种粮补偿款 -BAC009S0769W0121 该地块即为通州新城核心地标彩虹之门用地 -BAC009S0769W0122 北京通州新城投资公司网站显示 -BAC009S0769W0123 彩虹之门建筑净高三十米 -BAC009S0769W0124 为双拱形非中心对称建筑 -BAC009S0769W0125 新京报讯记者张旭报道 -BAC009S0769W0126 北京去年土地出让落下大幕 -BAC009S0769W0127 在丰台樊家村一宗商业用地底价成交后 -BAC009S0769W0128 北京今年的土地出让金锁定在两千亿元 -BAC009S0769W0129 同比去年增长五成 -BAC009S0769W0130 市政府决定今年将全面加快棚户区改造步伐 -BAC009S0769W0131 确保完成六万户搬迁改造任务 -BAC009S0769W0132 推进上百个棚改项目全面启动实施 -BAC009S0769W0133 今年北京要建设筹集各类保障房十万套 -BAC009S0769W0134 各区县力争完成十五万套开工任务竣工八万套 -BAC009S0769W0135 开工建设公租房不低于三万套 -BAC009S0769W0136 为了确保保障房住宅的优良品质 -BAC009S0769W0137 北京将继续改进住宅产业化推进方式 -BAC009S0769W0138 推行标准化装配式装修 -BAC009S0769W0139 前年至今年期间 -BAC009S0769W0140 北京要筹集建设各类保障性住房一百万套 -BAC009S0769W0141 为改善中低收入家庭住房条件 -BAC009S0769W0142 今年北京除了建设保障性住房外 -BAC009S0769W0143 还加大棚户区的改造任务 -BAC009S0769W0144 各区县各单位要按照下达的任务指标 -BAC009S0769W0145 确保完成今年六万户棚户区改造任务 -BAC009S0769W0146 今年是十二五规划的收官之年 -BAC009S0769W0147 各区县各单位要加强协作配合 -BAC009S0769W0148 要重点加大政策支持 -BAC009S0769W0149 破解棚户区改造征收瓶颈问题 -BAC009S0769W0150 各相关部门要主动服务区县服务各参建单位 -BAC009S0769W0151 对于今后棚户区改造中遇到的问题 -BAC009S0769W0152 各项目标任务已分解至各区县 -BAC009S0769W0153 今年北京将继续加大集体土地建设公租房试点力度 -BAC009S0769W0154 加快公租房的配租进度 -BAC009S0769W0155 力争配租三万户以上 -BAC009S0769W0156 今年北京还将加大社会单位泵租力度 -BAC009S0769W0157 市政府决定今年将全面加快棚户区改造步伐 -BAC009S0769W0158 确保完成六万户搬迁改造任务 -BAC009S0769W0159 今年土地收入预计近四万亿元 -BAC009S0769W0160 今年国有土地使用权出让收入四千亿元 -BAC009S0769W0161 继前年和去年连续两年突破四万亿元后 -BAC009S0769W0162 今年土地收入再维持稳定 -BAC009S0769W0163 相关公司股票走势 -BAC009S0769W0164 房地产市场竞争加大 -BAC009S0769W0165 房企应走差异化路线 -BAC009S0769W0166 还有多家机构分析认为 -BAC009S0769W0167 政府对今年的土地出让金收入预期下降 -BAC009S0769W0168 这暗示房地产的库存大 -BAC009S0769W0169 这直接影响到房地产的买地情况 -BAC009S0769W0170 相应的房价涨跌 -BAC009S0769W0171 如今房地产市场已经供需相对平衡 -BAC009S0769W0172 甚至开始进入了供过于求的局面 -BAC009S0769W0173 但去年住宅土地成交建筑面积仅十二亿平米 -BAC009S0769W0174 远低于去年和前年平均的二十亿平米水平 -BAC009S0769W0175 除了开发商的买地行为减少 -BAC009S0769W0176 全国房地产库存正在堆积 -BAC009S0769W0177 而出让的住宅建筑面积总和至少为一百亿平米 -BAC009S0769W0178 约可供销售四年 -BAC009S0769W0179 开发商整体在手土地充足 -BAC009S0769W0180 瑞银分析师丁晓预测 -BAC009S0769W0181 预计明年全国土地市场仍难复苏 -BAC009S0769W0182 各路开发商一致看好一线城市房地产市场 -BAC009S0769W0183 从一月的一线城市的土地成交看 -BAC009S0769W0184 溢价率楼面价均处于高位 -BAC009S0769W0185 预计后市一二线城市拿地竞争将更加剧烈 -BAC009S0769W0186 中原地产首席市场分析师张大伟告诉南都记者 -BAC009S0769W0187 并进而对城投债券进行唱空或做空 -BAC009S0769W0188 最近企业债券特别是城投债券的发行难度加大 -BAC009S0769W0189 发行利率也有较大幅度上升 -BAC009S0769W0190 人民银行多次提高存款准备金率和存贷款基准利率 -BAC009S0769W0191 不仅是城投债券发行利率 -BAC009S0769W0192 债券市场所有品种发行利率整体上都表现出向上的走向 -BAC009S0769W0193 导致城投债券发行产生较高的风险溢价 -BAC009S0769W0194 城投债券收益率上升 -BAC009S0769W0195 对债券投资人来说不是坏事 -BAC009S0769W0196 有利于提升城投债券的资产配置价值 -BAC009S0769W0197 则需要在发债时机和发债规模上进行合理的把握 -BAC009S0769W0198 我个人不赞成这一判断 -BAC009S0769W0199 债券发行人是优质的 -BAC009S0769W0200 还本付息也是正常的 -BAC009S0769W0201 投资者对城投债券风险表现出的恐慌 -BAC009S0769W0202 加强城投债监管完善制度建设 -BAC009S0769W0203 有的媒体甚至用井喷来描述 -BAC009S0769W0204 您如何看待城投债券这几年的发展和作用 -BAC009S0769W0205 这几年城投债券发行数量的确有所增加 -BAC009S0769W0206 地方投融资平台公司通过发行债券进行融资 -BAC009S0769W0207 符合提高直接融资比重的要求 -BAC009S0769W0208 城投债券也适应了发行人和投资人的需要 -BAC009S0769W0209 这是这几年城投债券发行规模不断扩大的主要原因 -BAC009S0769W0210 我委核准发行的企业债券累计为七千亿元 -BAC009S0769W0211 其中城投债券共发行七千亿元 -BAC009S0769W0212 占比只有百分之七 -BAC009S0769W0213 城投债券的发行有比较严格的条件 -BAC009S0769W0214 从已发行的城投债券用途看 -BAC009S0769W0215 保障房建设和棚户区改造 -BAC009S0769W0216 城市文化和体育设施 -BAC009S0769W0217 地震灾后重建等领域 -BAC009S0769W0218 都起到了积极的作用 -BAC009S0769W0219 随着我国资本市场的进一步发展 -BAC009S0769W0220 城投债券作为中国债券市场的准市政债 -BAC009S0769W0221 发行规模还会稳步扩大 -BAC009S0769W0222 中国证券报面对市场对城投债券风险的担忧 -BAC009S0769W0223 是如何更好地防范城投债券可能出现的风险的 -BAC009S0769W0224 虽然已发行的城投债券的还本付息都是正常的 -BAC009S0769W0225 城投债作为一个信用产品 -BAC009S0769W0226 不可能是完全无风险的 -BAC009S0769W0227 我看了以后很受震动 -BAC009S0769W0228 虽然报道内容并没有具体的城投债券还本付息违约案 -BAC009S0769W0229 但却提醒了我们要更加关注城投债券可能出现的风险 -BAC009S0769W0230 并采取措施切实保护债券投资人的合法权益 -BAC009S0769W0231 作为城投债券发行监管部门 -BAC009S0769W0232 我们对城投债券发行人的审核一直是比较严格的 -BAC009S0769W0233 地方投融资平台公司申请发行债券 -BAC009S0769W0234 必须符合一些基本的条件企业必须连续三年盈利 -BAC009S0769W0235 所投项目必须经过合规性审查 -BAC009S0769W0236 我们还控制了投融资平台公司发债的范围 -BAC009S0769W0237 才能申请发行城投债券 -BAC009S0769W0238 就不得再通过发行城投债券新增政府性债务 -BAC009S0769W0239 正是有了这样一些严格的规定 -BAC009S0769W0240 使得很多投融资平台公司 -BAC009S0769W0241 难以满足发行城投债券的资格和条件 -BAC009S0769W0242 这在相当程度上控制了城投债劵的发行规模 -BAC009S0769W0243 也降低了城投债劵的风险 -BAC009S0769W0244 为了控制地方政府本届发债下届还钱的道德风险 -BAC009S0769W0245 我们还安排了专门的偿债均摊机制 -BAC009S0769W0246 也就是将债劵还本压力在债劵存续期内进行合理分摊 -BAC009S0769W0247 避免在最后一年累积过大的还本压力和风险 -BAC009S0769W0248 有媒体报道了云投集团等发债企业转移核心资产 -BAC009S0769W0249 损害债劵持有人利益的事件 -BAC009S0769W0250 并对债券市场形成了不小的冲击 -BAC009S0769W0251 你们如何考虑防止这类事件再次发生 -BAC009S0769W0252 更好地保护债券投资人的利益 -BAC009S0769W0253 据新华社电有病当然要吃药 -BAC009S0769W0254 但吃下去的药能否真正作用到病灶就很难说了 -BAC009S0769W0255 通过它能够实现药物的精准投送 -BAC009S0769W0256 他们开发出一种只有二十微米长的机器人 -BAC009S0769W0257 这个机器人由高分子材料制成 -BAC009S0769W0258 当它进入动物胃部时 -BAC009S0769W0259 锌就会与胃酸发生反应 -BAC009S0769W0260 从而推动机器人在胃部前行 -BAC009S0769W0261 这种技术很适合用来治疗胃溃疡等胃部疾病 -BAC009S0769W0262 高效精准投送药物不仅可降低用药量 -BAC009S0769W0263 这项技术离临床应用还有一段距离 -BAC009S0769W0264 据新华社电有病当然要吃药 -BAC009S0769W0265 但吃下去的药能否真正作用到病灶就很难说了 -BAC009S0769W0266 美国政府部门当地时间周四警示称 -BAC009S0769W0267 苹果设备的用户应当注意 -BAC009S0769W0269 不要在弹出窗口点击安装打开应用时 -BAC009S0769W0271 苹果公司也在第一时间发布官方声明 -BAC009S0769W0273 还没有任何一个用户真正遭受过此攻击 -BAC009S0769W0274 我们鼓励用户只从可信任的渠道 -BAC009S0769W0276 并注意下载过程中的任何警告 -BAC009S0769W0277 企业用户在安装定制应用程序时 -BAC009S0769W0278 须从他们公司的安全网站上进行下载并安装 -BAC009S0769W0279 美国政府部门当地时间周四警示称 -BAC009S0769W0280 苹果设备的用户应当注意 -BAC009S0769W0283 据新华社电印度官员透露 -BAC009S0769W0284 美国将向印度转让两项军事技术 -BAC009S0769W0285 其中包括美国大鸦无人机今后将由印度工厂制造 -BAC009S0769W0286 印度斯坦时报二十四日援引消息人士的话报道 -BAC009S0769W0287 大鸦无人机由美国航空环境公司研制 -BAC009S0769W0288 由士兵直接用手投掷起飞 -BAC009S0769W0289 二零零三年以来在阿富汗得到了广泛应用 -BAC009S0769W0290 预计从二零一五年下半年开始 -BAC009S0769W0291 美国将不再生产大鸦无人机 -BAC009S0769W0292 改由设在印度本加卢鲁的一家美印合资公司生产 -BAC009S0769W0293 一名印度高级官员透露 -BAC009S0769W0294 眼下已有七个国家打算购买大鸦无人机 -BAC009S0769W0295 预计订单总额为三十亿美元 -BAC009S0769W0296 美国航空环境公司停止生产大鸦无人机后 -BAC009S0769W0297 印方工厂将继续完成剩馀订单 -BAC009S0769W0298 此外还将与美方联手研制一款升级版大鸦无人机 -BAC009S0769W0299 该技术可用于识别隐藏于伪装下的目标 -BAC009S0769W0300 从而把运输机转化为更为复杂的远程侦察机 -BAC009S0769W0301 美国外交消息人士透露 -BAC009S0769W0302 肯德尔定于二月二十三日访问印度 -BAC009S0769W0303 且达到情节特别严重程度 -BAC009S0769W0304 故依法裁定驳回上诉 -BAC009S0769W0306 从而获取用户信息的案件 -BAC009S0769W0308 虽然工信部很快就删除了后半句话 -BAC009S0769W0309 但还是引发业内广泛关注 -BAC009S0769W0310 这种宣传方式的目的是什么 -BAC009S0769W0311 截至中国经营报记者发稿前 -BAC009S0769W0312 浪潮官方尚未给出回应 -BAC009S0769W0313 旗下拥有浪潮信息浪潮软件浪潮国际三家上市公司 -BAC009S0769W0314 尽管政府对国产品牌有一定扶持 -BAC009S0769W0315 浪潮的发展也有可圈可可点之处 -BAC009S0769W0318 浪潮信息的研发支出约四亿元 -BAC009S0769W0319 占营业收入的比例是五点百分之四十七 -BAC009S0769W0320 较上年同期增长八十四点百分之三十九 -BAC009S0769W0321 研发支出主要用于服务器产品的研究开发和升级换代 -BAC009S0769W0322 研发投入是一个刚性指标 -BAC009S0769W0323 与技术的更新换代速度还是有相关性 -BAC009S0769W0324 国内几个服务器品牌的盘子还比较小 -BAC009S0769W0325 他们的硬件技术研发等力量 -BAC009S0769W0326 经验积累不足也是一个大问题 -BAC009S0769W0327 国产服务器即使是自主设计 -BAC009S0769W0329 核心架构也基本照抄国外厂商 -BAC009S0769W0330 在中低端市场或占有相应份额 -BAC009S0769W0331 但高端市场仍然难以企及 -BAC009S0769W0332 一位股份制银行科技部负责人如此讲述 -BAC009S0769W0334 国内厂商在高端核心技术上普遍存有差距 -BAC009S0769W0336 浪潮高管在接受媒体采访时表示 -BAC009S0769W0337 浪潮将通过产品渠道服务价格的全方位发力 -BAC009S0769W0338 一百米栏辛迪罗勒德尔 -BAC009S0769W0339 三千米障碍吉萨费里欣塔斯卡鲁塞 -BAC009S0769W0340 跳高玛丽劳伦斯荣格菲利斯 -BAC009S0769W0341 撑杆跳丽萨莱兹奇 -BAC009S0769W0342 跳远莱纳马尔库斯 -BAC009S0769W0343 三级跳克里斯丁吉尔奇 -BAC009S0769W0344 铅球克里斯蒂娜斯齐万兹 -BAC009S0769W0345 铁饼沙尼斯克拉夫特 -BAC009S0769W0346 链球贝蒂海德尔 -BAC009S0769W0347 标枪克里斯丁胡宋 -BAC009S0769W0348 克里斯蒂娜奥伯福尔 -BAC009S0769W0349 全能詹妮弗奥赛尔 -BAC009S0769W0350 四乘一百米接力亚历山大布尔格哈德特 -BAC009S0769W0351 安娜莱纳法拉塞 -BAC009S0769W0352 吉娜卢克肯科姆普尔 -BAC009S0769W0353 孙杨因心脏不适退出一千五百米自由泳决赛 -BAC009S0769W0354 无疑是刚刚结束的喀山世锦赛最大的遗憾 -BAC009S0769W0355 孙杨在一千五百米自由泳上的实力不容置疑 -BAC009S0769W0356 而这一次击败他的不是对手 -BAC009S0769W0357 孙杨的心脏不适早就不是秘密 -BAC009S0769W0358 是孙杨在二零一四年因治疗心脏不适 -BAC009S0769W0359 误服曲美他嗪导致兴奋剂检测呈阳性遭禁赛 -BAC009S0769W0360 正是治疗他心悸不适症状的 -BAC009S0769W0361 也第一次被媒体关注 -BAC009S0769W0362 记者从浙江省游泳协会了解到 -BAC009S0769W0363 孙杨就出现过心脏问题 -BAC009S0769W0364 孙杨因感冒后出现了胸闷心悸不适等症状 -BAC009S0769W0365 专家会诊之后认为孙杨存在心肌缺血情况 -BAC009S0769W0366 与感冒病毒感染损伤心肌有关 -BAC009S0769W0367 予服用处方药以治疗心肌缺血保护心肌 -BAC009S0769W0368 孙杨的心肌损伤是在感冒后引发的 -BAC009S0769W0369 心肌同位素扫描显示局部灌注差 -BAC009S0769W0370 达到保护心脏的作用 -BAC009S0769W0371 是去年备战亚运会选拔赛期间 -BAC009S0769W0372 直到二零一四年四月才解禁复出 -BAC009S0769W0373 尽管期间孙杨的训练并没有中断 -BAC009S0769W0374 但训练量几乎和正常时不可同日而语 -BAC009S0769W0375 为了备战亚运会选拔赛 -BAC009S0769W0376 在世锦赛决赛检录前突感不适 -BAC009S0769W0377 也是孙杨整个比赛期间疲劳所致 -BAC009S0769W0378 从四百米预赛到最后的一千五百米预赛 -BAC009S0769W0379 二百米的高强度无氧到一千五百米的有氧 -BAC009S0769W0380 师姐罗雪娟也忍不住落泪 -BAC009S0769W0381 回忆起自己从前训练时因心脏不适被抢救的事 -BAC009S0769W0382 更大的战场还在里约 -BAC009S0769W0383 华西都市报记者陈甘露 -BAC009S0769W0384 二零零八年北京奥运会时 -BAC009S0769W0385 曾经在鸟巢服务的志愿者们 -BAC009S0769W0386 顶级田径赛事再次落户鸟巢 -BAC009S0769W0387 如今为这次赛事服务的志愿者们更为年轻 -BAC009S0769W0388 他们几乎都是九零后 -BAC009S0769W0389 这批志愿者也被称为新鸟巢一代 -BAC009S0769W0390 而他们已经为这次田径世锦赛做好了准备 -BAC009S0769W0391 并要为国内外运动员献上一张张北京最美的名片 -BAC009S0769W0392 在每次大型赛事中志愿者都是必不可少的一部分 -BAC009S0769W0393 他们也是历届大赛的一个亮点 -BAC009S0769W0394 总共有二千七百六十人来为这项大赛志愿服务 -BAC009S0769W0395 他们最大的特点就是九零后占主角 -BAC009S0769W0396 比例超过百分之九十四的志愿者是九零后 -BAC009S0769W0397 在今年世锦赛的志愿者中 -BAC009S0769W0398 有的人还会八国语言 -BAC009S0769W0399 志愿者除了要具备流利的英语交流能力外 -BAC009S0769W0400 还要求具备大型赛会或日常从事社会志愿服务的经验 -BAC009S0769W0401 北京青年报记者昨日在鸟巢采访了一些志愿者 -BAC009S0769W0402 发现他们中间真有不少是志愿达人 -BAC009S0769W0403 例如在竞赛部赛后控制中心的陈田希 -BAC009S0769W0404 也等待了很长时间 -BAC009S0769W0405 但最终却因为出品公司相对论影业申请破产 -BAC009S0769W0406 而不得不离开这个项目 -BAC009S0769W0407 乌鸦在没有其他公司愿意接手的情况下 -BAC009S0769W0408 谈到前日爸爸谢贤在宣传活动上出手打曾江 -BAC009S0769W0409 婷婷指收到消息时正在拍摄广告 -BAC009S0769W0410 亦未联络到爸爸了解他不是一个常打架的人 -BAC009S0769W0411 他是一个大人 -BAC009S0769W0412 他一定有他的原因 -BAC009S0769W0413 又指自己未试过受爸爸体罚 -BAC009S0769W0415 婷婷就坦言靠传媒得知 -BAC009S0769W0416 但会给哥哥谢霆锋传短信了解情况 -BAC009S0769W0417 中新网六月二十四日电六月二十三日 -BAC009S0769W0418 谢霆锋妹妹谢婷婷在微博晒出与父亲合影 -BAC009S0769W0419 谢婷婷将头挨着父亲的头 -BAC009S0769W0420 二人一脸笑容 -BAC009S0769W0421 搜狐娱乐讯据香港媒体报道 -BAC009S0769W0422 艺人谢婷婷从小就成为媒体焦点 -BAC009S0769W0423 而有鬼妹仔性格的婷婷不时以性感打扮亮相 -BAC009S0769W0424 她去游泳解暑 -BAC009S0769W0425 还在网上分享身穿比基尼泳装照 -BAC009S0769W0426 这种天气很适合搞池边派对 -BAC009S0769W0427 中新网五月二十一日报道据香港明报消息 -BAC009S0769W0428 谢婷婷为服装拍摄时装宣传照 -BAC009S0769W0429 她透露现在父母哥哥谢霆锋都各忙各的 -BAC009S0769W0430 一家人很难有机会团聚 -BAC009S0769W0431 施王祥被陆丰市纪委立案调查 -BAC009S0769W0432 二零一三一二二六 -BAC009S0769W0433 二零一四三七 -BAC009S0769W0434 陆丰市纪委决定给予施王祥开除党籍处分 -BAC009S0769W0435 南粤清风网通报该案详情 -BAC009S0769W0436 村官遭判刑处罚证据涉嫌造假公检法自查迟迟无果 -BAC009S0769W0437 山西省临汾市尧都区刘村镇刘南村一零名村干部 -BAC009S0769W0438 因决定取消刁天恩的土地承包合同移栽地上树苗 -BAC009S0769W0439 被法院以故意毁坏财物罪判刑或处罚 -BAC009S0769W0440 村小老师自掏腰包八零零零元为贫困生设奖学金 -BAC009S0769W0441 薛孝文在学生家中家访 -BAC009S0769W0442 从金堂县城驱车一个半小时至土桥镇的大禹村 -BAC009S0769W0443 就到了薛孝文任教的学校金堂县平桥学校 -BAC009S0769W0444 乡间公路也就四米宽 -BAC009S0769W0445 薛孝文还在给学生上课 -BAC009S0769W0446 在年轻时也有着跳龙门的梦 -BAC009S0769W0447 他辗转三所乡村学校 -BAC009S0769W0448 村干部大闹天宫孙大圣口碑爆棚 -BAC009S0769W0449 村干部强揽工程遭拒绝雇百名老人阻挠施工 -BAC009S0769W0450 犯罪嫌疑人刘德怀等六人被刑拘 -BAC009S0769W0451 村干部靠打架成名被抓后喊我是市人大代表 -BAC009S0769W0452 和平花苑现已更名为龙和华府 -BAC009S0769W0453 村庄晴天降奇冰十几斤重来历不明 -BAC009S0769W0454 天上掉下一块重约十几斤的冰块 -BAC009S0769W0455 虽然事情过去三天了 -BAC009S0769W0456 但嵩县德亭镇大王沟村村民们仍感到好奇 -BAC009S0769W0457 一零月一零日临近中午 -BAC009S0769W0458 砸到了村民的菜地里 -BAC009S0769W0459 还把地面砸了个大坑 -BAC009S0769W0460 附近村民闻讯纷纷赶来瞧个新鲜 -BAC009S0769W0461 捡拾一些冰块回家冰冻保存 -BAC009S0769W0462 专家排除了冰雹和飞机上落冰的两种可能 -BAC009S0769W0463 这块天降奇冰究竟是何物 -BAC009S0769W0464 村庄现两名村支书假支书无名有实村内掌权 -BAC009S0769W0465 村庄遭人倾倒数百吨化工废料附近植物全空死 -BAC009S0769W0466 非法倾倒数百吨化工废料 -BAC009S0769W0467 村民生活因此发生巨变井水变味田地减产前日 -BAC009S0769W0468 该村村民黎胜明向楚天快报求助 -BAC009S0769W0469 希望相关部门能处理此事 -BAC009S0769W0470 村支书一周只上二小时班村民称反映会遭报复 -BAC009S0769W0471 村支书上班时间带彩娱乐神秘人曝光视频证据 -BAC009S0769W0472 视频中正在带彩娱乐的灰衣男 -BAC009S0769W0473 被警方确认为新农村党支部书记毛家文 -BAC009S0769W0474 村支书为考公务员改小一零岁一四岁时三月内生两子 -BAC009S0769W0475 淅川县上集镇一名村支书被指将年龄改小一零岁 -BAC009S0769W0476 图为时上集镇派出所 -BAC009S0769W0477 三个月内连生两个儿子 -BAC009S0769W0478 村支书将两女儿家七口人列为搬迁户骗领搬迁款 -BAC009S0769W0479 村支书违法占地建加油站多部门介入处罚仍未拆 -BAC009S0769W0480 浙江在线零九月二一日讯浙江日报记者季建荣近日 -BAC009S0769W0481 村民多次向温岭市有关部门投诉反映 -BAC009S0769W0482 但问题至今没有解决 -BAC009S0769W0483 村支书违规建小产权房花钱买通所有关系 -BAC009S0769W0484 都说下属有困难找领导 -BAC009S0769W0485 灵璧县韦集镇韦集村原村支书石某 -BAC009S0769W0486 就花钱请领导为他撑腰 -BAC009S0769W0487 村支书遭集体举报买鼠药欲投毒报复村民 -BAC009S0769W0488 本报一零月五日讯国庆长假 -BAC009S0769W0489 省纪委要求新闻媒体主动参与到纠四风监督工作中 -BAC009S0769W0490 强化媒体根据群众举报开展调查采访和舆论监督 -BAC009S0769W0491 发生了一起村民举报村支书贪腐 -BAC009S0769W0492 村支书以在全村井水中投毒以报复村民的离奇事件 -BAC009S0769W0493 村支书醉驾撞伤孕妇刑满释放后仍当人大代表 -BAC009S0769W0494 华江瑶族乡十四届人大代表会第五次会议会务材料上 -BAC009S0769W0495 于二零一四年四月一九日晚 -BAC009S0770W0121 住宅土地出让金及成交面积均大幅下降 -BAC009S0770W0122 开发商进驻一二线城市 -BAC009S0770W0123 抛售三线城市 -BAC009S0770W0124 遇到毛利率低的问题 -BAC009S0770W0125 发现土地成本占比持续提升 -BAC009S0770W0126 目前全国该指标 -BAC009S0770W0127 一线城市超过三成 -BAC009S0770W0128 三线及以下为一成 -BAC009S0770W0129 一二线城市用地紧张 -BAC009S0770W0130 房地产商需要解决毛利率低的问题 -BAC009S0770W0131 中指院广州公司总经理张化学向南都记者表示 -BAC009S0770W0132 三线城市库存积压又逼倒房地产商在一线城市抢地 -BAC009S0770W0133 建议房企不要一味强调做大 -BAC009S0770W0134 可以重点关注如何做强 -BAC009S0770W0135 在自身优势领域发力 -BAC009S0770W0136 发现无论是千亿巨头地产商 -BAC009S0770W0137 多数在积极剑指一线城市 -BAC009S0770W0138 从今年房企的买地情况来看 -BAC009S0770W0139 今年万科拿下九宗地块 -BAC009S0770W0140 包括五个一二线城市 -BAC009S0770W0141 保利地产开始进军成都珠海 -BAC009S0770W0142 中海地产作为国企龙头 -BAC009S0770W0143 也在厦门拿下几宗商住用地和济南几宗居住用地 -BAC009S0770W0144 在房企扎堆一二线城市时 -BAC009S0770W0145 更致命的是中小房企在融资方面的短板 -BAC009S0770W0146 相比千亿房企的借贷利率 -BAC009S0770W0147 中小房企要面临高达两位数利率 -BAC009S0770W0148 中国市场空间多样化 -BAC009S0770W0149 房企除了像千亿地产一样做大 -BAC009S0770W0150 在某一方面找到自己的企业竞争力 -BAC009S0770W0151 行业的玩家门槛越来越高 -BAC009S0770W0152 主动退出和寻求并购的中小开发商增多 -BAC009S0770W0153 大开发商有机会借此提高行业集中程度 -BAC009S0770W0154 张大伟向南都记者分析 -BAC009S0770W0155 在三线城市库存高攀销售停滞的情况下 -BAC009S0770W0156 没有雄厚的资金良好的业绩以及成熟的融资平台 -BAC009S0770W0157 似乎难以在一二线城市站稳 -BAC009S0770W0158 房地产业将在明年有所洗牌 -BAC009S0770W0159 点击进入股友会参与讨论 -BAC009S0770W0160 今年国有土地出让权收入四千亿元 -BAC009S0770W0161 今年房地产市场地域分化将加剧 -BAC009S0770W0162 政策放松和高库存背景下 -BAC009S0770W0163 开发商均面临不均衡的复苏前景 -BAC009S0770W0164 今年中国房地产开发商仍将面临供应过剩 -BAC009S0770W0165 房价不太可能强劲反弹 -BAC009S0770W0166 房地产在不同城市之间的复苏也将存在分化 -BAC009S0770W0167 一线城市或将复苏率先复苏 -BAC009S0770W0168 三四线城市可能在继续因高库存而承压 -BAC009S0770W0169 中国房地产的市场价格和销量将继续调整 -BAC009S0770W0170 但下半年的销售可能会回升 -BAC009S0770W0171 开发商只需选择继续降价 -BAC009S0770W0172 尤其是在三四线城市 -BAC009S0770W0173 中国经济增速放缓的背景下 -BAC009S0770W0174 预期政府将继续放松政策 -BAC009S0770W0175 而政府放松限购按揭和内地融资政策 -BAC009S0770W0176 房地产需求可能会上升 -BAC009S0770W0177 这将有助于开发商明年维持销量 -BAC009S0770W0178 政府放松政策对房地产销售的正面影响可能会提升 -BAC009S0770W0179 标普信用分析师孔磊说道 -BAC009S0770W0180 关于明年的房价走势 -BAC009S0770W0181 标普在基准情景假设下的预期是 -BAC009S0770W0182 明年平均售价将维持不变 -BAC009S0770W0183 销售额则将维持不变 -BAC009S0770W0184 房地产价格调整还未完全结束 -BAC009S0770W0185 未来一年内中国房地产价格不太可能强劲反弹 -BAC009S0770W0186 虽然过去一年一些获评级开发商的信用状况变差 -BAC009S0770W0187 徐林发债企业在债劵存续期内进行资产转移 -BAC009S0770W0188 极可能对债劵持有人利益构成不利影响 -BAC009S0770W0189 直接涉及到债劵持有人的利益保护问题 -BAC009S0770W0190 我们立即与云投集团进行了沟通 -BAC009S0770W0191 并严格按照合规程序进行 -BAC009S0770W0192 我委也注意到在企业债劵存续期内 -BAC009S0770W0193 需要对发行人资产重组等重大事宜加强监管 -BAC009S0770W0194 在制度上对债券人的合法权益进行保护 -BAC009S0770W0195 建立地方政府债务管理体系 -BAC009S0770W0196 从您刚才的介绍中我们了解到 -BAC009S0770W0197 城投债劵对公司城市基础设施和市政的建设 -BAC009S0770W0198 起到了非常积极的作用 -BAC009S0770W0199 对丰富债劵市场品种也具有积极意义 -BAC009S0770W0200 结合地方政府债务管理制度的完善 -BAC009S0770W0201 下一步我国的城投债劵还需要做哪些完善 -BAC009S0770W0202 这个问题涉及到一系列的制度完善 -BAC009S0770W0203 是一个比较复杂的问题 -BAC009S0770W0204 我个人是这么认识的 -BAC009S0770W0205 我国还处于城市化快速发展期 -BAC009S0770W0206 需要为各地的城市建设提供规范的融资渠道 -BAC009S0770W0207 农业与非农产业之间劳动生产率的差距也很大 -BAC009S0770W0208 这决定了我国城市化动力十分强劲 -BAC009S0770W0209 城市化进程远未结束 -BAC009S0770W0210 城市化快速发展期的重要特征就是基础设施投资需求量大 -BAC009S0770W0211 这是我国所处的发展阶段决定的 -BAC009S0770W0212 政府通过债务融资从事基础建设 -BAC009S0770W0213 我们应该建设可控的规范化的地方政府融资机制 -BAC009S0770W0214 为各地的基础建设设提供有制度保障的融资渠道 -BAC009S0770W0215 城投债劵作为准市政债劵仍将是有效的融资工具 -BAC009S0770W0216 但是还需要进一步改进 -BAC009S0770W0217 在政府投融资体制改革过程中 -BAC009S0770W0218 从事当地基础建设 -BAC009S0770W0219 相当于过去体制而言是更加市场化的 -BAC009S0770W0220 城投债劵作为融资平台公司最透明的直接融资工具 -BAC009S0770W0221 仍然存在并具有发展空间 -BAC009S0770W0222 由于目前城投债劵的发行需要符合企业的债劵发行的条件 -BAC009S0770W0223 这使得我国城投债劵的发行利率相对偏高 -BAC009S0770W0224 城投债劵的发行期限和利率 -BAC009S0770W0225 未来应该在制度上进一步完善 -BAC009S0770W0226 使得城投公司能够发行真正意义上的长期市政债劵 -BAC009S0770W0227 要尽快建立我国的地方政府债务管理体系 -BAC009S0770W0228 对于如何建立规范的地方政府融资渠道 -BAC009S0770W0229 加强地方政府债务管理和风险防控 -BAC009S0770W0230 一些专家学者提出了许多好的建议 -BAC009S0770W0231 如建立规范透明的地方政府融资渠道 -BAC009S0770W0232 并对地方政府债务进行监控和风险防范等 -BAC009S0770W0233 由于我国还没有建立统一的地方政府债务风险管理制度 -BAC009S0770W0234 设定政府性债务风险控制指标和标准 -BAC009S0770W0235 并对政府性债务进行馀额管理 -BAC009S0770W0236 使用地方政府的债务融资规模控制在安全范围内 -BAC009S0770W0237 远低于发生债务危机的欧美国家 -BAC009S0770W0238 债劵发行人是优质的 -BAC009S0770W0239 还本付息也是正常的 -BAC009S0770W0240 应该建立风险可控的规范化地方政府融资机制 -BAC009S0770W0241 为各地的基础建设提供有力的保障的融资渠道 -BAC009S0770W0242 责任编辑廖一宁 -BAC009S0770W0243 该政策将于二零一二年施行 -BAC009S0770W0244 要继续深化天然气价格改革 -BAC009S0770W0245 加快理顺天然气价格与可代替能源的比价关系 -BAC009S0770W0246 引导天然气合理消费 -BAC009S0770W0247 提高天然气利用率支持天然气贸易机制创新 -BAC009S0770W0248 天然气用户为优先允许限制类和禁止类 -BAC009S0770W0249 限制类主要是指天然化工 -BAC009S0770W0250 各地要按照天然气利用优先顺序加强需求侧管理 -BAC009S0770W0251 鼓励优先类支持允许类天然气利用项目发展 -BAC009S0770W0252 对限制类项目的核准和审核要从严把握 -BAC009S0770W0253 商议向印度转移更多军事技术的事宜 -BAC009S0770W0254 据新华社电印度官员透露 -BAC009S0770W0255 美国将向印度转让两项军事技术 -BAC009S0770W0256 其中包括美国大鸦无人机今后将由印度工厂制造 -BAC009S0770W0257 印度斯坦时报对二十四日援引消息人士的话报道 -BAC009S0770W0258 二零一五年最适宜供职的公司仍在科技领域 -BAC009S0770W0259 该网站根据雇员的反馈 -BAC009S0770W0260 给出了前五十名的公司排名 -BAC009S0770W0261 排名前十的科技公司 -BAC009S0770W0263 不仅在科技公司领域排名第一 -BAC009S0770W0264 而且在整个榜单也位居首位 -BAC009S0770W0265 谷歌不仅会以优厚薪酬招募顶尖人才 -BAC009S0770W0267 该应用交付网络在整个榜单中位居第四 -BAC009S0770W0268 在科技领域排名第二 -BAC009S0770W0270 这家社交网络巨头对待员工也是相当慷慨 -BAC009S0770W0271 谷歌的福利待遇他家基本都有 -BAC009S0770W0272 之前刚刚提出为女性员工提供冷冻卵子费用 -BAC009S0770W0274 去年高通被评为最佳实习科技公司 -BAC009S0770W0276 对于苹果公司来说这是很关键的一年 -BAC009S0770W0279 都是该公司的强心剂 -BAC009S0770W0280 雇员们也在很大程度上受到了鼓舞 -BAC009S0770W0282 作为全球最大的职业社交网站 -BAC009S0770W0283 领英在榜单上的成绩也是相当不错的 -BAC009S0770W0284 提供免费房地产估价服务的网站 -BAC009S0770W0285 在美国一上线就造成大轰动 -BAC009S0770W0290 且把服务范围特别局限在医疗健康领域 -BAC009S0770W0291 搜狐消息外媒消息 -BAC009S0770W0292 二零一五年最适宜供职的公司仍在科技领域 -BAC009S0770W0293 该网站根据雇员的反馈 -BAC009S0770W0297 排名从第十三位上升至第十一位 -BAC009S0770W0298 高通二零一四年所获专利也增长了百分之二十三 -BAC009S0770W0299 排名从第九升至第七 -BAC009S0770W0300 以上大多数专利都与计算软件及相关技术有关 -BAC009S0770W0304 加速推进中国服务器市场份额的第一目标 -BAC009S0770W0305 这是浪潮借助政策东风来做的营销手段 -BAC009S0770W0306 对于企业提高股价促成业务 -BAC009S0770W0307 某个银行的系统采购 -BAC009S0770W0308 在确保系统顺利运行的情况下 -BAC009S0770W0309 大家可能因为国家政策扶持国产品牌的大势 -BAC009S0770W0310 而选择国产的服务器 -BAC009S0770W0311 就更加愿意长期持有他们的股票 -BAC009S0770W0312 核心技术待突破自棱镜门事件之后 -BAC009S0770W0313 国家信息安全的问题被推到了风口浪尖 -BAC009S0770W0314 而体现在服务器产业上 -BAC009S0770W0315 由于中国政府的大力扶持 -BAC009S0770W0316 国产服务器厂商迎来利好 -BAC009S0770W0317 在国内四大厂商浪潮华为联想曙光中 -BAC009S0770W0318 浪潮的特点在于定制化策略 -BAC009S0770W0319 与互联网企业深度合作 -BAC009S0770W0320 而这种策略带来的结果是市场份额的快速提升 -BAC009S0770W0322 至于像整机柜这类深度定制化的细分市场 -BAC009S0770W0323 百分之百为国产品牌 -BAC009S0770W0324 其中浪潮达到了百分之六十的市场占有率 -BAC009S0770W0325 近年来随着国内互联网企业的快速发展 -BAC009S0770W0326 宽带和服务器的采购量也水涨船高 -BAC009S0770W0327 由于各家之间竞争激烈 -BAC009S0770W0328 往往在采购过程中尽量压低报价 -BAC009S0770W0329 再加上互联网企业对服务器技术性可能等要求很高 -BAC009S0770W0330 很多服务器厂商进入做一两年 -BAC009S0770W0331 而浪潮从二零一零坚持做到现在 -BAC009S0770W0332 业内对其做法的解读是先凭着低价杀入市场 -BAC009S0770W0333 以品质和服务黏住用户 -BAC009S0770W0334 虽然面对赔钱赚吆喝的质疑 -BAC009S0770W0335 浪潮与海关总署启动战略合作助推智慧海关搜狐科技 -BAC009S0770W0336 浪潮集团与海关总署启动战略合作 -BAC009S0770W0337 合作范围遍及全国各直属海关及隶属海关 -BAC009S0770W0338 对于我而言现在已经成为了一种习惯与本能 -BAC009S0770W0339 有着较为丰富志愿服务经历的九零后吴雯的话 -BAC009S0770W0340 只是本次田径世锦志愿者这个大群体的一个缩影 -BAC009S0770W0341 他们有理由相信九零后同样可以做好 -BAC009S0770W0342 我希望能通过这次田径世锦赛 -BAC009S0770W0343 以及未来几年更多志愿经历 -BAC009S0770W0344 来为二零二二年的冬奥会积累经验 -BAC009S0770W0345 到时将会成为冬奥会志愿者的主力 -BAC009S0770W0346 张锦麟将为自己称为鸟巢新一代志愿者 -BAC009S0770W0347 他在为此时刻准备着 -BAC009S0770W0348 本报记者宋翔王薇 -BAC009S0770W0349 著名双人滑运动员庞清和董健虽未正式宣布退役 -BAC009S0770W0350 但现在的生活已经进入了准退役状态 -BAC009S0770W0351 两人把更多的精力放到了花滑运动的推广上 -BAC009S0770W0352 他俩组建了工作团队 -BAC009S0770W0353 过上了比运动员复杂得多的生活 -BAC009S0770W0355 九月初顺利通过了考试 -BAC009S0770W0356 佟健已经完成了第一个学模块的学习 -BAC009S0770W0357 常年的专业训练给身体带来了各种伤病 -BAC009S0770W0358 二零一四年索契冬奥会上 -BAC009S0770W0359 早到了退役年龄的庞清和佟健克服了伤病困难 -BAC009S0770W0360 但这对老将却以追梦无悔的精神 -BAC009S0770W0361 赢得了同行媒体和观众的敬意 -BAC009S0770W0362 庞清和佟健没有马上退役 -BAC009S0770W0363 而是坚持参加了今年三月的世界花滑锦标赛 -BAC009S0770W0364 一方面是他们从事花样滑冰二零多年 -BAC009S0770W0365 与这项运动结下深厚感情 -BAC009S0770W0366 始终对那块冰面恋恋不舍 -BAC009S0770W0367 也是中国双人滑在申雪赵宏退役后 -BAC009S0770W0368 庞清和佟健仍肩负着扛起中国双人滑大旗的重任 -BAC009S0770W0369 这让他们的退役迟迟没有提上日程 -BAC009S0770W0370 中国双人滑项目的后续发展应当有了较为清晰的前景 -BAC009S0770W0371 庞清和佟健终于可以放心地考虑退役的事情了 -BAC009S0770W0372 受大学生的提议启发 -BAC009S0770W0373 该公众号已经举办了两期公益活动 -BAC009S0770W0374 佟健又将国内部分优秀的单人滑和冰舞运动员集合起来 -BAC009S0770W0375 组建了花滑表演团队 -BAC009S0770W0376 与商业性冰场达成合作协议 -BAC009S0770W0377 以表演的方式推广花样滑冰 -BAC009S0770W0378 现成的选择就在面前 -BAC009S0770W0379 或进入体育行政机关 -BAC009S0770W0380 这些出路也是中国运动员比较常见的退役选择 -BAC009S0770W0381 但庞清和佟健并不愿意随遇而安地安排自己的后半生 -BAC009S0770W0382 自己和庞清曾在赛场上努力地追求优秀更优秀 -BAC009S0770W0383 他们对退役后的人生同样也有追求 -BAC009S0770W0384 佟健给自己定下了要做就做到最好 -BAC009S0770W0385 和绝不会是短期行为的基调 -BAC009S0770W0386 对于工作中遇到的管理经验和能力欠缺问题 -BAC009S0770W0387 佟健的解决办法就只能努力提高自己 -BAC009S0770W0388 佟健报考了北大光华管理学院 -BAC009S0770W0389 在九月初参加考试时 -BAC009S0770W0390 佟健做好了考不上的思想准备 -BAC009S0770W0391 佟健因此顺利通过了入学考试 -BAC009S0770W0392 佟健是同班同学里唯一运动员出身的 -BAC009S0770W0393 记者查阅相关资料发现 -BAC009S0770W0395 来自体育圈的并不多见 -BAC009S0770W0396 只有姚明和刘国梁等少数几个人 -BAC009S0770W0397 佟健希望自己能真的学到管理知识 -BAC009S0770W0398 管理知识肯定都是用的上的 -BAC009S0770W0399 至于中国花滑运动的推广 -BAC009S0770W0400 佟健更希望能有实实在在的发展 -BAC009S0770W0401 这同样需要有效的办法和手段 -BAC009S0770W0402 借着北京将要举办二零二二年冬奥会的东风 -BAC009S0770W0403 冰雪运动在中国势必会有一次发展高潮 -BAC009S0770W0404 很可能被雪藏下去 -BAC009S0770W0405 搜狐娱乐赛文耷子备受关注重拍版乌鸦 -BAC009S0770W0406 在原定男主角卢克伊万斯退出剧组之后 -BAC009S0770W0407 将双双加盟该片 -BAC009S0770W0408 搜狐娱乐据香港媒体报道 -BAC009S0770W0409 谢婷婷九月七日三十三岁生日 -BAC009S0770W0410 网友纷纷留言祝谢婷婷生日快乐 -BAC009S0770W0411 还拉赞姑还是那么漂亮 -BAC009S0770W0412 搜狐娱乐讯北京时间八月十二日消息 -BAC009S0770W0413 据香港媒体报道 -BAC009S0770W0414 谢贤昨天庆祝七十九岁生日 -BAC009S0770W0415 相约家人到谢霆锋家中上演十二道锋味私房菜 -BAC009S0770W0416 由于谢霆锋亲为家人做大厨 -BAC009S0770W0417 同场更有两个神秘嘉宾 -BAC009S0770W0419 搜狐娱乐讯据香港媒体报道 -BAC009S0770W0420 谢贤怒打曾江 -BAC009S0770W0421 究竟是演戏还是积怨已深 -BAC009S0770W0422 只有他们才知道 -BAC009S0770W0423 有不少幕后花絮片花 -BAC009S0770W0424 节目推出至今收视很高 -BAC009S0770W0425 下星期更进入结局周 -BAC009S0770W0426 曾江谢贤四哥及胡枫修哥大谈往日情时 -BAC009S0770W0427 曾江当时说我和谢贤相识多年 -BAC009S0770W0428 也没有发生什么冲突 -BAC009S0770W0429 不好的事情发生 -BAC009S0770W0430 怎料无心说话却一语成谶 -BAC009S0770W0431 经兴安县人大常委会许可 -BAC009S0770W0432 杨爱明被兴安警方刑事拘留 -BAC009S0770W0433 二零一四九二 -BAC009S0770W0434 兴安县法院判杨爱明拘役四个月 -BAC009S0770W0435 杨爱明却参加了兴安县第十五届人大五次会议 -BAC009S0770W0436 村支书骗拆迁款一二万获刑一一年 -BAC009S0770W0437 骗取搬迁补偿金一二二万元 -BAC009S0770W0438 北京晨报记者昨天获悉 -BAC009S0770W0439 延庆法院一审以贪污罪判处钱某有期徒刑一一年 -BAC009S0770W0440 村支书村民被政府工作人员土埋系邻里纠纷 -BAC009S0770W0441 河南省新乡市封丘县留光镇政府东五零零米左右 -BAC009S0770W0442 当地村民孙秋英在自家门口因是否垫路与邻居产生争执 -BAC009S0770W0443 遭到镇政府工作人员用土掩埋 -BAC009S0770W0444 肇事方为镇政府安全保卫人员 -BAC009S0770W0445 所开拉土车辆是镇政府扣押车辆 -BAC009S0770W0446 村支委办公室猥亵女童被刑拘的孩子奶奶在隔壁开会 -BAC009S0770W0447 海峡都市报大白天 -BAC009S0770W0448 在村委会办公楼书记办公室 -BAC009S0770W0449 五一岁的村支委猥亵一名一零岁的留守儿童隔壁 -BAC009S0770W0450 孩子的奶奶正在参加村里的道路环境综合治理工作会议 -BAC009S0770W0451 这事发生在福建省漳州市诏安县林头村 -BAC009S0770W0452 该村支委李某因涉嫌猥亵儿童被警方传唤 -BAC009S0770W0453 村民一零年在沙洲植树造林已成林却被指种错地方 -BAC009S0770W0454 两个村子之间的长江江面上 -BAC009S0770W0455 有一块面积近五零零零面积的沙洲 -BAC009S0770W0456 沙洲几乎年年被淹 -BAC009S0770W0457 村民二六零棵梨树被连根推倒在地里住房被夷为平地 -BAC009S0770W0458 华商报讯记者张林实习生邓泽惠一夜之间 -BAC009S0770W0459 村民地里二六零馀棵正在挂果的梨树被连根推倒 -BAC009S0770W0460 地头边的一间平房也被夷为平地 -BAC009S0770W0461 至今未找到肇事者 -BAC009S0770W0462 村民不满地讨说法要求楼盘开发商停工被拘留 -BAC009S0770W0463 去年一二月四日在村民多次上访无果的情况下 -BAC009S0770W0464 大家到施工现场的临时大门外 -BAC009S0770W0465 尽管检察管最后以事事实不清 -BAC009S0770W0466 但在张关押了三七天后 -BAC009S0770W0467 公安局仍采取了取保候审的手段 -BAC009S0770W0468 没有发生任何肢体冲突 -BAC009S0770W0469 更没有扰乱社会秩序 -BAC009S0770W0470 村民不满行政批复诉市区政府区长庭应诉 -BAC009S0770W0471 门头沟雁翅镇村民李冬梅因不服行政批复 -BAC009S0770W0472 将市区两级政府告上法庭 -BAC009S0770W0473 门头沟区长张贵林出庭应诉 -BAC009S0770W0474 门头沟雁翅镇村民李冬梅向市政府提起了行政复议 -BAC009S0770W0475 复议维持了区政府的认定结论 -BAC009S0770W0476 村民为多拿补偿在拆迁前突击装修全用劣质建材 -BAC009S0770W0477 村里随处可见装潢小广告 -BAC009S0770W0478 村民为救坠井男童身亡被拉出时呈托举姿势 -BAC009S0770W0479 为了一名坠入废井的男童 -BAC009S0770W0480 邳州几名村民先后下井救人 -BAC009S0770W0481 第一个下井救人的大叔却再也没能爬上来 -BAC009S0770W0482 他的双手还保持着托举的姿势 -BAC009S0770W0483 他的义举感动了四里八乡 -BAC009S0770W0484 七月一三日的葬礼上 -BAC009S0770W0485 数百名乡邻自发赶来送他一程 -BAC009S0770W0486 实习生郭杨雪通讯员耿万志现代快报记者李伟豪 -BAC009S0770W0487 村民为解决问题给领导建庙官员其诉求不合规 -BAC009S0770W0488 其在村西旁花费万元建起一名叫清明堂的家庙 -BAC009S0770W0489 每天烧香敬拜办事处主任 -BAC009S0770W0490 该事件引发社会关注 -BAC009S0770W0491 以上两村民所要求的内容不符合相关规定 -BAC009S0770W0492 村民为阻止儿子与女友相见编造偷小孩谎言 -BAC009S0770W0493 涉嫌编造谣言非法拘禁被刑拘 -BAC009S0770W0494 村民举报县城干部建十馀栋别墅纪检委部门介入调查 -BAC009S0770W0495 村小组干部未经过小组集集体讨论 -BAC009S0901W0121 作为一线城市的北京 -BAC009S0901W0122 其市管国管住房公积金政策也均进行调整 -BAC009S0901W0123 公积金贷款最高额度由七万元提升至十万元 -BAC009S0901W0124 公积金政策调整方式各异对楼市影响几何 -BAC009S0901W0125 盘活各地公积金资源 -BAC009S0901W0126 以北京提高公积金贷款最高额度为例 -BAC009S0901W0127 据伟嘉安捷数据统计显示 -BAC009S0901W0128 该政策在七月份实施一周后 -BAC009S0901W0129 公积金贷款额度的提高 -BAC009S0901W0130 将使更多购房者具备买房支付能力 -BAC009S0901W0131 中原地产首席分析师张大伟认为 -BAC009S0901W0132 放宽提取住房公积金支付房租条件则对楼市影响甚微 -BAC009S0901W0133 对楼市也有较大影响 -BAC009S0901W0134 利用公积金可以减少租赁者负担 -BAC009S0901W0135 使其缓冲过度到买房阶段 -BAC009S0901W0136 对楼市消化库存起到正面作用 -BAC009S0901W0137 中新网房产频道每每 -BAC009S0901W0138 要求各地放宽公积金贷款条件后 -BAC009S0901W0139 美丽北京大型绿色公益品牌项目 -BAC009S0901W0140 住建部等三部委再次联合发 -BAC009S0901W0141 美丽北京大型绿色公益品牌项目 -BAC009S0901W0142 随着广州住房公积金贷款政策的调整实施 -BAC009S0901W0143 公积金贷款最高额度也不同程度上调 -BAC009S0901W0144 住房公积金贷款因其利率较低的优势 -BAC009S0901W0145 一直以来广受购房者青睐 -BAC009S0901W0146 本轮住房公积金房贷政策调整 -BAC009S0901W0147 进一步加速了消费者的入市节奏 -BAC009S0901W0148 广州调整住房公积金个人住房贷款政策 -BAC009S0901W0149 同时对申请公积金贷款的缴纳时限调整为五个月 -BAC009S0901W0150 据广州日报昨天报道 -BAC009S0901W0151 公积金贷款首付比例降低的消息令购房者喜出望外 -BAC009S0901W0152 其中刚需买家入市积极性明显提高 -BAC009S0901W0153 据伟嘉安捷提供的数据显示 -BAC009S0901W0154 北京公积金贷款首付比例松绑一周后 -BAC009S0901W0155 公积金贷款及组合贷咨询量明显上涨 -BAC009S0901W0156 尤其组合贷的咨询量较上月月初一周上涨百分之五左右 -BAC009S0901W0157 上海深圳等主要城市也在公积金新政推动下 -BAC009S0901W0158 呈现购房者积极入市的行情 -BAC009S0901W0159 全国已有超百个城市发布了不同力度的公积金松绑政策 -BAC009S0901W0160 加之降息降准等政策组合拳 -BAC009S0901W0161 呈现出量价齐涨的局面 -BAC009S0901W0162 据中国指数研究院最新数据显示 -BAC009S0901W0163 深圳环比上涨百分之七 -BAC009S0901W0164 涨幅据十大城市之首 -BAC009S0901W0165 五月份多地楼市的成交量明显上涨 -BAC009S0901W0166 是房地产当前发展格局下的一个必然 -BAC009S0901W0167 唯独这样才能盘活公积金资源 -BAC009S0901W0168 促使更多购房者积极入市 -BAC009S0901W0169 伴随着各地中住房公积金新政的落地实施 -BAC009S0901W0170 楼市进展仍需进一步观望 -BAC009S0901W0171 购房者受惠于政策利好的同时 -BAC009S0901W0172 公积金在申请放贷流程上并未提速 -BAC009S0901W0173 相反相关环节上审批更加严格 -BAC009S0901W0174 从目前上海住房公积金的具体政策看 -BAC009S0901W0175 购房的扶持力度在加大 -BAC009S0901W0176 但主要还是体现在贷款成本的降低 -BAC009S0901W0177 而申请公积金贷款方面还是需要走严格的流程 -BAC009S0901W0178 公积金提取一直是目前试图突破的内容 -BAC009S0901W0179 但目前还未出现大面积提取行为 -BAC009S0901W0180 来自广州日报的报道称 -BAC009S0901W0181 从申请到最后的拨放款 -BAC009S0901W0182 部分客户甚至等两个多月 -BAC009S0901W0183 如果申请公积金贷款或公积金贷款与商业贷款的组合贷 -BAC009S0901W0184 伟嘉安捷对中新网房产频道表示 -BAC009S0901W0185 现在公积金贷款办理需要一个月左右的时间 -BAC009S0901W0186 而申请办理组合贷款的手续则更为复杂 -BAC009S0901W0187 农业现代化水平显着提升 -BAC009S0901W0188 发展现代农业的条件更加有利 -BAC009S0901W0189 加快发展现代农业机遇遇得 -BAC009S0901W0190 一是工业化城镇化的引领推动作用将更加明显 -BAC009S0901W0191 信息化水平不断提高 -BAC009S0901W0192 农村劳动力大量转移 -BAC009S0901W0193 以及扩大内需战略的实施 -BAC009S0901W0194 二是政策支持将更加强化 -BAC009S0901W0195 随着我国综合国力和财政实力不断增强 -BAC009S0901W0196 强农惠农富农政策力度将进一步加大 -BAC009S0901W0197 支持现代农业发展的物质基础更加牢固 -BAC009S0901W0198 三是科技支撑将更加有力 -BAC009S0901W0199 科技创新孕育新突破 -BAC009S0901W0200 全球绿色经济低碳技术正在兴起 -BAC009S0901W0201 现代农业发展的动力更加强劲 -BAC009S0901W0202 四是外部环境将更加优化 -BAC009S0901W0203 形成合力推进现代农业发展的新局面 -BAC009S0901W0204 广大农民的积极性创造性将得到进一步激发和释放 -BAC009S0901W0205 发展现代农业的要求更加迫切 -BAC009S0901W0206 在工业化城镇化快速推进时期 -BAC009S0901W0207 农业面临着容易被忽视或削弱的风险 -BAC009S0901W0208 我国工业化城镇化快速发展 -BAC009S0901W0209 但农业现代化明显滞后 -BAC009S0901W0210 面临着一系列严峻挑战 -BAC009S0901W0211 科技创新和推广新应用能力不强 -BAC009S0901W0212 农业社会化服务体系不健全 -BAC009S0901W0213 国际农产品市场投机炒作及传导影响加深 -BAC009S0901W0214 我国现代农业发展面临更多的外部不确定性 -BAC009S0901W0215 必须珍惜抓住用好难得的历史机遇 -BAC009S0901W0216 坚持用现代物质条件装备农业 -BAC009S0901W0217 努力探索出一条具有中国特色的农业现代化道路 -BAC009S0901W0218 指导思想基本原则与发展目标 -BAC009S0901W0219 以邓小平理论和三个代表重要思想为指导 -BAC009S0901W0220 深入贯彻落实科学发展观 -BAC009S0901W0221 坚持走中国特色农业现代化道路 -BAC009S0901W0222 以转变农业发展方式为主线 -BAC009S0901W0223 着力强化政策科技设施装备人才和体制支撑 -BAC009S0901W0224 着力完善现代农业产业体系 -BAC009S0901W0225 提高农业现代化水平农民生活水平和新农村建设水平 -BAC009S0901W0226 坚持确保国家粮食安全 -BAC009S0901W0227 坚持立足国内实现粮食基本自给的方针 -BAC009S0901W0228 实行最严格的耕地保护和节约用地制度 -BAC009S0901W0229 加强农业基础设施建设 -BAC009S0901W0230 着力提高粮食综合生产能力 -BAC009S0901W0231 坚持和完善农村基本经营制度 -BAC009S0901W0232 在保持农村土地承包关系稳定并长久不变的前提下 -BAC009S0901W0233 推进农业经营体系体制创新 -BAC009S0901W0234 坚持科教兴农和人才强农 -BAC009S0901W0235 加快农业科技自主创新和农业农村人才培养 -BAC009S0901W0236 加快农业科技成果转化与推广应用 -BAC009S0901W0237 提高农业物质技术水装备水平 -BAC009S0901W0238 坚持政府支持农民主体社会参与 -BAC009S0901W0239 加大强农惠农富农力度 -BAC009S0901W0240 充分发挥农民的主体作用和首创精神 -BAC009S0901W0241 引导和鼓励社会资本投入农业 -BAC009S0901W0242 合力推进现代农业发展 -BAC009S0901W0243 坚持分类指导重点突破梯次推进 -BAC009S0901W0244 进一步优化农业生产力布局 -BAC009S0901W0245 因地制宜地采取有选择差别化扶持政策 -BAC009S0901W0246 支持主要农产品优势产区建设 -BAC009S0901W0247 鼓励有条件地区率先实现农业现代化 -BAC009S0901W0248 推动其他地区加快发展 -BAC009S0901W0249 全面提高农业现代化水平 -BAC009S0901W0250 现代农业建设取得明显进展 -BAC009S0901W0251 粮食等主要农产品供给得到有效保障 -BAC009S0901W0252 物质装备水平明显提高 -BAC009S0901W0253 并没有提供什么帮助 -BAC009S0901W0254 由于关于乔布斯的电话即将上演了 -BAC009S0901W0255 想了解苹果最初的事 -BAC009S0901W0256 乔布斯在最初产品开发过程中 -BAC009S0901W0257 到底发挥了什么作用 -BAC009S0901W0259 乔布斯几乎没发挥什么作用 -BAC009S0901W0261 而这都是我自己的努力 -BAC009S0901W0262 乔布斯在它出现之前都不知道它的存在 -BAC009S0901W0263 不过这话他在去年就说过 -BAC009S0901W0264 其中一个回答就说过 -BAC009S0901W0265 乔布斯不是一名工程师 -BAC009S0901W0266 他从来没有写过代码 -BAC009S0901W0267 也没有参与过任何产品的原始设计 -BAC009S0901W0268 乔帮主并没有他说的那么不堪 -BAC009S0901W0269 沃兹尼亚克自己也说 -BAC009S0901W0270 乔布斯想成为重要人物 -BAC009S0901W0271 而这种人通常是商业人士 -BAC009S0901W0272 他是一个杰出的商人 -BAC009S0901W0273 一个公司不能缺少两种人 -BAC009S0901W0274 公司的成功缺一不可 -BAC009S0901W0275 而沃兹尼亚克似乎乐于承担驱魅的角色 -BAC009S0901W0276 车库没有发挥过太大作用 -BAC009S0901W0277 除了有时候让他们觉得那里像家 -BAC009S0901W0278 车库虽然最能够代表初期创业 -BAC009S0901W0279 但是在那没做任何设计工作 -BAC009S0901W0280 他还吐槽过乔布斯电影中的桥段 -BAC009S0901W0281 他从未对产品被偷发表过任何评论 -BAC009S0901W0282 并不像乔布斯那样激动 -BAC009S0901W0283 我们外人是无法知道真相的 -BAC009S0901W0284 原创张驰乔布斯逝世已久 -BAC009S0901W0285 而苹果的另一位联合创始人沃兹尼亚克还活跃在科技圈 -BAC009S0901W0286 而且以喜欢点评各家公司着称 -BAC009S0901W0287 乔帮主在首批苹果产品的开发中 -BAC009S0901W0288 苹果股价下跌百分之五分析师出现重大分歧搜狐科技 -BAC009S0901W0289 本报记者纪佳鹏北京报道北京时间八月十二日 -BAC009S0901W0290 作为科技股领头羊的苹果股份当天下挫百分之二 -BAC009S0901W0291 人民币的贬值很可能会增加苹果设备进口的费用 -BAC009S0901W0292 这也是影响股价的一大因素 -BAC009S0901W0293 苹果股价的这轮连续下跌 -BAC009S0901W0294 从今年的七月二十一日便开始了 -BAC009S0901W0295 苹果股价已下挫了百分之七十九 -BAC009S0901W0296 不少报道与评论表示 -BAC009S0901W0299 也过分依赖于大中华地区 -BAC009S0901W0300 甚至是负增长而其中 -BAC009S0901W0301 根据近期公布的苹果第三财季业业绩 -BAC009S0901W0302 该季度苹果大中华区营收为一百三十二点三亿美元 -BAC009S0901W0303 为中国的智能制造产业做出贡献 -BAC009S0901W0304 由于该项目尚处于保密期 -BAC009S0901W0305 赵伟国并未透露更多内容 -BAC009S0901W0306 沈阳机床董事长关锡友认为 -BAC009S0901W0307 中国企业与世界企业同在同一起跑线上 -BAC009S0901W0308 中国的中高端嵌入式芯片全部从德国日本进口 -BAC009S0901W0309 德国制造业最核心的技术就是嵌入式系统 -BAC009S0901W0310 在体积能耗上存在一定的不足 -BAC009S0901W0311 紫光与沈阳机床可以在此布局 -BAC009S0901W0312 三十一九二零一五 -BAC009S0901W0313 紫光集团系清华控股旗下最主要的资产 -BAC009S0901W0314 二零一三年二零一四年 -BAC009S0901W0315 并一举成为中国最大全球第三大通讯芯片设计公司 -BAC009S0901W0316 紫光集团还计划布局物联网网络设备芯片 -BAC009S0901W0317 二零一五年紫光集团预计收入约四百亿元 -BAC009S0901W0318 资产规模将达到六十五亿八百亿元 -BAC009S0901W0319 中国机床龙头企业沈阳机床在北京举行战略发布会 -BAC009S0901W0320 紫光股份云计算股收涨停搜狐科技 -BAC009S0901W0321 大盘股仍是毫无作为 -BAC009S0901W0322 题材股继续扮演黑马角色 -BAC009S0901W0323 紫光股份在公告扩展云计算市场后 -BAC009S0901W0324 盘中有二千六百八十六万元资金净流入 -BAC009S0901W0325 主营信息电子和环保 -BAC009S0901W0326 公司昨日发布公告称 -BAC009S0901W0327 各方本着互惠互利优势互补合作共赢的原则 -BAC009S0901W0328 通过搭建具有领先技术水平的混合云解决方案平台 -BAC009S0901W0329 共同拓展国内云计算市场 -BAC009S0901W0330 紫光股份将与世纪互联共同出资组建合资公司 -BAC009S0901W0332 搭建混合云解决方案平台 -BAC009S0901W0333 满足政府和企业级客户云计算下的定制化需求 -BAC009S0901W0334 推动公司云服务战略的实施 -BAC009S0901W0335 紫光股份拟定增募资二百二十五亿元 -BAC009S0901W0336 公司继续推进云服务战略 -BAC009S0901W0337 紫光集团和员工持股计划参与非公开增发 -BAC009S0901W0338 医生此次将对惠若琪的心脏进行微创手术 -BAC009S0901W0339 彻底解决目前存在的隐患 -BAC009S0901W0340 惠若琪将在微创手术后回到南京调养 -BAC009S0901W0341 张蓉芳主持排管中心 -BAC009S0901W0342 成就了中国女排五连冠伟业 -BAC009S0901W0343 北京时间九月十七日 -BAC009S0901W0344 已经确定本赛季不会参加任何的比赛 -BAC009S0901W0345 明年春天普鲁申科将再次进行手术 -BAC009S0901W0346 作为有史以来天赋最高的花样滑冰运动员之一 -BAC009S0901W0347 普鲁申科的职业生涯却堪称多灾多难 -BAC009S0901W0348 他屡次受到伤病的困扰 -BAC009S0901W0349 背伤更是常年阻碍着他的发挥 -BAC009S0901W0350 去年的索契冬奥会上 -BAC009S0901W0351 赛后有媒体发布了他几乎扭曲的背部肌肉的照片 -BAC009S0901W0352 照片中看到普鲁申科的背部肌肉伤痕累累 -BAC009S0901W0353 全都是手术缝合的痕迹 -BAC009S0901W0354 他不仅动过多次肌肉手术 -BAC009S0901W0355 连身上的痛觉神经都进行了更换 -BAC009S0901W0356 普鲁申科出人意料地宣布复出 -BAC009S0901W0357 表示愿意再征战一个冬奥会周期 -BAC009S0901W0358 但就在外界期待着冰王子的卷土重来时 -BAC009S0901W0359 本赛季的各项赛事参赛名单上却都没有见到他的身影 -BAC009S0901W0360 据外媒最新的爆料显示 -BAC009S0901W0361 普鲁申科被诊断患上了一种新的脊椎疾病 -BAC009S0901W0362 这也让他必须在明年春天进行一次小手术来加以治疗 -BAC009S0901W0363 普鲁申科将错过整个二零一五二零一六季一六赛季的比赛 -BAC009S0901W0364 普鲁申科丝毫没有隐退的打算 -BAC009S0901W0365 他还在积极地为二零一八年韩国平昌冬奥会进行着准备 -BAC009S0901W0367 因为卷入兴奋剂丑闻 -BAC009S0901W0368 朴泰桓无法加入海外先进的训练团队进行训练 -BAC009S0901W0369 转投到昔日恩师卢民相任教练的游泳俱乐部训练 -BAC009S0901W0370 但遭到了韩国国内舆论的非议 -BAC009S0901W0372 到今年十二月为止将在东京的法政大学进行训练 -BAC009S0901W0373 备战明年的里约奥运会 -BAC009S0901W0374 但法政大学很快公开辟谣 -BAC009S0901W0375 韩国媒体报道称朴泰桓确实人在日本 -BAC009S0901W0376 他状告首尔某美容医院的官司将在十一月迎来终审 -BAC009S0901W0377 判决结果成为他能否参加里约奥运的变数 -BAC009S0901W0378 据韩国体育首尔的最新消息 -BAC009S0901W0379 二十一日抵达日本的朴泰桓目前确实在东京 -BAC009S0901W0380 计划在那里进行三个月的封闭训练 -BAC009S0901W0381 备战明年的里约奥运 -BAC009S0901W0382 但他的具体行踪成为谜团 -BAC009S0901W0383 能否在里约奥运东山再起 -BAC009S0901W0384 不仅要看他的竞技状态恢复程度 -BAC009S0901W0385 首先要跨过大韩体育会这一关 -BAC009S0901W0386 朴泰桓的禁期禁赛期将在明年三月期满 -BAC009S0901W0387 因为服用禁药被停赛的选手在禁赛期满起的三年内 -BAC009S0901W0388 都无法代表韩国参加国际比赛 -BAC009S0901W0389 义不容辞地想拯救运动生涯在绝境中的朴泰桓 -BAC009S0901W0390 废除这个第五条第六项 -BAC009S0901W0391 为他参加里约奥运扫清最后的障碍 -BAC009S0901W0392 体育首尔的报道分析 -BAC009S0901W0393 大韩体育会这一计划的顺利实施 -BAC009S0901W0394 最终判决结果将在十一月出炉 -BAC009S0901W0395 如果该医院罪名被判成立的话 -BAC009S0901W0396 这样一来可以获得韩国舆论的同情和理解 -BAC009S0901W0397 民众自然会支持大韩体育会给他一个人修改规则 -BAC009S0901W0398 如果美容院的医疗过失罪名不成立 -BAC009S0901W0399 朴泰桓会面临更加严峻的舆论环境 -BAC009S0901W0400 这场官司的前五次公判 -BAC009S0901W0401 朴泰桓和美容院都互不相让 -BAC009S0901W0402 一度让不少粉丝心碎不已 -BAC009S0901W0403 退役之后的高桥大辅并未远离公众视线 -BAC009S0901W0404 瓦尔兹将精心演绎这个著名角色 -BAC009S0901W0405 其首脑恩斯特布鲁菲尔是邦德的最终敌人 -BAC009S0901W0406 这个角色拥有一只白色的波斯猫作为自己的宠物 -BAC009S0901W0407 值得一提的是 -BAC009S0901W0408 搞怪调皮吐舌卖萌娱乐频道 -BAC009S0901W0409 搜狐娱乐讯八月九日晚 -BAC009S0901W0410 陈冠希在微博晒出一段小视频 -BAC009S0901W0411 陈冠希开始一直把镜头对着帽子上的皮卡丘 -BAC009S0901W0412 后来突然冒出头来 -BAC009S0901W0413 对着镜头吐舌卖萌 -BAC009S0901W0414 搜狐娱乐讯九月二日凌晨 -BAC009S0901W0415 陈冠希在微博晒出一张自拍照 -BAC009S0901W0416 陈冠希穿休闲短袖配宽松裤子 -BAC009S0901W0417 网友纷纷留言越来越像潮流教父了 -BAC009S0901W0418 这裙子娇艳 -BAC009S0901W0419 帅出新高度 -BAC009S0901W0420 搜狐娱乐讯据台湾媒体报道 -BAC009S0901W0421 多次想复合却无下文 -BAC009S0901W0422 感情事备受关注 -BAC009S0901W0423 前天他在脸书晒出自拍照 -BAC009S0901W0424 满脸黑斑与大眼袋 -BAC009S0901W0425 老残样再度乍现 -BAC009S0901W0426 搜狐娱乐讯据台湾媒体报道 -BAC009S0901W0427 事后解释是生活观不同才分开 -BAC009S0901W0428 但隔年三月却又分享一张女方坐他大腿的照片 -BAC009S0901W0429 一度让外界以为两人复合 -BAC009S0901W0430 但现在又有别的女孩坐上他的大腿 -BAC009S0901W0431 校长邱勇上任后首次参加学生毕业典礼并演讲 -BAC009S0901W0432 追求使命需要有强大的定力昨日上午 -BAC009S0901W0433 他叮嘱五千馀名毕业生 -BAC009S0901W0434 要有清晰的目标人文情怀和做到执着坚守 -BAC009S0901W0435 清华法学院教授司法改革应限制两长权力 -BAC009S0901W0436 本报讯记者汪红日前 -BAC009S0901W0437 对允许其亲自过问的案件提出严格限定标准 -BAC009S0901W0438 清华辟谣保安迫降无人机为人为诋毁 -BAC009S0901W0439 该事件引发广泛关注 -BAC009S0901W0440 清华大学通过调取监控录线发现 -BAC009S0901W0441 该保安为附近大厦保安 -BAC009S0901W0442 目前该保安承认有人花二百元雇他进行拍照 -BAC009S0901W0443 称当时几位学生模样的人让他帮忙配合拍照用来宣传 -BAC009S0901W0444 抓着男生的动作为摆拍 -BAC009S0901W0445 摔毁无人机一事为杜撰 -BAC009S0901W0446 保安得知自己被骗后表示我真的很恨他们 -BAC009S0901W0447 记者联系发微博男子 -BAC009S0901W0448 他表示我没想到弄这么大 -BAC009S0901W0449 目前该男子已将微博内容全部删除 -BAC009S0901W0450 清华附小昨迎百年校庆校长诠释成志教育理念 -BAC009S0901W0451 清华附小校长窦桂海诠释成志教育理念 -BAC009S0901W0452 清晨飘来辣眼白雾济南八名村民中毒入院 -BAC009S0901W0453 赵女士的公公躺在病床上 -BAC009S0901W0454 目前神志已恢复清醒 -BAC009S0901W0455 记者李焜染摄十三日早晨 -BAC009S0901W0456 历城区港沟镇神武村飘来多股白色不明气体 -BAC009S0901W0457 八位村民先后出现中毒症状 -BAC009S0901W0458 目前八人均已脱离生命危险 -BAC009S0901W0459 神秘气体成分及来源正在进一步核实 -BAC009S0901W0460 清洁工开宝马上下班真实身份为在逃诈骗犯 -BAC009S0901W0461 彭某下班后准备开车离开 -BAC009S0901W0462 清洁工开宝马上班被称励志故事经查系逃犯 -BAC009S0901W0463 十四日开宝马来上班重庆晨报记者罗伟雷罗伟雷键摄 -BAC009S0901W0464 清洁工被电梯咬断腿曾反映这样擦电源危险 -BAC009S0901W0465 制图黄欣晨报记者佟继萍王亦菲实习生张诗欢 -BAC009S0901W0466 网络时代信息的存在有了新方式 -BAC009S0901W0467 云盘就是一种直接把信息存在网络空间里的存储工具 -BAC009S0901W0468 和传统硬盘不同的是 -BAC009S0901W0469 用户不需要把它带在身上 -BAC009S0901W0470 只需要一个账户名和密码 -BAC009S0901W0471 就可以在网络环境下 -BAC009S0901W0472 上传读取和下载里面的信息 -BAC009S0901W0473 本来云盘的出现方便了人们的生活和工作 -BAC009S0901W0474 把云盘变成了一个淫秽色情信息的隐蔽聚散地 -BAC009S0901W0475 清风正在吹散互联网雾霾 -BAC009S0901W0476 四年前的一幕仍没从夏英俊的记忆中抹去 -BAC009S0901W0477 渐冻男孩驾驶电动轮椅上班医生曾诊断活不过十八岁 -BAC009S0901W0478 蔡兴桥在妈妈的帮助下靠墙练习站立 -BAC009S0901W0479 渔民南海捞出外国间谍潜航器搜集情报或已传回 -BAC009S0901W0480 在许多人眼里这都是小说和电影里才会出现的情节 -BAC009S0901W0481 可实际上维护国家边海防安全保护国家利益不受侵犯 -BAC009S0901W0482 这样的斗争和考验有时就发生在我们身边 -BAC009S0901W0483 南海的渔民在捕鱼的时候就曾捞出过一个奇怪的东西 -BAC009S0901W0484 由此引出一起重大安全案件 -BAC009S0901W0485 渔民在南海打捞起可疑电子装置确系无人潜航器 -BAC009S0901W0486 经国家安全部门会同有关技术权威部门鉴定 -BAC009S0901W0487 它既能搜集我国重要海域内各类环境数据 -BAC009S0901W0488 又能探测获取我海军舰队活动动向 -BAC009S0901W0489 实现近距离侦查和情报收集任务 -BAC009S0901W0490 渔民投诉遭离奇执法被派出所讨价还价式罚款 -BAC009S0901W0491 海南临高籍多位渔民向中新网记者反应称 -BAC009S0901W0492 二十二日在文昌市清澜港边防派出所执法检查时 -BAC009S0901W0493 渔民缴纳罚款后在摁手印时 -BAC009S0901W0494 被民警用针扎破手指 -BAC009S0901W0495 让他们很担心会不会相互传染疾病 -BAC009S0902W0121 所以审批加上放款的时间最快也要在七个半月左右 -BAC009S0902W0122 作为取之于民用之于民的住房公积金 -BAC009S0902W0123 缴存者还可以在租房装修离退休时提取 -BAC009S0902W0124 因此操作环节的快捷性与便捷性非常重要 -BAC009S0902W0125 后续要加大公积金贷款的便利性 -BAC009S0902W0126 鼓励购房者积极缴纳公积金 -BAC009S0902W0127 进而选择此类方式购房 -BAC009S0902W0128 另外要处理公积金异地使用的问题 -BAC009S0902W0129 这对于目前一线城市来说很紧要 -BAC009S0902W0130 很多人受限购政策的影响 -BAC009S0902W0131 难以在周边城市用公积金购房 -BAC009S0902W0132 导致公积金资源闲置的问题出现 -BAC009S0902W0133 美丽北京大型绿色公益品牌项目 -BAC009S0902W0134 随着广州住房公积金贷款政策的调整实施 -BAC009S0902W0135 政策内容主要涉及购房 -BAC009S0902W0136 随着广州住房公积金贷款政策的调整实施 -BAC009S0902W0137 公积金贷款最高额度亦不同程度上调 -BAC009S0902W0138 住房公积金贷款因其利率较低的优势 -BAC009S0902W0139 一直以来广受购房者青睐 -BAC009S0902W0140 本轮本轮住房公积金房贷政策调整 -BAC009S0902W0141 进一步加速了消费者的入市节奏 -BAC009S0902W0142 广州调整住房公积金个人住房贷款政策 -BAC009S0902W0143 同时对申请公积金贷款的缴纳时限调整为七个月 -BAC009S0902W0144 据广州日报昨天报道 -BAC009S0902W0145 公积金贷款首付比例降低的消息令购房者喜出望外 -BAC009S0902W0146 其中刚需要买入市积极性明显提高 -BAC009S0902W0147 据伟嘉安捷提供的数据显示 -BAC009S0902W0148 北京公积金贷款首付比例松绑一周后 -BAC009S0902W0149 公积金贷款及组合贷咨询量明显上涨 -BAC009S0902W0150 尤其组合贷的咨询量较上月月初一上涨百分之七左右 -BAC009S0902W0151 上海深圳等主要城市也在公积金新政推动下 -BAC009S0902W0152 呈现购房者积入市的行情 -BAC009S0902W0153 全国已有超百个城市发布了不同力度的公积金松绑政策 -BAC009S0902W0154 加之降息降准等政策组合拳 -BAC009S0902W0155 呈现出量价齐涨的局面 -BAC009S0902W0156 据中国指数研究院最新数据显示 -BAC009S0902W0157 深圳环比上上涨百分之七 -BAC009S0902W0158 涨幅据十大城市之首 -BAC009S0902W0159 五月份多地楼市的成交量明显上涨 -BAC009S0902W0160 是房地产当前发局格局下的一个必然 -BAC009S0902W0161 唯独这样才能盘活公积金资源 -BAC009S0902W0162 促使更多购房者积极入市 -BAC009S0902W0163 伴随着各地住房公积金新政的落地实施 -BAC009S0902W0164 楼市进展仍需进一步观望 -BAC009S0902W0165 购房者受惠于政策利好的同时 -BAC009S0902W0166 公积金在申请放贷流程上并未提速 -BAC009S0902W0167 相反相关环节上审批更加严格 -BAC009S0902W0168 从目前上海住房公积金的具体政策看 -BAC009S0902W0169 购房的扶持力度在加大 -BAC009S0902W0170 但主要还是体现在贷款成本的降低 -BAC009S0902W0171 而申请公积金贷款方面还是需要走严格的流程 -BAC009S0902W0172 公积金提取一直是目前试图突破的内容 -BAC009S0902W0173 但目前还未出现大面积提取行为 -BAC009S0902W0174 来自广州日报的报道称 -BAC009S0902W0175 从申请到最后的放款 -BAC009S0902W0176 部分客户甚至等两个多月 -BAC009S0902W0177 如果申请公积金贷款及公积金贷款与商业贷款的组合贷 -BAC009S0902W0178 伟嘉安捷对中新网房产频道表示 -BAC009S0902W0179 现在公积金贷款办理需要一个月左右的时间 -BAC009S0902W0180 而申请办理组合贷款的手续则更为复杂 -BAC009S0902W0181 所以审批加上放款的时间最快也要在五个半月左右 -BAC009S0902W0182 作为取之于民用之于民的住房公积金 -BAC009S0902W0183 缴存者还可以在租房装修离退休时提取 -BAC009S0902W0184 因此操作环节的快捷性与便捷性非常重要 -BAC009S0902W0185 后续要加大公积金贷款的便利性 -BAC009S0902W0186 鼓励购房者积极缴纳公积金 -BAC009S0902W0187 科技支撑能力显着增强 -BAC009S0902W0188 生产经营方式不断优化 -BAC009S0902W0189 农业产业体系更趋完善 -BAC009S0902W0190 土地产出率劳动生产率资源利用率显着提高 -BAC009S0902W0191 现代农业建设取得突破性进展 -BAC009S0902W0192 主要农产品优势区基本实行农业现代化 -BAC009S0902W0193 现代农业发展主要指标类别 -BAC009S0902W0194 粮食综合生产能力五亿吨 -BAC009S0902W0195 粮食播种面积五亿亩棉花总产量七万吨 -BAC009S0902W0196 油料总产量七万吨 -BAC009S0902W0197 肉类总产量五万吨 -BAC009S0902W0198 奶类总产量七万吨水产品总产量七万吨 -BAC009S0902W0199 农产品质量安全例行监测总体合格率百分之五十 -BAC009S0902W0200 畜牧业产值占农业总产值比重百分之 -BAC009S0902W0201 渔业产值占农业总产值比重百分之 -BAC009S0902W0202 农产品加工业产值与农业总产值 -BAC009S0902W0203 丰富和解调仲裁诉等维权内容和方式 -BAC009S0902W0204 新增农田有效灌溉面积万亩 -BAC009S0902W0205 耕种收综合机械化水平百分之五 -BAC009S0902W0206 丰富和解调解仲裁诉诉讼等 -BAC009S0902W0207 科技科技进步贡献率百分之七 -BAC009S0902W0208 农村实用人才总量万人 -BAC009S0902W0209 农业产业化组织带动农户数量亿户 -BAC009S0902W0210 团结就是力量 -BAC009S0902W0211 适宜农户沼气普及率百分之五 -BAC009S0902W0212 农作物秸秆综合利用率百分之五 -BAC009S0902W0213 薛之谦的歌儿很棒 -BAC009S0902W0214 农林牧渔业增长值年均增长率百分之五 -BAC009S0902W0215 增长速度按可比价格计算 -BAC009S0902W0216 从加快转变农业发展的方式关键环节入手 -BAC009S0902W0217 完善现代农业产业体系 -BAC009S0902W0218 稳定发展粮食和棉油糖生产 -BAC009S0902W0219 实施全国增长千亿斤粮食生产能力规划 -BAC009S0902W0220 积极推进南方稻区单改双 -BAC009S0902W0221 扩大东北优势区粳稻种植面积 -BAC009S0902W0222 稳步推进江淮等粳高稻生产适宜区糟改粳 -BAC009S0902W0223 稳定增加玉米播种面积 -BAC009S0902W0224 积极恢复和稳定大豆种植面积 -BAC009S0902W0225 积极开发和选育马铃薯优质专用高产品种 -BAC009S0902W0226 提高脱毒种薯供给能力 -BAC009S0902W0227 继续加强优质棉花生产基地建设 -BAC009S0902W0228 多油并举稳定食用植物油自给率 -BAC009S0902W0229 基本满足国内棉花消费需求 -BAC009S0902W0230 积极发展菜篮子产品生产 -BAC009S0902W0231 加强蔬菜水果肉蛋奶水产品等产品优势产区建设 -BAC009S0902W0232 扩大大中城市郊区菜篮子产品生产基地规模 -BAC009S0902W0233 推动苹果柑橘等优势园艺产品生产 -BAC009S0902W0234 稳定发展生猪和蛋禽 -BAC009S0902W0235 大力发展农产品加工和流通业 -BAC009S0902W0236 加强主要农产品优势产区加工基地建设 -BAC009S0902W0237 引导农产品加工业向种养业优势区域和城市郊区集中 -BAC009S0902W0238 启动实施农产品加工提升工程 -BAC009S0902W0239 提高生产流通组织化程度 -BAC009S0902W0240 培育一批产值过百亿元的大型加工和流通企业集团 -BAC009S0902W0241 强化流通基础设施建设和产销信息引导 -BAC009S0902W0242 升级改造农产品批发市场 -BAC009S0902W0243 支持优势产区现代化鲜活农产品批发市场建设 -BAC009S0902W0244 大力发展冷链体系和生鲜农产品配送 -BAC009S0902W0245 推进订单生产和农超对接 -BAC009S0902W0246 落实鲜活农产品运输绿化通道政策 -BAC009S0902W0247 降低农产品流通成本 -BAC009S0902W0248 规范和完善农产品期货市场 -BAC009S0902W0249 强化农业科技和人才支撑 -BAC009S0902W0250 增强农业科技自主创新能力 -BAC009S0902W0251 明确农业科技的公共性基础社会性地位 -BAC009S0902W0252 加强基础性前沿性公益性重大农业科学技术研究 -BAC009S0902W0253 比去年同期的六十二点三十亿美元大幅增长百分之十二 -BAC009S0902W0254 系涨幅最为明显的地区 -BAC009S0902W0255 占总营收的二十六点百分之六十点六十七 -BAC009S0902W0257 苹果的股价有一定的波动规律 -BAC009S0902W0258 即是在新品发布前的一个多季度的时间内 -BAC009S0902W0259 因为在新品发布之前 -BAC009S0902W0260 由于许多用户都持币待购 -BAC009S0902W0261 因此需求会暂时被抑制住 -BAC009S0902W0262 销量都会有一定的影响 -BAC009S0902W0263 孙永杰对二十一世纪报道记者表示 -BAC009S0902W0264 苹果的股价会随着销量相反 -BAC009S0902W0266 缺乏新的业务增长点 -BAC009S0902W0267 苹果在二零一四年营收为二百二十二亿美元 -BAC009S0902W0269 就手机领域的发展趋势 -BAC009S0902W0270 苹果高端市场已经确立了一个独一无二的地位 -BAC009S0902W0271 以前在高端智能手机市场领域 -BAC009S0902W0272 苹果有两个竞争对手 -BAC009S0902W0274 今年股价已经累计下跌了百分之六十而三星的情况也不佳 -BAC009S0902W0275 在三星第二季度财报中 -BAC009S0902W0277 降至二十六点零六万亿韩元 -BAC009S0902W0278 其中手机的销售额下降了七点百分之三 -BAC009S0902W0279 至二十五点五万亿韩元 -BAC009S0902W0280 在安卓手机的总体交付量中 -BAC009S0902W0281 价格高于六百美元的高端手机占比为百分之一 -BAC009S0902W0282 价格高于六百美元的高端手机 -BAC009S0902W0283 在安卓出货量中的占比减少到了百分之六 -BAC009S0902W0285 价格高于六百美元的占比从百分之七十增加到了百分之八十 -BAC009S0902W0287 苹果在高端市场击溃了对手 -BAC009S0902W0288 而这对于未来苹果保持高利润和利润率至关重要 -BAC009S0902W0289 这对苹果是一个利好 -BAC009S0902W0290 意味着只要用户要选择高端手机 -BAC009S0902W0291 在类似印度之类的新兴市场 -BAC009S0902W0292 因此从全球的角度来看 -BAC009S0902W0293 智能手机仍然有增长和爆发的空间 -BAC009S0902W0295 但是作为一个仍在成长没有其他对手的市场来讲 -BAC009S0902W0296 苹果已然可以单点突破 -BAC009S0902W0297 至于新的业务增长点 -BAC009S0902W0298 但是瑞士联合银行分析师估计 -BAC009S0902W0299 较最初的预期减少了一半左右 -BAC009S0902W0302 已经占有了全球智能手表市场的百分之五 -BAC009S0902W0303 云计算和大数据时代 -BAC009S0902W0305 紫光股份曾经出现一连波连续十六个一字涨停的狂飙行市 -BAC009S0902W0306 近日的走势也强于大盘 -BAC009S0902W0307 两个机构专用席位列于买一和卖二的位置 -BAC009S0902W0308 买卖前五名共计净出于该股六十二点九三万元 -BAC009S0902W0309 大盘股仍是毫无作为 -BAC009S0902W0310 题材股继续扮演黑马角色 -BAC009S0902W0311 紫光股份千九十三八在公布拓展云计算市场后 -BAC009S0902W0312 盘中有两千六百八十六万元资金净流入 -BAC009S0902W0313 给孩子买儿童电话手表有必要吗 -BAC009S0902W0314 消费者在听销售人员介绍小天才手表 -BAC009S0902W0315 消费者在听销售人员介绍小天才手表 -BAC009S0902W0316 消费者在听销售人员介绍小天才电话手表 -BAC009S0902W0317 很多家长都在给孩子购置各种学习用品 -BAC009S0902W0318 除了传统的书包文具以及辅导书外 -BAC009S0902W0319 这个儿童电话手表以其强大的定位通话微聊等功能 -BAC009S0902W0320 深受家长和儿童欢迎 -BAC009S0902W0321 很多孩子都以拥有一款电话手表为豪 -BAC009S0902W0322 而不少品牌的电话手表量销售量更是突破百万大关 -BAC009S0902W0323 电话手表对儿童健康安全是否有危险 -BAC009S0902W0324 老师是否允许孩子戴手表上学 -BAC009S0902W0325 电话手表应该如何选购 -BAC009S0902W0326 笔者进行了深度的了解 -BAC009S0902W0327 儿童电话手表到底有多火 -BAC009S0902W0328 年龄或大或小的孩子 -BAC009S0902W0330 都会目不转睛的盯着 -BAC009S0902W0331 或者跟着广告哼起歌曲来 -BAC009S0902W0332 随着产品快速进入家长和孩童的视野 -BAC009S0902W0333 每天的销量让你感受到儿童电话手表的火爆 -BAC009S0902W0334 对于如此火爆的市市场需求 -BAC009S0902W0335 来自广西的苏女士说家长对孩子安全的关心 -BAC009S0902W0336 是电话手表今年大受欢迎的主要原因 -BAC009S0902W0337 在电话手表出现之前 -BAC009S0902W0338 据悉他正在积极进修表演准备进入演艺圈 -BAC009S0902W0339 近日日本媒体曝出惊人消息 -BAC009S0902W0340 称高桥大辅可能在一段时间里出柜 -BAC009S0902W0341 公开自己的同性恋者身份 -BAC009S0902W0342 恐怕又要传来不少女粉丝心碎的声音了 -BAC009S0902W0343 高桥大辅堪称日本花样滑冰男单领域的领军人物 -BAC009S0902W0344 在他的职业生涯里曾在二零一零年拿到世锦赛金牌 -BAC009S0902W0345 温哥华冬奥会拿到铜牌 -BAC009S0902W0346 一二年总决赛拿到金牌 -BAC009S0902W0347 还曾经两次拿到了四大洲锦标赛的男单冠军 -BAC009S0902W0348 表示未来会进入演艺圈发展 -BAC009S0902W0349 颜值颇高的他今年四月远赴美国纽约 -BAC009S0902W0350 高桥大辅丝毫不加掩饰 -BAC009S0902W0351 他经常在社交网站公开美食等照片 -BAC009S0902W0352 看起来在美国过得很开心的样子 -BAC009S0902W0353 过去一直背负着日本花滑界的重压 -BAC009S0902W0354 终于得到了释放的样子 -BAC009S0902W0355 他每周二三天来学校 -BAC009S0902W0356 还有记者爆料说居住在纽约的日本人透露 -BAC009S0902W0357 高桥在当地过着奢华享乐的生活 -BAC009S0902W0358 如果真的想学习的话 -BAC009S0902W0359 就不会刻意选择位于纽约闹市区的这所大学 -BAC009S0902W0360 图片中他们一行人面对镜头尽显搞怪天赋 -BAC009S0902W0361 高桥大辅则是噘着嘴做出索吻的动作 -BAC009S0902W0362 外界认为这是一种另有深意的暗示 -BAC009S0902W0363 而对于他的好友小林尊 -BAC009S0902W0364 被认为日本体育界的相关人士称 -BAC009S0902W0365 但多年来关于他的形婚 -BAC009S0902W0366 实际上是同性恋者的传闻一直未停过 -BAC009S0902W0367 和澳洲鱼雷索普一样 -BAC009S0902W0368 高桥大辅因为其比赛风格的妖娆多变 -BAC009S0902W0369 多年来围绕其性取向的争论一直没有停息 -BAC009S0902W0370 退役前高桥大辅曾与花滑女神浅田真央传出恋情 -BAC009S0902W0371 身为上司而且已婚有儿女的桥本被指责涉嫌性侵 -BAC009S0902W0372 不过两位当事人双双否认性侵的说法 -BAC009S0902W0373 如今和小林尊出双入对 -BAC009S0902W0374 高调参加同性恋者的年度盛事 -BAC009S0902W0375 有可靠消息称高桥很可能在近期正式宣布出柜 -BAC009S0902W0376 此消息一出迅速引发外界强烈关注 -BAC009S0902W0377 日本网友也是众说纷纭一点儿也不吃惊 -BAC009S0902W0378 看他在冰场上搔首弄姿地表现 -BAC009S0902W0379 高桥大辅应该是他的新欢 -BAC009S0902W0380 难怪他能接受年过半百的桥本的索吻 -BAC009S0902W0381 许多为高桥痴迷多年的女粉丝肯定深受打击 -BAC009S0902W0382 作为日本的花滑王子 -BAC009S0902W0383 这么多年一直要压抑自己的性取向 -BAC009S0902W0384 挺不容易的支持他追属属于自己的真正幸福 -BAC009S0902W0385 据美联社十日报道 -BAC009S0902W0386 一些参赛选手赛后感到胃部不适 -BAC009S0902W0387 而队医怀疑这或许与比赛地水污染有关 -BAC009S0902W0388 美国队官员不排除他们的队员因食物或饮水而生病 -BAC009S0902W0389 近来有关里约水污染问题备受关注 -BAC009S0902W0390 美联社公布的一项独立水质检测显示 -BAC009S0902W0391 在奥运会赛艇和铁人三项公开水域等比赛地 -BAC009S0902W0392 也存在高危病毒危险 -BAC009S0902W0393 该湖区也将是明年奥运会赛艇比赛地 -BAC009S0902W0394 比污染严重的瓜内巴拉湾相比 -BAC009S0902W0395 赛艇比赛所在湖区的水污染问题近年来得到改善 -BAC009S0902W0396 但是上周公布的水质检测显示 -BAC009S0902W0397 湖区水污染仍旧十分严重 -BAC009S0902W0398 在本次赛艇测试赛期间 -BAC009S0902W0399 一些参赛选手也向新华社记者表示 -BAC009S0902W0400 比赛地的湖水比较浑浊 -BAC009S0902W0401 但还是担心水质问题 -BAC009S0902W0402 来自中国的赛艇选手崔帅豪说 -BAC009S0902W0403 比赛地水不是太干净 -BAC009S0902W0404 他自己还将出任影片的男主角 -BAC009S0902W0405 忙碌成本可想而知 -BAC009S0902W0406 外媒发布了更令人兴奋的消息 -BAC009S0902W0407 将在本届美国电影学会影展中进行秘密放映 -BAC009S0902W0408 对方是二十五岁的人妻名模泰舒培 -BAC009S0902W0409 搜狐娱乐讯七月十五日 -BAC009S0902W0410 陈冠希前女友嫩模黄榕在香港书展出席写真宣传活动 -BAC009S0902W0411 身穿白色抹胸的她大秀性感好身材 -BAC009S0902W0412 谈及前男友陈冠希近日被指外貌衰老了不少 -BAC009S0902W0413 黄榕坦言可能他做了太多运动 -BAC009S0902W0414 搜狐娱乐讯日前 -BAC009S0902W0415 众星云集上海出席某商家的开业活动 -BAC009S0902W0416 由潮男陈冠希打头阵 -BAC009S0902W0417 更云集了罗中旭前任 -BAC009S0902W0418 黄宗泽绯闻女友等女星 -BAC009S0902W0419 现场气氛火爆 -BAC009S0902W0420 粉丝们一度失控 -BAC009S0902W0421 陈冠希坦言认为陈奕迅是k歌之王 -BAC009S0902W0422 但由于风格不同 -BAC009S0902W0423 新专辑音乐方面还是坚持做自己 -BAC009S0902W0424 搜狐娱乐讯九月五日 -BAC009S0902W0425 一怒之下把大叔身份证扔在地上 -BAC009S0902W0426 二人发生姓肢体冲突 -BAC009S0902W0427 此视频曝光后 -BAC009S0902W0428 网友纷纷力挺陈冠希 -BAC009S0902W0429 温兆伦许飞欧弟等明星也通过微博表示支持力挺 -BAC009S0902W0430 搜狐娱乐讯北京时间六月二十四日消息 -BAC009S0902W0431 渔船凶案嫌疑借发动机声将同船同事依次杀害 -BAC009S0902W0432 渔船海上爆炸沉没四名渔民漂流三天获救 -BAC009S0902W0433 昨天上午七点五零分 -BAC009S0902W0434 目前正在根据海事部门的要求开往盐城大分港 -BAC009S0902W0435 准备将获救的四人送上岸边医院救治 -BAC009S0902W0436 渔船海上被撞翻仅一人逃生同伴求救却无能为力 -BAC009S0902W0437 出事的渔船被拖到韩榆石桥海边 -BAC009S0902W0438 渔船被其他船撞翻六人死海事部门悬赏五万寻肇事者 -BAC009S0902W0439 快报讯通讯员李欢乐记者王晓宇八月二六日 -BAC009S0902W0440 船上八名船员六人不幸遇难 -BAC009S0902W0441 只有一名船员得以逃生 -BAC009S0902W0442 渝武高速武胜段发生追尾事故已造成六死九伤 -BAC009S0902W0443 记者从广安消防部门获悉 -BAC009S0902W0444 大客车的车头和车身损毁严重 -BAC009S0902W0445 车辆载有数十名乘客 -BAC009S0902W0446 截至九点四零分消防人员撤离时 -BAC009S0902W0447 已造成六人死亡九人受伤 -BAC009S0902W0448 目前记者正赶往武胜县人民医院 -BAC009S0902W0449 渝蓉高速四川段计划明年通车被称最拖沓高速 -BAC009S0902W0450 渝蓉高速四川段因烂尾被称为最拖沓高速 -BAC009S0902W0451 渝蓉高速四川段资金断裂烂尾已修了六年 -BAC009S0902W0452 渣土车右转弯骑车男童被卷入车轮下不幸身亡 -BAC009S0902W0453 肇事车及损伤严重的自行车报料人供图 -BAC009S0902W0454 渣土车挂倒电动车致一死一伤肇事车主逃逸 -BAC009S0902W0455 蚌飞市发生一起惨剧 -BAC009S0902W0456 一对男女骑电动车在通过一个十字路口时 -BAC009S0902W0457 被同方向行驶的一辆渣土车挂倒 -BAC009S0902W0458 骑电动车男子当场死亡 -BAC009S0902W0459 但渣土车司机肇事后不仅没有下车救援 -BAC009S0902W0460 目前当地警方正在追查这名司机 -BAC009S0902W0461 渣土车撞进路边民房女子抱小孩幸运逃生 -BAC009S0902W0462 山水湾小区斜对面的一处工地旁 -BAC009S0902W0463 肇事的大卡车车头仍然卡在工房内 -BAC009S0902W0464 图记者陈斌潇湘晨报长沙讯一零月一一日下午 -BAC009S0902W0465 长沙县湘龙西路一处十字路口 -BAC009S0902W0466 一辆红色的卡车和一辆黄色的渣土车发生碰撞 -BAC009S0902W0467 黄色渣土车一头撞进了路边的工房 -BAC009S0902W0468 被撞废的奔驰昨日二二时左右 -BAC009S0902W0469 省城政务区习友路与怀宁路交叉口 -BAC009S0902W0470 一辆渣土车突然冲向逆向车道 -BAC009S0902W0471 连续撞了五辆小轿车最终才停了下来 -BAC009S0902W0472 其中一辆奔驰轿车被撞出近一零米远 -BAC009S0902W0473 渤海一渔船沉没船上一六人落水一二人失踪 -BAC009S0902W0474 唐山乐亭一船队在渤海与一山东渔船发生纠纷 -BAC009S0902W0475 导致唐山一渔船沉没 -BAC009S0902W0476 但因海上风大浪急影响救援 -BAC009S0902W0477 目前仍未发现失踪船员 -BAC009S0902W0478 渤海垃圾成堆变死海 -BAC009S0902W0479 与韩国西海相连的中国渤海湾由于垃圾堆积 -BAC009S0902W0480 有人忧虑渤海湾的污染会直接影响到韩国西部海域 -BAC009S0902W0481 渤海失事河北籍渔船已致四人遇难仍有八人失踪 -BAC009S0902W0482 又在船仓内发现四名船员遗体 -BAC009S0902W0483 目前仍有八名失踪人员下落不明 -BAC009S0902W0484 渤海湾溢油事故赔偿案宣判康菲公司被判赔一六八万 -BAC009S0902W0485 温岭倒塌厂房系违章建筑涉事负责人已被控制 -BAC009S0902W0486 据新华社电七月四日一六时许 -BAC009S0902W0487 浙江温岭市大溪镇发生鞋厂厂房倒塌事故 -BAC009S0902W0488 共造成一四人死亡三三人受伤 -BAC009S0902W0489 事故厂房系违章建筑 -BAC009S0902W0490 此前已被列入拆除范围 -BAC009S0902W0491 涉事两企业负责人均已被控制 -BAC009S0902W0492 温岭医院助理殴打女病人五年后提拔为副院长 -BAC009S0902W0493 法晚深度即时记者杜雯雯实习生张明明近日 -BAC009S0902W0494 此关于滕灵方此后晋升为副院长一事 -BAC009S0902W0495 该医院党委书记杨幼萍向晚报记者表示 -BAC009S0903W0121 进而选择此类方式购房 -BAC009S0903W0122 另外要处理公积金异地使用的问题 -BAC009S0903W0123 这对于目前一线城市来说很紧要 -BAC009S0903W0124 很多人受限购政策的影响 -BAC009S0903W0125 难以在周边城市用公积金购房 -BAC009S0903W0126 导致公积金资源闲置的问题出现 -BAC009S0903W0127 中新网房产频道 -BAC009S0903W0128 随着广州住房公积金贷款政策的调整实施 -BAC009S0903W0129 北上广深四个一线城市已经全部放开公积金房贷业 -BAC009S0903W0130 公积金新政加速楼市库存消化至搜狐财经 -BAC009S0903W0131 住建部等三部委联合发文 -BAC009S0903W0132 再次降低公积金贷款的门槛 -BAC009S0903W0133 还清首套房公积金贷款 -BAC009S0903W0134 再次申请公积金贷款购买第二套房的 -BAC009S0903W0135 该政策延续了去年新政以来 -BAC009S0903W0136 也延续了公积金担当扶持楼市主力军的政策选择 -BAC009S0903W0137 从去年三部委发文 -BAC009S0903W0138 公积金对楼市的扶持力度不断加大 -BAC009S0903W0139 相继有一百多个城市出台了公积金新政 -BAC009S0903W0140 公积金贷款利率也数次下调 -BAC009S0903W0141 二套还清十首套比例降至五成 -BAC009S0903W0142 与新政相比 -BAC009S0903W0143 目前公积金政策已经与去年等同了 -BAC009S0903W0144 此次公积金政策大力度调整 -BAC009S0903W0145 主要目的是通过激励改善型住房需求 -BAC009S0903W0146 实现三四线城市去库存 -BAC009S0903W0147 尽管全国商品房销售面积持续回升 -BAC009S0903W0148 但库存压力却难以缓减 -BAC009S0903W0149 全国商品房待售面积比七月末增加了五百万平方米 -BAC009S0903W0150 比去年底增加了七万平方米 -BAC009S0903W0151 库存逆势攀升的根本原因在于供求错配 -BAC009S0903W0152 推动全国成交面积止跌反弹 -BAC009S0903W0153 但供应和库存却主要集中在七个三四线城市 -BAC009S0903W0154 且待售库存单套面积较大 -BAC009S0903W0155 无论是降低二套房公积金首付比例 -BAC009S0903W0156 还是不再区分普通和非普通住房 -BAC009S0903W0157 都意在有针对性地加大三四线城市楼市库存消化力度 -BAC009S0903W0158 只有楼市库存真正消化了 -BAC009S0903W0159 才能提振开发商拿地和开工的积极性 -BAC009S0903W0160 在公积金利率已降至历史低位 -BAC009S0903W0161 站在金九银十即将来临的起点上 -BAC009S0903W0162 再次降低公积金贷款首付比例 -BAC009S0903W0163 目的也是为了夯实楼市回升的基础 -BAC009S0903W0164 尽管去年新政以来 -BAC009S0903W0165 楼市持续三个季度回升 -BAC009S0903W0166 回升势头有转弱的迹象 -BAC009S0903W0167 首先是重点城市楼市成交回落趋势明显 -BAC009S0903W0168 领头羊一线城市分别下降百分之一和百分之七 -BAC009S0903W0169 而重点城市的供应也在七月份下滑了百分之七 -BAC009S0903W0170 五月份更是增加了七百万平方米 -BAC009S0903W0171 银行房贷额度开始紧张 -BAC009S0903W0172 首套房贷利润优惠也开始减少 -BAC009S0903W0173 近期人民币贬值叠加资本外流预期 -BAC009S0903W0174 资金面紧张对楼市的影响开始显现 -BAC009S0903W0175 市场对金九银十的预期也开始谨慎起来 -BAC009S0903W0176 除了去库存和夯实楼市回升基础外 -BAC009S0903W0177 此次公积金政策调整 -BAC009S0903W0178 也在于全面落实分类调控因城施策 -BAC009S0903W0179 纠偏政策一刀切的负面影响 -BAC009S0903W0180 去年新政以来 -BAC009S0903W0181 松绑二套房贷认定标准降低二套房贷首付比例 -BAC009S0903W0182 以及营业税免征期 -BAC009S0903W0183 第一次在公积金上提出差别对待 -BAC009S0903W0184 包括上海广州厦门南京在内的重点城市 -BAC009S0903W0185 以及前几次公积金新政的实施 -BAC009S0903W0186 公积金可贷额度受到严重冲击 -BAC009S0903W0187 着力解决一批影响现代农业发展全局的重大科技问题 -BAC009S0903W0188 加快农业技术引进消化吸收再创新步伐 -BAC009S0903W0189 加强农业科技领域国际合作 -BAC009S0903W0190 调整优化农业科研布局 -BAC009S0903W0191 加强农业科研基地和重点实验室建设 -BAC009S0903W0192 完善农业科技创新体系和现代农业产业技术体系 -BAC009S0903W0193 启动实施农业科技创新能力建设工程 -BAC009S0903W0194 组建一批产业技术创新战略联盟和国家农业科技园区 -BAC009S0903W0195 完善农业科技评价机制 -BAC009S0903W0196 激发农业科技创新活力 -BAC009S0903W0197 大力发展现代农作物种业 -BAC009S0903W0198 实施好转基因生物新品种培育重大专项 -BAC009S0903W0199 加快发展生物育种战略性新兴产业 -BAC009S0903W0200 加快农业新品种新技术转化应用 -BAC009S0903W0201 加强小麦一喷三防喷施叶面肥 -BAC009S0903W0202 加快牲畜水产遗传改良进程 -BAC009S0903W0203 创新农业技术推广机制 -BAC009S0903W0204 大规模开展高产创建 -BAC009S0903W0205 在有条件地区实行整乡整县场推进 -BAC009S0903W0206 力争实现优势产区和主要品种全复盖 -BAC009S0903W0207 壮大农业农村人才队伍 -BAC009S0903W0208 以实施现代农业人才支撑计划为抓手 -BAC009S0903W0209 加大农村劳动力培训阳光工程实施力度 -BAC009S0903W0210 大力发展农业职业培养 -BAC009S0903W0211 加快技能型人才培养 -BAC009S0903W0212 支持高校毕业生和各类优秀人才投身现代农业建设 -BAC009S0903W0213 鼓励外出务工农农民带技术带资金回乡创业 -BAC009S0903W0214 改善农业基础设备和装备条件 -BAC009S0903W0215 大规模开展高标准农田建设 -BAC009S0903W0216 按照统筹规划分工协作集中投入连片推进的思想 -BAC009S0903W0217 大规模改造中低产田 -BAC009S0903W0218 建设旱涝保收高标准农田 -BAC009S0903W0219 加快大中型灌区排灌泵站配套改造 -BAC009S0903W0220 大力开展小型农田水利建设 -BAC009S0903W0221 增加农田有效灌溉面积 -BAC009S0903W0222 加强新增千亿斤粮食生产能力规划的田间工程建设 -BAC009S0903W0223 完善机耕道农田防护林等设施 -BAC009S0903W0224 推广土壤有机质提升测土配方施肥等培肥地力技术 -BAC009S0903W0225 完善高标准农田建后管护支持政策和制度 -BAC009S0903W0226 延长各类设施使用年限 -BAC009S0903W0227 确保农田综合生产能力长期持续稳定提升 -BAC009S0903W0228 改善养殖业生产条件 -BAC009S0903W0229 加快实施生禽良种工程 -BAC009S0903W0230 支持生禽规模化养殖场小区开展标准化改造和建设 -BAC009S0903W0231 加快草原围栏棚圈和牧区水利建设 -BAC009S0903W0232 配套发展节水高效灌溉词草基地 -BAC009S0903W0233 健全水产良良种体系 -BAC009S0903W0234 开展池塘标准化改造 -BAC009S0903W0235 建设水产健康养殖示范场 -BAC009S0903W0236 加强渔港和渔政执法能力建设 -BAC009S0903W0237 全面落实农机具购置补贴各项管理制度和规定 -BAC009S0903W0238 加快推进水稻栽插收获和玉米收获机械化 -BAC009S0903W0239 重点突破棉花油菜甘蔗收获机械化瓶颈 -BAC009S0903W0240 大力发展高效植保机器 -BAC009S0903W0241 积极推进养殖业园艺业农产品初加工机械化 -BAC009S0903W0242 加快实施保护性耕作工程 -BAC009S0903W0243 提高大型农机具和农药化肥农膜等农资生产水平 -BAC009S0903W0244 加强农业防灾减灾能力建设 -BAC009S0903W0245 提高防汛抗旱减灾能力 -BAC009S0903W0246 加强种子饲草料等急救灾物资储备调运条件建设 -BAC009S0903W0247 推广相应的生产技术和防灾减灾措施大力推进农业标准化 -BAC009S0903W0248 以农兽药残留标准为重点 -BAC009S0903W0249 加快健全农业标准体系 -BAC009S0903W0250 以园艺产品生产品水产品等为重点 -BAC009S0903W0251 推行统一的标准操作规程和技术规范 -BAC009S0903W0252 加强国家级农业标准化整建制推进示范县场建设 -BAC009S0903W0253 市场占有率为百分之五 -BAC009S0903W0254 二零一四年三星期累计销售超过一百二十万块智能手表 -BAC009S0903W0255 这个数据不及苹果的一个季度 -BAC009S0903W0256 因此不能表示苹果没有新的业务增长点 -BAC009S0903W0257 本报记者纪佳鹏北京报道北京时间八月十二日 -BAC009S0903W0258 作为科技股领头羊的苹果股价当天下挫百分之二 -BAC009S0903W0259 十二月二日路透社报道 -BAC009S0903W0260 苹果股票每分钟交易量已超过六百七十万股 -BAC009S0903W0261 这种巨大且异乎寻常的抛售量 -BAC009S0903W0262 瞬间将苹果估价拉低了至少百分之六 -BAC009S0903W0263 使其市值分秒间蒸发近四百亿美元 -BAC009S0903W0264 成为苹果近三个月以来股价下跌最严重的一次 -BAC009S0903W0265 苹果股价一度每分钟跌幅已破百分之三 -BAC009S0903W0266 每股报价报收于一百一十一点二七美元 -BAC009S0903W0267 报收于每股一百一十五点四五美元 -BAC009S0903W0268 对于造成此次异常闪崩的原因目前尚未公布 -BAC009S0903W0269 此举或与摩根士丹利下调苹果股票持股比例有关 -BAC009S0903W0270 同时将苹果持股比例由百分之四下调至百分之三 -BAC009S0903W0271 并建议客户减少对该股票在投资组合中的占比 -BAC009S0903W0272 高频交易也与此次闪崩事件逃脱不了干系 -BAC009S0903W0273 高频交易一直饱受诟病 -BAC009S0903W0274 美国股市九点五十起 -BAC009S0903W0275 超过三百馀种不同类别股票均出现不正常股价波动 -BAC009S0903W0276 当出现此类价格变化时 -BAC009S0903W0277 通常只是算法交易造成的影响 -BAC009S0903W0278 也就是所说的流动性蒸发事实上 -BAC009S0903W0279 流动性从未得到足够的重视 -BAC009S0903W0280 我们当下的股市在流动性方面也表现得支离破碎 -BAC009S0903W0281 苹果领头的股价闪崩原因可能比想象中的更为复杂 -BAC009S0903W0282 现在就下结论将原因推给高频交易 -BAC009S0903W0283 这种做法很容易误导客服 -BAC009S0903W0284 阿里巴巴当日股价下跌一点百分之四 -BAC009S0903W0285 谷歌十点五八分股价也出现一点百分之七十九的最大跌幅 -BAC009S0903W0286 苹果股价闪崩只是正常股票套利的表现 -BAC009S0903W0287 苹果股价相较十月份低点已经上涨约百分之二十五 -BAC009S0903W0288 纳斯达克在此期间只涨了百分之十 -BAC009S0903W0289 选择套现或也是情理之中 -BAC009S0903W0290 每股下滑三点八八美元报收于一点一十五点零五美元 -BAC009S0903W0292 苹果股票每分钟交易量已超过六十七万股 -BAC009S0903W0293 这不仅创下苹果公司自二零一四年 -BAC009S0903W0294 苹果背后那行字应该在每个中国人心里搜狐科技 -BAC009S0903W0295 翻译过来就是加利福尼亚苹果公司设计 -BAC009S0903W0296 按说这只是一个客观表述 -BAC009S0903W0297 对于谋求转型发展怀揣创新型国家梦想的中国来说 -BAC009S0903W0298 这行字值得我们深思 -BAC009S0903W0299 众多跨国品牌在中国都有生产基地 -BAC009S0903W0300 像苹果这样在产品上强调在本国设计的很少 -BAC009S0903W0301 这样的做法当然是企业行为 -BAC009S0903W0302 这行字对于我们来说 -BAC009S0903W0303 很多家长都考虑给孩子配置具有定位功能的智能手机 -BAC009S0903W0304 智能手机特定的上网和游戏功能 -BAC009S0903W0305 注定了它强烈的娱乐性 -BAC009S0903W0306 给孩子配置智能手机 -BAC009S0903W0307 担心会直接影响孩子正常的学习 -BAC009S0903W0308 儿童电话手表除了通话定位等功能外 -BAC009S0903W0309 还针对性的设置了上课禁用等功能 -BAC009S0903W0310 孩子带到学校既不会让孩子分心 -BAC009S0903W0311 又可以让家长了解孩子的位置 -BAC009S0903W0312 是很多父母迫切需要的 -BAC009S0903W0313 对于小天才电话手表上课禁用功能 -BAC009S0903W0314 相关人员表示为了方便和孩子保持联系 -BAC009S0903W0315 之前很多家长会给孩子买手机 -BAC009S0903W0316 影响学习虽说功能手机可以阻止孩子玩游戏 -BAC009S0903W0317 儿童电话手表正好解决了这两个问题 -BAC009S0903W0318 家长随时和孩子保持联系 -BAC009S0903W0319 我就给自己的孩子也买了一个呢 -BAC009S0903W0320 失孤等影片的上映 -BAC009S0903W0321 也将儿童人身安全的话题推向了妙论的风口浪尖 -BAC009S0903W0322 儿童电话手表的诞生 -BAC009S0903W0323 为孩子多了一份强有力的保障 -BAC009S0903W0324 电话手表正是瞄准了这一需求 -BAC009S0903W0325 加上随身携带的便捷性和流畅的操作体验 -BAC009S0903W0326 在手机平板电脑之外 -BAC009S0903W0327 开扩了一个新的市场 -BAC009S0903W0328 现在三百六十腾讯等大公司都涉足了这一领域 -BAC009S0903W0329 自今年六月电话手表行业兴起起来 -BAC009S0903W0330 整体行业出货量应该不断突破 -BAC009S0903W0331 并将成为新兴的销售热点 -BAC009S0903W0332 科技创新带动了电话手表行业 -BAC009S0903W0333 其实儿童电话手表的火 -BAC009S0903W0334 是火在行业的科技创新 -BAC009S0903W0335 随着国家在科技创新方面的投入和关注度的增加 -BAC009S0903W0336 新兴行业对于创新的热情也不断增加 -BAC009S0903W0337 我们小天才电话手表就是不断创新的成果 -BAC009S0903W0338 意大利选手弗菜戈也说我们在来里约之前 -BAC009S0903W0339 看到了有关这里水污染的报道 -BAC009S0903W0340 对这里的水质比较关心 -BAC009S0903W0341 这个湖虽然没有漂浮的垃圾 -BAC009S0903W0342 但湖水很脏也很浑浊 -BAC009S0903W0343 里约奥组委此前表示 -BAC009S0903W0344 运动员的健康是他们关注的头等大事 -BAC009S0903W0345 无论帆船赛艇还是公开水域 -BAC009S0903W0346 在奥运期间水质都可以保证运动员的健康 -BAC009S0903W0347 二零一五年九月十二日星期六十一点 -BAC009S0903W0348 开幕式举行了庄严的入场仪式 -BAC009S0903W0349 裁判员队伍和参赛代表队依次入场亮相 -BAC009S0903W0350 裁判员代表和运动员代表进行了宣誓 -BAC009S0903W0351 曾春蕾和刘晓彤向各参赛队赠送了签名排球 -BAC009S0903W0352 北京市体育局副局长孙学才宣布比赛开幕 -BAC009S0903W0353 响应北京市振兴三大球战略的号召 -BAC009S0903W0354 促进北京排球事业发展 -BAC009S0903W0355 丰富北京市业馀排球群体活动 -BAC009S0903W0356 激发广大群众对排球的热情 -BAC009S0903W0357 为将其打造成具有影响力的群众性品牌赛事 -BAC009S0903W0358 在社会主义核心价值观的指引下 -BAC009S0903W0359 突出弘扬北京排球文化 -BAC009S0903W0360 组委会在部门设置上调整了人员分工 -BAC009S0903W0361 组委会工作机构共分为四部一室 -BAC009S0903W0362 并且全部采用有经验的工作人员参与竞赛组织工作 -BAC009S0903W0363 在制定竞赛规程方面严格遵循规范化专业化原则 -BAC009S0903W0364 不仅能够将业馀排球与职业排球严格地区分开 -BAC009S0903W0365 而且满足了绝大多数业馀排球爱好者的参赛需求 -BAC009S0903W0366 充分做到公平公正公开 -BAC009S0903W0367 其中国际级裁判员两名 -BAC009S0903W0368 结合业馀排球特点做出细微调整制定而成 -BAC009S0903W0369 营造出良好的比赛氛围 -BAC009S0903W0370 悬挂于场馆醒目位置 -BAC009S0903W0371 增强参赛者的荣誉感与积极性的同时 -BAC009S0903W0372 进一步提升了比赛品质 -BAC009S0903W0373 要将北京市业馀排球联赛打造成群众性品牌赛事 -BAC009S0903W0374 离不开广大媒体的支持 -BAC009S0903W0375 组委会特意举办隆重的开幕式 -BAC009S0903W0376 并邀请京城排球界全部媒体参加报道 -BAC009S0903W0377 并制作了精美的秩序册发给媒体及参赛队 -BAC009S0903W0379 以大球套小球为设计理念 -BAC009S0903W0380 为振兴三大球贡献自己的一份力量 -BAC009S0903W0381 他们的造型由排球的五个经典动作组成 -BAC009S0903W0382 分别是发接传垫扣 -BAC009S0903W0383 颜色则是由代表着运动精神的奥运五环色组成 -BAC009S0903W0384 来自全国各地的业馀排球爱好者纷纷前来踊跃报名 -BAC009S0903W0385 半个月的报名期限未到 -BAC009S0903W0386 二四个参赛名额就已经全部报满 -BAC009S0903W0387 共有三百二十三名业馀排球爱好者报名参加比赛 -BAC009S0903W0388 其中年龄最小的年仅十四岁 -BAC009S0903W0389 最大的已经年过半百 -BAC009S0903W0390 另外还有两名来自加拿大和美国的外籍华侨 -BAC009S0903W0391 由此可见北京市业馀排球联赛的影响力与号召力 -BAC009S0903W0392 在参赛的二四支队伍中 -BAC009S0903W0393 有一些临时组建的球队 -BAC009S0903W0394 但大部分都是常年活跃在业馀排球圈里成熟队球 -BAC009S0903W0395 而且多次参加过业馀排球比赛 -BAC009S0903W0396 相信有这些高水平业馀排球队的参与 -BAC009S0903W0397 这一届北京市业馀排球联赛一定会精彩纷呈 -BAC009S0903W0398 为期五天的比赛全部结束后 -BAC009S0903W0399 将举行隆重的颁奖仪式 -BAC009S0903W0400 从四分之一决赛开始每场比赛评选出一名优秀运动员 -BAC009S0903W0401 为参加联赛的吸引力 -BAC009S0903W0402 提升参赛队的积极性 -BAC009S0903W0403 组委会提高了前三名的含金量 -BAC009S0903W0404 这也是该片首次亮相大荧幕 -BAC009S0903W0405 影片的正式公映要到圣诞节当天 -BAC009S0903W0406 但本月评论界就可以知道该片的真实成色 -BAC009S0903W0407 曾在二零一零年获得空前成功 -BAC009S0903W0408 据香港媒体报道 -BAC009S0903W0409 因参演剧集殭而与陈嘉宝及赖慰玲成为好姐妹 -BAC009S0903W0410 众人一起为寿寿星女庆生 -BAC009S0903W0411 陈嘉宝昨天六月二十三日将大合照上传个人主页 -BAC009S0903W0412 除了看见陈嘉宝及赖慰玲外 -BAC009S0903W0413 亮点正是与陈凯琳互相了解中郑嘉颖也有出席 -BAC009S0903W0414 并做陈凯琳背后的男人 -BAC009S0903W0415 网友纷纷将焦点转移到这对情侣身上 -BAC009S0903W0416 中新网七月二十八日电据香港明报消息 -BAC009S0903W0417 陈凯琳田心妮等出席新剧开机机仪式 -BAC009S0903W0418 谈及此前她曾到横店探班郑嘉颖 -BAC009S0903W0419 因为新剧的厂景和外景推迟了 -BAC009S0903W0420 才有时间去探班 -BAC009S0903W0421 在当地逗留了三四天 -BAC009S0903W0422 自己也有带剧本去看 -BAC009S0903W0423 搜狐娱乐讯北京时间十月二十六日消息 -BAC009S0903W0424 据香港媒体报导 -BAC009S0903W0425 昨晚张保仔播映大结局故演员齐集饭局以及庆祝 -BAC009S0903W0426 陈展鹏风骚到场 -BAC009S0903W0427 他要赶进厂开工 -BAC009S0903W0428 因此开香槟后要先离场 -BAC009S0903W0429 一直传他跟洪永城不和 -BAC009S0903W0430 两人在台下分枱坐欠交流 -BAC009S0903W0431 公安局的决定书说不对他做出行政处罚 -BAC009S0903W0432 我们才按照正常程序给他转为副院长一职的 -BAC009S0903W0433 温岭鞋厂倒塌事故已一四人遇难鞋厂老板被控制 -BAC009S0903W0434 新京报快讯记者杨锋七月四日下午四时零八分 -BAC009S0903W0435 浙江台州温岭市一一零指挥中心接警称 -BAC009S0903W0436 新京报记者从温岭市政府新闻办获悉 -BAC009S0903W0437 早前通报的五名失联人员已全部找到 -BAC009S0903W0438 死亡人数上升至一四人 -BAC009S0903W0439 涉事企业老板已被警方控制 -BAC009S0903W0440 温州二零位面包师制出二五米蛋糕或申报吉尼斯纪录 -BAC009S0903W0441 前往温州龙湾万达广场游玩的市民 -BAC009S0903W0442 无不发出这样的惊叹 -BAC009S0903W0443 一糕点店派出二零位面包师傅 -BAC009S0903W0444 耗时一四个小时打造的二米五超长蛋糕 -BAC009S0903W0445 吸引众多市民驻足观看 -BAC009S0903W0446 温州二名已婚男为争美女驾豪车互撞四个回合 -BAC009S0903W0447 车子被撞得破烂不堪七月四日凌晨 -BAC009S0903W0448 宝马奔驰连续四次相撞 -BAC009S0903W0449 两车驾驶员一度下车大打出手 -BAC009S0903W0450 起因是为了一名年轻的刘姓美女 -BAC009S0903W0451 经保险公司初步估算 -BAC009S0903W0452 两车损失高达三四十万元 -BAC009S0903W0453 温州七人涉嫌百倍抬杠非法经营期货三二亿被批货 -BAC009S0903W0454 浙江温州一公司安装虚拟交易系统 -BAC009S0903W0455 以一一零倍的杠杆吸引社会公众投资 -BAC009S0903W0456 非法经营期货金额共计人民币三二亿元 -BAC009S0903W0457 七名犯罪嫌疑人因涉嫌非法经营罪被批准逮捕 -BAC009S0903W0458 温州城管掌掴女清洁工已被停职检查 -BAC009S0903W0459 温州天价馒头续店方称顾客要狭索赔三条中华烟 -BAC009S0903W0460 荞麦窝窝头一零月二零日 -BAC009S0903W0461 网络上一张永嘉桥头国际饭店的结帐单十分引人注目 -BAC009S0903W0462 菜单显示该饭店的荞麦窝窝头卖三八元一个 -BAC009S0903W0463 三零馀位食客吃了四五个窝窝头 -BAC009S0903W0464 发现事情并没有这么简单 -BAC009S0903W0465 温州火锅先生后续涉案者父亲写公开道歉信 -BAC009S0903W0466 温州网八月二十七日讯记者项锐见习记者黄梦思 -BAC009S0903W0467 温州一七月大女童接种疫苗抽搐省疾控专家调查 -BAC009S0903W0468 金报讯记者蓝莹九月一一日上午 -BAC009S0903W0469 随即被送到儿童医院进行救治 -BAC009S0903W0470 经过连续三天的抢救 -BAC009S0903W0471 孩子仍处于昏迷阶段 -BAC009S0903W0472 省市区三三级疾控部门专家已介入调查 -BAC009S0903W0473 温州一中学门口氢气罐爆炸卖气球摊贩不治身亡 -BAC009S0903W0474 温州一住持被免政协委员遭准儿媳举报娶妻开路虎 -BAC009S0903W0475 关于中国嵩山少林寺方丈齐永信的举报风波尚未停歇 -BAC009S0903W0476 因准儿媳的举报跌下神坛 -BAC009S0903W0477 位于温州苍南龙港镇水门村的一个仓库发生火灾 -BAC009S0903W0478 记者从消防部门处了解到 -BAC009S0903W0479 这里存放着乙酯和工业酒精等化工品 -BAC009S0903W0480 温州一夫妻非法集资五亿丈夫将赃款送给情妇洗钱 -BAC009S0903W0481 温州一女士洗澡被已婚男多次偷窥二年后才报警 -BAC009S0903W0482 温州网讯洗澡被偷窥却一忍再忍 -BAC009S0903W0483 但愿望总是照不进现实 -BAC009S0903W0484 称一名男子时常偷窥自己洗澡 -BAC009S0903W0485 且时间已长达两年多了 -BAC009S0903W0486 温州一家六口吃毒蘑菇身亡女婴拒吃面躲过死神 -BAC009S0903W0487 温州一家六口误食毒菌五人已死亡 -BAC009S0903W0488 温州永嘉县桥下镇吴山村的潘老伯一家六口 -BAC009S0903W0489 半个月前因误食有毒野生菌中毒 -BAC009S0903W0490 至七月一六日中午已有五人去世 -BAC009S0903W0491 潘老伯二六岁的外孙还在医院重症监护室治疗 -BAC009S0903W0492 仍处于深度昏迷状态 -BAC009S0903W0493 温州一村主任台风夜转移群众其妻子遇难 -BAC009S0903W0494 石柱村泥石流塌方现场 -BAC009S0903W0495 温州一男子在医院放置假炸弹被判处有期徒刑一年 -BAC009S0904W0121 为了解决额度荒的问题 -BAC009S0904W0122 近期广州和南京已经开始启动公转商贷款贴息模式 -BAC009S0904W0123 即由商业银行向市民发放执行公积金利率的贷款 -BAC009S0904W0124 公积金贷款与商业贷款之间的利息差额 -BAC009S0904W0125 由公积金中心向银行支付 -BAC009S0904W0126 重点城市公积金贷额款度也有限制 -BAC009S0904W0127 比如北京和上海家庭最高贷款额度均为一百万元 -BAC009S0904W0128 广州和深圳分别为五万元和七万元 -BAC009S0904W0129 在近期房价持续回升的背景下 -BAC009S0904W0130 多数二线城市和三四线城市 -BAC009S0904W0131 公积金贷款上限能够复盖单套房总价 -BAC009S0904W0132 这些城市公积金贷款买房的比例也比较高 -BAC009S0904W0133 此次政策调整也有较好的针对性 -BAC009S0904W0134 包括此次三部委发布公积金新政 -BAC009S0904W0135 再结合近期限外政策全面松绑 -BAC009S0904W0136 具有实时性合理性和较强的针对性 -BAC009S0904W0137 有助于发挥内需在稳增长中的积极作用 -BAC009S0904W0138 住建部等三部委联合发文 -BAC009S0904W0139 再次降低公积金贷款的门槛 -BAC009S0904W0140 还清首套房公积金贷款 -BAC009S0904W0141 在公积金贷款额度上调后一个月内 -BAC009S0904W0142 北京公积金贷款成交量上涨百分之五 -BAC009S0904W0143 中小户型住房去化速度明显加快 -BAC009S0904W0144 北京公积金贷款额度提高 -BAC009S0904W0145 虽有助于使刚需购房者长期受益 -BAC009S0904W0146 但仍存在七万最高贷款额申请难等落地问题 -BAC009S0904W0147 公积金政策放宽对楼市成交的短期刺激作用将难以持续 -BAC009S0904W0148 各地公积金政策步入频繁调整期 -BAC009S0904W0149 一向严格收紧购房政策的北京也加入此阵营 -BAC009S0904W0150 据中新网房产频道梳理 -BAC009S0904W0151 北京针对公积金的调整次数便达到五次 -BAC009S0904W0152 美丽北京大型绿色公益品牌项目 -BAC009S0904W0153 调整公积金年度缴存上下限和缴存比例 -BAC009S0904W0154 放宽公积金贷款二套房认定标准 -BAC009S0904W0155 将公积金贷款利率下调 -BAC009S0904W0156 公积金政策的调整从未这么频繁 -BAC009S0904W0157 从一系列公积金政策看来 -BAC009S0904W0158 扶持刚需客群已经成为北京房地产调控的主要方向 -BAC009S0904W0159 未来政策层面将继续保持宽松 -BAC009S0904W0160 在上海易居房地产研究院研究员严跃进看来 -BAC009S0904W0161 这一系列公积金政策的放宽 -BAC009S0904W0162 与目前房地产救市的市场导向相吻合 -BAC009S0904W0163 盘活各地公积金资源 -BAC009S0904W0164 年初选择使用公积金贷款的购房者占比环比增多 -BAC009S0904W0165 伟嘉安捷提供数据显示 -BAC009S0904W0166 七月北京公积金贷款成交量提升了百分之五 -BAC009S0904W0167 贷款需求将在下个月继续释放 -BAC009S0904W0168 北京七月楼市的成交情况 -BAC009S0904W0169 也佐证了公积金政策放宽刺激作用的显现 -BAC009S0904W0170 在总成交中占比环比增加五个百分点 -BAC009S0904W0171 且低于七十平米的小户型住房成交明显上升 -BAC009S0904W0172 北京调整首套房公积金贷款最高额度 -BAC009S0904W0173 伟业我爱我家集团副总裁胡景晖分析 -BAC009S0904W0174 刚需人群观望心理正逐步消散 -BAC009S0904W0175 开始加速进入新房市场 -BAC009S0904W0176 公积金政策的放宽对二手房市场也产生了影响 -BAC009S0904W0177 据伟业我爱我家市场研究院测算 -BAC009S0904W0178 在过去六个月中处于高点 -BAC009S0904W0179 虽然刚需购房者入市积极性有所提升 -BAC009S0904W0180 但不少业内人士认为 -BAC009S0904W0181 这一刺激作用并不会长时间延续 -BAC009S0904W0182 上调公积金贷款额度对市场的刺激是短期的 -BAC009S0904W0183 公积金短期拉动的购房需求有限 -BAC009S0904W0184 更多是原本计划购房的客群享受到了政策利好 -BAC009S0904W0185 原本短期内不考虑购房的客群 -BAC009S0904W0186 在这一政策出台后匆忙购房 -BAC009S0904W0187 加强农产品质量安全监管 -BAC009S0904W0188 建立协调配合检打联动联防联控应急处置机制 -BAC009S0904W0189 实行农产品产地安全分级管理 -BAC009S0904W0190 推动农产品生产加工和流通企业建立诚信制度 -BAC009S0904W0191 提高农业产业化和规模化经营水平 -BAC009S0904W0192 推进农业产业化经营跨越式发展 -BAC009S0904W0193 制定扶持农业产业化龙头企业发展的综合性政策 -BAC009S0904W0194 启动实施农业产业化经营跨越发展行动 -BAC009S0904W0195 按照扶优扶大扶强的原则 -BAC009S0904W0196 依托农产品加工物流等各类农业园区 -BAC009S0904W0197 选建一批农业产业化示范基地 -BAC009S0904W0198 推进龙头企业集群发展 -BAC009S0904W0199 引导龙头企业采取兼并重组参股收购等方式 -BAC009S0904W0200 支持龙头企业跨区域经营 -BAC009S0904W0201 提升产品研发精深加工技术水平和装备能力 -BAC009S0904W0202 鼓励龙头企业采取参股合作等方式 -BAC009S0904W0203 与农户建立紧密型利益联联结关系 -BAC009S0904W0204 强化农民专业合作社组织带动能力 -BAC009S0904W0205 广泛开展示范社建设行动 -BAC009S0904W0206 加大合作社经营管理人员培训培养力度 -BAC009S0904W0207 加强合作社辅导员队伍建设 -BAC009S0904W0208 支持农民专业合作社参加农产品展示展销活动 -BAC009S0904W0209 建立稳定的产销关系 -BAC009S0904W0210 鼓励农民专业合作社开展信用合作 -BAC009S0904W0211 在自愿基础上组建联合社 -BAC009S0904W0212 提高生产经营和市场开拓能力 -BAC009S0904W0213 扶持合作社建设农产品仓储冷藏初加工等设施 -BAC009S0904W0214 发展多种形式的适度规模经营 -BAC009S0904W0215 在依法自愿有偿和加强服务基础上 -BAC009S0904W0216 完善土地承包经营权流转市场 -BAC009S0904W0217 发展多种形式的规模化专业化生产经营 -BAC009S0904W0218 引导土地承包经营权向生产和经营能手集中 -BAC009S0904W0219 大力培育和发展种养大户家庭农牧场 -BAC009S0904W0220 实施一村一品强村富民工程 -BAC009S0904W0221 大力发展农业社会化服务 -BAC009S0904W0222 增强农业公益性服务能力 -BAC009S0904W0223 加快基层农技推广体系改革和建施 -BAC009S0904W0224 健全公益性农业技术推广服务体系 -BAC009S0904W0225 加强农业有害生物监测预警和防控能力建设 -BAC009S0904W0226 加强农业资源和生态环境保护 -BAC009S0904W0227 继续实行最严格的耕地保护制度 -BAC009S0904W0228 确保耕地保有量保持在十亿亩 -BAC009S0904W0229 基本农田不低于十亿亩 -BAC009S0904W0230 科学保护和合理利用水资源 -BAC009S0904W0231 大力发展节水增效农业 -BAC009S0904W0232 继续建设国家级旱作农业示范区 -BAC009S0904W0233 坚持基本草原保护制度 -BAC009S0904W0234 推行禁牧休牧和划区轮牧 -BAC009S0904W0235 实施草原保护重大工程 -BAC009S0904W0236 加大水生生物资源养护力度 -BAC009S0904W0237 强化水生生态修复和建设 -BAC009S0904W0238 加强畜禽遗传资源和农业野生植物资源保护 -BAC009S0904W0239 加强农业生态环境治理 -BAC009S0904W0240 鼓励使用生物农药高效低毒低残留农药和有机肥料 -BAC009S0904W0241 回收再利用农膜和农药包装物 -BAC009S0904W0242 加快规模养殖场粪污处理利用 -BAC009S0904W0243 治理和控制农业面源污染 -BAC009S0904W0244 培育门类丰富层次齐用的综合利用产业 -BAC009S0904W0245 建立秸秆禁烧和综合利用的长效机制 -BAC009S0904W0246 继续实施农村沼气工程 -BAC009S0904W0247 大力推进农村清洁工程建设 -BAC009S0904W0248 清洁水源田园和家园 -BAC009S0904W0249 大力推进农业节能减排 -BAC009S0904W0250 树立绿色低碳发展理念 -BAC009S0904W0251 积极发展资源节约型和环境友好型农业 -BAC009S0904W0252 淘汰报废高耗能老旧农业机械 -BAC009S0904W0253 应该也是个提醒中国再也不能仅仅满足于组装了 -BAC009S0904W0254 我们在科技创新方面的进步非常显着 -BAC009S0904W0255 一项项领先世界的科技成果 -BAC009S0904W0256 不断刷新中国创造的精度高度深度 -BAC009S0904W0257 成为一个个响亮的中国品牌 -BAC009S0904W0258 我们的自主创新能力还不够强 -BAC009S0904W0259 与世界先进水平相比还有明显差距 -BAC009S0904W0260 特别是企业自主创新方面 -BAC009S0904W0261 具有重大影响的科技产品还不是很多 -BAC009S0904W0262 与世界第二经济大国的地位还不相称 -BAC009S0904W0263 希望中国品牌在国际市场的知名度和影响力越来越大 -BAC009S0904W0264 中国人从来不缺乏创新创造的基因 -BAC009S0904W0265 创新是中华民族最鲜明的禀赋 -BAC009S0904W0266 我们完全有理由树立创新自信 -BAC009S0904W0267 上一次工业革命我们落在了西方发达国家后面很远 -BAC009S0904W0268 面对以网络和数字技术为标志的信息技术发展 -BAC009S0904W0269 我们迎来了赶超发达国家的难得机遇 -BAC009S0904W0270 我国拥有近一四亿人口 -BAC009S0904W0271 手机网民近五点六亿 -BAC009S0904W0272 这样的规模没有任何一个国家可以比拟 -BAC009S0904W0273 他们的消费需求是拉动创新创业的巨大牵引力 -BAC009S0904W0274 规模超大的人才群体更是创新创造无与伦比的重要资源 -BAC009S0904W0275 我国经济发展进入新常态 -BAC009S0904W0276 双目标不仅包括保持中高速增长 -BAC009S0904W0277 还包括迈向中高端水平 -BAC009S0904W0278 我国的经济处在爬坡过坎的重要关口 -BAC009S0904W0279 我们也许不用像以前那样为了追求某个数字赶紧赶慢了 -BAC009S0904W0280 但松一口气的想法是没有出路的 -BAC009S0904W0281 恰恰更需要我们有所作为 -BAC009S0904W0282 就是在创新驱动上下功夫 -BAC009S0904W0283 在转型发展上下功夫 -BAC009S0904W0284 不断提高技术创新对经济发展的贡献率 -BAC009S0904W0285 如果说过去的这些年 -BAC009S0904W0286 我们成为世界工厂是不可逾越的发展阶段 -BAC009S0904W0287 那么未来的五年十年二十年 -BAC009S0904W0288 我们肯定不能再沾沾自喜于世界工厂 -BAC009S0904W0289 也不能一直被贴上中国组装的标签 -BAC009S0904W0290 长期处在产业链的末端 -BAC009S0904W0291 期待着越来越多中国设计的产品不断涌现并享誉国际 -BAC009S0904W0292 未必印在每个产品上 -BAC009S0904W0293 但应刻在每个中国企业家甚至每个中国人心里 -BAC009S0904W0295 中国经营网注有国外媒体报道称 -BAC009S0904W0296 苹果市场价值达到七千亿美元刚刚过去几个月 -BAC009S0904W0297 已经有股票经纪公司预测 -BAC009S0904W0298 那么苹果能突破一万亿大关吗 -BAC009S0904W0299 苹果公司上次发布全新产品是在五年以前 -BAC009S0904W0301 苹果的目标股价也开始相应地上涨 -BAC009S0904W0302 苹果的市值可能将突破万亿美元 -BAC009S0904W0303 现在市面上的电话手表功能最主要有两个通话和定位 -BAC009S0904W0304 儿童电话手表还推出了其他更多人性化的创新功能 -BAC009S0904W0305 对手表的大力普及也起到了至关重要的作用 -BAC009S0904W0306 以小天才电话手表为例 -BAC009S0904W0307 除了能和手机一样接打电话 -BAC009S0904W0308 做到全方位亲子沟通 -BAC009S0904W0310 击掌成为加好友等功能也一应俱全 -BAC009S0904W0311 电话手表就相当于一部简化的智能手机 -BAC009S0904W0312 主要在于将通信和定位的模块大大缩小到方寸之间 -BAC009S0904W0313 置入只有手机几分之一大小的手表表盘 -BAC009S0904W0314 还要保证与手机一样的通话质量呢 -BAC009S0904W0315 这是摆在行业面前最大的技术难题 -BAC009S0904W0316 小天才产品负责人表示 -BAC009S0904W0317 以小天才电话手表为例 -BAC009S0904W0318 公司超百位研发人员历经半年多时间 -BAC009S0904W0319 投入巨资研究经费攻关 -BAC009S0904W0320 最后找到芬兰的高级技术团队 -BAC009S0904W0321 才解决电话手表的内线内置问题 -BAC009S0904W0322 对于这种突破性的天线内置方案 -BAC009S0904W0323 我们进行了极为严谨的测试 -BAC009S0904W0324 确保信号与手机相当才真正投放市场 -BAC009S0904W0325 对于创新成果的实证和检验 -BAC009S0904W0326 电话手表对儿童安全吗 -BAC009S0904W0327 儿童电话手表的辐射对儿童的健康安全是否存在隐患呢 -BAC009S0904W0328 这种说法到底有无科学依据呢 -BAC009S0904W0329 关于手机等产品的辐射问题 -BAC009S0904W0330 任何家用电器只要通电就会产生电磁辐射 -BAC009S0904W0331 大到空调电视机电脑微波炉加湿器 -BAC009S0904W0332 小到吹风机充电器甚至接线板都会产生电磁辐射 -BAC009S0904W0333 虽然电磁辐射无处不在 -BAC009S0904W0334 并非所有的电磁辐射都会对人体产生危害 -BAC009S0904W0335 中国电力科学研究院高级工程师邬雄表示 -BAC009S0904W0336 比如阳光也是一种电磁辐射 -BAC009S0904W0337 根据国际非电离辐射防护委员会制定的标准 -BAC009S0904W0338 北京市业馀排球联赛未来每年都将举办一届 -BAC009S0904W0339 并且会逐渐扩大比赛规模 -BAC009S0904W0340 筹备时间和比赛周期都将延长 -BAC009S0904W0341 参赛队伍数量也会有所提升 -BAC009S0904W0342 明年北京市业馀排球联赛将在中国排球协会备案 -BAC009S0904W0343 北京市排球协会与天津排协已经初步达成合作意向 -BAC009S0904W0344 今后北京与天津两地可能会联合办赛 -BAC009S0904W0345 通过冠军赛季后赛垫场赛等形式 -BAC009S0904W0346 通过未来几年的发展 -BAC009S0904W0347 影响力强的全国性比赛 -BAC009S0904W0348 高清图女排凯旋郎平受热捧 -BAC009S0904W0349 时隔一二年重夺世界杯冠军的中国女排 -BAC009S0904W0350 新队长曾春蕾揭秘了角色转变前后的幕后故事 -BAC009S0904W0351 并且介绍自己是如何通过实战调整状态而渐入佳境的 -BAC009S0904W0352 后两轮死磕俄罗斯和日本更是有红了眼的感觉 -BAC009S0904W0353 回忆起当时临危受命接班队长一职的情况 -BAC009S0904W0354 曾春蕾介绍是在中国女排出发的前一天 -BAC009S0904W0355 主教练郎平训练结束后通知她的 -BAC009S0904W0356 当时确实没有什么心理准备 -BAC009S0904W0357 虽然知道惠若琪的心脏不太好 -BAC009S0904W0358 但是也不好过问太多 -BAC009S0904W0359 结果等到的消息是她不能去世界杯 -BAC009S0904W0360 其实在二零一四年女排大奖赛的总决赛 -BAC009S0904W0361 曾春蕾就曾经临时客串过队长职务 -BAC009S0904W0362 不过和这次在世界杯当队长相比压力明显不同 -BAC009S0904W0363 这位北京姑娘直言在三大赛当队长的感觉很特殊 -BAC009S0904W0364 是心智上的一个考验 -BAC009S0904W0365 刚开始无谓的心理压力很大 -BAC009S0904W0366 甚至在头一场的比赛还影响到自己的技术发挥 -BAC009S0904W0367 好在队友们相互弥补得非常出色 -BAC009S0904W0368 曾春蕾通过自我调节而让竞技状态渐入佳境 -BAC009S0904W0369 在保障好技术稳定发挥的同时 -BAC009S0904W0370 还能够在情绪上带动队友 -BAC009S0904W0371 谈及当队长的责任感 -BAC009S0904W0372 曾春蕾认为中国女排的困难体现在伤病多 -BAC009S0904W0373 需要不停地告诫自己要淡定下来 -BAC009S0904W0374 毕竟她本人是经历过伦敦奥运会的 -BAC009S0904W0375 当队长的一举一动都会带来情绪上影响 -BAC009S0904W0376 因此一个眼神一个动作 -BAC009S0904W0377 都要给队友们传递乐观和放松的讯号 -BAC009S0904W0378 曾春蕾一记五米线的调整攻打得非常漂亮 -BAC009S0904W0379 评价自己发挥的最好一场其实就是本场比赛 -BAC009S0904W0380 因为和高手过招有种红了眼的感觉 -BAC009S0904W0381 个别球更是像释放怒火一般 -BAC009S0904W0382 桎梏挣脱开了就敢于发挥 -BAC009S0904W0383 由于中国女排的前期准备特别充分 -BAC009S0904W0384 这在曾春蕾看来打俄罗斯很有底 -BAC009S0904W0385 发挥也很从容和淡定 -BAC009S0904W0386 曾春蕾坦言打关键分的状态很忘我 -BAC009S0904W0387 打日本从来都不需要动员 -BAC009S0904W0388 队友彼此之间需要相互鼓励 -BAC009S0904W0389 但更多的是落实在技术环节的细腻方面 -BAC009S0904W0390 因为想要捧起来冠军奖杯的欲望太强烈 -BAC009S0904W0391 直通里约奥运会的目标也近在咫尺 -BAC009S0904W0392 身为大队员就会去提醒大家 -BAC009S0904W0393 将去年输球的原因作为教训反思 -BAC009S0904W0394 对垒日本女排有这样一个小细节 -BAC009S0904W0395 曾春蕾在刘晓彤一传失误后直接说我来 -BAC009S0904W0396 表明队长角色转换完成得还不错 -BAC009S0904W0397 自言就应该去承担更多的任务 -BAC009S0904W0398 曾春蕾保持着清醒的头脑 -BAC009S0904W0399 深知世界杯夺冠是对过去努力的肯定 -BAC009S0904W0400 但更多的是看到了自己的不足 -BAC009S0904W0401 也知道了未来需要努力的方向 -BAC009S0904W0402 视频中国三比一大胜俄罗斯独占女排世界杯榜首 -BAC009S0904W0403 日本二零一五女排世界杯单循环赛战至第十轮 -BAC009S0904W0404 不仅在全球收回十亿美元票房 -BAC009S0904W0405 且获得奥斯卡最佳动画片大奖 -BAC009S0904W0406 皮克斯终于有了拍摄玩具总动员四的计划 -BAC009S0904W0407 这部正在酝酿中的续集敲定了导演 -BAC009S0904W0408 但上台祝酒时都会交足戏 -BAC009S0904W0409 洪永城还主动跟陈展鹏碰杯 -BAC009S0904W0411 她自言最近在拍戏 -BAC009S0904W0412 戏中的角色常常以性感打扮示人 -BAC009S0904W0413 所以自己也很喜欢性感打扮 -BAC009S0904W0414 问及男友郑嘉颖会不会介意这么性感 -BAC009S0904W0416 这个程度是美的 -BAC009S0904W0417 他应该也是喜欢 -BAC009S0904W0418 问及最近是否有跟男友见面 -BAC009S0904W0420 自己也有一段时间没有跟他见面了 -BAC009S0904W0421 两人都是依赖电话沟通 -BAC009S0904W0422 自己也很期待九月中旬和男友见面 -BAC009S0904W0423 并大呼我自己也非常期待他回来 -BAC009S0904W0424 因为很久了很想念他 -BAC009S0904W0425 问及见面后两人怎样庆祝 -BAC009S0904W0426 她表示应该是吃吃饭看电影之类的 -BAC009S0904W0428 是否会请教男友拍戏上的问题 -BAC009S0904W0429 她透露有些不懂的会问男友郑嘉颖 -BAC009S0904W0430 对方给了她很大的帮助 -BAC009S0904W0431 图自网络温州网讯有网友爆料 -BAC009S0904W0432 温州瑞安一驾考考生在科目三考试中突然晕了过去 -BAC009S0904W0433 送到医院时已没有呼吸 -BAC009S0904W0434 现场图温都讯今天下午四时许 -BAC009S0904W0435 看来温州市区电梯也该大整修了 -BAC009S0904W0436 温州一网友造谣苏迪罗登陆期间水库崩塌被拘 -BAC009S0904W0437 澎湃新闻八月一零日从浙江温州平阳警方获悉 -BAC009S0904W0438 因在台风苏迪罗登陆期间在网络散布水库崩塌谣言 -BAC009S0904W0439 温州一路虎店隐瞒新车维修史被判赔三一四万 -BAC009S0904W0440 温州新力虎汽车销售公司展示台 -BAC009S0904W0441 温州一酒店窝头三八元一个三盘消费一七一零元 -BAC009S0904W0442 温州一闲置地块填满垃圾臭味浓烈附近居民不敢开窗 -BAC009S0904W0443 小区外的空置地上填满垃圾近日 -BAC009S0904W0444 与小区只有一河之隔的东边 -BAC009S0904W0445 因惠民路南段从去年开通后 -BAC009S0904W0446 对一块闲置地监管没有跟上 -BAC009S0904W0447 近一年来每天晚上有垃圾倒在这块闲置地上 -BAC009S0904W0448 垃圾刺鼻的臭味害得住户们连窗户都不敢打开 -BAC009S0904W0449 此前温州政协委员连续两届提出要求整治垃圾污染问题 -BAC009S0904W0450 日前本报记者前往实地调查了解 -BAC009S0904W0451 温州三学生为庆生爬上浙江第一高楼玩自拍 -BAC009S0904W0452 再上到楼顶一座高约四零米的铁塔上 -BAC009S0904W0453 并在铁塔上借助自拍杆合影 -BAC009S0904W0454 一则长达一分五七秒的视频在网络上热传 -BAC009S0904W0455 有网友称之为青春任性 -BAC009S0904W0456 温州三家熟食店摊主被捕为求卖相好添加日落黄 -BAC009S0904W0457 本报讯记者范跃红通讯员瓯文为了卖相好 -BAC009S0904W0458 温州两女孩溺水救护车因车多路堵错过救援时间 -BAC009S0904W0459 温州两男子为争女人驾奔驰宝马街头四次对撞 -BAC009S0904W0460 瑞安市商业街和联中路交叉口 -BAC009S0904W0461 一辆宝马和一辆奔驰四次相撞 -BAC009S0904W0462 两车损失高达数十万元 -BAC009S0904W0463 温州企业家卖房建养老院捐给当地却被闲置三年 -BAC009S0904W0464 浙江温州乐清七五岁的企业家虞一杰退休之后 -BAC009S0904W0465 拿出了自己全部的积蓄 -BAC009S0904W0466 还卖了自己在杭州和乐清的房子 -BAC009S0904W0467 但是养老院建成至今已经有三年了 -BAC009S0904W0468 那原因到底在哪里呢 -BAC009S0904W0469 温州体育局官员逼女教练陪酒当地纪委介入调查 -BAC009S0904W0470 以给编制五险一金等为借口 -BAC009S0904W0471 诱逼女教练陪他喝酒吃饭唱歌 -BAC009S0904W0472 并贴出多张聊天记录截图 -BAC009S0904W0473 立即引起众多网友关注 -BAC009S0904W0474 温州六旬老人辗转各地看公厕一二年还债七六万元 -BAC009S0904W0475 温州网讯我不想死后给后人说闲话 -BAC009S0904W0476 省吃俭用也要把该还的钱尽力还掉 -BAC009S0904W0477 让借给我钱的好心人 -BAC009S0904W0478 这是富林愚老人发自内心的一句话 -BAC009S0904W0479 温州农贸市场现注胶虾业内人称为增加重量 -BAC009S0904W0480 虾里有明显的胶状物质图片来源网友微信日前 -BAC009S0904W0481 回家后发现大虾体内竟然被注射了不明胶状物 -BAC009S0904W0482 瑞安市市场监管局玉海所介入调查 -BAC009S0904W0483 当事水产摊贩已退还郑女士一零零元购虾款 -BAC009S0904W0484 温州化工仓库起火殃及附近河流大量死鱼漂河面 -BAC009S0904W0485 图为几天前村民拍到的河面 -BAC009S0904W0486 温州医生夫妇贩婴被批捕女儿欲捐款替父赎罪 -BAC009S0904W0487 温州医生涉贩卖儿童谎称活婴是死婴骗父母放弃 -BAC009S0904W0488 参考消息网九月二五日报道新报称 -BAC009S0904W0489 继陕西富平妇产科医生张淑侠因贩卖婴儿被判刑之后 -BAC009S0904W0490 中国再现医生涉嫌拐卖婴儿的案例 -BAC009S0904W0491 一对来自浙江温州的医生夫妇涉案被捕 -BAC009S0904W0492 温州瑞安市发生一起违停女司机故意伤害交通协警案件 -BAC009S0904W0493 温州城管协管员掌掴女清洁工被停职 -BAC009S0904W0494 该段视频时长仅有六秒 -BAC009S0904W0495 一名路人疾步上前将男子拦开 -BAC009S0905W0121 公积金贷款额度的提高 -BAC009S0905W0122 确实降低了刚需人群购房成本 -BAC009S0905W0123 对房地产市场的利好影响将是长期的 -BAC009S0905W0124 而这一落地难题也会影响其对刚需市场的支持效力 -BAC009S0905W0125 这一公积金新政实际上仍然存在很多门槛 -BAC009S0905W0126 在住房公积金贷款的申请过程中 -BAC009S0905W0127 有些要与房企具体项目挂钩 -BAC009S0905W0128 在公积金贷款额度上调后一个月内 -BAC009S0905W0129 北京公积金贷款成交量上涨百分之五 -BAC009S0905W0130 金融市场总体平稳鲁指冲高回落 -BAC009S0905W0131 但专家预计短期央行仍可能会积极维稳 -BAC009S0905W0132 汇率较大概率维持双向 -BAC009S0905W0133 相关公司股票走势招商银行 -BAC009S0905W0134 降准降息或再掀收益率的下降潮 -BAC009S0905W0135 双降加上广州公积金贷款新政落地 -BAC009S0905W0136 上周末成为潜在买家们争相咨询看楼的时机 -BAC009S0905W0137 期待岁末能有更多利好出现 -BAC009S0905W0138 第四季度二手住宅成交量将环比增幅在百分之七以内 -BAC009S0905W0139 价格要到明年初才出现上涨 -BAC009S0905W0140 广州日报讯记者林琳上周五 -BAC009S0905W0141 再加上广州公积金贷款新政终于落地 -BAC009S0905W0142 一系列利好消息影响下的首个周末 -BAC009S0905W0143 买家积极咨询看楼 -BAC009S0905W0144 降息消息传出后首日 -BAC009S0905W0145 地铺门店咨询量与七月同期相比约有百分之七左右的增幅 -BAC009S0905W0146 满堂红链家市场研究部高级经理周峰透露 -BAC009S0905W0147 店均电话咨询量比上一个周末增加十一百分之左右 -BAC009S0905W0148 看楼量对比上一周末大概增加百分之七左右 -BAC009S0905W0149 不过他认为这种增幅并不算太明显 -BAC009S0905W0150 搜房网广州二手房电商集团市场部总监罗来平发现 -BAC009S0905W0151 市场上约有两成业主反价 -BAC009S0905W0152 一个天河区的中介人士告诉记者 -BAC009S0905W0153 市民对连续多次降息已经麻木了 -BAC009S0905W0154 公积金贷款新政出台 -BAC009S0905W0155 市场不可能那么快有反应 -BAC009S0905W0156 七月广州二手楼市交投升温的态势已相当明确 -BAC009S0905W0157 按照这一趋势发展下去 -BAC009S0905W0158 再加上央行降息以及公积金新政等利好的叠加效应 -BAC009S0905W0159 有望进一步激活买家在接近年底这段时间的入市积极性 -BAC009S0905W0160 据阳光家缘网站公布数据统计 -BAC009S0905W0161 广州二手住宅市场七月的网签量已达一千套 -BAC009S0905W0162 广州二手住宅市场网签量达一千套 -BAC009S0905W0163 超过五月七千套的水平 -BAC009S0905W0164 目前市场上的低价房源已基本消耗完毕 -BAC009S0905W0165 广州二手房迎来新一轮涨价潮 -BAC009S0905W0166 搜房网广州二手房统计中心数据显示 -BAC009S0905W0167 广州五月二手房均价为一千元每平方米 -BAC009S0905W0168 比月初增长了一百元每平方米 -BAC009S0905W0169 因此判断随着利好政策的实施和成交量的增加 -BAC009S0905W0170 今年的房价还会有上升空间 -BAC009S0905W0171 广州还是在执行严厉的限购政策 -BAC009S0905W0172 我预计市场成交量会有所增加 -BAC009S0905W0173 但增加的幅度不会太大 -BAC009S0905W0174 他预测今年剩馀的两个月中 -BAC009S0905W0175 昨日人民币汇率小幅走弱 -BAC009S0905W0176 人民币中间价 -BAC009S0905W0177 美丽北京大型绿色公益品牌项目 -BAC009S0905W0178 在岸人民币兑美元收盘下跌百分之一 -BAC009S0905W0179 双降后首日在岸人民币由弱转强 -BAC009S0905W0180 人民币成交额减少百分之一 -BAC009S0905W0181 报七千亿美元 -BAC009S0905W0182 上周五的双降政策让市场担忧 -BAC009S0905W0183 投金或在经济增长速度放缓形势下加速外流 -BAC009S0905W0184 投资者担心这将加重人民币所面临的压力 -BAC009S0905W0185 就在上周五双降公布之后 -BAC009S0905W0186 招商银行同业金融部高级分析师刘东亮指出 -BAC009S0905W0187 加快老旧渔船更新改造 -BAC009S0905W0188 不断增强农业可持续发展能力 -BAC009S0905W0189 创建国家现代农业示范区 -BAC009S0905W0190 加大示范区建设力度 -BAC009S0905W0191 加大示范目建设投入力度 -BAC009S0905W0192 努力打造现代农业发展的典型和样板 -BAC009S0905W0193 发挥示范区引领作用 -BAC009S0905W0194 通过产业拉动技术辐射和人员培训等 -BAC009S0905W0195 带动周边地区现代农业加快发展 -BAC009S0905W0196 引导各地鉴借示范区发展现代农业的好做法和好经验 -BAC009S0905W0197 推动创建不同层次特色鲜明的现代农业示范区 -BAC009S0905W0198 按照分类指导突出重点梯次推进的思路 -BAC009S0905W0199 以七区二十三带农业战略格局为核心 -BAC009S0905W0200 着力建设重点推进率先实现和稳步发展三类区域 -BAC009S0905W0201 引领全国现代农业加快发展 -BAC009S0905W0202 重点推进区域 -BAC009S0905W0203 农业生产技术较为成熟 -BAC009S0905W0204 农业生产条件具有良好基础 -BAC009S0905W0205 承担着主要农产品供给保证的主体功能 -BAC009S0905W0206 加快推进该区域现代农业建设 -BAC009S0905W0207 事关全国农业现代化进程和国家粮食安全大局 -BAC009S0905W0208 继续发挥该区域粮食安全基础保障作用 -BAC009S0905W0209 调动各方发展粮食生产积极性 -BAC009S0905W0210 以建设小麦玉米水稻大豆优势产业带为重点 -BAC009S0905W0211 深入开展粮食稳定增产行动 -BAC009S0905W0212 加强农田水利和高标准农田建设 -BAC009S0905W0213 提高农机装备和作业水平 -BAC009S0905W0214 大力开展高产创建和科技指导服务 -BAC009S0905W0215 推广防灾减灾增产关键技术 -BAC009S0905W0216 加快选育应用优良品种 -BAC009S0905W0217 大幅度提升粮食综合生产能力和现代化生产水平 -BAC009S0905W0218 大力发展粮食精深加工及仓储物流业 -BAC009S0905W0219 完善粮食仓储运输设备 -BAC009S0905W0220 引导龙头企业向优势产区集聚 -BAC009S0905W0221 提高粮食生产综合效益 -BAC009S0905W0222 其他主要农产品优势区 -BAC009S0905W0223 以及蔬菜蚕卓等农产品生产的主体区域 -BAC009S0905W0224 以建设区域内各类农产品优势产业带为重点 -BAC009S0905W0225 提高资源利用率和加工转化率 -BAC009S0905W0226 继续巩固棉油糖水果和蔬菜等产品供给保证地位 -BAC009S0905W0227 着力强化技术装备支撑 -BAC009S0905W0228 提高现代化生产水平 -BAC009S0905W0229 强化出口水产品生产基地功能 -BAC009S0905W0230 加快现代养殖业发展 -BAC009S0905W0231 率先实现区域 -BAC009S0905W0232 该区域交通区位市场和人力资源优势明显 -BAC009S0905W0233 资本技术等现代化生产要素集约化程度高 -BAC009S0905W0234 加快该区域现代农业建设 -BAC009S0905W0235 对于引领全国现代农业加快发展具有重要意义 -BAC009S0905W0236 东部沿海先导农业区 -BAC009S0905W0237 大力发展资本技术密集型农业 -BAC009S0905W0238 保持耕地面积不减少 -BAC009S0905W0239 探索企业化集团化发展模式 -BAC009S0905W0240 大力推进标准化生产和集约化经营 -BAC009S0905W0241 提高信息化优质化和品牌化水平 -BAC009S0905W0242 提升产品的科技含量和附加值 -BAC009S0905W0243 大城市郊区多功能农业区 -BAC009S0905W0244 主要指沿海地区以外的直辖市省会城市等大城市郊区 -BAC009S0905W0245 统筹推进新一轮菜篮子工程建设 -BAC009S0905W0246 合理确定大城市郊区菜篮子产品生产用地保有数量 -BAC009S0905W0247 提高大城市菜篮子产品的自给率 -BAC009S0905W0248 在稳定城市副食品供应保证能力的基础上 -BAC009S0905W0249 全面推进机械化标准化品牌化产业化发展 -BAC009S0905W0250 加快农田基础设备和现代农业装备建设 -BAC009S0905W0251 着力建设国家商品粮供给重点保证区 -BAC009S0905W0252 提升垦区现代农业发展水平 -BAC009S0905W0253 业界首次开始认真讨论苹果市值晋升万亿大关的潜力 -BAC009S0905W0254 苹果股票价格创下历史新高 -BAC009S0905W0255 苹果市值超过七千亿美元 -BAC009S0905W0256 如果按照每股一二七美元的股价来算 -BAC009S0905W0257 那么苹果市价约为七四四十亿美元 -BAC009S0905W0258 这一价格也是目前华尔街给出的最高估值 -BAC009S0905W0261 随着四克网络的在中国的展开 -BAC009S0905W0262 苹果对电动汽车表现出的浓厚兴趣 -BAC009S0905W0263 也能够给股票市场来带更多兴奋 -BAC009S0905W0264 苹果将继续向股东返还现金 -BAC009S0905W0265 四月份或将采取更多的举动 -BAC009S0905W0266 这些力量的结合将会推动苹果的市盈率大幅上正 -BAC009S0905W0267 苹果公司的市价将突破一万亿美金大关 -BAC009S0905W0268 这只是最乐观的估计 -BAC009S0905W0269 苹果在成长为万亿美元市场的巨无霸之前 -BAC009S0905W0270 还有很多阻碍要解决 -BAC009S0905W0271 先是价格昂贵功能鸡肋的特点遭到一众业内人士吐槽 -BAC009S0905W0273 屏幕良品率仅在百分之三十至百分之四十之间 -BAC009S0905W0274 苹果公司现在已将约三百万的原始订单削减了一半 -BAC009S0905W0275 准备和特斯拉一较高下 -BAC009S0905W0276 但相对于传统的汽车制造工业 -BAC009S0905W0277 苹果作为消费数码产品的公司是否具备造车能力 -BAC009S0905W0279 目前大部分华尔街分析师们都对苹果的未来保持乐观 -BAC009S0905W0280 仅有三点百分之四的分析师建议卖出 -BAC009S0905W0281 中国经营网注有国外媒体报道称 -BAC009S0905W0282 苹果市场价值达到七千亿美元刚刚过去几个月 -BAC009S0905W0283 已经有股票经纪公司预测 -BAC009S0905W0284 苹果能否摆脱王者魔咒 -BAC009S0905W0285 苹果晋身道指固属众望所归 -BAC009S0905W0287 而苹果得以顺利跻身道指 -BAC009S0905W0288 亦拜股份去年六月一拆七所赐 -BAC009S0905W0289 却完全不足以彰显编制机构与时并进 -BAC009S0905W0290 苹果固然不会因此而升格 -BAC009S0905W0292 毕竟还有许多人的心愿 -BAC009S0905W0293 老毕于跟苹果押注太阳能一文问过大家 -BAC009S0905W0294 苹果股价在说不准的时间内有望上升三成 -BAC009S0905W0295 是否能令捧场客心满意足 -BAC009S0905W0296 问题焦点若是太阳能 -BAC009S0905W0297 诸位自然不会满足于前面提及的潜在回报 -BAC009S0905W0298 这家市值离万亿美元不远的股王 -BAC009S0905W0299 难不成真能第三期发育 -BAC009S0905W0300 读者若信经济学人 -BAC009S0905W0302 若定苹果第三期发育的立场已呼之欲出 -BAC009S0905W0303 手机辐射的比吸收率最高限值为二瓦特每千克 -BAC009S0905W0304 我国的标准和国际差不多 -BAC009S0905W0306 对生活中的电磁辐射进行了全面健康风险评估 -BAC009S0905W0307 不存在实际健康问题 -BAC009S0905W0308 辐射吸收率在国家的安全标准范围之内 -BAC009S0905W0309 电话手表的辐射主要来自天线 -BAC009S0905W0310 包括外置天线和内置天线 -BAC009S0905W0311 正规厂家生产的电话手表辐射一般符合国家标准 -BAC009S0905W0312 以小天才电话手表为例 -BAC009S0905W0313 根据权威机构检测报告显示 -BAC009S0905W0314 小天才电话手表辐射远小于国家标准二瓦特每千克 -BAC009S0905W0315 只要辐射值小于或等于国家标准值 -BAC009S0905W0316 就是符合国家标准的 -BAC009S0905W0317 小天才负责人介绍说 -BAC009S0905W0318 手机是直接贴着耳朵使用 -BAC009S0905W0319 而电话手表通话时离头部还有一百零一百一十五厘米的距离 -BAC009S0905W0320 可见电话手表的辐射比手机还小 -BAC009S0905W0321 不排除有一些杂牌的电话手表辐射会超标 -BAC009S0905W0322 建议家长通过正规渠道购买正规厂家生产的产品 -BAC009S0905W0323 电话手表应如何选购 -BAC009S0905W0324 关于儿童电话手表应该如何选购 -BAC009S0905W0325 也是众多家长特别想了解的 -BAC009S0905W0326 除了之前提到的关于辐射的测试报告外 -BAC009S0905W0327 专家提醒相关的产品认证也是消费者必须要关注的 -BAC009S0905W0328 所有在中国境内销售及使用的无线电组件产品 -BAC009S0905W0329 必须取得无线电型号的核准认证 -BAC009S0905W0330 没有该认证的产品属于违法产品 -BAC009S0905W0331 未获得进网许可证的 -BAC009S0905W0332 不得接入公用电信网使用和在国内销售 -BAC009S0905W0333 小天才电话手表等国内几个大品牌都有 -BAC009S0905W0334 这也是选购电话手表要注意关注的 -BAC009S0905W0335 很多家长都在给孩子购置各种学习用 -BAC009S0905W0336 网络安全漏洞挡道车联网阴霾笼罩搜狐科技 -BAC009S0905W0337 对频频的骚扰电话显得无可奈何 -BAC009S0905W0338 由郎平挂帅的中国女排在名古屋赛区 -BAC009S0905W0339 提升战绩为九胜一负反超至榜首位置 -BAC009S0905W0340 只要在明天的最后一战中赢下东道主日本 -BAC009S0905W0341 高清女排力擒俄罗斯夺冠占主动众将喜极而泣 -BAC009S0905W0342 能够赢得比赛真的很开心 -BAC009S0905W0343 对手给我们制造了非常多的困难 -BAC009S0905W0344 我和队友们一起团结努力克服了这些困难 -BAC009S0905W0345 在今天的比赛中曾春蕾首发出场 -BAC009S0905W0346 凭借十三分位列本队和扣球榜第二位 -BAC009S0905W0347 而主教练郎平则在全面性方面对大家做了更多要求 -BAC009S0905W0348 说到今天获胜的原因 -BAC009S0905W0349 作为队长出席新闻发布会的曾春蕾提到了凝聚力三个字 -BAC009S0905W0350 凝聚力一直都是中国女排的传统 -BAC009S0905W0351 它都是女排精神的一部分 -BAC009S0905W0352 当队伍遇到一些困难的时候 -BAC009S0905W0353 我们不需要教练要求就会团结在一起 -BAC009S0905W0354 像这种无形的向心力是在队伍中一直存在的 -BAC009S0905W0355 在今天的比赛中中国女排始终相互鼓励相互扶持 -BAC009S0905W0356 在几度遇险的情况下顽强咬住 -BAC009S0905W0357 无论年轻队员还是老队员都可能在比赛中出现起伏 -BAC009S0905W0358 我们要做的就是相互弥补 -BAC009S0905W0359 今天作为队长我更多是在精神层面上提醒大家 -BAC009S0905W0360 而在技术上年轻队员也弥补了我的不足 -BAC009S0905W0361 这是我们每个人都应该做的 -BAC009S0905W0362 如果能够战而胜之的话 -BAC009S0905W0363 明天还剩最后一场比赛 -BAC009S0905W0364 对我们来讲最重要的就是兢兢业业 -BAC009S0905W0365 大家回去之后将马上投入到对日本的准备中 -BAC009S0905W0366 明天比赛里我们会冷静下来落实到细节 -BAC009S0905W0367 一分分和对手拼到最后 -BAC009S0905W0368 北京时间明天晚间十八点 -BAC009S0905W0369 中国女排将应战日本队 -BAC009S0905W0370 搜狐体育郭健文 -BAC009S0905W0371 女排三零阿根廷朱婷复出扣杀状态神勇 -BAC009S0905W0372 搜狐体育郭健九月一日发自日本冈山今天下午 -BAC009S0905W0373 二零一五年第十二届女排世界杯单循环赛战至第八轮 -BAC009S0905W0374 从而将战绩提升为七胜一负积二十一分 -BAC009S0905W0375 本场比赛朱婷复出担任首发主攻并当选为当场最佳 -BAC009S0905W0376 虽然在比赛中没有得到出场机会 -BAC009S0905W0377 但曾春蕾赛后还是以队长身份出席了新闻发布会 -BAC009S0905W0378 很开心赢得今天的比赛 -BAC009S0905W0379 队伍凭借稳定的整体发挥获得了三零的胜利 -BAC009S0905W0380 曾春蕾表示阿根廷是一支拥有良好防守能力的球队 -BAC009S0905W0381 这一点也值得中国女排学习 -BAC009S0905W0382 中国女排队长坦言不仅是后面的几场比赛 -BAC009S0905W0383 每场较量对球队都很关键 -BAC009S0905W0384 我们球员要做的就是立足于自己 -BAC009S0905W0385 争取把自身水平发挥出来 -BAC009S0905W0386 至于其他球队的比赛结果 -BAC009S0905W0387 阿根廷队队长索萨认为 -BAC009S0905W0388 中国队的快速打法给自己的球队制造了很大的麻烦 -BAC009S0905W0389 像她们这样的亚洲对手速度很快 -BAC009S0905W0390 对我们来说比赛很困难 -BAC009S0905W0391 还有三场非常重要的比赛 -BAC009S0905W0392 希望得到想要的结果 -BAC009S0905W0393 对阵中国这样的球队是非常困难的 -BAC009S0905W0394 令我满意的是球队能够以一个积极的态度进行比赛 -BAC009S0905W0395 以前接触比较多的巴西队速度也很快 -BAC009S0905W0396 我们应该多和亚洲球队比赛来适应这样的打法 -BAC009S0905W0397 接下来中国女排将转战名古屋 -BAC009S0905W0398 从九月四日起迎接多米尼加俄罗斯和日本的挑战 -BAC009S0905W0399 搜狐体育郭健文 -BAC009S0905W0400 广州日报社记者许胚日前 -BAC009S0905W0401 英国人保拉拉德克利夫公开了自己的血液检测结果 -BAC009S0905W0402 以此证明自己并没有使用过违禁药物 -BAC009S0905W0403 在英国议会关于血液兴奋剂的听证会中 -BAC009S0905W0404 将出任玩具总动员四的导演 -BAC009S0905W0405 影片将在二零一七年登陆全国 -BAC009S0905W0406 来源时光网昨日 -BAC009S0905W0407 在英格兰多塞特群的波维顿坦克博物馆 -BAC009S0905W0408 至于有传拍台庆剧很容易获奖 -BAC009S0905W0410 她笑称我不想说我没有信心 -BAC009S0905W0411 很多演员都非常棒 -BAC009S0905W0412 搜狐娱乐讯北京时间七月二十日消息 -BAC009S0905W0413 据香港媒体报导 -BAC009S0905W0417 不到几个月的时间已爱得如此火热了 -BAC009S0905W0418 两人不想恋情变得高调 -BAC009S0905W0419 却多次被身边的人将他们的行踪暴露出来 -BAC009S0905W0420 两人被传媒追问恋情时都要求给予空间 -BAC009S0905W0421 看来他们需要身边的朋友保密他们的行踪 -BAC009S0905W0422 这样做反而更实际 -BAC009S0905W0423 搜狐娱乐讯北京时间六月三十日消息 -BAC009S0905W0424 据香港媒体报道 -BAC009S0905W0425 陈凯琳的心被郑嘉颖成功俘虏 -BAC009S0905W0426 更是郑嘉颖愿意公开承认的女友 -BAC009S0905W0427 不过二人因给陈嘉宝把生日合照在网上公开才泄露恋情 -BAC009S0905W0428 对此陈凯琳没有怪责陈嘉宝 -BAC009S0905W0429 觉得对方只是分享生日上的喜悦 -BAC009S0905W0430 陈凯琳之前说没交过男友 -BAC009S0905W0431 温州鹿城区宣传部官微做出回应 -BAC009S0905W0432 称涉事男子为某街道协管员 -BAC009S0905W0433 其发现清洁工保洁不到位 -BAC009S0905W0434 因此与清洁工引发争执 -BAC009S0905W0435 进一步导致肢体冲突 -BAC009S0905W0436 目前该协管已经停职 -BAC009S0905W0437 温州多地商户拉横幅求降租导购不少店亏本经营 -BAC009S0905W0438 东越花苑不少商铺都关门转租记者谢国林摄 -BAC009S0905W0439 温州大妈年逾半百冒充女儿成功骗婚多名小鲜肉 -BAC009S0905W0440 该女子已经行骗多地 -BAC009S0905W0441 她一直假冒的林某竟是她的女儿 -BAC009S0905W0442 而且她还是已婚身份 -BAC009S0905W0443 凭着远比真实年龄看起来要年经许多的容貌 -BAC009S0905W0444 雷某一直在河北邢台衡水等地干着游走骗婚的勾当 -BAC009S0905W0445 温州天价窝头事件背后顾客要持持赔三条中华 -BAC009S0905W0446 网络上一张永嘉桥头国际饭店的结帐单十分引人注目 -BAC009S0905W0447 菜单显示该饭店的荞麦窝窝头卖三八元一个 -BAC009S0905W0448 三零馀位食客吃了四五个窝窝头 -BAC009S0905W0449 发现事情并没有这么简单 -BAC009S0905W0450 温州女协管员侮辱环卫工行尸走肉已辞职 -BAC009S0905W0451 温州女协管员发伪辱性文字环卫节一群行尸走肉 -BAC009S0905W0452 温州女婴打疫苗后口吐白沫抽搐昏迷 -BAC009S0905W0453 温州网讯在温医大附属育英儿童医院的重监护室里 -BAC009S0905W0454 才七个月大的女童腾腾化名已昏迷了两天时间 -BAC009S0905W0455 随即被送到儿童医院进行抢救 -BAC009S0905W0456 区市省三级疾控部门专家已介入调查 -BAC009S0905W0457 温州家庭误食毒蘑菇后续小女儿已确诊脑死亡 -BAC009S0905W0458 温州少年峡谷失踪续二零万馀元赔偿款执行到位 -BAC009S0905W0459 金报讯记者蓝莹还记得小温吗 -BAC009S0905W0460 二零一三六二三 -BAC009S0905W0461 温州一四岁少年小温迷失莒溪大峡谷 -BAC009S0905W0462 浙江省史上规模最大的户外救援行动开始了 -BAC009S0905W0463 经过长达四个月的搜救 -BAC009S0905W0464 最终在峡谷上游的石头夹缝下 -BAC009S0905W0465 发现小温残缺的遗骸 -BAC009S0905W0466 温州市场现胶注虾业内不仅增重卖相更好 -BAC009S0905W0467 温州市民郑女士在农贸市场购买了三只大虾 -BAC009S0905W0468 回家后发现大虾体内居然被注射了不明胶状物 -BAC009S0905W0469 生活经验让郑女士起了疑心 -BAC009S0905W0470 她将几只虾的图片通过微博发布 -BAC009S0905W0471 迅速引起了网友以及当地监管部门的关注 -BAC009S0905W0472 温州市域铁路将成为全国第一条城市交通铁路 -BAC009S0905W0475 温州市治堵办的负责人表示 -BAC009S0905W0477 温州开水浇头服务员被批捕涉嫌故意伤害罪 -BAC009S0905W0478 京华时报讯昨天下午 -BAC009S0905W0479 浙江温州鹿城区检察院通报九月六日 -BAC009S0905W0480 开水淋顾客的火锅店服务员朱某被依法批准逮捕 -BAC009S0905W0481 温州惊现注胶虾续苍南再查六公斤注胶大虾 -BAC009S0905W0482 温州一菜场惊现注胶虾追踪 -BAC009S0905W0483 温州昆明出现注胶虾产地均指向广东湛江 -BAC009S0905W0484 浙江温州市一位市民一零零元买回三只斑节虾 -BAC009S0905W0485 在虾体内发现疑似胶状物质七月二十一日 -BAC009S0905W0486 云南昆明市同样发现类似注胶虾 -BAC009S0905W0487 国内两地出现注胶虾踪迹 -BAC009S0905W0488 且产地均指向广东省湛江市 -BAC009S0905W0489 温州景山花木市场发生大火火势已得到基本控制 -BAC009S0905W0490 温州服务员向顾客头上泼开水继而已被批捕 -BAC009S0905W0491 今天九月八日下午 -BAC009S0905W0492 因火锅加水问题与顾客发生争执 -BAC009S0905W0493 为泄愤将开水淋到顾客头上 -BAC009S0905W0494 并将其摁倒在地殴打 -BAC009S0905W0495 火锅店服务员朱某被温州市鹿城区检察院依法批准逮捕 -BAC009S0906W0121 双降会令市场看贬人民币的情绪持续 -BAC009S0906W0122 人民币未来贬值压力依然较大 -BAC009S0906W0123 预计短期央行仍可能会积极维稳 -BAC009S0906W0124 汇率较大概率维持双向波动 -BAC009S0906W0125 公积金松绑接棒释压房价下跌动力趋缓至搜狐财经 -BAC009S0906W0126 上海南昌等城市近期继续松绑了公积金贷款政策 -BAC009S0906W0127 而南昌除了放松首套房界定标准 -BAC009S0906W0128 还降低了首套房公积金首付比例 -BAC009S0906W0129 公积金贷款首付款比例不低于百分之七 -BAC009S0906W0130 上海易居研究院研究院严跃进认为 -BAC009S0906W0131 存销比已经见顶回落 -BAC009S0906W0132 房价下跌压力将趋于缓解 -BAC009S0906W0133 公积金大力度松绑相关商业银行信贷政策 -BAC009S0906W0134 各地对公积金贷款的松绑力度更大 -BAC009S0906W0135 江苏省对省级机关住房公积金政策做出了调整 -BAC009S0906W0136 昆明市住房公积金管理中心出台三项公积金新政 -BAC009S0906W0137 上海市公积金管理中心公布公积金新政 -BAC009S0906W0138 有一套住房并已结清公积金贷款 -BAC009S0906W0139 再次申请公积金贷款购房的 -BAC009S0906W0140 参照首套房贷款政策 -BAC009S0906W0141 中原地产市场研究部统计数据显示截至目前 -BAC009S0906W0142 二套执行认贷不认房 -BAC009S0906W0143 二套首付降比百分之七 -BAC009S0906W0144 南京武汉市放宽第二套房公积金贷款门槛 -BAC009S0906W0145 扬州杭州成都无锡等地 -BAC009S0906W0146 已有一套住房并结清贷款馀额的家庭 -BAC009S0906W0147 再购房执行首套房贷款政策 -BAC009S0906W0148 中原地产分析师张大伟认为 -BAC009S0906W0149 公积金是地方政府可以直接通过政策调整动用的资金 -BAC009S0906W0150 用公积金政策刺激市场是地方政府最习惯的举措 -BAC009S0906W0151 对购房者心理影响也非常大 -BAC009S0906W0152 由于公积金贷款利率相当于市场贷款利率的七折 -BAC009S0906W0153 对需求拉动作用比较大 -BAC009S0906W0154 上海作为一线城市代表 -BAC009S0906W0155 对房地产市场的心理影响比较大 -BAC009S0906W0156 预计还有其他城市将发布同类型松绑政策 -BAC009S0906W0157 房价下跌压力缓解各地救市政策不断 -BAC009S0906W0158 房企促销力度也在加大 -BAC009S0906W0159 各城市库存压力正在减小 -BAC009S0906W0160 房价下跌压力趋于缓解 -BAC009S0906W0161 上海易居房地产研究院数据显示 -BAC009S0906W0162 同比增长百分之七 -BAC009S0906W0163 这是今年五月份以来库存环比增幅最小的一次 -BAC009S0906W0164 环比增长百分之七 -BAC009S0906W0165 同比减小百分之七 -BAC009S0906W0166 五月份的供求关系是今年前五个月最均衡的一次 -BAC009S0906W0167 存销比见顶的态势基本确立 -BAC009S0906W0168 五个城市新建商品住宅存销比为七个月 -BAC009S0906W0169 该存销比数值为七个月 -BAC009S0906W0170 这直接利好去库存目标的实现 -BAC009S0906W0171 二到五个城市的总体水平看 -BAC009S0906W0172 库存去化周期依然偏大 -BAC009S0906W0173 说明各城市涨价的时机还不成熟 -BAC009S0906W0174 一至七月份大多数城市还是会采取积极降价的策略 -BAC009S0906W0175 房价未来可能会略微有下跌 -BAC009S0906W0176 一线城市由于需求面大 -BAC009S0906W0177 未来住宅价格会企稳回升 -BAC009S0906W0178 一些库存量较大的三四线城市 -BAC009S0906W0179 房价继续下行的可能性仍然比较大 -BAC009S0906W0180 同策咨询研究部总监张宏伟认为 -BAC009S0906W0181 月度市场成交量开始出现环比回升 -BAC009S0906W0182 市场去库存的速度在适度提高 -BAC009S0906W0183 从一线城市及存销比在七个月以下的城市来看 -BAC009S0906W0184 市场基本面有可能会率先好转 -BAC009S0906W0185 年底将出现翘尾行情 -BAC009S0906W0186 但年底出现翘尾行情并不代表楼市已经回暖 -BAC009S0906W0187 示范带动周边地区发展 -BAC009S0906W0188 并在农业走出去方面发挥重要作用 -BAC009S0906W0189 稳步发展区域 -BAC009S0906W0190 主要指草原生态经济区 -BAC009S0906W0191 包括北方干旱半干旱草原地区和青藏高原草原地区 -BAC009S0906W0192 加快该地区域现代农业建设 -BAC009S0906W0193 对于保障全国生态安全具有不可代替的战略作用 -BAC009S0906W0194 牢固树立生产生态有机结合生态优先的基本方针 -BAC009S0906W0195 加强草原生态环境保护和建设 -BAC009S0906W0196 稳步推进退牧还草和游牧民定居工程 -BAC009S0906W0197 加强以节水灌溉饲草地为重点的牧区水利建设 -BAC009S0906W0198 建立草原增加碳汇和生态补偿机制 -BAC009S0906W0199 转变畜牧业发展方式 -BAC009S0906W0200 优化生产布局和畜群结构 -BAC009S0906W0201 提高科学饲养和经营水平 -BAC009S0906W0202 加强农牧互补牧养结合 -BAC009S0906W0203 以最急需最关键最薄弱的环节和领域为重点 -BAC009S0906W0204 组织实施一批重大工程 -BAC009S0906W0205 全面分实现代农业发展的物质基础 -BAC009S0906W0206 一旱涝保收高标准农田建设工程 -BAC009S0906W0207 落实土壤改良地力培肥等措施 -BAC009S0906W0208 加快先进适用耕作技术推广应用 -BAC009S0906W0209 新建旱涝保收高标准农田四亿亩 -BAC009S0906W0210 新增千亿斤粮食生产能力建设工程 -BAC009S0906W0211 棉油糖生产基地建设工程 -BAC009S0906W0212 加强新疆黄淮海地区长江流域棉花生产基地建设 -BAC009S0906W0213 支持南方甘蔗和北方甜菜生产基地建设 -BAC009S0906W0214 着力改善田间基础设施良种科研繁育设施等生产条件 -BAC009S0906W0215 新一轮菜篮子建设工程 -BAC009S0906W0216 加强园艺作物标准园建设 -BAC009S0906W0217 引导建设优质农产品物流配送中心 -BAC009S0906W0218 发展农产品电子商务 -BAC009S0906W0219 健全农作物种质资源和畜禽遗传资源保存体系 -BAC009S0906W0220 建设动植物基因信息库 -BAC009S0906W0221 建立转基因生物安全保障体系 -BAC009S0906W0222 建设国家级农作物育制种基地 -BAC009S0906W0223 完善农作物品种试验和种子检测设施条件 -BAC009S0906W0224 建设水产遗传育种中心和原良种场 -BAC009S0906W0225 渔政渔港建设工程 -BAC009S0906W0226 建设一批大型渔政船 -BAC009S0906W0227 加强渔政基地和管理信息系统建设 -BAC009S0906W0228 动植物保护工程 -BAC009S0906W0229 健全六级动物疫病防控体系 -BAC009S0906W0230 健全兽药质量安全监管和动物防疫技术支撑体系 -BAC009S0906W0231 建设四级农作物病虫疫情监测防控体系 -BAC009S0906W0232 完善监测防控监管等设施设备 -BAC009S0906W0233 农产品质量安全检验检测能力建设工程 -BAC009S0906W0234 改扩建检验检测实验室 -BAC009S0906W0235 建设部级水产品质量安全研究中心 -BAC009S0906W0236 补充建设一批部级专业质检中心 -BAC009S0906W0237 构建全国农产品质量安全监测信息预警平台 -BAC009S0906W0238 乡镇农业公共服务能力建设工程 -BAC009S0906W0239 农业机械化推进工程 -BAC009S0906W0240 加大对秸秆机械化还田和收集打捆机具配套的支持力度 -BAC009S0906W0241 完善农业气象等方面的航空站和作业起降点基础设施 -BAC009S0906W0242 扶持农机服务组织发展 -BAC009S0906W0243 农业信息化建设工程 -BAC009S0906W0244 开展农业物物联网应用示范 -BAC009S0906W0245 加大天然草原退牧还草工程实施力度 -BAC009S0906W0246 加强京津风沙源区草地治理 -BAC009S0906W0247 继续加强三江源等地区草原生态建设 -BAC009S0906W0248 开展草原自然保护区建设和南方草地综合治理 -BAC009S0906W0249 加快实施游牧民定居工程 -BAC009S0906W0250 人工种草五亿亩 -BAC009S0906W0251 新型农村人才培养工程 -BAC009S0906W0252 必须从我国国情和农业发展实际出发 -BAC009S0906W0253 亦不可能跟自然规律抗衡 -BAC009S0906W0254 无止境地重复过去十年的惊人增长 -BAC009S0906W0256 企业于某个领域称王称霸的一刻 -BAC009S0906W0257 往往就是公司陷入灾难的开始 -BAC009S0906W0258 市场给予它的估值却异常克制 -BAC009S0906W0259 以二零一五年度每股八点五美元的盈利预测为准 -BAC009S0906W0260 苹果市盈率仅一五倍 -BAC009S0906W0261 莫说跟其他创意十足的科技股相提并论 -BAC009S0906W0262 比之大市亦有所不如 -BAC009S0906W0263 苹果早晚将步之前过气股王的后尘 -BAC009S0906W0265 不同意的地方多于同意 -BAC009S0906W0266 从随身听到智能电话 -BAC009S0906W0267 苹果的拿手好戏是把市场上原霸主拉下马 -BAC009S0906W0268 确认消费者喜新厌旧后 -BAC009S0906W0269 快速建立以苹果产品服务为核心的生态系统 -BAC009S0906W0270 透过不断的更新换代 -BAC009S0906W0271 索尼黑莓以至诺基亚 -BAC009S0906W0272 在最风光的时候看不见来自颠复者的威胁 -BAC009S0906W0273 从不可一世到遭对手边缘化 -BAC009S0906W0274 消费者贪新忘旧虽亦可能适用于苹果 -BAC009S0906W0277 对投资者大有参考价值 -BAC009S0906W0278 一九八三至二零零五年 -BAC009S0906W0279 标普五百指数市值冠军宝座 -BAC009S0906W0281 四大天王平均累计回报高达一千二百分之八十二 -BAC009S0906W0282 四倍于标指同期的三十百分之二 -BAC009S0906W0283 四大天王平均回报仅一百分之二十五 -BAC009S0906W0284 明显跑输标普五百指数的一百分之九十九 -BAC009S0906W0286 销售也总有饱和的一天 -BAC009S0906W0287 苹果能否第三期发育 -BAC009S0906W0290 从市场始终不愿给予苹果较高估值可见 -BAC009S0906W0291 管理层眼光得再高一点 -BAC009S0906W0292 苹果有意进军汽车产业 -BAC009S0906W0294 老毕对此说甚有保留 -BAC009S0906W0295 而库克若真有此意 -BAC009S0906W0297 汽车是苹果下一个颠复目标 -BAC009S0906W0298 马斯克乃商界新一代万人迷 -BAC009S0906W0299 人气不逊乔布斯在世之时 -BAC009S0906W0300 三藩市纪事报指此君曾与库克碰头 -BAC009S0906W0302 越多人讲往往越难成事 -BAC009S0906W0303 有黑客在网络上兜售车主信息 -BAC009S0906W0304 雪铁龙车主信息泄露规模或超十万条 -BAC009S0906W0305 该平台上显示的漏洞状态是 -BAC009S0906W0306 漏洞已通知厂商但厂商忽略该漏洞 -BAC009S0906W0307 该公司内部相关人士回应称 -BAC009S0906W0308 东风雪铁龙的客户数据存放在专业数据库中 -BAC009S0906W0309 对数据库设有监控及记录 -BAC009S0906W0310 对用户信息做足了保密工作 -BAC009S0906W0311 有业内人士分析指出 -BAC009S0906W0312 车企在信息安全方面的投入不足已经越来越成为其软肋 -BAC009S0906W0313 其中近一半的漏洞都可能造成网站用户的信息泄露 -BAC009S0906W0314 背后涉及到百万车主的信息安全 -BAC009S0906W0315 而绝大多数漏洞状态都是未联系到厂商或厂商忽略 -BAC009S0906W0316 汽车这个行业缺乏成熟的网络安全管理体系 -BAC009S0906W0317 网络运营人员的安全素质有待提高 -BAC009S0906W0318 很多车企网站是外包给第三方公司开发的 -BAC009S0906W0319 没有交付信息安全公司进行评估 -BAC009S0906W0320 因此更有可能留下信息安全风险 -BAC009S0906W0321 用户隐私遭泄露的问题日益突出 -BAC009S0906W0322 如果许多传统制造行业中的企业一样 -BAC009S0906W0323 车企诚待转化互联网思维以及加强互联网安全管控等 -BAC009S0906W0324 要跟上互联网发展的步伐不太容易 -BAC009S0906W0325 随着互联网快速发展 -BAC009S0906W0326 这类专业人才往往集中在互联网企业 -BAC009S0906W0327 而车企相对缺乏这类人才 -BAC009S0906W0328 网络安全管理体系方面投资非常大 -BAC009S0906W0329 涉及人才软件硬件服务以及管理等方面 -BAC009S0906W0330 互联网企业也是一步步投入不断完善 -BAC009S0906W0331 不同行业在网络安全方面投入比例不一 -BAC009S0906W0332 预计汽车行业在网络安全方面投入往往较少 -BAC009S0906W0333 一些车企为了节约成本 -BAC009S0906W0334 往往将数据库服务器都放在公网上 -BAC009S0906W0335 这样很容易被黑客攻破 -BAC009S0906W0336 一旦发现系统有漏洞 -BAC009S0906W0337 将及时采取主动或被动措施 -BAC009S0906W0338 拉德克利夫认为自己被暗指有问题 -BAC009S0906W0339 但检测结果的异常并不能就证实运动员使用违禁药物 -BAC009S0906W0340 因为导致这项数值波动的原因有很多 -BAC009S0906W0341 包括高原训练或身体过度消耗后立刻接受检测 -BAC009S0906W0342 因此我请求世界反兴奋剂机构回顾前后所有的数据 -BAC009S0906W0343 盘点昆仑决二零一五五大飙血之战搜狐体育 -BAC009S0906W0344 无疑是擂台上最能引爆肾上腺素的震撼一幕 -BAC009S0906W0345 令拳迷记忆犹新的飙血之战不计其数 -BAC009S0906W0346 而这些战斗也成为了圈内久聊不厌的经典谈资 -BAC009S0906W0347 在数百场真枪实弹的巅峰对决中 -BAC009S0906W0348 不乏诸多脍炙人口的飙血之战 -BAC009S0906W0349 十月二十八日与三十一日 -BAC009S0906W0350 下面小编将盘点本年度迄今为止昆仑决五大惨烈血战 -BAC009S0906W0353 北京时间七月二十八日晚 -BAC009S0906W0354 这场对决的惨烈程度超出了所有人的想象 -BAC009S0906W0355 比赛开始后仅仅十馀秒 -BAC009S0906W0356 播求的头部便被对方的肘击割破 -BAC009S0906W0357 打出一道深深的血口 -BAC009S0906W0358 伴随着双方激战的火爆升级 -BAC009S0906W0359 播求头部的伤口进一步扩大 -BAC009S0906W0360 几乎全部被鲜血复盖的半边身体令人触目惊心 -BAC009S0906W0361 双方的肘击对轰场面接连上演 -BAC009S0906W0362 哈亚的肘击刁钻狠辣 -BAC009S0906W0363 直肘反肘交替使用 -BAC009S0906W0364 令人防不胜防播求的肘击则更具王者霸气 -BAC009S0906W0365 以大刀阔斧的摆肘砸肘为主 -BAC009S0906W0366 凶悍直接大开大合 -BAC009S0906W0367 加之其半身浴血的黝黑健美体魄 -BAC009S0906W0368 颇似从地狱中走出的修罗帝王 -BAC009S0906W0369 播求久负盛名的扫腿与冲膝技术开始发威 -BAC009S0906W0370 令对手不再敢贸然近身 -BAC009S0906W0371 不得不暂停比赛进行处理 -BAC009S0906W0372 双方均向对手发起了不遗馀力的猛攻 -BAC009S0906W0373 这场史诗级的双王血战在两大强者最后的对决中 -BAC009S0906W0374 迎来了结束铃声的敲响 -BAC009S0906W0375 哈立以争议性的点数优势宣告获胜 -BAC009S0906W0376 浑身是血的播求由于头部三处动脉破裂失血过多 -BAC009S0906W0377 被立刻送往医院接受紧急输血治疗 -BAC009S0906W0378 也被送往医院进行抢救 -BAC009S0906W0379 对于任何一个目睹了整场比赛过程的人来讲 -BAC009S0906W0380 这场史诗级惊天血战中没有失败者 -BAC009S0906W0383 二零一五年六月七日 -BAC009S0906W0384 昆仑决雄霸山城在重庆江南体育馆重装上阵 -BAC009S0906W0385 面对身高臂展明显占优的对手 -BAC009S0906W0386 雅桑克莱并没有采取矮个子拳手惯用的闪击式打法 -BAC009S0906W0387 而是王气十足地向对手进行正面逼近 -BAC009S0906W0388 雅桑克莱的优势继续在扩大 -BAC009S0906W0389 标志性的扫腿重击力道沉猛的后手重拳纷纷呼啸而出 -BAC009S0906W0390 在其左扫腿无情踢击之下 -BAC009S0906W0391 祖耶夫的右肋很快便被踢出大片鲜红的淤血斑痕 -BAC009S0906W0392 经验丰富的雅桑克莱开始刻意放缓节奏 -BAC009S0906W0393 对已是强弩之末的对手进行消耗 -BAC009S0906W0394 此时的祖耶夫右眼已经肿胀得完全封闭 -BAC009S0906W0395 只能依靠顽强的意志进行支撑 -BAC009S0906W0396 雅桑克莱的组合拳将祖耶夫重重击倒然而 -BAC009S0906W0397 意志力惊人的白俄罗斯特种兵被没有就此放弃 -BAC009S0906W0398 顽强的意志力博得了对手以及全场观众致意 -BAC009S0906W0399 比赛在两名王者最后的对决中 -BAC009S0906W0400 比赛结果已经无需裁判的裁定 -BAC009S0906W0401 但看两人比赛后的面部状况 -BAC009S0906W0402 夺得了自己在昆仑拳坛上的第二场重要胜利 -BAC009S0906W0404 布拉德皮特新片狂怒接受了宣传媒体拍照 -BAC009S0906W0405 我们可以清晰看到皮特的结婚戒指 -BAC009S0906W0406 今天确定了上映日期二零一七年四月十七日 -BAC009S0906W0407 这是后年春季档的一个黄金上映期 -BAC009S0906W0408 看来郑嘉颖是她的初恋 -BAC009S0906W0409 问到他们在法国拍戏定情的细节 -BAC009S0906W0410 陈凯琳也拒绝回答 -BAC009S0906W0411 但就希望外界多给予他们发展空间 -BAC009S0906W0412 搜狐娱乐讯据香港媒体报道 -BAC009S0906W0413 早前有传媒更拍到陈凯琳直上嘉颖住所短聚 -BAC009S0906W0414 父女恋纸包不住火 -BAC009S0906W0415 两人于异国拍外景晨夕相对 -BAC009S0906W0416 感情一日千里 -BAC009S0906W0418 陈势安两天一夜没洗澡刷牙 -BAC009S0906W0419 猛嗑薄荷喉糖 -BAC009S0906W0420 搜狐娱乐讯据台湾媒体报道 -BAC009S0906W0421 香港女星吴君如与导演陈可辛爱情长跑十八年 -BAC009S0906W0422 虽然没有注册结婚 -BAC009S0906W0423 但两人关系比一般夫妻更加紧密 -BAC009S0906W0424 她日前被媒体目击与陈可辛在大街上逛街血拼 -BAC009S0906W0425 且沿途有说有笑 -BAC009S0906W0426 一路上都十指紧扣 -BAC009S0906W0427 甜蜜恩爱的模样彷彿热恋中的情侣 -BAC009S0906W0428 搜狐娱乐讯据香港媒体报导 -BAC009S0906W0429 一直邀请陈善之担任经理人 -BAC009S0906W0430 并兼任李嘉欣经理人及处理旗下其他艺人的合约事宜 -BAC009S0906W0431 执法人员将王靖苏押解回温州 -BAC009S0906W0432 温州水库沉车案现男女腐尸女方事发前行为古怪 -BAC009S0906W0433 温州沙城街道一民房发生火灾已造成四人死亡 -BAC009S0906W0434 温州沙城街道一民房今晨发生火灾已造成四人死亡 -BAC009S0906W0436 沙城街道七五村永安路一二五号一民房发生火灾 -BAC009S0906W0437 一时一零分火势完全扑灭 -BAC009S0906W0438 火灾造成四人死亡一人受伤 -BAC009S0906W0439 伤者目前在解放军第一一八医院进行治疗 -BAC009S0906W0441 温州惊现最牛菜场温州的状元农贸市场 -BAC009S0906W0443 买菜用支付宝扫码付钱 -BAC009S0906W0444 听说过段时间还要上场智能秤 -BAC009S0906W0445 用智能秤称重将自动生成二维码 -BAC009S0906W0446 用支付宝扫一下就能付款 -BAC009S0906W0447 温州美女学霸将赴非洲支教教当地小学生汉语 -BAC009S0906W0448 麻丽贤等一七位志愿者将远赴非洲支教 -BAC009S0906W0449 温州老人卖房筹四八零零万建养老院赠政府遭闲置 -BAC009S0906W0450 为了实现退休后能建一座养老机构 -BAC009S0906W0451 为更多的老人安度往年的心愿 -BAC009S0906W0452 浙江温州一老人拿出全部积蓄并卖掉两套房子 -BAC009S0906W0453 筹款四八零零万经六年建成养老院 -BAC009S0906W0454 捐给当地慈善部门后却遭闲置三年 -BAC009S0906W0455 温州苍南县看守所民警宿舍楼起火无人员伤亡 -BAC009S0906W0456 八月三日上午一一时左右 -BAC009S0906W0457 温州苍南县看守所一宿舍起火 -BAC009S0906W0458 该市苍南县公安局直属县看守所突发火情 -BAC009S0906W0459 所内民警宿舍楼突发大火 -BAC009S0906W0460 在看守所干警及消防人员的扑救下火势很快被扑灭 -BAC009S0906W0461 温州话到底有多难懂 -BAC009S0906W0463 大家对温州话难懂这事儿略有耳闻 -BAC009S0906W0464 一直被认为是全中国最难学习的方言之一 -BAC009S0906W0465 温州贩卖婴儿大案女医生假称婴儿已死然后卖掉 -BAC009S0906W0466 警方先后解救了一六名婴儿 -BAC009S0906W0467 有六个被送往苍南福利院 -BAC009S0906W0468 图为其中一名被解救的孩子 -BAC009S0906W0469 温州集资诈骗案犯汇给情人四千万小三被诉 -BAC009S0906W0470 二九岁的章某被控洗钱一二二万元 -BAC009S0906W0471 温州鞋业总经理遭追杀凶手行凶过程中被打死 -BAC009S0906W0472 陆续有人从乐清赶往平阳法院 -BAC009S0906W0473 平阳法院内外已聚集了三零零多人等待开庭 -BAC009S0906W0474 温州首家支付宝菜市场启动一周很多摊主不会用 -BAC009S0906W0475 状元农贸市场内挂着支付宝的宣传牌 -BAC009S0906W0476 温州高三男生坠楼身亡事发前无异常刚从家返校 -BAC009S0906W0477 龙湾永强中学一名高三男生从宿舍楼五楼楼顶坠楼身亡 -BAC009S0906W0478 永强中学校长也是坠楼学生的语文老师 -BAC009S0906W0479 印象里他性格是比较开朗的 -BAC009S0906W0480 没有发现近期有异常变化目前 -BAC009S0906W0481 龙湾警方已对此事展开调查 -BAC009S0906W0482 温州高速公路大米遭抢续五名涉案人员已落网 -BAC009S0906W0483 白花花的大米洒了一地 -BAC009S0906W0484 引来周边大批村民哄抢 -BAC009S0906W0485 一场考验道德与良知的大米保卫战悄然打响 -BAC009S0906W0486 温州鹿城警方发布通报称 -BAC009S0906W0487 五名涉嫌参与抢米的犯罪嫌疑人先后被抓获并拘留 -BAC009S0906W0488 民警仍在对其馀涉事人员进行调查 -BAC009S0906W0489 温州高速车祸九二包大米遭哄抢续带头者被拘 -BAC009S0906W0490 一辆货车在金丽温高速温州段发生事故 -BAC009S0906W0491 涉案的其中两名嫌疑人陈某女 -BAC009S0906W0492 永嘉县人谢某女 -BAC009S0906W0493 永嘉县人已被鹿城警方依法行政拘留 -BAC009S0906W0494 港京航班六名乘客推撞地勤四人被判九至一一天监禁 -BAC009S0907W0121 也不代表开发商资金面已经不再紧张 -BAC009S0907W0122 背后可能蕴含着开发商更多的窘境 -BAC009S0907W0123 本世纪网至本世纪经济报道 -BAC009S0907W0124 上海南昌等城市近期继续松绑了公积金贷款政策 -BAC009S0907W0125 而南昌除了放松首套房界定标准 -BAC009S0907W0126 还降低了首套房公积金首付 -BAC009S0907W0127 国家住房银行箭在弦上 -BAC009S0907W0128 住建部官员发表文章指出 -BAC009S0907W0129 以住房公积金制度为基础 -BAC009S0907W0130 设立国家住房银行条件已经基本成熟 -BAC009S0907W0131 国家住房银行是否箭在弦上 -BAC009S0907W0132 其成立需具备哪些条件 -BAC009S0907W0133 以住房公积金制度为基础 -BAC009S0907W0134 设立政策性住宅金融机构 -BAC009S0907W0135 此机构即是住房银行 -BAC009S0907W0136 设立住房银行的条件已基本成熟 -BAC009S0907W0137 改进住房公积金提取使用监管机制 -BAC009S0907W0138 全国住房公积金七万亿元 -BAC009S0907W0139 住房维修资金约七亿元 -BAC009S0907W0140 如允许每年发行专项金融债券七万亿元 -BAC009S0907W0141 今年资金规模接近七万亿元 -BAC009S0907W0142 明年预计达到七万亿元 -BAC009S0907W0143 可基本满足首套和改善性自住住房的低息贷款需求 -BAC009S0907W0144 三是已有人员和机构 -BAC009S0907W0145 全国共有管理中心一百个 -BAC009S0907W0146 业务网点一千个 -BAC009S0907W0147 从业人员五万人 -BAC009S0907W0148 可充分利用这些机构网点和人员 -BAC009S0907W0149 组建国家住房银行分行和支行 -BAC009S0907W0150 对各地分支机构实行垂直管理 -BAC009S0907W0151 全国住房公积金贷款风险准备金已接近一百亿元 -BAC009S0907W0152 其中五亿元为超额拨备 -BAC009S0907W0153 可转化为住房银行资本金 -BAC009S0907W0154 设立住房银行好处多多 -BAC009S0907W0155 提高家庭购房能力 -BAC009S0907W0156 通过国家住房银行提供低息贷款 -BAC009S0907W0157 可以解决贷款难和贷款贵问题 -BAC009S0907W0158 有效提高家庭购房能力 -BAC009S0907W0159 完善宏观调控机制 -BAC009S0907W0160 可以有效解决商业银行顺周期操作问题 -BAC009S0907W0161 避免房地产市场大起大落 -BAC009S0907W0162 拓展货币政策操作空间 -BAC009S0907W0163 为利率市场化改革创造条件 -BAC009S0907W0164 促进新型城镇化发展 -BAC009S0907W0165 将农民工纳入住房公积金制度 -BAC009S0907W0166 积累在城镇购房首期付款 -BAC009S0907W0167 再由国家住房银行提供低息贷款 -BAC009S0907W0168 后续还款用住房公积金支付 -BAC009S0907W0169 将有效缓解购房能力不足矛盾 -BAC009S0907W0170 提升新型城镇化质量和效益 -BAC009S0907W0171 改进住房公积金管理 -BAC009S0907W0172 根源是体制机制存在弊端 -BAC009S0907W0173 通过设立国家住房银行 -BAC009S0907W0174 可以有效提高资金管理集约化专业化和精细化水平 -BAC009S0907W0175 充分发挥住房公积金作用 -BAC009S0907W0176 住建部官员发表文章指出 -BAC009S0907W0177 以住房公积金制度为基础 -BAC009S0907W0178 设立国家住房银行条件已经基本成熟 -BAC009S0907W0179 国家住房银行是否箭在弦上 -BAC009S0907W0180 其成立需具备哪些条件 -BAC009S0907W0181 备受刚需购房者关注的公积金政策也频繁迎来调整 -BAC009S0907W0182 北京市管国管住房公积金中心先后发布通知 -BAC009S0907W0183 贷款最高额度由五万元升至七万元 -BAC009S0907W0184 公积金贷款总共可少缴利息三十馀万 -BAC009S0907W0185 是对过去住房公积金制度不作为方式的纠正 -BAC009S0907W0186 而随着各地公积金政策的调整 -BAC009S0907W0187 建立健全以工促农以城带乡的长效机制 -BAC009S0907W0188 为现代农业建设取得明显进展提供有力保障 -BAC009S0907W0189 建立农业投入稳定增长机制 -BAC009S0907W0190 按照总量持续增长比例稳步提高的要求 -BAC009S0907W0191 预算内固定资产投资要向重大农业农村建设项目倾斜 -BAC009S0907W0192 耕地占用税税率提高后 -BAC009S0907W0193 新增收入全部用于农业 -BAC009S0907W0194 积极推动土地出让收益用于高标准农田建设 -BAC009S0907W0195 充分发挥中国农业产业发展基金的引导作用 -BAC009S0907W0196 加快农村金融组织产品和服务创新 -BAC009S0907W0197 推动发展村镇银行等农村中小金融机构 -BAC009S0907W0198 引导金融机构发放农业中长期贷款 -BAC009S0907W0199 完善农民专业合作社管理方法 -BAC009S0907W0200 支持其开展信用合作 -BAC009S0907W0201 落实农民专业合作社和农村金融有关税收优惠政策 -BAC009S0907W0202 扶持农业信贷担保组织发展 -BAC009S0907W0203 扩大农村担保品范围 -BAC009S0907W0204 完善农业保险保费补贴政策 -BAC009S0907W0205 健全农业再保险体系 -BAC009S0907W0206 探索完善财政支持下的农业大灾风险分散机制 -BAC009S0907W0207 引导社会资本投入农业 -BAC009S0907W0208 各部门要主动服务三农 -BAC009S0907W0209 积极推动建立城乡要素平等交换关系 -BAC009S0907W0210 鼓励和促进工业与城市资源要素向农业农村配置 -BAC009S0907W0211 调动农民参与农业农村基础设施建设的积极性 -BAC009S0907W0212 通过组织动员和政策引导等多种途径 -BAC009S0907W0213 鼓励各种社会力量与乡村结对帮扶 -BAC009S0907W0214 参与农村产业发展和公共设施建设 -BAC009S0907W0215 努力形成多元化投入新格局 -BAC009S0907W0216 加大农业支持保护力度 -BAC009S0907W0217 坚持和完善农业补贴政策 -BAC009S0907W0218 建立农业补贴政策后评估机制 -BAC009S0907W0219 落实农资综合补贴动态调整机制 -BAC009S0907W0220 研究逐步扩大良种补贴品种和范围 -BAC009S0907W0221 扩大农机具购置补贴规模 -BAC009S0907W0222 加大农机化薄弱环节生产机械补贴力度 -BAC009S0907W0223 加大动物强制免疫补贴力度 -BAC009S0907W0224 逐步完善农业生产关键技术应用与服务支持政策 -BAC009S0907W0225 大幅度增加农业防灾减灾稳产增产关键技术良法补助 -BAC009S0907W0226 坚持和完善渔用柴油补贴政策 -BAC009S0907W0227 继续实施农业种子种苗种畜种禽免税进口优惠政策 -BAC009S0907W0228 建立完善农业生产奖补制度 -BAC009S0907W0229 完善主产区利益补偿机制 -BAC009S0907W0230 提高中央财政对粮食油料生产大县转移支付水平 -BAC009S0907W0231 继续加大对产粮大县生猪调出大县的奖励力度 -BAC009S0907W0232 规范粮食主产县涉农投资项目地方资金配套 -BAC009S0907W0233 全面取消主产区粮食风险基金地方资金配套 -BAC009S0907W0234 稳步提高粮食主产区县级人均财力水平 -BAC009S0907W0235 全面实施和完善草原生态保护补助奖励政策 -BAC009S0907W0236 扩大草原生态保护面源污染防控生态奖补范围和规模 -BAC009S0907W0237 探索实施生物农药低毒农药使用补助政策 -BAC009S0907W0238 研究建立高耗能老旧农业机械报废回收制度 -BAC009S0907W0239 探索实施报废更新补助 -BAC009S0907W0240 加大对农业科研和技术推广的支持力度 -BAC009S0907W0241 完善现代农业产业技术体系 -BAC009S0907W0242 选择部分农业科研院所予以稳定支持 -BAC009S0907W0243 按照种养规模和服务绩效安排工作经费 -BAC009S0907W0244 加大动物疫病防控经费投入 -BAC009S0907W0245 完善病死动物无害化处理补贴制度 -BAC009S0907W0246 建立和完善农作物病虫害专业化统防统治补助政策 -BAC009S0907W0247 继续向农民免费提供测土配方施肥服务 -BAC009S0907W0248 扩大土壤有机质提升项目实施范围和规模 -BAC009S0907W0249 继续加大农业农村人才培养力度 -BAC009S0907W0250 对大学生涉农创业按规定给予相关政策扶持 -BAC009S0907W0251 完善农产品市场调控机制 -BAC009S0907W0252 稳步提高稻谷小麦最低收购价 -BAC009S0907W0253 没有人提的往往才是真命天子 -BAC009S0907W0254 谁是苹果进军汽车市场的合作伙伴收购对象 -BAC009S0907W0258 这个问题存在于软件捆绑方式 -BAC009S0907W0259 它是软件集成的一种方式 -BAC009S0907W0261 他们很快提供了修复软件 -BAC009S0907W0262 不管是什么时候推出软件和开发一些超前的东西 -BAC009S0907W0263 避免不了出现一些漏洞 -BAC009S0907W0264 我们所做的就是发现漏洞后立即修复 -BAC009S0907W0265 在苹果发布靓丽的第四财季业绩报告后 -BAC009S0907W0266 乔斯维亚克就很少在公众场合露面 -BAC009S0907W0268 促使这家公司获得了创记录的第四财季盈利 -BAC009S0907W0269 苹果正在全力以赴出售尽可能多的智能手机 -BAC009S0907W0270 你必须保证自己了解稳态市场 -BAC009S0907W0271 而不仅仅是早期市场 -BAC009S0907W0272 大尺寸屏幕设备在亚洲很流行 -BAC009S0907W0273 但是在欧洲受欢迎度较低 -BAC009S0907W0274 美国市场刚好介于两者之间 -BAC009S0907W0275 目前这项服务已经达到了一个里程碑 -BAC009S0907W0277 有一百万张信用卡已被激活 -BAC009S0907W0278 其中就包括沃尔玛和百思买 -BAC009S0907W0279 这两家公司目前正在开发自己的移动支付系统 -BAC009S0907W0280 零售商最终都会向消费者妥协 -BAC009S0907W0281 想要成功的零售商将考虑消费者的利益 -BAC009S0907W0282 并接受消费者想要使用的支付方式 -BAC009S0907W0283 乔斯维亚克还谈及了苹果涉足可穿戴设备市场的问题 -BAC009S0907W0286 乔斯维亚克还为苹果平板电脑业务做了辩护 -BAC009S0907W0287 他拿出了数据作为证据截止目前 -BAC009S0907W0290 我们一直都在打造最好的产品 -BAC009S0907W0291 这次我们同样做到了 -BAC009S0907W0294 用户发现系统更新之后 -BAC009S0907W0295 心率测量记录没有之前那么频繁了 -BAC009S0907W0297 不过苹果官方很快澄清了这个事情 -BAC009S0907W0298 根据苹果官方的支持页面显示 -BAC009S0907W0300 不过更新后锻炼和运动手臂的时候不会记录心率 -BAC009S0907W0301 因此用户看到测量记录的记录要比之前少一些 -BAC009S0907W0302 不过这导致了很多新问题 -BAC009S0907W0303 在认证授权系统中对服务器设置权限管理 -BAC009S0907W0304 以及与经销商汽车垂直网站等签署保密协议等 -BAC009S0907W0305 这些措施在一定程度上将可防止用户数据泄露 -BAC009S0907W0306 除了投入大这一因素之外 -BAC009S0907W0307 往往对网络安全意识也不强 -BAC009S0907W0308 毕竟与互联网融合时间不长 -BAC009S0907W0309 上述网络安全人士称 -BAC009S0907W0310 乌云网合伙人邬迪接受第一财经日报记者采访时称 -BAC009S0907W0311 尽管网络安全目前投入成本大 -BAC009S0907W0312 又未直接产生经济效益 -BAC009S0907W0313 但到将来互联网时代 -BAC009S0907W0314 部分传统的车企或许还没有注意到这点 -BAC009S0907W0315 乌云上有不少因联网漏洞可导致车辆被控制 -BAC009S0907W0316 这将会导致行车安全问题 -BAC009S0907W0317 令车企烦恼的不仅是车主信息被泄露这一困扰 -BAC009S0907W0318 随着越来越多车企踊跃加入车联网浪潮中 -BAC009S0907W0319 信息安全隐患也随之而来 -BAC009S0907W0320 负责车辆网络安全问题 -BAC009S0907W0321 现在汽车与网络的联系越来越紧密 -BAC009S0907W0322 以后将能够与周围环境交流 -BAC009S0907W0323 如果车辆被黑客软件侵袭 -BAC009S0907W0324 车辆可能会发生严重的交通事故 -BAC009S0907W0325 比如现在的汽车一般采用了哪些新技术 -BAC009S0907W0326 其中十六家回复发函 -BAC009S0907W0327 在接受调查的这些公司中 -BAC009S0907W0328 有两家表示能够诊断或者反馈黑客入侵后的情况 -BAC009S0907W0329 有一家公司表示能够及时检测黑客入侵 -BAC009S0907W0330 像车上的信息娱乐系统和导航系统 -BAC009S0907W0331 很可能通过联网技术 -BAC009S0907W0332 被恶意软件或者黑客攻击 -BAC009S0907W0333 二十二二零一五 -BAC009S0907W0335 黑客可利用这些漏洞远程打开车门 -BAC009S0907W0336 宝马方面表示已经升级该数字系统 -BAC009S0907W0337 解决信息安全的问题 -BAC009S0907W0339 作为唯一能够入选五大飙血之战的女子比赛 -BAC009S0907W0340 正是得益于我国女子散打名将鄂美蝶的惊艳一击 -BAC009S0907W0341 在当天女子五二千克级自由搏击超级战中 -BAC009S0907W0342 鄂美蝶便毫无保留地将炮火轰向对手 -BAC009S0907W0343 三十三岁的大滨芳美在面对强大的火力下 -BAC009S0907W0344 比赛很快便呈向一边倒的局面第二回合 -BAC009S0907W0345 鄂美蝶继续将自己所学到的新搏击技能尽情展现 -BAC009S0907W0346 在一连串的拳腿风暴过后 -BAC009S0907W0347 终止时间定格在二分二十一秒 -BAC009S0907W0350 二零一五年四月十二 -BAC009S0907W0351 一场众星闪耀的群龙赛事震撼打响 -BAC009S0907W0352 作为此次赛事上唯一一场纯泰式规则的超级战 -BAC009S0907W0353 两位气质迥异的选手展示出了全然不同的擂台风格 -BAC009S0907W0354 在前两局僵持不下的情况下 -BAC009S0907W0355 面对兵行诡道的波斯弯刀 -BAC009S0907W0356 张春雨选择了加强压迫对手的力度 -BAC009S0907W0357 此举却导致了一次擂台意外的发生 -BAC009S0907W0358 张春雨被对手的一记肘击打破了右侧眉弓 -BAC009S0907W0359 经过场上护理人员的医治后 -BAC009S0907W0360 张春雨在全场观众的喝彩声中再次投入比赛 -BAC009S0907W0361 并向对手发起了凶猛的反扑 -BAC009S0907W0362 双方刺刀见红式的对攻中 -BAC009S0907W0363 伊萨的眉弓同样被张春雨以牙还牙的肘法击破 -BAC009S0907W0364 全面引爆现场观众的激情 -BAC009S0907W0365 比赛在双方互不相让的对攻中落下了帷幕 -BAC009S0907W0366 但对于每一位观赛者来讲 -BAC009S0907W0367 能够亲眼见证这场火爆刺激的的精彩大战 -BAC009S0907W0368 远比单纯的胜负有意义得多 -BAC009S0907W0369 这是一场没有输家的经典比赛 -BAC009S0907W0372 二零一五年二月一日 -BAC009S0907W0373 昆仑决广州站在广州天河体育中心成功打响 -BAC009S0907W0374 多国大神级搏击王者论剑昆仑武道之巅 -BAC009S0907W0375 决赛一如期待般精彩绝伦 -BAC009S0907W0376 马刀抡击式的中距离组合拳法配合高位膝技 -BAC009S0907W0377 打得对手只有招架之功 -BAC009S0907W0378 便将对手的眼部击伤 -BAC009S0907W0379 严重影响卡尔泽塔的实现 -BAC009S0907W0380 令对手无奈放弃比赛 -BAC009S0907W0381 他在二零一五年昆仑决诸神之战决赛圈的表现 -BAC009S0907W0382 将成为无数武迷接下来最大的期待之一 -BAC009S0907W0383 谁也不知道会发生什么 -BAC009S0907W0384 这就是竞技体育的魅力 -BAC009S0907W0385 在昨晚的女子标枪决赛中 -BAC009S0907W0386 然而就是这最后一掷 -BAC009S0907W0387 在昨天比赛的第五投 -BAC009S0907W0388 吕会会在全场观众的加油助威声中爆发 -BAC009S0907W0389 倾尽全力将标枪掷到了六十六米一三 -BAC009S0907W0390 然而就在全场仅剩下莫利托一个人的最后一掷时 -BAC009S0907W0391 虽然留下了巨大的遗憾 -BAC009S0907W0392 不过这依然是吕会会的个人最好成绩 -BAC009S0907W0393 吕会会在走到混合区接受记者采访时止住了泪水 -BAC009S0907W0394 在大赛中投出这样好的成绩我自己都没有想到 -BAC009S0907W0395 其实比赛过程中我也没有多想 -BAC009S0907W0396 就是要一枪一枪地投 -BAC009S0907W0397 比成这样我其实已经很开心了 -BAC009S0907W0398 能在北京获得一枚奖牌我很骄傲 -BAC009S0907W0399 观众们的鼓励也给了我力量 -BAC009S0907W0400 我的泪水主要还是来自于喜悦 -BAC009S0907W0401 要说一点儿没有遗憾和失落是假的 -BAC009S0907W0402 但总的来说还是高兴多于遗憾 -BAC009S0907W0403 文本报记者刘艾林 -BAC009S0907W0404 去年美国队长二寒冬战士就曾在四月登陆 -BAC009S0907W0405 结果创造了相当可观的票房成绩 -BAC009S0907W0406 丛林之书则将在二零一六年四月十五日登场 -BAC009S0907W0408 两人合作长达二十年 -BAC009S0907W0409 不过天下无不散之筵席 -BAC009S0907W0410 原来陈善之最近已离开了百仕活 -BAC009S0907W0411 有传他离开是因黎明不满其在挽留艺人方面没有尽力 -BAC009S0907W0412 搜狐娱乐讯十月九日 -BAC009S0907W0413 表示决定辞职 -BAC009S0907W0414 不与无线续约 -BAC009S0907W0415 他感叹自己在无线十五年都没有机会 -BAC009S0907W0416 眼见后辈爬头 -BAC009S0907W0417 希望出去发展 -BAC009S0907W0418 他直言不想看见自己变作一潭死水 -BAC009S0907W0419 早前演出的舞台剧令他醒觉要出外寻找更多演出机会 -BAC009S0907W0420 因此决定出外闯 -BAC009S0907W0421 虽然未知去向 -BAC009S0907W0422 但坚信有我落脚的地方 -BAC009S0907W0423 我便会到那里 -BAC009S0907W0424 搜狐娱乐讯北京时间七月二十八日消息 -BAC009S0907W0425 据香港媒体报导 -BAC009S0907W0427 陈奕迅双手合十认真地向蛋糕许愿 -BAC009S0907W0428 搜狐娱乐讯据台湾媒体报道 -BAC009S0907W0429 港歌神陈奕迅出道近二十年 -BAC009S0907W0430 曾获美国时代杂志形容为影响香港乐坛风格的人物 -BAC009S0907W0431 并于当日被香港警方拘捕 -BAC009S0907W0432 警方以普通袭击罪对涉事乘客提起诉讼 -BAC009S0907W0433 其中四名被告分别判即时监禁九至一一天 -BAC009S0907W0434 一人被判罚款一五零零元 -BAC009S0907W0435 港京航班延误九小时六名内地乘客推撞地勤被捕 -BAC009S0907W0437 六名内地乘客与地勤发生肢体冲突 -BAC009S0907W0439 将被以普通袭击罪起诉 -BAC009S0907W0440 港商在台遭绑三八天获救后痛哭以为必死 -BAC009S0907W0441 黄煜坤被警方送到附近医院接受检查 -BAC009S0907W0442 惠州公安在金山河捞获一具无头无双手女尸 -BAC009S0907W0443 广东惠州惊爆港商杀情妇碎尸凶案 -BAC009S0907W0444 五零岁港商疑与其工厂的同龄女主管偷情多年 -BAC009S0907W0445 近日再度拒绝女方的逼婚后 -BAC009S0907W0446 遭追讨欠款和抚养费共四零万元人民币 -BAC009S0907W0447 港商疑恼羞成怒将她杀害 -BAC009S0907W0448 并肢解尸体分成多袋抛入河中 -BAC009S0907W0449 港商被骗牵出路边地下钱庄涉案资金四三零零亿 -BAC009S0907W0450 深圳警方查获的一个地下钱庄窝点 -BAC009S0907W0451 由普通商店作为掩护 -BAC009S0907W0452 该商店老板郑晓生红衣者涉嫌暗地里兑换外汇 -BAC009S0907W0453 替人向境外转移资金 -BAC009S0907W0454 港媒关注天价虾店停业破坏青岛形象 -BAC009S0907W0455 参考消息网一零月八日报道港媒称 -BAC009S0907W0456 备受关注的青岛三八元一只大虾事件有最新发展 -BAC009S0907W0457 并责令其立即改正价格违法行为 -BAC009S0907W0458 事发后派出所和物价局都互相踢皮球 -BAC009S0907W0459 批评职能部门没有将消费者放在第一位 -BAC009S0907W0460 港媒关注内地私人美术馆新富人群热衷分享藏品 -BAC009S0907W0461 参考消息网七月二九日报道港媒称 -BAC009S0907W0462 用来保存他们的藏品 -BAC009S0907W0463 其中一些人是近年来国际拍卖会上艺术品的最大买家 -BAC009S0907W0464 港媒关注浙江暖男医生手术室播动画片哄小女孩 -BAC009S0907W0465 参考消息网九月二二日报道港媒称 -BAC009S0907W0466 网络上热传一组暖男医生哄小萝莉的温情照片 -BAC009S0907W0467 男医生为了安抚即将做手术的小女孩 -BAC009S0907W0468 将小女孩抱在腿上并播放手机中的动画片 -BAC009S0907W0469 港媒关注重雾霾重回华北罕见蓝天只持续两周 -BAC009S0907W0470 参考消息网九月一九日报道港媒称 -BAC009S0907W0471 随着严重雾霾卷土重来 -BAC009S0907W0472 港媒关注马云回应被逼捐花钱比挣钱难 -BAC009S0907W0474 企业应该做好的投资 -BAC009S0907W0475 盲目捐款没有益处 -BAC009S0907W0476 港媒曝水货客扮残疾人在轮椅中藏钻石月入八万 -BAC009S0907W0477 参考消息网七月二九日报道港媒称 -BAC009S0907W0478 香港海关严查水货客 -BAC009S0907W0479 水货集团看中轮椅人士收入不高 -BAC009S0907W0480 以高收入低风险和免缴税等好处利诱对方成为水货客 -BAC009S0907W0481 有走私奢侈品的人士月入高达八万港元 -BAC009S0907W0482 港媒盘点亚洲千禧一代十大富豪九人是中国人 -BAC009S0907W0483 参考消息网七月二二日报道 -BAC009S0907W0484 港媒称假沉香充斥内地多以化学香油制成 -BAC009S0907W0485 高仿沉香多以化学香精等制成 -BAC009S0907W0486 可比黄金的沉香价格每年倍增 -BAC009S0907W0487 港媒称内地中产人数猛增有助稳定企望渐进改革 -BAC009S0907W0488 一个国家稳定的社会结构呈橄榄形 -BAC009S0907W0489 而橄榄形结构是以中产为主的结构 -BAC009S0907W0490 中产阶级在一个国家的现代化中起着稳定作用 -BAC009S0907W0491 是社会稳定的主要力量 -BAC009S0907W0492 港媒称内地为国际市场修改动画片妖怪不能吃唐僧肉 -BAC009S0907W0493 参考消息网一一月一日报道港媒称 -BAC009S0907W0494 中国的动画工作室越来越看重海外市场 -BAC009S0907W0495 港媒称内地人不穷了为何仍爱抢学者抢习惯了 -BAC009S0908W0121 将进一步提振刚需购房者入市信心 -BAC009S0908W0122 加速今年楼市成交复苏回暖 -BAC009S0908W0123 公积金政策利好首套自住住房贷款需求的消息纷至沓来 -BAC009S0908W0124 并已实施 -BAC009S0908W0125 贷款额度上限调整为一百万元 -BAC009S0908W0126 购买一百平方米以上非政策性住房或第二套住房 -BAC009S0908W0127 贷款最高额度仍为一百万元 -BAC009S0908W0128 均规定贷款额度不再依据个人信用等级上浮 -BAC009S0908W0129 并对异地缴存住房公积金等政策作出调整 -BAC009S0908W0130 北京市公积金管理中心明确取消新建商品房评估 -BAC009S0908W0131 国管住房公积金中心则表示取消担保服务费 -BAC009S0908W0132 这一系列公积金门槛放低额度提高的调整 -BAC009S0908W0133 是对过去住房公积金制度不作为方式的纠正 -BAC009S0908W0134 即使去年十一月公积金贷款利率降至百分之七 -BAC009S0908W0135 很多人需要支付大额首付 -BAC009S0908W0136 使用公积金制度的作用和效果没有得到有效的发挥 -BAC009S0908W0137 此番公积金政策调整 -BAC009S0908W0138 将在诸多方面惠及刚需购房者 -BAC009S0908W0139 之前的公积金贷款额只有一百万 -BAC009S0908W0140 而最高额度提升至一百万后 -BAC009S0908W0141 大部分刚需购房者都可以选择公积金贷款 -BAC009S0908W0142 中原地产首席分析师张大伟分析 -BAC009S0908W0143 公积金额贷款额度升至一百万可以节省很多少利息 -BAC009S0908W0144 公积金贷款可少缴三十馀万 -BAC009S0908W0145 而额度最高一百万时 -BAC009S0908W0146 这一数值为二十馀万 -BAC009S0908W0147 这将使更多购房者具备买房支付能力 -BAC009S0908W0148 可以使用公积金贷款的购房者将起码增加百分之七 -BAC009S0908W0149 也将在一定程度上降低刚需购房者支付负担 -BAC009S0908W0150 公积金政策调整对于楼市成交刺激作用已初见瑞尔 -BAC009S0908W0151 链家地产市场研究部数据显示 -BAC009S0908W0152 北京市公积金额度提升后的元旦时期 -BAC009S0908W0153 近郊小户型楼盘及城区部分公房社区客户咨询量上升 -BAC009S0908W0154 而其房源多在一百平方米以下 -BAC009S0908W0155 中原地产市场研究部数据显示 -BAC009S0908W0156 以北京去年纯商品房成交结构为例 -BAC009S0908W0157 一百平均单套总价约一百万元左右 -BAC009S0908W0158 公积金贷款上限调整后 -BAC009S0908W0159 一百万元的贷款额度能满足大部分首套刚需的贷款需求 -BAC009S0908W0160 链家地产市场研究部张旭表示 -BAC009S0908W0161 此番公积金贷款政策调整将进一步提升振刚需 -BAC009S0908W0162 促进楼市预期向好发展 -BAC009S0908W0163 去年已有不少城市对公积金政策进行放松调整 -BAC009S0908W0164 公积金政策调整对购房者心理层面影响较大 -BAC009S0908W0165 将加速今年楼市成交复苏回暖 -BAC009S0908W0166 备受刚需购房者关注的公积金政策也频繁迎来调整 -BAC009S0908W0167 北京市管国管住房公积金中心先后发布通知 -BAC009S0908W0168 通知指出除北上广深一线城市外 -BAC009S0908W0169 对拥有一套住房并已结清相应购房贷款的居民家庭 -BAC009S0908W0170 申请公积金购买第二套住房 -BAC009S0908W0171 最低首付款比例由百分之七降低至百分之五 -BAC009S0908W0172 公积金首付的再次降低实际影响有限 -BAC009S0908W0173 但对购房者预期有积极响应 -BAC009S0908W0174 这将有利于稳定房地产市场 -BAC009S0908W0175 进而对稳定中国经济有正面作用 -BAC009S0908W0176 为进一步完善住房公积金个人住房贷款政策 -BAC009S0908W0177 对拥有一套住房并已结清相应购房贷款的居民家庭 -BAC009S0908W0178 最低首付款比例由百分之七降低至百分之五 -BAC009S0908W0179 该政策对于一线城市并不强制执行 -BAC009S0908W0180 而是北京上海广州深圳可在国家统一政策基础上 -BAC009S0908W0181 易居研究院智库中心研究总监严跃进认为 -BAC009S0908W0182 此次住建部财政部和中央联合发文 -BAC009S0908W0183 反映出政策层面较大的刺激力度 -BAC009S0908W0184 这是自去年以来除降息外 -BAC009S0908W0185 相关部门对公积金贷款政策的第三次放松 -BAC009S0908W0186 美丽北京大型绿色公益品牌项目 -BAC009S0908W0187 完善玉米大豆油菜籽棉花等农产品临时收储政策 -BAC009S0908W0188 完善主要农产品吞吐和调节机制 -BAC009S0908W0189 健全重要农产品储备制度 -BAC009S0908W0190 发挥骨干企业稳定市场的作用 -BAC009S0908W0191 完善生猪棉花食糖边销茶等调控预案 -BAC009S0908W0192 制定鲜活农产品调控办法 -BAC009S0908W0193 探索建立目标价格为核心的反周期补贴制度 -BAC009S0908W0194 加强农业科技交流合作 -BAC009S0908W0195 提高农业利用外资水平 -BAC009S0908W0196 继续用好国外优惠贷款和赠款 -BAC009S0908W0197 加大先进适用技术装备的引进消化和吸收力度 -BAC009S0908W0198 强化多双边和区域农业磋商谈判和贸易促进 -BAC009S0908W0199 做好涉农国际贸易规定制动工作 -BAC009S0908W0200 进一步强化贸易促进公共服务能力 -BAC009S0908W0201 积极推动优势农产品出口 -BAC009S0908W0202 积极应对国际贸易摩擦 -BAC009S0908W0203 支持行业协会办企业维护合法权益 -BAC009S0908W0204 进一步完善农业产业损害监测预警机制 -BAC009S0908W0205 运用符合世界贸易组织规定的相关措施 -BAC009S0908W0206 灵活有效调控农业产品进出口 -BAC009S0908W0207 积极推动种业农垦等方面改革 -BAC009S0908W0208 发展农村服务业和乡村企业 -BAC009S0908W0209 制定农村二三产业加快发展的鼓励政策 -BAC009S0908W0210 落实和完善有关税收政策 -BAC009S0908W0211 统筹城乡基础设施建和公共服务 -BAC009S0908W0212 逐步建立城乡统一的公共服务制度 -BAC009S0908W0213 积极稳妥推进户籍制度改革 -BAC009S0908W0214 推进省直接管理县市财政体制改革 -BAC009S0908W0215 优先将农业大县纳入改革范围 -BAC009S0908W0216 强化农业法制保障 -BAC009S0908W0217 坚持米袋子省长负责制和菜篮子市长负责制 -BAC009S0908W0218 全面落实耕地和基本农田保护领导干部离任审计制度 -BAC009S0908W0219 各有关部门和地方各级人民政府要围绕规划目标任务 -BAC009S0908W0220 研究落实各项强农惠农富农政策 -BAC009S0908W0221 统筹协调推动重大工程的实施 -BAC009S0908W0222 努力开创我国农业现代化发展新局面 -BAC009S0908W0223 农业农村信息化十二五规划 -BAC009S0908W0224 关于印发十二五规划的通知 -BAC009S0908W0225 中国老龄十二五规划 -BAC009S0908W0226 新农村十二五发展规划 -BAC009S0908W0227 国家林业十二五规划 -BAC009S0908W0228 十二五医药发展规划 -BAC009S0908W0229 老龄事业十二五规划 -BAC009S0908W0230 国务院总理温家宝五日主持召开国务院常务会议 -BAC009S0908W0231 再次听取全国民用核设施综合安全检查情况汇报 -BAC009S0908W0232 核电重启的曙光越来越近 -BAC009S0908W0233 国务院二零一一年五月 -BAC009S0908W0234 相关公司股票走势国海证券 -BAC009S0908W0235 决定对全国核设施进行安全检查 -BAC009S0908W0236 有关部门组织核安全地震海洋等方面专家 -BAC009S0908W0237 用五个多月时间对全国七十台运行在建核电机组 -BAC009S0908W0238 以及所有民用研究堆和核燃燃料循环设施等 -BAC009S0908W0239 进行了综合安全检查 -BAC009S0908W0240 形成了新形势下我国核电发展的建议阶段研究报告 -BAC009S0908W0241 国务院常务会议听取了综合安全检查情况汇报 -BAC009S0908W0242 对进一步深入检查及落实整改措施作了部署 -BAC009S0908W0243 核安全法规标准体系与国际接轨 -BAC009S0908W0244 具备一定的严重事故预防和缓解能力 -BAC009S0908W0245 部分核电厂未制定实施严重事故预防和缓解规程 -BAC009S0908W0246 海啸问题评估和应付基础比较薄弱等 -BAC009S0908W0247 有关部门和企业迅速组织整改 -BAC009S0908W0248 目前已取得阶段性成效 -BAC009S0908W0249 基本原则是预防为主纵深防御 -BAC009S0908W0250 新老并重防结结合 -BAC009S0908W0251 依靠科技持续改进 -BAC009S0908W0252 坚持法治严格监管 -BAC009S0908W0253 比如用户抱怨升级之后设施无法像以前那样工作了 -BAC009S0908W0254 甚至还不如原来的一点零版本系统好用 -BAC009S0908W0255 苹果此举是为了节约用电量 -BAC009S0908W0256 有人给出了解决方法 -BAC009S0908W0258 强制不断的心率测量 -BAC009S0908W0259 只是这种情况下心率传感器会每隔十秒进行一次 -BAC009S0908W0261 苹果表这么火爆微软也该出智能手表吗 -BAC009S0908W0262 刚开始微软因谨慎起见 -BAC009S0908W0264 最近才开始向其他市场推广销售 -BAC009S0908W0265 在谷歌与苹果相机推出智能手表后 -BAC009S0908W0266 微软目前仍局限于健身手环领域 -BAC009S0908W0267 但它的确算不上是智能手表 -BAC009S0908W0268 拥有内部存储空间与完整的应用平台 -BAC009S0908W0269 支持开发者为其编写应用 -BAC009S0908W0270 但它对开发者来说限制太多 -BAC009S0908W0272 微软正在向外界推广一次编写 -BAC009S0908W0273 跨设备使用的通用应用 -BAC009S0908W0274 但至今唯独没有提升智能手表平台 -BAC009S0908W0275 具体如下方视频介绍所示 -BAC009S0908W0277 刚开始微软因谨慎起见 -BAC009S0908W0280 原告当地时间周二在法庭上表示 -BAC009S0908W0281 苹果通过发布不必要的软件升级包 -BAC009S0908W0283 一起针对苹果的集体反垄断案两名原告的律师称 -BAC009S0908W0284 由于苹果要打压竞争对手 -BAC009S0908W0286 但却损害了消费者的利益 -BAC009S0908W0287 这次庭审将持续九天时间 -BAC009S0908W0288 给一桩近十年之久的诉讼一个定论 -BAC009S0908W0292 不过这些政策现在已经被废除 -BAC009S0908W0293 苹果打压了市场竞争 -BAC009S0908W0297 苹果担忧这会蚕食其市场份额 -BAC009S0908W0298 生态链中插入其他公司产品会造成问题 -BAC009S0908W0299 这会危及用户体验和产品质量 -BAC009S0908W0301 价格要么下降要么维持不变 -BAC009S0908W0302 苹果没有危害消费者利益 -BAC009S0908W0303 对频频的骚扰电话显示无可奈何 -BAC009S0908W0304 有黑客在网络上兜售车主信 -BAC009S0908W0305 美的摆稳棋局过冬搜狐科技 -BAC009S0908W0306 白电行业将进入最惨烈的一年 -BAC009S0908W0307 昔日巨头格力美的海尔也将沉浮于其中 -BAC009S0908W0308 从本年度第一份季报来看 -BAC009S0908W0309 三巨头中的格力海尔均出现不同程度下滑 -BAC009S0908W0310 实现净利营收双增长 -BAC009S0908W0311 美的吸取了当年大跃进的教训 -BAC009S0908W0312 一位买家电的朋友晒出一张销量清单 -BAC009S0908W0313 他担心自己马上就要被辞退了 -BAC009S0908W0314 发改委约谈各大空调企业的高管 -BAC009S0908W0315 媒体采访的电话打到各空调企业的市场负责人那里 -BAC009S0908W0316 各公司市场部都在卖场忙活 -BAC009S0908W0317 今年的促销从三月份就启动了 -BAC009S0908W0318 一位商场场内部人士称 -BAC009S0908W0319 注定是白色家电行业最惨烈的一年 -BAC009S0908W0320 现实的残酷落到报表上 -BAC009S0908W0321 是白电上市企业今年的一季报几乎全部沦陷 -BAC009S0908W0322 两大龙头企业格力和海尔 -BAC009S0908W0323 格力电器一季报营收为二百四十五亿元 -BAC009S0908W0324 同比去年降零点六百分之六 -BAC009S0908W0325 净利润为二十七点七五亿元 -BAC009S0908W0326 同比上升百分之二十三点零六 -BAC009S0908W0327 上一次是金融危机期间的二零零九一季度 -BAC009S0908W0328 另一白电巨头青岛海尔 -BAC009S0908W0329 一季度营收为二十一八点七亿元 -BAC009S0908W0330 净利润为九点七亿元 -BAC009S0908W0331 同比增百分之十三点一一 -BAC009S0908W0332 海信科龙和惠而浦则是营收增 -BAC009S0908W0333 海信科龙一季报营收为六十四点三亿元 -BAC009S0908W0334 净利润出现百分之一的下滑 -BAC009S0908W0335 净利出现七点三百分之一的降幅 -BAC009S0908W0337 实现营收净利双增长 -BAC009S0908W0338 十多天压抑的情感终于爆发 -BAC009S0908W0339 女排姑娘们在日本的最后一夜 -BAC009S0908W0340 大家才安安稳稳地睡了一觉 -BAC009S0908W0341 如果要数一下中国女排谁最红 -BAC009S0908W0342 张晓雅的人气肯定在前三名 -BAC009S0908W0343 她以最帅国手走红网络 -BAC009S0908W0344 网友大呼她帅过林丹 -BAC009S0908W0345 这位英气十足的九零后很有人缘 -BAC009S0908W0346 张晓雅最大的优点是有想法 -BAC009S0908W0347 张晓雅这个娃娃训练很自觉 -BAC009S0908W0348 在球场上的思路比较清楚 -BAC009S0908W0349 是一个在球场上有想法的球员 -BAC009S0908W0350 这个娃娃打球时很有思想 -BAC009S0908W0351 中国最帅的竞走冠军陈定将亮相苏州吴中 -BAC009S0908W0352 一九九二年八月五日出生于云南省保山市龙陵县 -BAC009S0908W0353 这个二十三岁的云南小伙子 -BAC009S0908W0354 取得瑞士卢加诺竞走挑战赛男子二十公里竞走银牌 -BAC009S0908W0355 夺得国际田联竞走世界杯男子二十公里竞走银牌 -BAC009S0908W0356 参加全国竞走大奖赛暨世锦赛选拔赛 -BAC009S0908W0357 以一小时二十一分十一秒成绩获铜牌 -BAC009S0908W0358 并取得世锦赛参赛资格 -BAC009S0908W0359 仰泳选手在比赛中 -BAC009S0908W0360 本次比赛使用最新的仰泳出发壁架 -BAC009S0908W0361 帮助仰泳运动员改善自己的出发技术 -BAC009S0908W0362 欧米茄计时管理委员会成员彼得许尔泽勒介绍说 -BAC009S0908W0363 可以帮助他们在出发时增加自己距离水面的高度 -BAC009S0908W0364 可以防止运动员出发时手部滑落 -BAC009S0908W0365 得到了仰泳选手的广泛好评 -BAC009S0908W0366 这是它第一次在游泳世界杯上亮相 -BAC009S0908W0367 也为背后的关键技术提供开发支持 -BAC009S0908W0368 从而确保高度精准地记录竞赛成绩 -BAC009S0908W0369 新科世界冠军宁泽涛领衔中国队出战 -BAC009S0908W0370 身材傲人颜值爆表的她魅力席卷整个亚洲 -BAC009S0908W0373 现年十八岁的莎宾娜身高达一百八十二厘米 -BAC009S0908W0374 腿长足足十二厘米 -BAC009S0908W0375 去年在亚青赛上亮相后 -BAC009S0908W0376 瞬间成为各国媒体的焦点 -BAC009S0908W0377 成为宅男心目中的排球女神 -BAC009S0908W0378 莎宾娜也凭借兼具清纯和性感气质的漂亮外形走红日本 -BAC009S0908W0379 甚至有日本的大牌经纪公司希望与其签约 -BAC009S0908W0380 做客日本电视台的新闻节目 -BAC009S0908W0381 不少媒体追问她是否有男朋友 -BAC009S0908W0382 莎宾娜透露目前单身理想型是喜欢运动 -BAC009S0908W0383 身材高挑并且不抽烟喝酒的男生 -BAC009S0908W0384 当下想把注意力集中在打球上 -BAC009S0908W0385 暂时不考虑恋爱的问题 -BAC009S0908W0386 这一单身宣言更加激发了日本粉丝对她的痴迷 -BAC009S0908W0387 希望可以见到她本人 -BAC009S0908W0388 该球队在官方博客上 -BAC009S0908W0389 但喜欢欧美音乐爱吃西红柿意大利面 -BAC009S0908W0391 看好她成为日本排球的新女神 -BAC009S0908W0392 美貌和实力并存的选手太稀罕了 -BAC009S0908W0393 莎宾娜已经在今年八月秘密抵达日本 -BAC009S0908W0395 她的母亲在采访中表示莎宾娜为了提升自己的实力 -BAC009S0908W0396 以哈萨克斯坦排协特派选手的方式加盟日本的球队 -BAC009S0908W0397 日本的排球训练是出了名的严厉 -BAC009S0908W0398 对此莎宾娜已经做好了吃苦的心理准备 -BAC009S0908W0399 家人和哈排协也表明了全力支持她的态度 -BAC009S0908W0400 不仅在各国网络社区和比赛中表现活跃 -BAC009S0908W0401 也成为哈萨克斯坦的宣传大使 -BAC009S0908W0402 日本排球界的人士指出 -BAC009S0908W0404 但是由于加朵要为蝙蝠侠大战超人忙碌 -BAC009S0908W0405 档期遇到了不可调和的冲突 -BAC009S0908W0406 因此不得不放弃宾虚的演出 -BAC009S0908W0407 这对她来说也是一个巨大的遗憾 -BAC009S0908W0408 私底下对歌迷亲切和善 -BAC009S0908W0409 最近人在大陆举行巡回演唱会的他 -BAC009S0908W0410 却被曝出在机场大发飙 -BAC009S0908W0411 有网友则晒出当天现场情况 -BAC009S0908W0412 搜狐娱乐讯据香港媒体报道 -BAC009S0908W0413 分享入行二十年的感受 -BAC009S0908W0414 陈奕迅坦言自己一直有情绪病 -BAC009S0908W0415 而且是一个爱哭鬼 -BAC009S0908W0416 常常在看电影和新闻时流泪 -BAC009S0908W0417 不开心时会找太太徐濠所倾诉 -BAC009S0908W0418 搜狐娱乐讯据香港媒体报道 -BAC009S0908W0419 陈奕迅在香港出席品牌活动 -BAC009S0908W0420 现场他透露道近日忙于内地巡演 -BAC009S0908W0421 对于天津爆炸时间 -BAC009S0908W0422 他表示感到伤痛 -BAC009S0908W0423 又透露去年曾在天津举办演唱会 -BAC009S0908W0424 希望送上歌曲今日为受害者打气 -BAC009S0908W0425 也祝福伤者早日康复 -BAC009S0908W0426 搜狐娱乐讯四月三十日 -BAC009S0908W0427 称这二人总是可以把自己逗笑 -BAC009S0908W0428 照片中二人坐在沙发上 -BAC009S0908W0429 谢霆锋戴着帽子 -BAC009S0908W0430 穿着白背心黑色短裤 -BAC009S0908W0431 数万只黄色小鸡散落路上 -BAC009S0908W0432 村民蜂拥而至捉小鸡 -BAC009S0908W0433 香港明报参考消息网八月二十九日报道港媒称 -BAC009S0908W0434 近日又出现疯抢水果捡漏等事 -BAC009S0908W0435 有内地学者分析背后心态 -BAC009S0908W0436 是因为国民抢习惯了 -BAC009S0908W0437 港媒称马云向浙江商人发出警告永远不要行贿 -BAC009S0908W0438 港媒评助学达人性侵女童案加强监管是关键 -BAC009S0908W0439 资料图王杰图片来源于网络 -BAC009S0908W0440 港媒评中国游客全球爆买旅游幼稚病 -BAC009S0908W0441 参考消息网一零月八日报道国庆长假结束 -BAC009S0908W0442 媒体再次盘点长假期间的各种热点新闻 -BAC009S0908W0443 其中一组中国旅游购物者全面攻陷日本的图片 -BAC009S0908W0444 多家媒体就这组图片中的场景和现象作出评论 -BAC009S0908W0445 并提出多种思考和提示 -BAC009S0908W0446 比如就中国游客热衷日本药品 -BAC009S0908W0447 歧视中国药企改进质量提高信誉改善用户体验 -BAC009S0908W0448 以便提高药品竞争力等等 -BAC009S0908W0449 港媒道士下山被批引发网友广泛讨论 -BAC009S0908W0450 参考消息网七月二零日报道 -BAC009S0908W0451 港媒上海成为亚洲奢华生活最昂贵的城市 -BAC009S0908W0452 参考消息网一零月二九日报道港媒称 -BAC009S0908W0453 上海已成为全亚洲奢华生活最昂贵的城市 -BAC009S0908W0454 垫底的是印度城市孟买 -BAC009S0908W0455 港媒东莞工地连续两次坍塌路面似被吸入地底 -BAC009S0908W0456 东莞常平一地盘两日两度地陷 -BAC009S0908W0457 网上流传的视频可见 -BAC009S0908W0458 地面在几秒内迅速塌陷成一个大坑 -BAC009S0908W0459 恐影响旁边大厦的基地 -BAC009S0908W0460 网络图片参考消息网八月一四日报道港媒称 -BAC009S0908W0461 一三日上午一零时许 -BAC009S0908W0462 东莞常平住宅大厦联邦花园旁边发生大面积地陷 -BAC009S0908W0463 面积达逾三零零平方米 -BAC009S0908W0464 造成一名井下工人死亡 -BAC009S0908W0465 该地盘曾发生地陷事故 -BAC009S0908W0466 现场流出的短片显示 -BAC009S0908W0467 每一次塌陷的区域前已有一个大坑 -BAC009S0908W0468 港媒中国人启动营养革命养生书籍热卖 -BAC009S0908W0469 参考消息网八月一五日报道 -BAC009S0908W0470 港媒中国出现多中心大都市郊区需要更多移民 -BAC009S0908W0471 参考消息网八月二五日报道 -BAC009S0908W0472 港媒中国发布金牌月嫂标准实用性遭质疑 -BAC009S0908W0473 参考消息网七月八日报道 -BAC009S0908W0474 港媒中国城市告别血汗工厂经济转型见成效 -BAC009S0908W0475 参考消息网八月一二日报道 -BAC009S0908W0476 港媒中国患者年底可在线上美国医生咨询病情 -BAC009S0908W0477 参考消息网九月二五日报道港媒称 -BAC009S0908W0478 在中国某个在线医疗平台增设一项新服务之后 -BAC009S0908W0479 中国正在逐步拥抱智能技术和数字至上创业精神 -BAC009S0908W0480 港媒中国成访日第一大客源国还会持续增加 -BAC009S0908W0481 参考消息网八月二日报道外媒称 -BAC009S0908W0482 访日外国游客突破千万 -BAC009S0908W0483 其中上半年中国访日游客接近翻倍 -BAC009S0908W0484 超过韩国成为访日最大客源国 -BAC009S0908W0485 更是扭转日本旅游赤字 -BAC009S0908W0486 港媒中国科学家研究蜈蚣毒液发现新止痛药 -BAC009S0908W0487 蜈蚣资料图参考消息网一零月二二日报道中国科学家称 -BAC009S0908W0488 港媒中式教学不可复制中国学生在哪都能拿高分 -BAC009S0908W0489 参考消息网九月二三日报道港媒称 -BAC009S0908W0490 宣传的重点是中国教育和英国教育之战 -BAC009S0908W0491 港媒中秋赏月航班受热捧部分靠窗座位售罄 -BAC009S0908W0492 参考消息网九月一三日报道港媒称 -BAC009S0908W0493 很多人都已为赏月做准备 -BAC009S0908W0494 如果对一般登高赏月仍未满足 -BAC009S0908W0495 可以考虑一下空中赏月 -BAC009S0912W0121 房地产相关领域问题频发 -BAC009S0912W0122 东地产财经周刊新一年度审计工作报告出炉 -BAC009S0912W0123 审计署审计长刘家义受国务院委托 -BAC009S0912W0124 土地相关的审查成为重点之一 -BAC009S0912W0125 刘家义在报告中指出 -BAC009S0912W0126 共审计二十个省本级和二百个市 -BAC009S0912W0127 二零零八年至二零一五年 -BAC009S0912W0128 这些地区批准建设用地二百万公顷 -BAC009S0912W0129 取得土地出让收入十三万亿元 -BAC009S0912W0130 支出十二万亿元 -BAC009S0912W0131 为经济社会发展提供了重要基础和支持 -BAC009S0912W0132 土地出入收入累计结馀五千亿元 -BAC009S0912W0133 主要是土地出让收入少征三千亿元 -BAC009S0912W0134 一些地方和单位少支付补偿一亿元 -BAC009S0912W0135 编造虚假资料等套取或骗取补偿一亿元 -BAC009S0912W0136 一些地方土地出让收支核算不够规范 -BAC009S0912W0137 减免或返还土地出让收入一亿元 -BAC009S0912W0138 建设用地方面也暴露了不少问题 -BAC009S0912W0139 违规以租代征改变规划条件等用地一万公顷 -BAC009S0912W0140 有一个突破土地或城市规划 -BAC009S0912W0141 还有一个违规扩区一万公顷 -BAC009S0912W0142 虚增耕地质量不达标的分别占百分之十和百分之三十 -BAC009S0912W0143 整治资金被挤占挪用等一亿元 -BAC009S0912W0144 纠正违法用地一万起 -BAC009S0912W0145 制定完善制度一百多项 -BAC009S0912W0146 审计已向有关部门移送重大违法违纪问题三百起 -BAC009S0912W0147 各级政府安排财政资金一亿元 -BAC009S0912W0148 为安居工程建设提供了资金保障 -BAC009S0912W0149 还有一亿元被套取或用于弥补经费不足等 -BAC009S0912W0150 有关地方追回资金或补贴一亿元 -BAC009S0912W0151 清理收回住房二十套 -BAC009S0912W0152 取消一万户家庭的保障资格 -BAC009S0912W0153 审计已向有关部门移送重大违法违纪问题三十起 -BAC009S0912W0154 在对央企的审计也发现了不少问题 -BAC009S0912W0155 中粮集团违规投资四亿元对原培训中心进行改扩建 -BAC009S0912W0156 受土地开发政策和土地规划限制未开发建设 -BAC009S0912W0157 六年土地收入十三万度审计报告中 -BAC009S0912W0158 房地产相关领域问题频发 -BAC009S0912W0159 东地产财经周度审计工作报告出炉 -BAC009S0912W0160 审计署审计长刘家义受国务院委托 -BAC009S0912W0161 羊年置业小调查的调查结果截图 -BAC009S0912W0162 二初楼市迎来多项利好政策 -BAC009S0912W0163 在多项政策的支持下 -BAC009S0912W0164 今年楼市将走向何方 -BAC009S0912W0165 中新网房产频道推置业小调查 -BAC009S0912W0167 十位网友参与了本次调查 -BAC009S0912W0168 在参与调查的网友中 -BAC009S0912W0169 约六成网友看涨全国的商品房价格 -BAC009S0912W0170 万科获选性价比最高的房企 -BAC009S0912W0171 房价的一涨一跌都牵动着购房者的神经 -BAC009S0912W0172 百分之五的网友认为房价将普遍上涨 -BAC009S0912W0173 百分之五的网友认为房价将普遍下跌 -BAC009S0912W0174 百分之五的网友认为房价走势不好判断 -BAC009S0912W0175 作为楼市政策的风向标 -BAC009S0912W0176 二全国两会或将楼市基调 -BAC009S0912W0177 国务院总理李克强在二政府工作报告中表示 -BAC009S0912W0178 支持居民自住和改善住房需求 -BAC009S0912W0179 促进房地产市场平稳健康发展 -BAC009S0912W0180 这也从宏观层面明确了政府对于房地产市场的态度 -BAC009S0912W0181 在今年两会是否会开启新一轮楼市调控这个问题上 -BAC009S0912W0182 中新网的调查结果显示 -BAC009S0912W0183 百分之五的网友认为不会 -BAC009S0912W0184 百分之五的网友认为会 -BAC009S0912W0185 百分之五的网友认为不好说 -BAC009S0912W0186 楼市政策也深刻影响着房地产行业的走向 -BAC009S0912W0187 抓紧做好故调查处理工作 -BAC009S0912W0188 督促责任单位彻底排查溢油风险点 -BAC009S0912W0189 并重新编报海洋环境影响报告书 -BAC009S0912W0190 彻底查明事故原因 -BAC009S0912W0191 查清事故造成的危害及损失 -BAC009S0912W0192 维护受损各方合法权益 -BAC009S0912W0193 立即部署开展海洋石油勘探开发安全生产检查 -BAC009S0912W0194 全面加强海洋环境监视监测和监督管理 -BAC009S0912W0195 全面准确及时发布事故处置相关信息 -BAC009S0912W0196 抓紧研究完善海洋环境保护的法律法规 -BAC009S0912W0197 入海污染物排放总量下降 -BAC009S0912W0198 力争渤海近岸海域水质总体改善 -BAC009S0912W0199 优化产业结构与布局 -BAC009S0912W0200 切实改变沿海地区重化工比重过大过于集中的状况 -BAC009S0912W0201 严格控制新上石化项目 -BAC009S0912W0202 禁止在可能造成生态严重失衡的地方进行围填海活动 -BAC009S0912W0203 有效控制陆海污染源 -BAC009S0912W0204 坚持海陆统筹河海兼顾 -BAC009S0912W0205 加强入海河流综合治理 -BAC009S0912W0206 合理布局入海排污口 -BAC009S0912W0207 制定更加严格的地方水污染排放标准 -BAC009S0912W0208 努力保护和修复渤海生态系统 -BAC009S0912W0209 加强用水总量控制与调度管理 -BAC009S0912W0210 改善河口和近岸海域生态环境 -BAC009S0912W0211 加强海陆过渡区生态建设 -BAC009S0912W0212 逐步恢复湿地生态功能 -BAC009S0912W0213 在海洋环境敏感区关键区等划定生态红线 -BAC009S0912W0214 有效防范海洋环境灾害 -BAC009S0912W0215 建立渤海海洋环境预警机制和突发事件应对机制 -BAC009S0912W0216 修订完善相关应急预案 -BAC009S0912W0217 强化地方政府和企业的主体意识法制意识 -BAC009S0912W0218 落实海洋环境保护责任 -BAC009S0912W0219 提高公众参与渤海环境保护的积极性和主动性 -BAC009S0912W0220 建立公开透明的信息发布机制 -BAC009S0912W0221 会议讨论进一步加强环境保护工作的意见 -BAC009S0912W0222 强调必须把污染治理和生态保护摆在更加重要的位置 -BAC009S0912W0223 切实解决损害公众健康影响科学发展的突发环境问题 -BAC009S0912W0224 落实节能减排各项任务 -BAC009S0912W0225 凡依法应当进行环评的建设规划和项目 -BAC009S0912W0226 都要严格履行环评程序 -BAC009S0912W0227 环评过程要公开透明 -BAC009S0912W0228 充分征求专家和社会公众意见 -BAC009S0912W0229 要依法追究管理部门责任企业及有关人员的责任 -BAC009S0912W0230 切实加强重金属污染防治 -BAC009S0912W0231 对重点地区行业和企业 -BAC009S0912W0232 妥善处理重金属污染历史遗留问题和突发污染事件 -BAC009S0912W0233 保障人民群众生命健康安全 -BAC009S0912W0234 严格化学品环境管理 -BAC009S0912W0235 对化学品项目布局进行梳理评估 -BAC009S0912W0236 对化学品生产经营企业进行环境隐患排查 -BAC009S0912W0237 对海洋江河湖泊沿岸化工企业进行集中综合整治 -BAC009S0912W0238 落实环境监管责任和安全保障措施 -BAC009S0912W0239 提高化学品生产的环境准入门槛 -BAC009S0912W0240 加强农村环境保护 -BAC009S0912W0241 集中整治存在突出环境问题的村庄和集镇 -BAC009S0912W0242 重点治理农村土壤饮用水水源地污染 -BAC009S0912W0243 推动环保基础设施和服务向农村延伸 -BAC009S0912W0244 引导和帮助农民科学处理垃圾和污水 -BAC009S0912W0245 科学使用农药化肥和农膜 -BAC009S0912W0246 严格农村工矿企业环境监管 -BAC009S0912W0247 坚决防止污染向农村转移 -BAC009S0912W0248 加快建设环境监测预警体系 -BAC009S0912W0249 完善环境事件应急机制 -BAC009S0912W0250 完善环境法律政策体系 -BAC009S0912W0251 针对近期各种环境事件暴露出的问题 -BAC009S0912W0252 抓紧制定和修订相关法律法规 -BAC009S0912W0253 毛利率也只有百分之十四 -BAC009S0912W0254 由此可见苹果现在的业务确实比汽车行业更加赚钱 -BAC009S0912W0256 他表示他肯定会与苹果展开合作 -BAC009S0912W0257 苹果公司一直在秘密从事电汽汽车的研发 -BAC009S0912W0258 并且计划最早在二零二零年推出生产首款车型 -BAC009S0912W0259 苹果已为汽车项目招募了数百名员工 -BAC009S0912W0260 包括电池和机器人技术领域的专家 -BAC009S0912W0261 苹果涉足汽车行业并不是一个好主意 -BAC009S0912W0264 除了苹果上周公布的选定合作伙伴 -BAC009S0912W0269 将会在设备发售稳定的推出与更新 -BAC009S0912W0278 此款健康设备将延迟到明年推出 -BAC009S0912W0279 根据知情人士获得的安吉拉录音手稿 -BAC009S0912W0280 安吉拉要求零售店员工养精蓄锐 -BAC009S0912W0281 为即将到来的购物季 -BAC009S0912W0282 以及中国的春节做准备 -BAC009S0912W0285 后有消息称该款产品将于今年的情人节推出 -BAC009S0912W0286 不过目前看来不大可能 -BAC009S0912W0287 因为春季的计算方式是从三月二十日到六月三十日 -BAC009S0912W0290 有报道称苹果计划在二零一四年秋季推出其可穿戴设备 -BAC009S0912W0291 该产品将延迟到二零一五年发布 -BAC009S0912W0292 纷至沓来的报道显示 -BAC009S0912W0293 电池的技术难题最终导致了它的延迟推出 -BAC009S0912W0299 包括更换不同尺寸型号和不同的表带 -BAC009S0912W0302 这将是苹果零售店采用的全新模式 -BAC009S0912W0303 如果融入移动互联的新时代 -BAC009S0912W0304 我们凭什么征战全世界 -BAC009S0912W0305 在前段时间的上海家电展上 -BAC009S0912W0306 美的集团总裁方洪波提出了上述三个问题 -BAC009S0912W0307 这是当前所有中国家电企业 -BAC009S0912W0308 都必须要回答的问题 -BAC009S0912W0309 如果不回答这三个课题 -BAC009S0912W0310 企业所有的目标都是空洞的 -BAC009S0912W0311 得出这个结论来自于方洪波对当前形势的判断 -BAC009S0912W0312 中国家电企业现在正面临前所未有的挑战 -BAC009S0912W0313 过去三十年高速发展的前提条件没有了 -BAC009S0912W0316 在全世界的产业格局看 -BAC009S0912W0317 全世界排列的二加三格局 -BAC009S0912W0319 这样一个全球白电的格局短期内是难以撬动的 -BAC009S0912W0320 时代力量正在颠复着家电行业 -BAC009S0912W0321 移动互联以前改变的是软的层面 -BAC009S0912W0322 比如流程的缩短平台化的应用 -BAC009S0912W0323 转型升级应该在十年前就开始了 -BAC009S0912W0324 中国家电企业在世界产业链地位弱小 -BAC009S0912W0325 跟世界产业的差距不是在缩小 -BAC009S0912W0326 这是目前我们中国家电企业面临的具体挑战 -BAC009S0912W0327 这些挑战来自于四面八方 -BAC009S0912W0328 未来给我们的机会和空间是有限的 -BAC009S0912W0330 白电行业将进入最惨烈的一年 -BAC009S0912W0331 昔日巨头格力美的海尔也将沉浮于其中 -BAC009S0912W0332 从本年度第一份季报来看 -BAC009S0912W0333 三巨头中的格力海尔均出现不同程度 -BAC009S0912W0335 下称美的内部的组织架构二点一五年加大了调整力度 -BAC009S0912W0336 七月成立了美的部品事业部 -BAC009S0912W0337 威灵电机将有可能兼并美芝压缩机 -BAC009S0912W0338 这一切并不是说说而已 -BAC009S0912W0339 而是要明确落实在数字上 -BAC009S0912W0340 控制在六零微克立方米左右 -BAC009S0912W0341 这与市民的期望和城市发展的愿景也是一致的 -BAC009S0912W0342 二零一七年二零二二年 -BAC009S0912W0343 我们还将继续加大污染防治力度 -BAC009S0912W0344 这一点对于京津冀一带的居民来说是才最重要的 -BAC009S0912W0345 因为每个人都需要呼吸 -BAC009S0912W0346 场馆建设一简约而不简单 -BAC009S0912W0347 二零二二年北京冬奥会计划使用一二个比赛场馆 -BAC009S0912W0348 总体上以节俭办赛为原则进行规划建设和改造使用 -BAC009S0912W0349 充分利用北京奥运后的丰富遗产 -BAC009S0912W0350 仅有三个场馆需要新建 -BAC009S0912W0351 分别是位于北京市区的国家速滑馆和延庆的二个雪场 -BAC009S0912W0352 其馀场馆改建后均可满足赛事需要 -BAC009S0912W0353 既免去了不必要的花费 -BAC009S0912W0354 每个场馆又高端大气上档次 -BAC009S0912W0355 真可谓是简约而不简单啊 -BAC009S0912W0356 花样滑冰短道速滑项目在首都体育馆进行 -BAC009S0912W0357 冰壶项目在水立方进行 -BAC009S0912W0358 计划明年就将开始动工 -BAC009S0912W0359 速滑馆建成后将设置四百米滑道 -BAC009S0912W0360 设有座位一万两千个 -BAC009S0912W0361 在冬奥会举办之前这里将为专业队伍训练提供场地 -BAC009S0912W0362 我们的奥运健儿将在此努力备战 -BAC009S0912W0363 成为市民体验冰上运动的乐园 -BAC009S0912W0364 张家口市的崇礼县从每年十一月初到第二年四月初 -BAC009S0912W0365 崇礼县发展较成熟的万龙滑雪场和云顶滑雪场 -BAC009S0912W0366 加上一个仍在建的太舞四季滑雪场 -BAC009S0912W0367 均已被纳入二零二二年冬奥会的规划场馆 -BAC009S0912W0368 万龙和云顶滑雪场都将根据赛事要求进行改造和扩建 -BAC009S0912W0369 小海坨山是位于延庆境内的海坨山主峰 -BAC009S0912W0370 此地春秋冬三季有雪 -BAC009S0912W0371 滑雪期从十一月下旬到次年三月中旬 -BAC009S0912W0372 这里常年吸引着众多登山探险运动爱好者 -BAC009S0912W0373 拥有高山滑雪要求的八百米落差 -BAC009S0912W0374 非常适合修建高山雪场 -BAC009S0912W0375 将依托现有山体地形修建临时场地设施 -BAC009S0912W0376 用作雪车雪橇大项和滑雪大项中的高山滑雪比赛场地 -BAC009S0912W0377 全民冰雪季奥运健儿助力 -BAC009S0912W0378 早在申办北京冬奥会的时候 -BAC009S0912W0379 很多人都看到了新的奥运商机 -BAC009S0912W0380 会投资建设一些冰雪主题乐园和冬季项目体验场所 -BAC009S0912W0381 在全民健身成为国家战略的大背景下 -BAC009S0912W0382 观赛便利不出国门看奥运 -BAC009S0912W0383 以往想要见识奥运级别的比赛 -BAC009S0912W0384 冰雪爱好者不得不选择出国 -BAC009S0912W0385 高昂的交通和住宿成本让很多人望而却步 -BAC009S0912W0386 如今在家门口就可以实现这个愿望了 -BAC009S0912W0387 交通住宿花费大大降低 -BAC009S0912W0388 让我们能够来一次说走就走的冬奥之行 -BAC009S0912W0389 在主场为中国健儿加油 -BAC009S0912W0390 该是一件多幸福的事啊 -BAC009S0912W0392 责任编辑冯浩 -BAC009S0912W0393 十月十八日早上九点 -BAC009S0912W0394 各地跑步爱好者齐聚一堂 -BAC009S0912W0395 共同享受奔跑带来的乐趣 -BAC009S0912W0396 经历过北京站和上海站两次比赛 -BAC009S0912W0397 本次沈阳站赛场迎来了许多熟悉的面孔 -BAC009S0912W0398 尤为引人瞩目的莫过于李子成 -BAC009S0912W0399 他更是以三十分十七秒一举夺得奔跑中国三连冠 -BAC009S0912W0400 而十公里女子组由刘庆红以三十四分十秒夺得冠军 -BAC009S0912W0401 海信一汽大众等知名企业和品牌也依旧亮相赛场 -BAC009S0912W0402 以不同方式助力本次比赛胜利进行 -BAC009S0912W0403 近四千名跑步爱好者和其家人朋友齐聚于此 -BAC009S0912W0404 全球范围内的创收达到十一点八亿美元 -BAC009S0912W0405 亚当桑德勒成功卫冕 -BAC009S0912W0406 约翰尼德普紧随其后 -BAC009S0912W0407 但是出于预算考虑 -BAC009S0912W0408 陈奕迅隔空发表爱的宣言也是啊 -BAC009S0912W0409 例如出入帮忙开门拉椅子 -BAC009S0912W0410 新京报报道思维发散表情与肢体语言丰富 -BAC009S0912W0411 对于疯癫陈奕迅所长的这些设定歌迷早已习惯了 -BAC009S0912W0412 在凭借专辑米闪成为新一轮金曲歌王后 -BAC009S0912W0414 朱祖儿操刀灰色调封面 -BAC009S0912W0415 袁两半一人歌词包办 -BAC009S0912W0416 处于寻找状态中的挣扎 -BAC009S0912W0417 然而准备中三个字卸掉了他的纠结 -BAC009S0912W0418 二十九日晚间举办媒体听歌会 -BAC009S0912W0419 现场试听无条件人生马拉松等六首歌曲 -BAC009S0912W0420 终站是好友谢霆锋的创作 -BAC009S0912W0422 花了三年时间才得到这首歌 -BAC009S0912W0423 被问是否感觉到谢霆锋与王菲恋爱的甜蜜 -BAC009S0912W0428 十九点二十六分 -BAC009S0912W0429 好友陈妍希晒与潘玮柏搞怪合影为他庆生 -BAC009S0912W0430 称潘玮柏生日快乐 -BAC009S0912W0431 港富豪被绑涉及两岸三地绑匪要求赎金用比特币 -BAC009S0912W0432 日前遭人绑架并勒索七零零零万港元 -BAC009S0912W0433 台港警方追查一个月 -BAC009S0912W0434 二十七日深夜终于在云林县一家废弃空屋中救出了黄立坤 -BAC009S0912W0435 获救第一句话就是我以为我活不了了 -BAC009S0912W0436 港报评上海迪尼士不意味着香港迪尼士的没落 -BAC009S0912W0437 参考消息网七月二八日报道 -BAC009S0912W0438 港报内地医院仍控制处方药销售电商盼网售解禁 -BAC009S0912W0439 参考消息网九月一七日报道港媒称 -BAC009S0912W0440 自从中国内地的第一家网上药店一零年前开张以来 -BAC009S0912W0441 大量资本已投入医药企业中 -BAC009S0912W0442 希望能从中国内地日益老龄化的一三亿人口中受益 -BAC009S0912W0443 港校两名内地生酒后街头野战当事人被起底 -BAC009S0912W0444 南都讯记者王睦广发自香港今年四月初 -BAC009S0912W0445 被拍下短片冠以野战之名在网上疯传 -BAC009S0912W0446 二人早前被香港警方以有违公德罪落案起诉 -BAC009S0912W0447 事件中的女方昨日被判一二个月感化令 -BAC009S0912W0448 男方则将于下月庭审 -BAC009S0912W0449 港珠澳大桥又起漂移风波可能进一步影响工期 -BAC009S0912W0450 其人工岛被指移动六七米 -BAC009S0912W0451 这个意外可能进一步影响工期 -BAC009S0912W0452 游乐场大章鱼甩飞游客母亲落地时紧抱儿子 -BAC009S0912W0453 在空中以高速自转带给游客惊险刺激的体验 -BAC009S0912W0454 背部撞断了游乐场场边的三根不锈钢护栏 -BAC009S0912W0455 游学夏令营的无奈花豪华团价格吃喝难保 -BAC009S0912W0457 游客三亚海滩赏月后留二九吨垃圾三百人连夜清理 -BAC009S0912W0458 当海滩上如潮的人群散去 -BAC009S0912W0459 留下的却是被随手丢弃的垃圾 -BAC009S0912W0460 虽然海滩上设置了众多垃圾桶 -BAC009S0912W0461 但赏月人群还是乱扔垃圾 -BAC009S0912W0462 从二八日凌晨四点半至六点半这整整二个小时里 -BAC009S0912W0463 游客三亚游泳致终身残疾向旅行社索赔一九六万 -BAC009S0912W0464 成都男子张呈亮化名旅行时到三亚海滩游泳 -BAC009S0912W0465 下海后却突然失去意识 -BAC009S0912W0466 送医后被查出颈部脊髓损伤 -BAC009S0912W0467 张先生在青羊法院提起诉讼 -BAC009S0912W0468 此案正在进一步审理之中 -BAC009S0912W0469 游客下桥拍照踩死植物水杉栈道仙境拉铁丝网 -BAC009S0912W0470 当植物恢复正常生长后铁丝网将拆除 -BAC009S0912W0471 游客不满小孩超高补票与景区工作人员群殴 -BAC009S0912W0472 一段游客暴打景区员工的视频开始在网上发酵 -BAC009S0912W0473 某景点大门处多名游客与身着穿服的工作人员大打出手 -BAC009S0912W0474 游客乌鲁木齐吃自助被罚二四零零元工商部门介入 -BAC009S0912W0475 剩下了一二零零克食物 -BAC009S0912W0476 被餐厅罚款二四零零元 -BAC009S0912W0477 餐厅返还了游客的二四零零元 -BAC009S0912W0478 物价部门工商部门已介入调查 -BAC009S0912W0479 游客偷走雷峰塔砖块想供奉起来做药给老人喝 -BAC009S0912W0480 游客入住药店被收二零元马桶使用费消协可举报 -BAC009S0912W0481 住酒店还要交二零元马桶费 -BAC009S0912W0482 南京市民张女士化姓去无锡旅游时 -BAC009S0912W0483 通过网站团购了无锡江南丹青度假酒店一间套房 -BAC009S0912W0484 退房结账时却被告知扣了二零元马桶使用费 -BAC009S0912W0485 这让张女士哭笑不得 -BAC009S0912W0486 酒店方承诺退还二零元马桶使用费 -BAC009S0912W0487 酒店行为属于乱收费 -BAC009S0912W0488 消费者可以直接向物价部门和旅游部门举报 -BAC009S0912W0489 现代快报记者赵书伶 -BAC009S0912W0490 游客再曝日照点海鲜太少被围殴当地警方证实 -BAC009S0912W0491 网友先在微博中陈述了悲惨遭遇 -BAC009S0912W0492 据称是当事人之一在派出所通过一个亲戚的微博发的 -BAC009S0912W0493 游客北京游两天遭引导消费近二万元 -BAC009S0912W0494 京华时报讯记者武红利与家人来京旅游 -BAC009S0912W0495 王女士与旅行社签订四天五晚的旅行合同 -BAC009S0913W0121 在最希望国家实施的调控政策这一问题上 -BAC009S0913W0122 有百分之五的网友选择了提高公积金贷款额度 -BAC009S0913W0123 百分之五的网友选择了房贷利率打折优惠 -BAC009S0913W0124 百分之五的网友倾向于房产税的开征 -BAC009S0913W0125 百分之五的网友希望放开一线城市的限购政策 -BAC009S0913W0126 网友的置业目的为首套房自住的占到了百分之六十 -BAC009S0913W0127 改善型二套房比例比约为百分之五 -BAC009S0913W0128 三套以上投资性购房占百分之五 -BAC009S0913W0129 其他目的的占比为百分之五 -BAC009S0913W0130 在商品房性价比的选择上 -BAC009S0913W0131 万科以百分之五的票数获选性价比最高的房企 -BAC009S0913W0132 绿地保利万达分列性价比最高房企的二三四名 -BAC009S0913W0133 选择恒大世茂富力的网友均不足百分之十 -BAC009S0913W0134 有百分之五的网友选择了其他房企 -BAC009S0913W0135 二初楼市迎来多项利好政策 -BAC009S0913W0136 在多项政策的支持下 -BAC009S0913W0137 支持新产业新业态 -BAC009S0913W0138 集中释放用地政策红利 -BAC009S0913W0139 在加大新供用地保障力度方面 -BAC009S0913W0140 新产业发展快地用地集约求且需求大的地区 -BAC009S0913W0141 在鼓励盘活利用现有用地方面 -BAC009S0913W0142 意见提出对制造业迈向中高端的企业用地 -BAC009S0913W0143 生产性科技及高技术服务业发展用地 -BAC009S0913W0144 建设创业创新平台用地 -BAC009S0913W0145 互联网行动计划实实施用地实行过渡期政策 -BAC009S0913W0146 按新用途新权利类型市场价办理用地手续 -BAC009S0913W0147 支持新产业新业态 -BAC009S0913W0148 由国土资源部联合国家 -BAC009S0913W0149 正式放松外贸外资投资我国房地产相关规定 -BAC009S0913W0150 我国对房地产的行政干预政策陆续退出 -BAC009S0913W0151 放松限外是必然趋势 -BAC009S0913W0152 此举将有利于市场信心的培养 -BAC009S0913W0153 并利好一二线城市的中高端住宅 -BAC009S0913W0154 对于外商投资房地产企业注册资本与投资总额比例 -BAC009S0913W0155 对于实施住房限购政策的城市 -BAC009S0913W0156 境外个人购房应当符合当地政策规定 -BAC009S0913W0157 上海易居研究所副院长杨红旭表示 -BAC009S0913W0158 外资管制放松是大势所趋 -BAC009S0913W0159 随着我国行政干预政策的陆续退出 -BAC009S0913W0160 此前为限制外资炒房 -BAC009S0913W0161 我国出台了一系列限外令 -BAC009S0913W0162 二的向境外投资方出售国内资产征税规定 -BAC009S0913W0163 国家外汇局出台过规定 -BAC009S0913W0164 国家发改委也发出过通知 -BAC009S0913W0165 对于提供给外籍人士的个人住房按揭贷款的外债需求 -BAC009S0913W0166 不予安排中长期外债额度 -BAC009S0913W0167 房地产被视作保值升值的投资标的被炒作 -BAC009S0913W0168 但目前的形势早已改变 -BAC009S0913W0169 此前外资购房主要集中在一线城市和几个热点二线城市 -BAC009S0913W0170 而现在这类城市房价已经很高 -BAC009S0913W0171 即使限外令放开也不会出现外资大规模买房 -BAC009S0913W0172 中原地产市场总监张大伟认为 -BAC009S0913W0173 放松对外限制利用利好一二线城市中高端物业 -BAC009S0913W0174 对于外商房企的注册资本金降低要求 -BAC009S0913W0175 也有利于部分企业的资金周转 -BAC009S0913W0176 正式放松外资投资我国房地产相关规定 -BAC009S0913W0177 允许境外机构在境内设 -BAC009S0913W0178 六部委松绑楼市限外令 -BAC009S0913W0179 外资在华房地产投资购房限制被松绑 -BAC009S0913W0180 允许机构和个人在中国购房 -BAC009S0913W0181 中房指数研究所院长陈晟表示 -BAC009S0913W0182 此举对促进外企在华投资房地产有积极作用 -BAC009S0913W0183 相关公司股票走势鄂尔多斯 -BAC009S0913W0184 内地产投资比例有限 -BAC009S0913W0185 此项政策对中国楼市影响有限 -BAC009S0913W0186 对于实施住房限购政策的城市 -BAC009S0913W0187 为环境保护提供更加完备有效的法制保障 -BAC009S0913W0188 进一步完善环境政策 -BAC009S0913W0189 健全环境执法调协调机制 -BAC009S0913W0190 国务院国资委力挺国企 -BAC009S0913W0191 具备条件的要积极引进战略投资者 -BAC009S0913W0192 推进主营业务整体上市 -BAC009S0913W0193 国资委接二连三对此表态或意味着国企将迎来上市高峰 -BAC009S0913W0194 使国有资本更多地向重要行业和关键领域集中 -BAC009S0913W0195 向具有优势的行业集中 -BAC009S0913W0196 向大企业大集团集中 -BAC009S0913W0197 要吸收民间资本参与国有企业改制重组 -BAC009S0913W0198 发展混合所有制经济 -BAC009S0913W0199 发挥国有大企业引领带动作用 -BAC009S0913W0200 促进各种所有制企业共同发展 -BAC009S0913W0201 至二零一一年六月底 -BAC009S0913W0202 中央企业控股境外上市公司 -BAC009S0913W0203 国资委还将支持企业走出去 -BAC009S0913W0204 逐步实现战略运营管管理全球化 -BAC009S0913W0205 应当经国务院国资委核准 -BAC009S0913W0206 办法五月一日起实施 -BAC009S0913W0207 国务院国资委对央企境外投资的管理法规在逐渐完善 -BAC009S0913W0208 中央企业在境外从事非主业投资 -BAC009S0913W0209 需要向国务院国资委报送申请核准非主业投资的请示 -BAC009S0913W0210 对非主业投资项目的有关决策文件 -BAC009S0913W0211 项目可行性研究报告尽职调查等相关文件 -BAC009S0913W0212 办法还特别提出一些建议 -BAC009S0913W0213 国务院国资委将指导中央企业之间加强境外投资合作 -BAC009S0913W0214 中央走出去的步伐正趋加快 -BAC009S0913W0215 央企在境外含港澳地区营收 -BAC009S0913W0216 利润总额较大 -BAC009S0913W0217 同比较上年同期分别增长百分之三十和百分之二十八 -BAC009S0913W0218 涨幅远超央企整体水平 -BAC009S0913W0219 一方面很多央企已制定了海外战略 -BAC009S0913W0220 一些国家经济出现大的波动 -BAC009S0913W0221 而社会罢工劳资纠纷也时有发生 -BAC009S0913W0222 央企在境外投资面临的问题很多 -BAC009S0913W0223 目前央企境外投资仍处在初级阶段 -BAC009S0913W0224 制定和发布办法 -BAC009S0913W0225 是为了进一步建立健全境外国有资产管理制度 -BAC009S0913W0226 切实加强央企境外投资监管 -BAC009S0913W0227 确保境外国有资产保值增值 -BAC009S0913W0228 更好地适应了新形势的需要 -BAC009S0913W0229 国务院多举措力挺农产品流通 -BAC009S0913W0230 免征蔬菜流通环节的增值税 -BAC009S0913W0231 提出完善农产品流通税收政策 -BAC009S0913W0232 免征蔬菜流通环节增值税加强金融支持 -BAC009S0913W0233 相关公司股票走势农产品 -BAC009S0913W0234 各地要鼓励流通企业跨地区兼并重组和投资合作 -BAC009S0913W0235 以加强产销衔接为重点 -BAC009S0913W0236 加强鲜活农产品流通基础设施建设 -BAC009S0913W0237 创新鲜活农产品的流通模式 -BAC009S0913W0238 提高流通组织化程度 -BAC009S0913W0239 完善流通链条和市场布局 -BAC009S0913W0240 进一步减少流通环节 -BAC009S0913W0241 保障鲜活农产品市场供应和价格稳定 -BAC009S0913W0242 各地要依据城市总体规划和城市网点商业规划 -BAC009S0913W0243 鼓励流通企业跨地区兼并重组和投资合作 -BAC009S0913W0244 要大力推进产销衔接 -BAC009S0913W0245 完善市场监测预警和信息发布机制 -BAC009S0913W0246 建立健全重要农产品储备制度 -BAC009S0913W0247 完善农产品跨区调运调剂机制 -BAC009S0913W0248 各城市要根据消费需求和季节变化 -BAC009S0913W0249 合理确定耐贮蔬菜的流通动态库存数量 -BAC009S0913W0250 加快鲜活农产品质量安全追溯体系建设 -BAC009S0913W0251 通过投资入股产权回购回租建公建配套等方式 -BAC009S0913W0252 发挥财政资金引导示范作用 -BAC009S0913W0254 这样可以加深苹果和消费者之间的关系 -BAC009S0913W0255 对未来的销量至关重要 -BAC009S0913W0258 为提高苹果零售商店的服务质量 -BAC009S0913W0259 该系统包含一套算法 -BAC009S0913W0260 有媒体援引知情人士消息称 -BAC009S0913W0261 苹果将引入这样一套顾客接待系统 -BAC009S0913W0262 前往苹果零售店的顾客将比餐厅订餐叫号还方便 -BAC009S0913W0263 苹果零售店实行先到先服务的原则 -BAC009S0913W0264 这样难免会出现某个客户的维修问题特别复杂 -BAC009S0913W0265 导致技术支持时间超过了预期分配时间 -BAC009S0913W0266 从而影响接下来的客户无法在指定时间点获得服务 -BAC009S0913W0267 新系统可根据难易程度进行排序 -BAC009S0913W0268 与现在的接待原则不同的是 -BAC009S0913W0269 此时客户可以选择离开苹果零售店 -BAC009S0913W0270 而当预订时间接近时 -BAC009S0913W0271 客户会再次收到短信提醒 -BAC009S0913W0272 客户回到苹果零售店后 -BAC009S0913W0274 以告知客户相关技术人员确切的空闲时间 -BAC009S0913W0275 以及在店内的具体位置 -BAC009S0913W0277 为提高苹果零售商店的服务质量 -BAC009S0913W0278 苹果靠什么颠复移动支付市场 -BAC009S0913W0279 苹果一口气召开了两次新品发布会 -BAC009S0913W0280 就在会场的凳子和垃圾尚未收拾干净的时候 -BAC009S0913W0281 全世界的报道已经蜂拥而至 -BAC009S0913W0282 失望中夹杂着嘲讽的情绪霸占了各模块的头条 -BAC009S0913W0283 科技经济社会金融全都是苹果的消息 -BAC009S0913W0284 害得汪峰也不敢随便表白了 -BAC009S0913W0285 而是统一地认为苹果开了有史以来最烂的发布会 -BAC009S0913W0286 他们推出的产品不仅非常鸡肋 -BAC009S0913W0288 就足以让专家们恶心七七四十九天了 -BAC009S0913W0289 但这些口水式的讨伐并没有影响苹果前进的脚步 -BAC009S0913W0291 证明了其向主流妥协的姿态 -BAC009S0913W0293 自二零零七年乔布斯重新发明手机开始 -BAC009S0913W0294 把它升级成为一款综合性智能终端之后 -BAC009S0913W0295 就开始潜移默化地渗透人类的生活 -BAC009S0913W0296 这种渗透犹如蜘蛛结网细菌繁殖病毒传播 -BAC009S0913W0297 悄无声息又经年累月 -BAC009S0913W0298 我们甚至都没有来得及反抗就被完全征服了 -BAC009S0913W0299 我根本无法想象每天在朋友圈上花两个小时的情景 -BAC009S0913W0300 但现在已经成为了习生活习惯 -BAC009S0913W0301 但新贵移动支付具绝对能更深层次地改变用户的生活 -BAC009S0913W0302 乃至颠复现有的经济形态和支付格局 -BAC009S0913W0303 第一财经日报记者七月十三日从美的内部获悉 -BAC009S0913W0304 已获任美的部品事业部的总裁 -BAC009S0913W0305 而威灵电器七月九日下午也公告透露 -BAC009S0913W0306 于一九九一年加盟美的集团 -BAC009S0913W0307 美芝压缩机已是全球最大空调压缩机企业 -BAC009S0913W0308 占全球空调压缩机市场三分之一的份额 -BAC009S0913W0309 美的将美芝压缩机威灵电机合并 -BAC009S0913W0310 将有助于两大部品业务的研发资源销售渠道共享 -BAC009S0913W0311 美的部品事业部建立后 -BAC009S0913W0312 将成立压缩机开发研究院和微电机开发研究院 -BAC009S0913W0313 以区域为中心建立客户经理负责制制造方面 -BAC009S0913W0314 负责统一管理原电机事业部的各工厂制造系统 -BAC009S0913W0315 原压缩机事业部各工厂保持不变 -BAC009S0913W0316 美的集团公关部的相关人士告诉第一财经日报记者 -BAC009S0913W0317 目前美芝与威灵的合并 -BAC009S0913W0318 仅处于美的集团内部管理架构调整的阶段 -BAC009S0913W0319 还没体现在香港上市公司威灵电器的业务层面 -BAC009S0913W0320 由于向为民已获任威灵电机的董事会主席 -BAC009S0913W0321 威灵电机今后兼并美芝压缩机 -BAC009S0913W0322 美芝压缩机是隶属于美的集团旗下的业务 -BAC009S0913W0323 由于美的集团本身就是威灵电机的大股东 -BAC009S0913W0324 即使今后美芝压缩机被威灵电器兼并 -BAC009S0913W0325 也对美的集团的总体业绩影响不大 -BAC009S0913W0326 而威灵电器二零一四年的营业额约九十二点七三亿港元 -BAC009S0913W0327 同比增长百分之四净利润六点七八亿港元 -BAC009S0913W0328 同比下跌十三六点百分之六 -BAC009S0913W0329 如果威灵电器兼并美芝压缩机 -BAC009S0913W0330 将有利于增加威灵电器的收入和利润 -BAC009S0913W0331 除了威灵电器与美芝压缩机合并成为美的部品事业部之外 -BAC009S0913W0332 美的最近还把洗碗机事业部合并到美的的厨房电器事业部 -BAC009S0913W0333 美的的洗碗机业务以外销为主 -BAC009S0913W0334 业物内士向记者分析说 -BAC009S0913W0335 被合并到美的的厨房电器事业部后 -BAC009S0913W0336 将有助于美的洗碗机开拓国内市场 -BAC009S0913W0337 破坏和颠复是互联网时代的特征 -BAC009S0913W0338 美丽的丁香湖公园成为跑步爱好者的狂欢圣地 -BAC009S0913W0339 剪纸皮影戏等特色节目更是吸引了一批批观众围观 -BAC009S0913W0340 跑友们积极的参与剪纸活动 -BAC009S0913W0341 亲身感受沈阳当地浓郁的民俗文化内蕴 -BAC009S0913W0342 许多跑友争先恐后穿上沈阳花棉袄拍照 -BAC009S0913W0343 并与亲朋好友分享这份快乐 -BAC009S0913W0344 而涂鸦墙上写满了跑友们的目标和愿望 -BAC009S0913W0345 伴随着专业啦啦队的加油声 -BAC009S0913W0346 跑友们在奔跑中国沈阳站的赛道上尽情的展示自己 -BAC009S0913W0347 赛道两边设置了许多专业摄像头 -BAC009S0913W0348 主办方试图记录每一个跑友挥洒激情的每一个瞬间 -BAC009S0913W0349 将这份快乐与跑对跑步的执着传递给身边的好友 -BAC009S0913W0351 同时带动当地人民的奔跑热情 -BAC009S0913W0354 更加多维度的助推跑步事业在中国的发展 -BAC009S0913W0355 服务广大跑步爱好者 -BAC009S0913W0356 奔跑中国系列竞跑赛事将转战广州 -BAC009S0913W0357 中新网成都九月十五日电付敬懿十五日 -BAC009S0913W0358 服务时间约为五十三万小时 -BAC009S0913W0359 自二零一四年十二月五日正式启动志愿者招募工作以来 -BAC009S0913W0360 因为本次赛事比赛周期长赛区跨度大 -BAC009S0913W0361 经过网络测试综合面试专业技能体能测试等环节 -BAC009S0913W0362 机关企事业单位等社会志愿者三千一百名 -BAC009S0913W0363 为做好志愿者服务工作 -BAC009S0913W0364 邀请专家学者等三十馀人组成志愿者培训导师库 -BAC009S0913W0365 指导各赛区开展志愿服务培训 -BAC009S0913W0366 组委会设计了具有四川特色的志愿者服装 -BAC009S0913W0367 志愿者的那一抹绿并大家亲切地称呼为小青椒 -BAC009S0913W0368 随着赛会推进被越来越多的人所熟知 -BAC009S0913W0369 成为本届残运会志愿服务文化的重要部分 -BAC009S0913W0370 电子科大的小青椒早上六点起床 -BAC009S0913W0371 每天忙碌十三个小时 -BAC009S0913W0372 用他们真挚的微笑和运动员建立起心与心的连接 -BAC009S0913W0373 四川大学的手语志愿者要学习四千个手语动作 -BAC009S0913W0374 而他们熟练掌握的秘笈是一次又一次反复的训练和排练 -BAC009S0913W0375 小青椒用热情和真诚打动了每位运动员 -BAC009S0913W0376 他们每天手牵手肩并肩出入赛场 -BAC009S0913W0377 就像认识多年的朋友和兄弟姐妹一样 -BAC009S0913W0378 湖北运动员的家长给小青椒写来致谢信 -BAC009S0913W0379 也温暖和感动着志愿者 -BAC009S0913W0380 北京时间十月十日 -BAC009S0913W0381 根据韩国乒乓球协会的相关规定 -BAC009S0913W0382 根据国际乒联刚刚公布的最新一期世界排名 -BAC009S0913W0383 而排名第三位的李尚洙 -BAC009S0913W0384 将只参加奥运会团体赛的比赛 -BAC009S0913W0385 此次韩国男团派出了一老带二新的阵容 -BAC009S0913W0386 此次里约奥运会也将会是其第三次征战奥运会比赛 -BAC009S0913W0387 作为经验最为丰富的老大哥 -BAC009S0913W0388 他将尽全力带领队伍取得好成绩 -BAC009S0913W0389 在韩国男队中排名第四 -BAC009S0913W0390 女排三零阿根廷朱婷复出扣杀状态神勇 -BAC009S0913W0391 二零一五年第十二届女排世界杯战至第八轮 -BAC009S0913W0392 中国女排直落三周以三零取胜阿根廷拿到第七胜 -BAC009S0913W0393 早前意外崴伤脚踝的朱婷强势复出 -BAC009S0913W0394 斩获十五分冠全场并且拦网独得四分 -BAC009S0913W0395 伤愈复出找手感一传防守遇考验 -BAC009S0913W0396 本报讯记者李晖经过两天转场 -BAC009S0913W0397 中国女排昨天下午在冈山迎战古巴队 -BAC009S0913W0398 三局比分是二五比一九二五比十和二五比一四 -BAC009S0913W0399 中国女排从第三轮开始便被挤出了三甲 -BAC009S0913W0400 而东道主日本队紧追在中国队之后 -BAC009S0913W0401 若想保住进入前两名的资格 -BAC009S0913W0402 中国队在第二阶段的第三场比赛不仅要保全取九个积分 -BAC009S0913W0403 而且还要尽量在小分上取得优势 -BAC009S0913W0404 福斯只允许先拍一部 -BAC009S0913W0405 另一部要视独立日二的票房而定 -BAC009S0913W0406 影片的上映日期 -BAC009S0913W0407 也从原计划的二零一六年七月一日 -BAC009S0913W0408 潘玮柏以侧颜出镜 -BAC009S0913W0409 与陈妍希分别看向对方 -BAC009S0913W0410 可见两人友谊非同一般 -BAC009S0913W0411 搜狐娱乐据讯据香港媒体报道 -BAC009S0913W0412 陈妍希今天五月二日下午出席公益活动 -BAC009S0913W0413 小洋装更衬托出她的纤细身材 -BAC009S0913W0414 不过她出道以来身材一直是外界关注的焦点 -BAC009S0913W0415 陈妍希一直努力让自己的脸圆圆脸变瘦 -BAC009S0913W0416 今天她出席活动 -BAC009S0913W0417 对着镜头嘟嘴吐舌 -BAC009S0913W0418 当被问到对于被选为棉花糖女孩比较肉感的女生 -BAC009S0913W0419 她笑说我觉得蛮好的啊 -BAC009S0913W0420 搜狐娱乐讯日前 -BAC009S0913W0421 名为娱乐圈八卦的自然自媒体 -BAC009S0913W0422 曝出陈妍希拍戏时突然干呕 -BAC009S0913W0423 并推断其已怀孕 -BAC009S0913W0424 陈妍希公司官方账号发表微博辟谣 -BAC009S0913W0425 否认了陈妍希疑似怀孕的传闻 -BAC009S0913W0426 称陈妍希目前还在剧组拍戏 -BAC009S0913W0427 并感谢了各界对于陈妍希公开与陈晓恋情的祝福 -BAC009S0913W0428 陈妍希在微博发布跳绳视频 -BAC009S0913W0429 并写道每天早上二十零下 -BAC009S0913W0430 中午二十零下 -BAC009S0913W0431 北京地接旅行社有限公司负责人称 -BAC009S0913W0432 向乘客收取的船费属于应收项目 -BAC009S0913W0433 旅行社为了盈利设置购物环节 -BAC009S0913W0434 北京旅游服务热线反馈称 -BAC009S0913W0435 还有待职能部门进一步调查 -BAC009S0913W0436 游客参与不合理低价游将被罚专家怎么判断 -BAC009S0913W0437 关于低价游旅行团因强制购物产生的纠纷事件频出 -BAC009S0913W0438 甚至还出现了一些造成游人身伤害的悲剧 -BAC009S0913W0439 旅游法早已明令禁止 -BAC009S0913W0440 游客参与不合理低价游也将受到受处难执行 -BAC009S0913W0441 京汇佳律师事务所律师邱宝昌表示 -BAC009S0913W0442 消费者根本很难判断什么叫做不合理低价 -BAC009S0913W0444 园中园收费超景区大门票 -BAC009S0913W0445 游客在乌鲁木齐市吃自助餐浪费食物被罚二四零零元 -BAC009S0913W0446 剩下了一二零零克食物 -BAC009S0913W0447 被餐厅罚款二四零零元 -BAC009S0913W0448 此事昨日经网络曝光后 -BAC009S0913W0449 食客该不该如此浪费 -BAC009S0913W0450 餐厅有没有权力罚款 -BAC009S0913W0451 成为了网民争相讨论的话题 -BAC009S0913W0452 游客在公园躲雨遭雷击已脑死亡至今无人负责 -BAC009S0913W0453 信息时报讯记者周伟龙八月一零日下午 -BAC009S0913W0454 六名游客在海珠湖公园凉亭内躲雨 -BAC009S0913W0455 昨日记者从医院了解到 -BAC009S0913W0456 目前黄某已被诊断为脑死亡 -BAC009S0913W0457 记者回访海珠湖公园发现 -BAC009S0913W0458 出事凉亭依然呈现事发时的状态 -BAC009S0913W0459 一旦雷雨天游客在亭内出事 -BAC009S0913W0460 该告示不能成为园方免责的理由 -BAC009S0913W0461 游客在北京动物园内小树间拉吊床摇荡 -BAC009S0913W0462 却要承载一个成年人的体重 -BAC009S0913W0463 一家三口在两棵树间拉起了一张吊床 -BAC009S0913W0464 父亲和孩子轮流上去躺 -BAC009S0913W0465 躺进吊床的父亲还荡起吊床 -BAC009S0913W0466 游客在新加坡买祖母绿回国发现非纯天然 -BAC009S0913W0467 夏先生带太太跟团去新马泰旅游 -BAC009S0913W0468 在新加坡花费三万元购买了纯天然的祖母绿吊坠 -BAC009S0913W0469 回国后经鉴定发现不是天纯天然的 -BAC009S0913W0470 游客在日照旅游吃海鲜太少被围殴警方都有错 -BAC009S0913W0471 大众网河南游客爆料在日照旅游团因吃海鲜太少被围殴 -BAC009S0913W0472 警方回应都有过错经警方调查 -BAC009S0913W0473 双方均有不同程度受伤 -BAC009S0913W0474 河南籍游客张某某手部受伤及表皮损伤 -BAC009S0913W0475 店主陈某头皮裂创二处 -BAC009S0913W0476 游客在济南景区水池许愿观赏莲被砸成马蜂窝 -BAC009S0913W0477 游客扔硬币许愿观赏莲被砸成马蜂窝 -BAC009S0913W0478 游客在百年老店买到发霉盐水鸭商家主动退款 -BAC009S0913W0479 谢女士购买的盐水鸭外包装 -BAC009S0913W0480 华商报讯记者杨德合买了两个肉夹馍 -BAC009S0913W0481 结果被店员搓走了二零零零元 -BAC009S0913W0482 尽管在民警的协助下 -BAC009S0913W0483 但这也让首次来到陕西游玩的孙女士感到憋屈 -BAC009S0913W0484 游客大铜缸刻字警察喊话故宫刻字者请自首 -BAC009S0913W0485 北京警方已介入调查 -BAC009S0913W0486 目前正在进行一步工作中 -BAC009S0913W0487 游客成都遇连环车祸近千人隧道里死里逃亡 -BAC009S0913W0488 图片由胡先生提供本报讯记者喻莉出门旅游 -BAC009S0913W0489 近千人在隧道里上演生死时速 -BAC009S0913W0490 武汉网友胡琦的一条短信微博引起众人关注 -BAC009S0913W0491 记者联系上胡先生才知虚惊一场 -BAC009S0913W0492 现场有人喊有车要爆炸 -BAC009S0913W0493 事后才了解他们遭遇的只是普通连环车祸 -BAC009S0913W0494 游客打车被找四张同号假钞官方疑遇克隆车 -BAC009S0913W0495 其在出行成都打车时被司机找了四张同号的二元零元假币 -BAC009S0914W0121 境外个人购买应当符合当地政策规定 -BAC009S0914W0122 外资在华房地产投资限制松绑已成大势 -BAC009S0914W0123 在限限制外商投资产业目录中 -BAC009S0914W0124 已经删除了此前针对外商投资房地产的全部限制类条款 -BAC009S0914W0125 放开外资购买房产限制 -BAC009S0914W0126 外资只可以购买商铺写字楼等物业 -BAC009S0914W0127 普通住宅很可能仍将限购 -BAC009S0914W0128 而就昨日六部委松绑楼市限外令的情况来看 -BAC009S0914W0129 对于中国楼市的影响不必过于乐观 -BAC009S0914W0130 取消限外令将促进外企在华投资房地产 -BAC009S0914W0131 对于中国楼市有一定积极作用 -BAC009S0914W0132 特别是在海外热钱有外流预期的情况下 -BAC009S0914W0133 继续限制外资投资中国房地产已经不合时宜 -BAC009S0914W0134 外资占国内地产投资比例有限 -BAC009S0914W0135 此项政策对中国楼市影响有限 -BAC009S0914W0136 中国房地产学会副会长陈国强也认为 -BAC009S0914W0137 外资购房主要集中在一线城市和几个热点二线城市 -BAC009S0914W0138 而现在这类城市的房价已经很高 -BAC009S0914W0139 即使限外令放开也不会出现大规模外资买房 -BAC009S0914W0140 正处于筑底回暖阶段 -BAC009S0914W0141 主要还是依靠中国国内企业投资 -BAC009S0914W0142 虽然一线城市房价已出现反弹 -BAC009S0914W0143 但包括鄂尔多斯温州等地的去库存还是非常困难 -BAC009S0914W0144 七月份各线城市房价分化仍然明显 -BAC009S0914W0145 目前整体的宏观经济还是比较困难的 -BAC009S0914W0146 房地产的投资增速目前不到五百分之 -BAC009S0914W0148 因此开发还要继续坚定的开工和拿地的信心 -BAC009S0914W0149 这种分化情况会更剧烈 -BAC009S0914W0150 但是整体回暖和好转态势已经确定 -BAC009S0914W0151 与前年差不多这种状态 -BAC009S0914W0152 外资在华房地产投资购房限制被松绑 -BAC009S0914W0153 允许机构和个人在中国购 -BAC009S0914W0154 六部委调整房地产市场外资准入和管理政策 -BAC009S0914W0155 为促进房地产市场平稳健康发展 -BAC009S0914W0156 一外商投资房地产企业注册资本与投资总额比例 -BAC009S0914W0157 对于实施住房限购政策的城市 -BAC009S0914W0158 境外个人购房应当符合当地政策规定 -BAC009S0914W0159 优化和改进外商投资房地产管理 -BAC009S0914W0160 除上述政策调整以外 -BAC009S0914W0161 为促进房地产市场平稳健康发展 -BAC009S0914W0162 以及在中国境内工作学习的境外个人 -BAC009S0914W0163 可以购买符合实际需要的自用自住商品房 -BAC009S0914W0164 外商投资房地产企业注册资本与投资总额比例 -BAC009S0914W0165 将依照中外合资经营企业的相关暂行规定 -BAC009S0914W0166 中新网八月二十七日电据商务部官网公布的文件显示 -BAC009S0914W0167 取消外商投资房地产企业 -BAC009S0914W0168 六部门出台新政楼市限外政策放松 -BAC009S0914W0169 这来外资进入我国房地产市场最宽松的政策 -BAC009S0914W0170 这份只有五百多字的通知印发于八月十九日 -BAC009S0914W0171 规定外商投资建立房地产企业 -BAC009S0914W0172 投资总额超过一千万美元含一千万美元的 -BAC009S0914W0173 注册资本金不得低于投资总额的百分之五十 -BAC009S0914W0174 外商投资房地产企业注册资本金未全部缴付的 -BAC009S0914W0175 未取得国有土地使用证的 -BAC009S0914W0176 或开发项目资本金未达到项目投资总额百分之五 -BAC009S0914W0177 不得办理境内境外贷款 -BAC009S0914W0178 外汇管理部门不予批准该企业的外汇借款结汇 -BAC009S0914W0179 不得购买非自用非自住商品房 -BAC009S0914W0180 港澳台地区居民和华侨因生活需要 -BAC009S0914W0181 可在境内限购一定面积的自住商品房 -BAC009S0914W0182 二到二 -BAC009S0914W0183 我国楼市正处在急速上升通道 -BAC009S0914W0184 大量外资希望进入我国市场 -BAC009S0914W0185 面对楼市中急剧增长的投资热情 -BAC009S0914W0186 对购买住房的数量也未做要求 -BAC009S0914W0187 带动和规范民间资本进入农产品流通领域 -BAC009S0914W0188 完善农产品流通税收政策 -BAC009S0914W0189 免征蔬菜流通环节增值税 -BAC009S0914W0190 加大涉农贷款投放力度 -BAC009S0914W0191 可按作价出资入股方式办理理用地手续 -BAC009S0914W0192 但禁止改变用途和性质 -BAC009S0914W0193 严厉打击农产品投机炒作 -BAC009S0914W0194 做好外资并购大型农产品批发市场的安全审查 -BAC009S0914W0195 严格执行鲜活农产品运输绿色通道政策 -BAC009S0914W0196 加快农产品流通标准体系建设 -BAC009S0914W0197 各地各部门加强组织领导 -BAC009S0914W0198 农产品产销对接的经验介绍 -BAC009S0914W0199 农产品产销合作社简介 -BAC009S0914W0200 海南农产品流通现状 -BAC009S0914W0201 农产品流通加工标准化 -BAC009S0914W0202 中国对农产品流通政策 -BAC009S0914W0203 温家宝主持召开国务院常务会议 -BAC009S0914W0204 研究部署在城市优先发展公共交通 -BAC009S0914W0205 审议通过缺陷汽车产品召回管理条例草案 -BAC009S0914W0206 国务院总理温家宝主持召开国务院常务会议 -BAC009S0914W0207 研究部署在城市优先发展公共交通 -BAC009S0914W0208 审议通过缺陷汽车产品召回管理条例草案 -BAC009S0914W0209 为加快发展中等职业教育 -BAC009S0914W0210 自秋季学期起 -BAC009S0914W0211 多数城市公共交通出行比例偏低 -BAC009S0914W0212 为从根本上缓解交通拥堵出行不便环境污染等矛盾 -BAC009S0914W0213 必须树立公共交通优先发展理念 -BAC009S0914W0214 将公共交通放在城市交通发展的首要位置 -BAC009S0914W0215 加快构建以公共交通为主 -BAC009S0914W0216 同时改善步行自行车出行条件 -BAC009S0914W0217 城市综合交通体系规划应明确公共交通优先发展原则 -BAC009S0914W0218 城市公共交通规划要科学布局线线网 -BAC009S0914W0219 促进城市内外交通便利衔接和城乡公共交通一体化发展 -BAC009S0914W0220 加快基础设施建设 -BAC009S0914W0221 提升公共交通设施装备水平 -BAC009S0914W0222 提高公共交通舒适性 -BAC009S0914W0223 将其纳入旧城改造和新城建设规划 -BAC009S0914W0224 加强公共交通用地综合开开发 -BAC009S0914W0225 对新建公共交通设施用地的地上地下空间 -BAC009S0914W0226 按照市场化原则实施土地综合开发 -BAC009S0914W0227 收益用于公共交通基础设施建设和弥补运营亏损 -BAC009S0914W0228 加大政府投入 -BAC009S0914W0229 城市政府要将公共交通发展资金纳入公共财政体系 -BAC009S0914W0230 对城市公共交通企业实行税收优惠政策 -BAC009S0914W0231 落实对城市公共交通行业的成品油价格补贴政策 -BAC009S0914W0232 对城市轨道交通运营企业实行电价优惠 -BAC009S0914W0233 拓宽投资渠道 -BAC009S0914W0234 吸引和鼓励社会资金参与公共交通基础设施建设和运营 -BAC009S0914W0235 保障公交路权优先 -BAC009S0914W0236 增加划设城市公共交通优先车道 -BAC009S0914W0237 允许机场巴士校车班车使用公共交通优先车道 -BAC009S0914W0238 加强公共交通优先车道的监控和管理 -BAC009S0914W0239 健全安全管理制度 -BAC009S0914W0240 规范技术和产品标准 -BAC009S0914W0241 构建服务质量评价指标体系 -BAC009S0914W0242 规范公共交通重大决策程序 -BAC009S0914W0243 实行线网规划编制公示制度和运营价格听证制度 -BAC009S0914W0244 建立城市公共交通运营成本和服务质量信息公开制度 -BAC009S0914W0245 应当立即停止生产销售进口 -BAC009S0914W0246 由其生产者实施召回 -BAC009S0914W0247 并及时发布产品缺陷及信息 -BAC009S0914W0248 对实施召回的缺陷汽车产品 -BAC009S0914W0249 生产者应当及时采取措施消除缺陷 -BAC009S0914W0250 会议还研究了其他事项 -BAC009S0914W0251 国务院将对各类交易场所清理整顿 -BAC009S0914W0252 国务院近期将开展对各类交易场所的清理整顿工作 -BAC009S0914W0254 而且这也可以看作是苹果利用硬件优势 -BAC009S0914W0255 衍生出软件服务的又一重要举措 -BAC009S0914W0256 又如何和政府银行搞好关系 -BAC009S0914W0257 证明他们真得没有手机用户信息 -BAC009S0914W0258 苹果靠什么颠复移动支付 -BAC009S0914W0259 苹果推出的每一款新产品都不免要引发大讨论 -BAC009S0914W0260 才能显得像个知识分子 -BAC009S0914W0261 不仅树立了良好的品牌形象 -BAC009S0914W0262 也向全世界推广了一种趋之若鹜的文化 -BAC009S0914W0263 他们真得赚了很多钱 -BAC009S0914W0264 这些特质让库克基本上实现了财务自由 -BAC009S0914W0265 这对于一家巨型企业是非常难能可贵的 -BAC009S0914W0266 而土豪和穷鬼做生意的最大区别就是 -BAC009S0914W0267 而是会更加关注产品本身 -BAC009S0914W0268 以及是否能提升他们的历史地位 -BAC009S0914W0270 他们没有必要着急回本 -BAC009S0914W0271 更大的野心在于深刻变革人类的支付习惯 -BAC009S0914W0272 这种状态是苹果颠复现有市场格局的根基 -BAC009S0914W0273 除却土豪式的生意属性之外 -BAC009S0914W0276 系统会使用不同编码来转移用户凭据和支付数据 -BAC009S0914W0277 整个过程基于安全元素芯片 -BAC009S0914W0278 这种芯片不会直接发送用户敏感信息 -BAC009S0914W0279 而是将其转化成唯一的临时编码 -BAC009S0914W0280 可有效降低信息泄漏的风险其次 -BAC009S0914W0284 苹果积累了海量的绑定信息卡用户 -BAC009S0914W0285 这些资源的特点不单单是数目庞大 -BAC009S0914W0286 而且苹果最早一批的用户积累 -BAC009S0914W0287 囊括了大量的优质资源 -BAC009S0914W0288 甚至包括了一些明星意见领袖和政府官员 -BAC009S0914W0290 更是一种文化和习惯的推广者 -BAC009S0914W0291 柯振东入狱期间的囚服都能在淘宝上热卖 -BAC009S0914W0292 要是詹妮弗劳伦斯也能在微博上说这个应用不错 -BAC009S0914W0293 一定会有立竿见影的推广效果 -BAC009S0914W0294 也在所不惜的最后 -BAC009S0914W0296 早在九月九日发布会上 -BAC009S0914W0297 苹果就公布了合作伙伴 -BAC009S0914W0298 包括迪斯尼耐克麦当劳梅西百货公司等巨头企业 -BAC009S0914W0300 从这些零售商的等级来看 -BAC009S0914W0301 库克团队应当是花费了大量精力 -BAC009S0914W0302 移动支付肯定会有井喷式的发展 -BAC009S0914W0303 现阶段管理创新和组织再造比任何的创新都重要 -BAC009S0914W0304 美的美的在二零一四年三月正式发布智慧家庭战略 -BAC009S0914W0305 未来将搭建空气水营养等智能管家平台 -BAC009S0914W0306 事业部制一直是美的快速成长的法宝 -BAC009S0914W0307 一定程度上影响了资源整合的效率 -BAC009S0914W0308 美的已将风扇加湿器等空气类产品 -BAC009S0914W0309 归到家用空调事业部旗下 -BAC009S0914W0310 围绕几大智能管家平台 -BAC009S0914W0311 美的整合事业部精简组织架构 -BAC009S0914W0312 也是顺应互联网时代管理扁平化的趋势 -BAC009S0914W0314 下称美的内部的组织架构二零一五年加大了调整力度 -BAC009S0914W0316 每日经济新闻记者从美的家用空调事业部了解到 -BAC009S0914W0317 自二零一一年事业部启动自动化升级至今的四年里 -BAC009S0914W0318 工人数量减少近一半 -BAC009S0914W0319 美的家用空调事业部制造副总裁乌守保对记者表示 -BAC009S0914W0320 到二零一八年美的空调营收到达一千亿元规划时 -BAC009S0914W0321 员工数量将减至两万人 -BAC009S0914W0322 虽然投入产生问题以及机器人后期运行维护等 -BAC009S0914W0323 都是家电企业自动化升级需要面临的挑战 -BAC009S0914W0324 自动化是未来唯一出路 -BAC009S0914W0325 四年来机器人代替人工近半 -BAC009S0914W0326 美的家用空调事业提出精品战略 -BAC009S0914W0327 机器人应用也进一步提速 -BAC009S0914W0328 二零一一年美的空调达到五百亿元营收规模时 -BAC009S0914W0329 工人数量超过五万以上 -BAC009S0914W0330 空调业务总营收接近七百亿元 -BAC009S0914W0331 工人数量已经缩减至二点六万人 -BAC009S0914W0332 除了在顺德工厂建成全自动遥控器生产线外 -BAC009S0914W0333 美的空调还在其他地区工厂建有三条全自动生产线 -BAC009S0914W0334 经过前几年自动化生产线升级改造 -BAC009S0914W0335 美的空调工厂的注塑车间 -BAC009S0914W0336 在无开灯照明的情况下也能正常稳定运行 -BAC009S0914W0337 钣金冲压已实现无人运行 -BAC009S0914W0338 而在昨天对阵古巴队的比赛中 -BAC009S0914W0339 中国队教练组还是做出了让朱婷继续休战的抉择 -BAC009S0914W0340 来自北汽女排的主攻手刘晓彤取代朱婷的位置首发出场 -BAC009S0914W0341 除了第一局在开局阶段古巴队一度领先外 -BAC009S0914W0342 比赛的节奏始终被中国队控制在手中 -BAC009S0914W0343 中国队直落三局零封对手 -BAC009S0914W0344 曾春蕾和张常宁均拿到十六分 -BAC009S0914W0345 俄罗斯美国和日本三队均零封对手 -BAC009S0914W0346 此轮战罢后积分榜前四名排位没有任何变化 -BAC009S0914W0347 俄罗斯队十七分居榜首 -BAC009S0914W0348 美国队十六分排第二 -BAC009S0914W0349 日本和中国同积十五分 -BAC009S0914W0350 日本以小分优势暂列第三位 -BAC009S0914W0351 中国队将迎战冈山赛区的第二个对手肯尼亚队 -BAC009S0914W0352 中国女排昨天下午在松本迎战韩国队 -BAC009S0914W0353 主攻手朱婷不慎扭伤脚踝后依然带伤奋战 -BAC009S0914W0354 最终中国队以三比一力战韩国队全取三分 -BAC009S0914W0355 中韩之战中国队首发再次变阵 -BAC009S0914W0356 二传丁霞和主攻刘晏含取代了沈静思和张常宁的位置 -BAC009S0914W0357 张常宁则取代曾春蕾站在接应的位置上 -BAC009S0914W0358 中国队迅速调整阵容 -BAC009S0914W0359 逐渐控制住了局面并连扳两局以二比一优先 -BAC009S0914W0360 关键的第四局一开始中国队便发生了意外 -BAC009S0914W0361 一脸痛苦的朱婷当即被换下场 -BAC009S0914W0362 失去了最稳定的得分手之后 -BAC009S0914W0363 中国队进攻火力明显减弱 -BAC009S0914W0364 而看到了希望的韩国队也趁机拼命反击 -BAC009S0914W0365 当打到一三比一四中国队落后一分时 -BAC009S0914W0366 在场下接受完队医高压包扎后的朱婷请命上场 -BAC009S0914W0367 虽然扣球落地后朱婷依然一瘸一拐 -BAC009S0914W0368 见此情景韩国队的信心受到了打击 -BAC009S0914W0369 尽管也一度以二一比一七领先四分之多 -BAC009S0914W0370 但朱婷与队友们合力打出了一波八比二的高潮 -BAC009S0914W0371 最终中国队以二五比二三拿下第四局 -BAC009S0914W0372 以三比一胜出拿到了宝贵的三个积分 -BAC009S0914W0373 俄俄罗斯队以全胜战绩列积分榜首位 -BAC009S0914W0374 日本与美国同积十十分暂列二三两位 -BAC009S0914W0375 中国和多米尼加同积九分排在第四和第五位 -BAC009S0914W0376 今天中国队将迎战第一阶段的最后一个对手秘鲁队 -BAC009S0914W0377 中国女排三十一日本四夺世界杯冠军直通里约奥运 -BAC009S0914W0378 女排三十一日本进军里约众将欢度欢庆 -BAC009S0914W0379 夺冠的同时摘得本次世界杯的冠军 -BAC009S0914W0380 同时拿到了明年里约奥运会的入场券 -BAC009S0914W0381 是全场得分最高的运动员 -BAC009S0914W0382 也让这位一九九四年出生的河南妹子 -BAC009S0914W0383 逐步成长为中国女排的新核心 -BAC009S0914W0385 在今年的亚锦赛夺冠后 -BAC009S0914W0386 关于中国队过于依赖朱婷的言论不少 -BAC009S0914W0387 本赛季调进张常宁就是郎平为朱婷解压的一个表现 -BAC009S0914W0388 加上惠若琪因伤缺席本届世界杯 -BAC009S0914W0389 张常宁的幼稚嫩显然还不能立即挑起大梁 -BAC009S0914W0390 这支女排的暴露性强攻基本上都是靠朱婷打 -BAC009S0914W0391 郎平也认为这样去打世界高水平的球队是不够的 -BAC009S0914W0392 在目前中国队的阵容中 -BAC009S0914W0393 霸气外露的朱婷是不可或缺的绝对核心 -BAC009S0914W0394 在队长惠若琪缺阵的情况下 -BAC009S0914W0395 她几乎担当起了场上进攻加振奋士气的主力作用 -BAC009S0914W0396 半决赛对阵俄罗斯的比赛中 -BAC009S0914W0397 朱婷全场夺得二十九分 -BAC009S0914W0398 在俄罗斯队的严密拦防下 -BAC009S0914W0399 进攻成功率达到百分之五十六点七六拦网 -BAC009S0914W0400 朱婷得到七分同样全队最高 -BAC009S0914W0401 作为一个主攻手非常不易 -BAC009S0914W0402 与几乎不接一传的科舍列娃相比 -BAC009S0914W0403 朱婷的任务更重效率更高 -BAC009S0914W0404 提前一周至二零一六年六月二十四日 -BAC009S0914W0405 避免和新木乃伊正面较量 -BAC009S0914W0406 来源时光网美国时间本周一 -BAC009S0914W0407 二十世纪福斯影业公布一批新片的档期 -BAC009S0914W0408 晚上二十零下 -BAC009S0914W0409 马甲线啊马甲线 -BAC009S0914W0410 力证自己没有怀孕 -BAC009S0914W0411 网友纷纷调侃道为了辟谣怀孕也是蛮拼的 -BAC009S0914W0412 哈哈哈第一次见人用这种方式证明自己没怀孕 -BAC009S0914W0413 搜狐娱乐讯九月六日 -BAC009S0914W0414 陈妍希晒出一组攀岩照 -BAC009S0914W0415 并称攀岩太难会晃 -BAC009S0914W0416 不抓紧会被撞到地上 -BAC009S0914W0417 希饭快来接住我 -BAC009S0914W0418 陈妍希穿着粉色上衣 -BAC009S0914W0419 头发随意披在脑后 -BAC009S0914W0420 手脚并用努力向往上爬 -BAC009S0914W0421 似乎已过了第三关 -BAC009S0914W0422 如此高难度的动作 -BAC009S0914W0423 再次身体力行地辟谣怀孕传闻 -BAC009S0914W0424 搜狐娱乐讯近日频频传出陈晓向陈妍希求婚成功的消息 -BAC009S0914W0425 陈妍希回应现在真的很享受快乐恋爱的喜悦 -BAC009S0914W0426 有进一步消息一定会通知大家 -BAC009S0914W0427 中新网七月二十二日电据台湾东森新闻消息 -BAC009S0914W0428 陈妍希曾在新版神鵰侠侣中演小龙女 -BAC009S0914W0429 被网友调侃是小笼包 -BAC009S0914W0430 尽管她努力瘦身 -BAC009S0914W0431 当事网友疑遭遇克隆车 -BAC009S0914W0432 经调查核实相关情况 -BAC009S0914W0433 游客抢订冬奥运旅游团因遭遇订票难住房等 -BAC009S0914W0434 北京冬奥会刚刚申办成功 -BAC009S0914W0435 已经有游客迫不及待想去张家口看看了 -BAC009S0914W0436 游客摔断腿旅游社赔三成因旅游时未尽提示义务 -BAC009S0914W0437 游客日照海鲜店被打受伤警方称言语冲突引发互殴 -BAC009S0914W0438 京华时报讯记者卫张宁昨天上午 -BAC009S0914W0439 自己和家人因点的海鲜较少 -BAC009S0914W0440 并被店主及店员辱骂围殴 -BAC009S0914W0441 当时游客出言不逊在先 -BAC009S0914W0442 并未将游客脱光衣服殴打 -BAC009S0914W0443 日照市公安局官方发布消息 -BAC009S0914W0444 称事件系点餐过程中 -BAC009S0914W0445 双方发生语言冲突后进行互殴 -BAC009S0914W0446 已依法对双方进行处罚 -BAC009S0914W0447 游客晋吉岛乘船颠骨折诉旅社索赔二零馀万元 -BAC009S0914W0448 本来一家人出国旅游挺高兴的 -BAC009S0914W0449 可是我遇见这事还不够添堵的呢 -BAC009S0914W0450 崔先生带家人随团前往泰国晋吉岛游玩 -BAC009S0914W0451 导致崔先生腰部受伤 -BAC009S0914W0452 回国后被确诊为腰部骨折 -BAC009S0914W0453 将接团的两家旅行社起诉至法院 -BAC009S0914W0454 索赔各项损失共计二零馀万元 -BAC009S0914W0455 昌平法院开庭审理了这起案件 -BAC009S0914W0456 游客景区被忽悠八零零克石斛收费一二六零零元 -BAC009S0914W0457 滕女士在云南购买的石斛 -BAC009S0914W0458 游客武夷山就餐麝香肉结账要四八元一两 -BAC009S0914W0459 旅游点餐时与海鲜店主起争执互殴二人被行政拘留 -BAC009S0914W0460 新京报讯记者林斐然近日 -BAC009S0914W0461 有网友反映前往山东日照一海排档点海鲜时 -BAC009S0914W0462 该事件系游客点餐时嫌大排档太脏而引起口角纷 -BAC009S0914W0463 日照市公安局官方微博通报了这一事件的调查情况 -BAC009S0914W0464 双方因互殴均被行政拘留并处罚款 -BAC009S0914W0465 游客爬到峨眉山悬崖边石头上拍照 -BAC009S0914W0466 游客称点海鲜太少被当地媒体老板受伤更重 -BAC009S0914W0467 事情的真相完全不是这样的 -BAC009S0914W0468 大排档老板受伤更严重 -BAC009S0914W0469 起因也完全不是河南游客自己说的那样 -BAC009S0914W0470 希望警方尽快给出公平调查结果 -BAC009S0914W0471 游客称在山东日照只因点海鲜少全家遭殴打恐吓 -BAC009S0914W0472 并最新发微博表示当地警方已介入调查 -BAC009S0914W0473 游客突破八万人限流大关故宫首次提前禁止售票 -BAC009S0914W0474 新京报讯记者黄颖自七月六日进入暑期以来 -BAC009S0914W0475 故宫博物院接待的观众量也日益攀升 -BAC009S0914W0476 屡屡逼近八万人次的限流大关 -BAC009S0914W0477 故宫首次启动了起流起票限流措施 -BAC009S0914W0478 在馀票数量为售后现场关闭售票窗口 -BAC009S0914W0479 游客美签被废因访美停留太久称从没到过欧洲 -BAC009S0914W0480 而被美国海关移民官遣返 -BAC009S0914W0481 游客脚踩烈士铜像拍照四名当事人鞠躬道歉 -BAC009S0914W0482 四人鞠躬道歉据瓜沥人网 -BAC009S0914W0483 游客被黑导游拉进农家宴消费蘑菇炖鸡卖九零零元 -BAC009S0914W0484 其中一道蘑菇炖鸡收费近九零零元 -BAC009S0914W0485 看到该网友的曝光帖后 -BAC009S0914W0486 崂山景区勒令该农家宴停止停止营业 -BAC009S0914W0487 并索偿该游客全部损失 -BAC009S0914W0488 游客西安遭天价玛卡商家四零零零元一价合理 -BAC009S0914W0489 张先生购买的四零零元玛卡 -BAC009S0914W0490 内江人张先生在这次国庆期间 -BAC009S0914W0491 被导游介绍到一家购物点后 -BAC009S0914W0492 他被迫交了四零零元 -BAC009S0914W0493 这一斤玛卡其实价格只有一零零多元 -BAC009S0914W0494 一捧玛卡磨成粉景区商家要四零零零元 -BAC009S0914W0495 游客要退团张家界低价团导游称信不信你走走不了 -BAC009S0915W0121 从房地产的角度来看 -BAC009S0915W0122 这个政策的出台是希望刺激房地产投资 -BAC009S0915W0123 则是希望防止外资流出 -BAC009S0915W0124 国家统计局公布的数据显示 -BAC009S0915W0125 今年一到七月全国房地产开发投资五万亿元 -BAC009S0915W0126 增速比一到六月回落一个百分点 -BAC009S0915W0127 开发商投资增速处于不断下降的状态 -BAC009S0915W0128 市场开发也呈降温态势 -BAC009S0915W0129 此次出台的新政虽然放宽了条件 -BAC009S0915W0130 但对于实施住房限购政策的城市 -BAC009S0915W0131 境外个人购房依然需要符合当地政策规定 -BAC009S0915W0132 境外机构和个人在中国投资购买房地产的限制放松 -BAC009S0915W0133 兰州房地产市场回暖销量增加价格微涨 -BAC009S0915W0134 自二夏季开始 -BAC009S0915W0135 得益于一系列稳定房地产市场的措施 -BAC009S0915W0136 兰州房地产市场销量增加明显 -BAC009S0915W0137 一些楼盘新房价格出现微涨 -BAC009S0915W0138 较上月环比上涨百分之五 -BAC009S0915W0139 这也是该指数连续三个月出现上涨 -BAC009S0915W0140 而在多时间里 -BAC009S0915W0141 兰州新建住宅价格均呈现微降的态势 -BAC009S0915W0142 兰州楼市出现明显的区域分化 -BAC009S0915W0143 兰州市中心城区的一些楼盘 -BAC009S0915W0144 自今年初至今上涨幅度超过了十百分之 -BAC009S0915W0145 可由于中心城区楼盘数量稀少 -BAC009S0915W0146 在兰州雁滩区域的一家楼盘 -BAC009S0915W0147 而在兰州市新开楼盘集中的城郊区域 -BAC009S0915W0148 但房企调价幅度有限 -BAC009S0915W0149 由于商品房供应量充足 -BAC009S0915W0150 多个楼盘仍然采取的是低价走量的策略 -BAC009S0915W0151 在兰州市北岸由广东房企开发的一个大型楼盘里 -BAC009S0915W0152 但房价从七月至今上涨幅度仅为百分之二左右 -BAC009S0915W0153 今兰州市商品房销售面积同比上涨超过百分之三十 -BAC009S0915W0154 商品房销售额同比上涨超过了百分之四十 -BAC009S0915W0155 许多刚性住房和改善型住房需求得到释放 -BAC009S0915W0156 兰州房地产市场存在持续上涨可能 -BAC009S0915W0157 但由于房地产市场供给仍然不仍然充足 -BAC009S0915W0158 自二夏季开始 -BAC009S0915W0159 得益于一系列稳定房地产市场的措施 -BAC009S0915W0160 兰州房地产市场销量增加明显 -BAC009S0915W0161 而且提供各项衍生的福利性服务 -BAC009S0915W0162 中新网十月二十一日前 -BAC009S0915W0163 北京又一家共享创办公平台落地丰台 -BAC009S0915W0164 借全国大众创业万众创新活动周启动之势 -BAC009S0915W0166 将生活社区与科技园区两种空间组织融合 -BAC009S0915W0167 作为美国新型共享式办公与创新环境的运营品牌 -BAC009S0915W0168 是国际上合作性办公品牌的代表 -BAC009S0915W0169 由此拉开了跨境共享创新生态平台化发展的新时代 -BAC009S0915W0170 而且提供各项行生的福利性服务 -BAC009S0915W0171 帮助创新创业者聚合各方面资源 -BAC009S0915W0172 旨在帮助小型企业降低运运营成本 -BAC009S0915W0174 从创业者真正的需求出发 -BAC009S0915W0175 石榴中心位于丰台区宋家庄交通枢纽商圈 -BAC009S0915W0176 可以北京四环内唯一的国际化共享办公园区 -BAC009S0915W0177 园区总建筑面积一万平方米 -BAC009S0915W0178 其中地上一万平方米 -BAC009S0915W0179 地下一万平方米 -BAC009S0915W0180 由二十二栋企业独栋和二栋二十层的五a级写字楼组成 -BAC009S0915W0181 而且提供各项行生的福利性服务 -BAC009S0915W0182 中新网十月二十日前 -BAC009S0915W0184 关于智能家居你必须懂的五件事 -BAC009S0915W0185 智能家居概念的炒作 -BAC009S0915W0186 这是自媒体时代的胜利 -BAC009S0915W0187 将明确政策界限和工作机制以知以及部门分工 -BAC009S0915W0188 证监会将协同有关部门落实相关工作 -BAC009S0915W0189 公共娱乐场所清理整顿 -BAC009S0915W0190 燃气经经营市场清理整顿 -BAC009S0915W0191 行业协会清理整顿报告 -BAC009S0915W0192 国务院已批准信贷资产证券化继续扩大试点 -BAC009S0915W0193 多方面原因造成今年上半年部分中小企业生产经营困难 -BAC009S0915W0194 但没有出现大范围趋势性的破产倒闭 -BAC009S0915W0195 部分中小企业国内生产成本有所提高 -BAC009S0915W0196 这主要有四方面原因 -BAC009S0915W0197 中小企业经营困难 -BAC009S0915W0198 既是信贷投放回归常态的体现 -BAC009S0915W0199 也是国家淘汰落后产能加快产业升级宏观政策的体现 -BAC009S0915W0200 对于中小企业的支持政策 -BAC009S0915W0201 国务院已经批准信贷资产证券化继续扩大试点 -BAC009S0915W0202 转化成由资产产生的现金流作担保可自由流通的证券 -BAC009S0915W0203 销售给资本市场投资者的一种融资方式 -BAC009S0915W0204 目前我国正在稳步开展中小企业信贷资产证券化试点 -BAC009S0915W0205 为加快发展银行间债券市场 -BAC009S0915W0206 对中小企业发行债务融资工具提供绿色通道 -BAC009S0915W0207 占非金融企业直接债务融资总额之比 -BAC009S0915W0208 有力地支持了中小企业的发展 -BAC009S0915W0209 积极指导支持和鼓励金融机构根据中小企业的特点 -BAC009S0915W0210 研发推出不同的金融创新产品和服务方式 -BAC009S0915W0211 吴显亭称将加强和证监会等相关部门的配合和协作 -BAC009S0915W0212 而针对浙江广东民间借贷丰沛的特点 -BAC009S0915W0213 一定程度上缓解了部分中小企业的融资困难 -BAC009S0915W0214 将在有效防范民间借贷的潜在风险的前提下 -BAC009S0915W0215 发挥好民间借贷在服务中小企业发展中的积极作用 -BAC009S0915W0216 要加强对民间借贷的合理引导 -BAC009S0915W0217 解决中小企业生产经营困难需靠多方面共同努力 -BAC009S0915W0218 听取对中央企业监督检查情况的汇报 -BAC009S0915W0219 中央企业要进一步深化改革 -BAC009S0915W0220 强化企业管理和风险管控 -BAC009S0915W0221 加强依法监管和制度建设 -BAC009S0915W0222 部分中央企业的结构调整还存在一些困难 -BAC009S0915W0223 资源环境面临较大压力有的企业管理水平不高 -BAC009S0915W0224 非主业投资存在不少经营风险 -BAC009S0915W0225 境外资产监管有待加强 -BAC009S0915W0226 中央企业实现营业总收入十六点八亿元 -BAC009S0915W0227 上交税金一万亿元 -BAC009S0915W0228 增长百分之三十净利润一千亿元 -BAC009S0915W0229 二零一一年一月至七月 -BAC009S0915W0230 实现营业总收入十一亿元 -BAC009S0915W0231 同比增加迅速上缴税金三亿元 -BAC009S0915W0232 增长非常迅速 -BAC009S0915W0233 进入世界五百强的企业增加 -BAC009S0915W0234 包括七座以下小客车及摩托车都被列入免费范范围 -BAC009S0915W0235 江苏省交通厅相关负责人昨日对记者表示 -BAC009S0915W0236 今年国庆小长假期间私家车主们就可以免费上路了 -BAC009S0915W0237 免费时段从节假日第一天开始 -BAC009S0915W0238 节假日最后一天结束 -BAC009S0915W0239 普通公路以车辆通过收费站收费车道的时间为准 -BAC009S0915W0240 高速公路以车辆驶离出口收费车车道的时间为准 -BAC009S0915W0241 允许在普通收费公路行驶的摩托车 -BAC009S0915W0242 各地机场高速公路是否实行免费通行 -BAC009S0915W0243 由各省区市人民政府决定 -BAC009S0915W0244 各地机场高速公路是否实行免费通行 -BAC009S0915W0245 由各省区市人民政府决定 -BAC009S0915W0246 比如南京机场高速一到节假日 -BAC009S0915W0247 是南京往南的重要通道 -BAC009S0915W0248 对于江苏的机场高速是否免费 -BAC009S0915W0249 省交通部门称目前未定 -BAC009S0915W0250 但有关负责人认为我想 -BAC009S0915W0251 机场高速最大可能还是免费 -BAC009S0915W0252 另一个让南京市民特别关心的是 -BAC009S0915W0253 或许到二零一六年的时候 -BAC009S0915W0254 零售店就再也卖不出去一个实体钱包了 -BAC009S0915W0255 催生着移动支付技术的大跃进 -BAC009S0915W0256 最关键的两个属性莫过于安全和便捷 -BAC009S0915W0257 而且它们两个之间是非常对立的关系 -BAC009S0915W0258 安全性的提升需要牺牲一定的便携性 -BAC009S0915W0259 究竟哪个特特性更加重要 -BAC009S0915W0260 这也影响着移动支付市场的总体进程和发展方向 -BAC009S0915W0261 或许是受好莱坞艳照门的影响 -BAC009S0915W0263 重点强调了其安全性 -BAC009S0915W0264 最基本的逻辑就是我们不读取信息 -BAC009S0915W0265 牛师傅总说自己的面没有添加任何防腐剂 -BAC009S0915W0266 任何的电子行为都不免会留下痕迹 -BAC009S0915W0267 移动支付又会产生非常敏感的操作信息 -BAC009S0915W0268 蕴含着巨大商业价值 -BAC009S0915W0269 有哪家支付机构愿意心无旁续地放弃这些金子呢 -BAC009S0915W0270 安全真的是移动支付的第一属性吗 -BAC009S0915W0271 消费者对便捷性的要求可能会更高 -BAC009S0915W0272 按照国内消费者的习惯 -BAC009S0915W0273 他们通常会单独办一张银行卡来绑定移动支付系统 -BAC009S0915W0274 而不是拿着主卡到处刷 -BAC009S0915W0275 移动支付可调用的只能是消费者的小额度的钱财 -BAC009S0915W0276 一般不会给消费者带来巨大损失 -BAC009S0915W0277 消费者会在特定情况下牺牲安全性来提升支付的便捷性 -BAC009S0915W0278 她们宁愿可花五个小时讨论是否买一条裙子 -BAC009S0915W0279 也不愿意花五分钟重新输一定密码 -BAC009S0915W0281 大概十年前就有了这样的说法 -BAC009S0915W0283 也正是看中了中国消费者的消费潜力 -BAC009S0915W0284 华尔街才对阿里巴巴情有独钟 -BAC009S0915W0286 就让业界讨论它会带给中国移动支付市场怎样的影响 -BAC009S0915W0287 苹果要想在中国本土化 -BAC009S0915W0288 最大难点在于如何改变国内的消费习惯 -BAC009S0915W0289 如何说服四大银行一起与之愉快合作 -BAC009S0915W0290 如何重修与中国政府的良好关系 -BAC009S0915W0291 这对于苹果来说不是件容易的事儿 -BAC009S0915W0292 现在是不是也该长点心了吧 -BAC009S0915W0293 国内移动支付需主要有两股力量 -BAC009S0915W0295 前者有长时间的沉淀 -BAC009S0915W0296 银行们对此已深耕多年 -BAC009S0915W0297 而后者则是刚刚涌现的后起之秀 -BAC009S0915W0298 二零一四年春天打车软件补贴大战 -BAC009S0915W0299 两股力量基本上都有一统天下的野心 -BAC009S0915W0300 这三个优势能在短时间内颠复美国移动支付市场的格局 -BAC009S0915W0301 最终促使苹果成为主流标准但中国市场有其特殊性 -BAC009S0915W0302 首先银联和苹果的合作谈判不会顺利 -BAC009S0915W0303 今后所有空调产品还将实现联机运行 -BAC009S0915W0304 这台设备就不会开机运转 -BAC009S0915W0305 这个在美的空调的南沙工厂武汉工厂已全面试点 -BAC009S0915W0306 自动化制造是未来唯一的出路 -BAC009S0915W0307 未来的制造业方向要实现无人化 -BAC009S0915W0308 美的计划在二零一八年 -BAC009S0915W0309 将家用空调事业部员工工人数缩减至两万人 -BAC009S0915W0310 除了四轴或三轴机器人外 -BAC009S0915W0311 今年还将新增二百台 -BAC009S0915W0312 机器人维护成本是挑战 -BAC009S0915W0313 广东东莞顺德等城市已经掀起大量机器换人计划 -BAC009S0915W0314 家电企业机器人智造也正在加速进行 -BAC009S0915W0315 从美的海尔使用机器人操作来看来 -BAC009S0915W0316 机器换人确实能够大大降低企业的用工数量 -BAC009S0915W0317 实现自动化升级也没那么简单 -BAC009S0915W0318 美的集团对项目在一定年限内有投入产出的规定 -BAC009S0915W0319 这对我们来说是个很大的挑战 -BAC009S0915W0320 同时也卡住了自动化的投入 -BAC009S0915W0321 一定年限内的投入产出 -BAC009S0915W0322 我们必须要有衡量标准 -BAC009S0915W0323 美的不能因自动化生产增加制造成本而让用户买单 -BAC009S0915W0324 美的空调进行自动化升级 -BAC009S0915W0325 一定是为了降低制造成本 -BAC009S0915W0326 比如降低人工费用运作费用等 -BAC009S0915W0327 机器人后期维护运行成本及技术也是一个高门槛 -BAC009S0915W0328 因为机器人生产商派遣技术人员不可能长期驻起驻点企业 -BAC009S0915W0329 高工机器人董事长张小飞表示 -BAC009S0915W0330 家电企业自动化升级改造必须进行 -BAC009S0915W0331 但伴随一定的投资风险 -BAC009S0915W0332 除了后期技术维护能力外 -BAC009S0915W0333 对于国内家电企业而言 -BAC009S0915W0334 自动化生产线的柔性改造也是其面临的一大难题 -BAC009S0915W0335 空调产品越来越追求个性化 -BAC009S0915W0336 这需要通过机器人的柔性改变来对此进行处理 -BAC009S0915W0337 家电企业要建立数字化工厂才能真正提升生产效率 -BAC009S0915W0338 她的表现也更加全面 -BAC009S0915W0339 对阵俄罗斯的比赛中 -BAC009S0915W0340 在张常宁一度进行进攻受阻 -BAC009S0915W0341 刘晓彤替补上场打得缩手缩脚的情况下 -BAC009S0915W0342 不断地为中国女排得分 -BAC009S0915W0343 只要中国队需要有人挺身而出 -BAC009S0915W0344 朱婷在中韩之战中一度受伤 -BAC009S0915W0345 但她在中国队遇到困难的时候坚持带伤上阵 -BAC009S0915W0346 最终掠队拿下了比赛 -BAC009S0915W0347 在队长惠若琪因为身体原因无缘世界杯的情况下 -BAC009S0915W0348 朱婷就是中国女排的核心 -BAC009S0915W0349 朱婷再度扮演了场上头脑的角色 -BAC009S0915W0350 队员们也对于她在技术上和心理上都颇为依赖和信服 -BAC009S0915W0351 朱婷扣球拿下一百一十三分 -BAC009S0915W0352 总共贡献了一百四十一分 -BAC009S0915W0353 反超张常宁成为中国队的得分王 -BAC009S0915W0354 让朱婷最佳球员的身份和价值再度彰显 -BAC009S0915W0355 尚不足十八岁的她身高为一米八六 -BAC009S0915W0356 徐建德统领的中国青年队八战全胜夺得冠军 -BAC009S0915W0357 作为主力主攻的朱婷 -BAC009S0915W0358 从而被授予最有价值球员荣誉 -BAC009S0915W0360 当时身披八号战袍的她身高达到了一米九五公分 -BAC009S0915W0361 朱婷斩获了一六七分 -BAC009S0915W0362 与多米尼加的马丁内斯一起摘得最佳得分奖 -BAC009S0915W0363 随后还以百分之五十三点五六的得分率拿到了最佳进攻的大奖 -BAC009S0915W0364 朱婷荣膺最有价值球员 -BAC009S0915W0365 还与巴西队的加比一起入选最佳主攻 -BAC009S0915W0366 当年的整个世青赛上 -BAC009S0915W0367 中国队虽然如愿夺冠 -BAC009S0915W0368 朱婷却是唯一的硕果 -BAC009S0915W0369 去年六月下旬举行的中国国际精英赛北仑站 -BAC009S0915W0371 当时郎平率队三战全胜名列第一 -BAC009S0915W0372 朱婷两场比赛担任首发 -BAC009S0915W0374 而在今年的香港站上 -BAC009S0915W0375 中国队三比二力克美国队收获分站赛九连胜 -BAC009S0915W0376 赛后主攻朱婷获最有价值球员和最受欢迎球员 -BAC009S0915W0377 主教练郎平获得最佳教练 -BAC009S0915W0378 三场比赛朱婷均有出色表现 -BAC009S0915W0379 朱婷共计拿下二十四分 -BAC009S0915W0380 第二场对阵日本也拿下全队第二高的十二分 -BAC009S0915W0381 获得二十三分荣誉全场得分王 -BAC009S0915W0382 在分站赛总得分榜上 -BAC009S0915W0383 朱婷以一百五十七分领先群芳 -BAC009S0915W0384 其中扣球拿到一百三十二分 -BAC009S0915W0385 扣球成功率五十四点百分之十高居榜首 -BAC009S0915W0386 人们首先会想到她的高度 -BAC009S0915W0387 其一米九五的身高三米二七的扣球高度 -BAC009S0915W0388 在比赛中确实非常有利 -BAC009S0915W0389 朱婷进攻相对比较简单 -BAC009S0915W0390 主要是四号位的高点强攻和六号位的后排进攻 -BAC009S0915W0391 四号位进攻以大斜线为主 -BAC009S0915W0392 她进攻的变化逐渐多了起来 -BAC009S0915W0393 首先是增加了二号位的进攻 -BAC009S0915W0394 即当自己轮转到前排二号位时 -BAC009S0915W0395 临时客串接应在二号位参与强攻 -BAC009S0915W0396 这样既丰富了自己也增加了全队的进攻变化 -BAC009S0915W0397 再就是四号位的进攻除了斜线 -BAC009S0915W0398 还增加了直线直线和斜线之间的所谓二直线 -BAC009S0915W0399 不时还施以非常巧妙的吊球 -BAC009S0915W0400 视频中国三一大胜俄罗斯独占女排世界杯榜首 -BAC009S0915W0401 日本二零一五女排世界杯单循环赛战至第十轮 -BAC009S0915W0402 由郎平挂帅的中国女排在名古屋赛区 -BAC009S0915W0403 提升战绩为九胜一负反超至榜首位置 -BAC009S0915W0404 上周在北美电影市场上遭遇票房惨剧 -BAC009S0915W0405 只以六百四十八万美元的进账排名第八 -BAC009S0915W0406 这部电影的失败并没有影响囧瑟夫的心情 -BAC009S0915W0407 将自导自演一部名为睡魔的科幻大片 -BAC009S0915W0408 让体重维持在四十五公斤左右 -BAC009S0915W0409 但网友的吐槽却一直没有停息 -BAC009S0915W0410 她在台湾出席活动 -BAC009S0915W0411 坦言刚开拍的一个月中 -BAC009S0915W0412 心情低落到崩溃大哭 -BAC009S0915W0413 甚至出现忧郁症状况 -BAC009S0915W0414 搜狐娱乐讯陈妍希传出和陈晓的恋情之后 -BAC009S0915W0415 二人一直鲜少回应 -BAC009S0915W0416 陈妍希回到台北代言悠游卡 -BAC009S0915W0417 外传她可能已经怀孕 -BAC009S0915W0418 但陈妍希在出席活动时 -BAC009S0915W0419 穿高跟鞋快步走 -BAC009S0915W0420 似乎也让传言不攻自破 -BAC009S0915W0421 贵州都市报十月二十九日报道据台湾媒体报道艺人陈妍希认爱小四岁的大陆小生陈晓 -BAC009S0915W0423 两人因合作神雕侠侣擦出爱火 -BAC009S0915W0424 恋情发展备受关注 -BAC009S0915W0425 更在日前爆出交往七个月准备闪婚 -BAC009S0915W0426 连男方在法国包游艇求婚的照片都被网友扒出 -BAC009S0915W0427 她坦承当时很惊喜很感动 -BAC009S0915W0428 男友受访时也首度大方松口确实已经进入求婚阶段 -BAC009S0915W0429 让粉丝听了又惊又喜 -BAC009S0915W0430 搜狐娱乐讯据台湾媒体报道 -BAC009S0915W0431 记者调查湖南张家界国家森林公园低价团问题 -BAC009S0915W0432 四零零元左右的低价两日游在当地非常普遍 -BAC009S0915W0433 这种低价游自称费用全包 -BAC009S0915W0434 原本自费项目变成必须交费项目 -BAC009S0915W0435 导游还诱骗游客加钱走特殊路线 -BAC009S0915W0436 面对游客质疑和退团要求 -BAC009S0915W0437 导游放言此树是我栽 -BAC009S0915W0438 你不可能一分钱不花 -BAC009S0915W0439 游客赴港游买瑞士表半个月停摆旅行社久拖不管 -BAC009S0915W0440 市民刘先生和江西环球国际旅行社的沟通协商再次失败 -BAC009S0915W0441 双方矛盾的焦点是一只瑞士名表 -BAC009S0915W0442 游客踩敦煌千年古城遗址拍照反问踩了会掉吗 -BAC009S0915W0443 现场图一零月五日下午 -BAC009S0915W0444 在甘肃敦煌大方盘城遗址 -BAC009S0915W0445 几位游客轮流翻越护栏 -BAC009S0915W0446 一位游客在拍照中说人家几千年都没有掉下来 -BAC009S0915W0447 踩一下就掉下来了 -BAC009S0915W0448 澎湃新闻在现场看到 -BAC009S0915W0449 遗址附近有多处警示牌写明严禁跨入保护区 -BAC009S0915W0450 游客进店未购物被导游嘲讽官方正在立案处理 -BAC009S0915W0451 游客铜缸刻字秀恩爱故宫已报警 -BAC009S0915W0452 法制晚报讯记者李洁今天傍晚 -BAC009S0915W0453 严厉谴责这一不文明应为 -BAC009S0915W0454 并称故宫博物院已就此事件向公安机关报案 -BAC009S0915W0455 游客青岛遭遇天价虾当地人最多几十元一斤 -BAC009S0915W0456 肖先生在上菜后高兴地拍下图片 -BAC009S0915W0457 当时他还不知道自己会被暗算 -BAC009S0915W0458 游客骑着明孝陵驮碑龟趺拍照市民大煞风景 -BAC009S0915W0459 游客骑在龟趺身上报料人供图 -BAC009S0915W0460 游戏主播花样作死声称天津是他炸的直播被抓游戏室老板因冲突开枪将人射伤致死 -BAC009S0915W0461 一五年后落网 -BAC009S0915W0462 贵港民警追凶未言弃嫌犯一五年后落法网 -BAC009S0915W0463 游戏平台称投千元可收百万数十民上当 -BAC009S0915W0464 信息时报讯记者周伟龙天上不会掉馅饼 -BAC009S0915W0465 数十名市民赶到越秀区一酒家维权 -BAC009S0915W0466 称他们曾在这里被人游说注册了一游戏平台的账户 -BAC009S0915W0467 花费几千元至上万元不等 -BAC009S0915W0468 原以为可以按照游戏规则定期分红提现 -BAC009S0915W0469 孰料从上月底开始平台关闭 -BAC009S0915W0470 随后众人一起到东山派出所报案 -BAC009S0915W0471 有待警方进一步调查 -BAC009S0915W0472 游戏网站频遭攻击每周交二零零零元保护费息事宁人 -BAC009S0915W0473 办案民警检查作案设备金华警方供图昨天 -BAC009S0915W0474 记者从金华市公安局获悉 -BAC009S0915W0475 仅半年就敲诈勒索了五七二万元 -BAC009S0915W0476 该案也被列为公安部督办大案 -BAC009S0915W0477 警方已抓获一五名犯罪嫌疑人 -BAC009S0915W0478 湖北一七二名教师转岗当保安其中有人曾是校长 -BAC009S0915W0479 一身保安制服的他准时站在校门口 -BAC009S0915W0480 手握电动栅栏遥控器 -BAC009S0915W0481 眼睛警惕地注视着进出校门的车辆和学生 -BAC009S0915W0482 湖北一九岁女护士深夜遭抢劫杀害嫌疑嫌犯已落网 -BAC009S0915W0483 凶手被抓捕归案钟欣摄 -BAC009S0915W0484 湖北二五岁女子从未来例假基因检查是男身 -BAC009S0915W0485 家住汉阳的莎莎化名 -BAC009S0915W0486 近日在医院检查才发现 -BAC009S0915W0487 她的基因竟是个纯爷们 -BAC009S0915W0488 湖北三亿打造亚洲玫瑰基地多个种植园杂草丛生 -BAC009S0915W0489 湖北四名被捅法官脱离危险一女法官尚在哺乳期 -BAC009S0915W0490 经十堰市中级人民法院确认 -BAC009S0915W0491 四名法官系送达法律文书时被刺伤 -BAC009S0915W0492 目前均暂无生命危险 -BAC009S0915W0493 其中一女法官尚在哺乳期 -BAC009S0915W0494 湖北六零后求婚九零后被指责欠款六千万因诈骗取保候审 -BAC009S0915W0495 湖北黄石市一家商场前 -BAC009S0916W0121 真正落地的产品却非常地少 -BAC009S0916W0122 而落地后的产品与客户的期待甚远 -BAC009S0916W0123 这些状况每日均上演发生 -BAC009S0916W0124 大部分的创业者举步艰辛 -BAC009S0916W0125 钱烧完了东西出不来 -BAC009S0916W0126 创业者成了智慧时代的贡品 -BAC009S0916W0127 这不是这个时代的不公平 -BAC009S0916W0128 而是我们对这个时代了解的太少 -BAC009S0916W0129 如果我们懂得多一点智能家居产品市场的法则 -BAC009S0916W0130 我们的路也许会好走得多 -BAC009S0916W0131 一智能产品的安全 -BAC009S0916W0132 连接的最高代价就是安全问题 -BAC009S0916W0133 成千上万的产品通过无线连接 -BAC009S0916W0134 只要一个单品存在安全漏洞 -BAC009S0916W0135 整个系统的安全就会出现问题 -BAC009S0916W0136 产生非常可怕的结果 -BAC009S0916W0137 现阶段市场上落地的产品大多对安全的认知都存在缺陷 -BAC009S0916W0138 普遍认为现在的市场很小且还是单品 -BAC009S0916W0139 不用花那么大的成本去解决安全的问题 -BAC009S0916W0140 可大家必须明白一个道理 -BAC009S0916W0141 当大家习惯安全的问题留以后解决的时候 -BAC009S0916W0142 安全问题立即会成为你的内伤 -BAC009S0916W0143 但综观国内同类企业 -BAC009S0916W0144 以深圳智能锁业代表为例 -BAC009S0916W0145 在安全加解密认证等方面也做足了功夫 -BAC009S0916W0147 软件与硬件都做了深度的对接 -BAC009S0916W0148 把顾客个人资料全部归客户自己保管 -BAC009S0916W0149 企业不接触客户个人资料 -BAC009S0916W0150 许多企业都把取得顾客个人资料当作资本 -BAC009S0916W0151 这是智能家居行业的先例 -BAC009S0916W0152 必须具有高度习惯融合性和耐用性 -BAC009S0916W0153 这决不是八零九零的消费习惯这么单纯的问题 -BAC009S0916W0154 是每个家庭成员体验的统一 -BAC009S0916W0155 也就是每个成员综合体验的最大公约数 -BAC009S0916W0156 以情怀代替体验是非常错误的 -BAC009S0916W0157 产品的核心是客户的体验 -BAC009S0916W0158 顾客体验的核心是真善美 -BAC009S0916W0159 近来看到的许多创新型产品 -BAC009S0916W0160 可使用起来让人啼笑皆非 -BAC009S0916W0161 加解密的措施如同虚设 -BAC009S0916W0162 没有智慧手机的成员无法开门 -BAC009S0916W0163 这是一帮精英自恋情怀的产品 -BAC009S0916W0164 可美国的月亮总是比中国的亮 -BAC009S0916W0165 国内许多媒体或企业都在为其背书 -BAC009S0916W0166 而对国内比它更优秀的产品却集体失声 -BAC009S0916W0167 只要了解一点核桃锁信息的人都能第一时间感受到 -BAC009S0916W0168 一智能家居产品的销售渠道 -BAC009S0916W0169 你要懂既然不是电子产品不是易损品不是玩品 -BAC009S0916W0170 他是家居产品依托互联网技术升级的家居耐用品 -BAC009S0916W0171 这产品的换代周期会较长 -BAC009S0916W0172 购买的机会受时间的制约 -BAC009S0916W0173 而未来借助更多的互联网技术 -BAC009S0916W0174 产品的升级速度一定加快 -BAC009S0916W0175 而智能家居产品的特殊属性决定了销售渠道的模式 -BAC009S0916W0176 他不能按电子产品或传统居家产品的模式去销售 -BAC009S0916W0177 除了做好传统门店的体验销售电商平台销售外 -BAC009S0916W0178 希望智能家居产品企业在短期的高回报率也是不现实的 -BAC009S0916W0179 但可以肯定的他一定是最高成长的企行业 -BAC009S0916W0180 一大数据云计算不是你谈的 -BAC009S0916W0181 好像不谈你就不属于这个时代的人 -BAC009S0916W0182 作用大并不代表每个人 -BAC009S0916W0183 大数据云计算是非常烧钱的 -BAC009S0916W0184 不是一般的企业个人玩得起的 -BAC009S0916W0185 与其厌不其烦的谈论大数据云计算 -BAC009S0916W0186 不如做一款实实在在的好产品 -BAC009S0916W0187 但是却不在国家文件所指的收费公路范围内 -BAC009S0916W0188 而是一条市内快速路 -BAC009S0916W0189 对于这条特殊的隧道 -BAC009S0916W0190 省交通部门表示应该不会特殊 -BAC009S0916W0191 长江隧道估计也顶不住 -BAC009S0916W0192 对于提高重大节假日公路通行能力和服务水平 -BAC009S0916W0193 降低公众假日出行成本具有重要意义 -BAC009S0916W0194 具体工作将由各省区市政府负责统一组织实施 -BAC009S0916W0195 国务院及五部门并没有明确实施时间 -BAC009S0916W0196 着实让不少网友有些着急 -BAC009S0916W0197 免费新规究竟啥时能享受到 -BAC009S0916W0198 记者昨日第一时间从江苏省交通运输厅获悉 -BAC009S0916W0199 就国家方案我省还会进行再研究 -BAC009S0916W0200 具体执行时间由省政府定 -BAC009S0916W0201 今年国庆应该可以实施 -BAC009S0916W0202 可是通过收费站的车有大客车中型客车还有货车 -BAC009S0916W0203 到时候会不会乱成一锅粥 -BAC009S0916W0204 在国务院下发的文件中提及 -BAC009S0916W0205 为确保免费政策实施后车辆有序通行 -BAC009S0916W0206 各地区要对公路收费站现有车道进行全面调查 -BAC009S0916W0207 合理规划和利用现有收费车道和免费专用通道 -BAC009S0916W0208 确保过往车辆分类分车道有序通行 -BAC009S0916W0209 记者昨日从省交通部门了解到 -BAC009S0916W0210 这是一个比较复杂的问题 -BAC009S0916W0211 估计未来系统可能会改造 -BAC009S0916W0212 应该不会开免费车道 -BAC009S0916W0213 如果开了小车免费车道 -BAC009S0916W0214 有大车或是货车误闯或者闯进去了就不好办了 -BAC009S0916W0215 有关负责人告诉记者 -BAC009S0916W0216 省里会对此进行专门研究讨论 -BAC009S0916W0217 看看山东之前是怎么做的记者了解到 -BAC009S0916W0218 面对上述这些问题山东是怎么免费放行的呢 -BAC009S0916W0219 免费期间收费员还是按照正常放行的 -BAC009S0916W0220 山东潍坊的一位李先生告诉记者 -BAC009S0916W0221 今年大年初一他开车去海南 -BAC009S0916W0222 一路上很多省份的高速公路收费站都是免费放行 -BAC009S0916W0223 到了出口车道再把通行卡收回去 -BAC009S0916W0224 由于山东免费放行的时间不在春运最高峰 -BAC009S0916W0225 大年初一路上都没什么车 -BAC009S0916W0226 所以倒也没产生收费站排队的现象 -BAC009S0916W0227 扩大到四个小长假之后 -BAC009S0916W0228 国务院批准银行系基金公司再扩容 -BAC009S0916W0229 本报记者蔡宗琦中国证券报记者获悉 -BAC009S0916W0230 公募基金管理业务有关工作 -BAC009S0916W0231 积极推动基金产品审核制度改革 -BAC009S0916W0232 鼓励更多资金投资资本市场 -BAC009S0916W0233 先后两批共八家商业银行设立或参股八家基金管理公司 -BAC009S0916W0234 试点基金管理公司发展态势良好 -BAC009S0916W0235 工商银行建设银行和交通银行为首批试点银行 -BAC009S0916W0236 增加机构投资者数量 -BAC009S0916W0237 促进基金行业规范发展 -BAC009S0916W0238 为商业银行探索跨业经营运作积累经验 -BAC009S0916W0239 此举可能将进一步推动金融混业经营 -BAC009S0916W0240 随着对商业银行设立基金管理公司门槛放宽 -BAC009S0916W0241 我国资本市场将迎来更多机构投资者 -BAC009S0916W0242 更加有利于价值投资理念形成 -BAC009S0916W0243 保险资产管理公司如符合有关规定 -BAC009S0916W0244 可以向有关金融监管部门申请 -BAC009S0916W0245 依法开展公募性质的资产管理业务 -BAC009S0916W0246 通知扩大保险资管公司业务范围 -BAC009S0916W0247 这体现出监管部门开放管理的思路 -BAC009S0916W0248 允许各类资产管理公司同台竞技 -BAC009S0916W0249 在遴选优质管理人提升保险资金投资收益率的同时 -BAC009S0916W0250 也通过机构间的竞争促进保险资管公司的转型发展 -BAC009S0916W0251 明确了参股基金管理公司股东 -BAC009S0916W0252 证监会新闻发言人邓给解释 -BAC009S0916W0253 中国的银行居垄断地位 -BAC009S0916W0254 作风向来无耻加强势苹果也因强势出名 -BAC009S0916W0255 可参照中国移动和苹果的合作传闻中 -BAC009S0916W0257 这估计很难让掉进钱眼儿的四大银行接受 -BAC009S0916W0260 但却鲜有人会像苹果的服务付费 -BAC009S0916W0261 更现实的的困难在于 -BAC009S0916W0263 粗估下来大概要七十亿 -BAC009S0916W0264 这还不包括改造过程中的渠道分食 -BAC009S0916W0265 以及给领导们的审批费用 -BAC009S0916W0266 从支付的大环境上看 -BAC009S0916W0268 它依旧要面对政府的刁难 -BAC009S0916W0270 政府失控的可不是什么隐私了 -BAC009S0916W0271 而是实实在在的金融命脉 -BAC009S0916W0272 慈禧太后就因乔致庸创办了票号 -BAC009S0916W0273 害怕其掌握国家金融命脉 -BAC009S0916W0274 而将他软禁十年之久 -BAC009S0916W0275 何况是一个来自美帝的小苹果呢 -BAC009S0916W0276 科幻星系康斯坦丁文 -BAC009S0916W0277 苹果一口气召开了两次新品发布会 -BAC009S0916W0278 就在会场的凳子和垃圾尚未收拾干净的时候 -BAC009S0916W0279 全世界的报道已经蜂拥而至 -BAC009S0916W0280 失望中夹杂着嘲讽的情绪霸占了各模块的头条 -BAC009S0916W0281 据华尔街日报网站报道 -BAC009S0916W0282 在自己全身心的努力和坚持之下 -BAC009S0916W0284 艾维本周四晚在旧金山现代艺术馆向大众表示 -BAC009S0916W0286 主要是因为社会对可穿戴智能手表的期望太高 -BAC009S0916W0287 手腕是配戴轻便型互动设备与休闲设备的理想之处 -BAC009S0916W0288 但不适合那些笨重的解读设备 -BAC009S0916W0289 艾维表示尽管苹果智能手表拥有诸多功能 -BAC009S0916W0290 这种产品的设计仍需考虑文化历史和未来等因素 -BAC009S0916W0291 艾维现为苹果主管设计业务的高级副总裁 -BAC009S0916W0292 帮助设计了苹果多项产品的外观和用户体验 -BAC009S0916W0294 苹果计划于明年初开始销售其智能手表 -BAC009S0916W0295 该公司于上个月简单地宣布了智能手表相关的情况 -BAC009S0916W0296 其将提供三种版本的智能手表 -BAC009S0916W0297 起步价为三百四九美元十 -BAC009S0916W0298 苹果没有透露更昂贵智能手表的具体售价 -BAC009S0916W0299 这些手表将配置不同的表带 -BAC009S0916W0300 以满足不同用户的需求 -BAC009S0916W0301 市场上还有诸多其他制造商也在尝试生产智能手表 -BAC009S0916W0302 但这些厂商的产品都难以进入主流 -BAC009S0916W0303 这是未来的必经之路 -BAC009S0916W0304 美的家用空调事业部总裁吴文新表示 -BAC009S0916W0305 每日经济新闻记者从美的家用空调事业部了解到 -BAC009S0916W0306 自二零一一年事业部启动自动化升级至今的四年里 -BAC009S0916W0307 工人数量减少近一半 -BAC009S0916W0308 美的家用空调事业部制造副总裁乌守保对记者 -BAC009S0916W0309 老板电器的新增量创新需求追求极致搜狐科技 -BAC009S0916W0310 质变中的世界工厂中国正在由中国制造向中国智造蜕变 -BAC009S0916W0311 如何借力拥抱互联网加这一全新变量 -BAC009S0916W0312 如何重新激活内部潜能 -BAC009S0916W0313 便是区别行业龙头企业经营智慧高低的关键时刻 -BAC009S0916W0314 身处传统白色家电领域中的重要一支到厨房电器 -BAC009S0916W0315 多年来保持奇高市占率的老板电器 -BAC009S0916W0316 在成名三十馀年后仍在竭力寻求业态的新鲜化和可能性 -BAC009S0916W0317 能否找到厨电行业下一个未知的增量 -BAC009S0916W0318 也成为老板电器和它的宿敌们能否领跑下半程的关键 -BAC009S0916W0319 阐述老板电器和内部创新外部国际化如何进行破题 -BAC009S0916W0320 老板电器如何看待公司的创新驱动 -BAC009S0916W0321 赵继宏老板电器做厨电已经三十多年了 -BAC009S0916W0322 作为企业理念和产品技术必须要走在时代的前面 -BAC009S0916W0323 现在中国的八十五后和九十后消费人群已经成为消费主体 -BAC009S0916W0324 他们需要的是智能厨房智能家居与家电 -BAC009S0916W0325 公司为此研发并推出市场的智能产品非常贴近市场 -BAC009S0916W0326 围绕消费者消费者需要什么 -BAC009S0916W0327 我们开发什么的产品研发策略 -BAC009S0916W0328 除了产品功能必须不错之外 -BAC009S0916W0329 以保证持续长久的黏性互动 -BAC009S0916W0330 产品创新其实也是一个双向互动的过程 -BAC009S0916W0331 现在消费者的需求越来越个性化差异化 -BAC009S0916W0332 可以和我们的消费者有很多的互动并提供超值服务 -BAC009S0916W0333 这些都是和消费者增添黏性互动的方式 -BAC009S0916W0334 这个方向的创新以后还有更多的东西可以发挥作用 -BAC009S0916W0335 如今的智能家电更多意义上是智能加上互动 -BAC009S0916W0336 也就是老板电器总结的自动加互动 -BAC009S0916W0338 只要在明天的最后一战中赢下东道主日本 -BAC009S0916W0339 高清女排力擒俄罗斯夺冠占主动众将喜极而泣 -BAC009S0916W0340 今天大家打得都挺好的 -BAC009S0916W0341 我们是一条心在打团结作战 -BAC009S0916W0342 赛后主攻手朱婷对记者说 -BAC009S0916W0343 本场比赛朱婷三七次扣球得到二十一分 -BAC009S0916W0344 此外她还凭借拦网和发球分别拿到七分和一分 -BAC009S0916W0345 我觉得自己的脚伤已经完全恢复了 -BAC009S0916W0346 对弹跳没有什么影响 -BAC009S0916W0347 当在新闻发布会上被问及伤情的时候 -BAC009S0916W0348 在第四轮与韩国队的比赛中 -BAC009S0916W0349 朱婷在第四局比赛中意外崴脚 -BAC009S0916W0350 今天出色的数据也佐证了她身体的康复情况良好 -BAC009S0916W0351 作为队里年龄最大的球员 -BAC009S0916W0352 最终拦网和扣球均得到六这些分 -BAC009S0916W0353 位列球队发球榜首位和拦网榜的第二位 -BAC009S0916W0354 大家今天打得非常出色 -BAC009S0916W0355 能够在这个集体与可爱的队友一起拼杀 -BAC009S0916W0356 我感到非常骄傲和自豪 -BAC009S0916W0357 在赛后发布会上颜妮对记者说 -BAC009S0916W0358 在复盘与俄罗斯一战时 -BAC009S0916W0359 这场比赛前教练给我们布置了很多 -BAC009S0916W0360 作为就是我上场多去贯彻教练意图 -BAC009S0916W0361 颜妮坦言今天俄罗斯表现很好 -BAC009S0916W0362 我们两家有时候比较像 -BAC009S0916W0363 当被问及新老队员相互担当弥补的话题时 -BAC009S0916W0364 颜妮坦言自己的发挥也不是特别稳定 -BAC009S0916W0365 但有起伏应该是正常的 -BAC009S0916W0366 作为老队员我要多承担 -BAC009S0916W0367 用实际行动来弥补不足 -BAC009S0916W0368 搜狐体育郭健文 -BAC009S0916W0369 女排众将手举国旗敬夺冠 -BAC009S0916W0370 拿到了明年里约奥运会的入场券 -BAC009S0916W0371 在接受中央电视台记者采访时朱婷表示 -BAC009S0916W0372 全队上下面对了巨大困难 -BAC009S0916W0373 其中郎平主教练最为辛苦 -BAC009S0916W0374 今晚的比赛中朱婷独得二十七分 -BAC009S0916W0375 再度成为了比赛的得分王 -BAC009S0916W0377 但今天能拿冠军真的是发自肺腑的想哭 -BAC009S0916W0378 面对日本队的魔鬼主场 -BAC009S0916W0379 中国女排表示承受了巨大的压力 -BAC009S0916W0380 朱婷表示我想日本肯定也会拼我们 -BAC009S0916W0381 做了很多很多困难准备 -BAC009S0916W0382 如果输了就不太好说了 -BAC009S0916W0383 但是里面不是淡定的 -BAC009S0916W0384 中国队连续三位主力因伤缺战 -BAC009S0916W0385 大家可能觉得我们这支队伍很苦 -BAC009S0916W0386 但我觉得郎导是最苦的 -BAC009S0916W0387 朱婷表示其实我也想 -BAC009S0916W0388 女排三零阿根廷朱婷复出扣杀状态神勇 -BAC009S0916W0389 全场比赛的焦点是休战三场后重新登场的名将朱婷 -BAC009S0916W0390 拿下全场最高分的朱婷赛后表示 -BAC009S0916W0391 在八月二十六日中国队和韩国队的比赛中 -BAC009S0916W0392 朱婷崴脚之后带伤率队取胜 -BAC009S0916W0393 主教练郎平都没有派她出场 -BAC009S0916W0394 一日晚的中阿之战 -BAC009S0916W0395 重新以首发身份登场的朱婷迅速找回比赛的感觉 -BAC009S0916W0396 赛后被评为当场最佳球员 -BAC009S0916W0397 这也是她在本届世界杯上第二次获得全场最佳 -BAC009S0916W0398 在场上移动很好 -BAC009S0916W0399 朱婷在谈到大家关心的脚伤时说 -BAC009S0916W0400 在冈山的桃太郎体育馆 -BAC009S0916W0401 当地华人团体组织了不少球迷为中国队加油 -BAC009S0916W0402 这样的氛围让朱婷感觉像是主场一样 -BAC009S0916W0403 大家赢球比自己获得最佳还要高兴 -BAC009S0916W0404 这部电影从二零一三年就已经开始筹备了 -BAC009S0916W0405 前后打磨了两年时间才得以完成 -BAC009S0916W0406 与奥斯卡影帝本金斯利同时出现在海报中央 -BAC009S0916W0407 雷诺兹持枪的造型和他在冥界警局里的颇为相似 -BAC009S0916W0408 那些年女神陈妍希近来瘦身有成 -BAC009S0916W0409 不仅摆脱神雕侠侣时期的小笼包名号 -BAC009S0916W0410 日前在大陆真人秀节目秀出两条雪白大长腿 -BAC009S0916W0411 更让粉丝看了鼻血直流 -BAC009S0916W0412 只不过好景不常 -BAC009S0916W0413 她最近又被拍到崩坏实录 -BAC009S0916W0414 乱糟糟的马尾加上宽松衣服的村姑打扮 -BAC009S0916W0415 搜狐娱乐讯名为娱乐圈八卦的自媒体 -BAC009S0916W0416 曝出陈妍希拍戏时突然干呕 -BAC009S0916W0417 推断其已怀孕 -BAC009S0916W0418 应该是月初吧 -BAC009S0916W0419 小笼包身体有反应 -BAC009S0916W0420 她突然就干呕 -BAC009S0916W0421 陈妍希还去医院做了检查 -BAC009S0916W0422 她的团队对她更加关心了 -BAC009S0916W0423 中新网六月十六日电六月十六日是容祖儿的生日 -BAC009S0916W0424 陈妍希晒出与容祖儿合照 -BAC009S0916W0425 并送上真挚祝福 -BAC009S0916W0426 祝可爱的你 -BAC009S0916W0427 每一天都要快乐喔 -BAC009S0916W0428 中新网九月二十五日电据台湾东森新闻报道 -BAC009S0916W0429 陈晓与陈妍希承认恋情 -BAC009S0916W0430 获得粉丝祝福 -BAC009S0916W0431 湖北六小伙温州偷硬币称代表诸葛后人战刘伯温后人 -BAC009S0916W0432 专偷摇摇车里的硬币 -BAC009S0916W0433 运气好时一天能偷几千枚一元硬币 -BAC009S0916W0434 湖北六岁女童被继母虐打下阴撕裂警方已介入 -BAC009S0916W0435 湖北七人冒充福彩工作人员兜售中奖秘籍骗取三零零万 -BAC009S0916W0436 湖北省黄冈市公安局通报称 -BAC009S0916W0437 打掉一个以传授彩票中奖秘籍为名的特大电信诈骗团伙 -BAC009S0916W0438 破获电信诈骗案二三三起 -BAC009S0916W0439 涉案金额三零零多万元 -BAC009S0916W0440 湖北九岁女童遇害案告破凶手强奸不成推下窗外 -BAC009S0916W0441 湖北五道杠少年捐出二万元政府奖学金 -BAC009S0916W0442 学校里有些同学家里条件不好 -BAC009S0916W0443 但是想让更多需要帮助的水高学子感受到温暖 -BAC009S0916W0444 湖北卷人电梯设计不合理同型号已售四六四八部 -BAC009S0916W0445 事故电梯仍处于拆解状态 -BAC009S0916W0446 新华社记者梁建强摄 -BAC009S0916W0447 湖北吃人同型号电梯全国四六四八部分布三一省市 -BAC009S0916W0448 安良百货商场正常营业 -BAC009S0916W0449 但各楼层的自动扶梯均已关停供图新华 -BAC009S0916W0450 湖北吃人电梯品牌四年被曝光五次 -BAC009S0916W0452 湖北吞人电梯三月份刚检测合格 -BAC009S0916W0453 事故电梯出厂刚满一年 -BAC009S0916W0454 今年三月份经检验为合格 -BAC009S0916W0455 涉事厂家生产的电梯此前曾发生多起事故 -BAC009S0916W0456 目前湖北省质监局已要求全省暂停使用涉事厂家电梯 -BAC009S0916W0457 湖北咬人电梯厂家曾为盖板支架申请专利 -BAC009S0916W0458 湖北荆州吃人电梯盖板设计不合理供图 -BAC009S0916W0459 湖北电梯吃人定性为责任事故 -BAC009S0916W0460 看过湖北电梯吃人视频的不少上海年轻人 -BAC009S0916W0461 在经过商场自动扶梯时会选择跳过视频中的跳板 -BAC009S0916W0462 晨报记者张佳琪晨报讯昨晚九时三零分 -BAC009S0916W0463 湖北省荆州市安监局召开安良百货电梯事故情报通报会 -BAC009S0916W0464 此次事故调查组组长荆州市安监局局长陈观鑫通报称 -BAC009S0916W0465 初步认定这是一起安全生产责任事故 -BAC009S0916W0466 湖北电梯吃人调查报告电梯厂商及商场负主责 -BAC009S0916W0467 二零一五七二六 -BAC009S0916W0468 湖北荆州市安良百货公司事发手扶电梯已被关闭检修 -BAC009S0916W0470 申龙电梯和安良百货公司应对事故负主要责任 -BAC009S0916W0471 湖北飞踢女居民车道办主任被停职 -BAC009S0916W0472 网曝视频截图当街飞踢女群众 -BAC009S0916W0473 大喊我一脚方言 -BAC009S0916W0474 同踹死你的街道办主任 -BAC009S0916W0475 湖北一中学教师体罚学生致重伤被判刑三年 -BAC009S0916W0476 用右脚踢向董某左腹部 -BAC009S0916W0477 董某某所受损伤程度属二重伤二级 -BAC009S0916W0478 残疾等级为六级残疾 -BAC009S0916W0479 梁某某被一审法院以故意伤害罪判处有期徒刑三年 -BAC009S0916W0480 湖北一中学班长失踪坠亡教学楼四小时去向成谜 -BAC009S0916W0481 新洲一名高中新生因为没去教室上晚自习 -BAC009S0916W0482 老师发现后和学生一起寻找 -BAC009S0916W0483 直至晚上一零时左右 -BAC009S0916W0484 一名老师才发现学生坠楼摔落在教学楼前 -BAC009S0916W0485 今日二二日晨凌晨 -BAC009S0916W0486 这名一五岁的花季少年最终送医救治无效死亡 -BAC009S0916W0487 湖北一传销头目归国投案骗取群众资金数亿元 -BAC009S0916W0488 湖北一骗取群众资金数亿元的传销头目近日归国投案 -BAC009S0916W0489 湖北一公司以员工名义贷款数十员工负债千万 -BAC009S0916W0490 阳逻一家公司以数十名员工的名义 -BAC009S0916W0491 向一家金融公司贷款一千多万元 -BAC009S0916W0492 公司承诺贷款本息都由公司负责偿还 -BAC009S0916W0493 公司却遇到了资金困难 -BAC009S0916W0494 存在无法如期还贷的风险 -BAC009S0916W0495 这令被贷款的员工们寝食难安 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/gpu/tensorrt/model_repo_stateful_trt/decoder/1/.gitkeep b/models/audio/speech_recognition/conformer/igie/tools/__init__.py similarity index 100% rename from models/audio/speech_recognition/conformer/igie/wenet/runtime/gpu/tensorrt/model_repo_stateful_trt/decoder/1/.gitkeep rename to models/audio/speech_recognition/conformer/igie/tools/__init__.py diff --git a/models/audio/speech_recognition/conformer/igie/compute_cer.py b/models/audio/speech_recognition/conformer/igie/tools/compute_cer.py similarity index 82% rename from models/audio/speech_recognition/conformer/igie/compute_cer.py rename to models/audio/speech_recognition/conformer/igie/tools/compute_cer.py index 6a7b381e6ebc6ff950226677ce34e25f4b1f4947..a5db08979f4d31a4a2ac9e4ceb0d122537690aac 100644 --- a/models/audio/speech_recognition/conformer/igie/compute_cer.py +++ b/models/audio/speech_recognition/conformer/igie/tools/compute_cer.py @@ -1,22 +1,10 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import sys import unicodedata import codecs -import argparse remove_tag = True spacelist = [' ', '\t', '\r', '\n'] @@ -278,16 +266,101 @@ def default_cluster(word) : return 'Other' return unicode_names[0] -def get_acc(ref_file, hyp_file): +def usage() : + print("compute-wer.py : compute word error rate (WER) \ + and align recognition results and references.") + print(" usage : python compute-wer.py [--cs={0,1}] \ + [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ + [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") + +if __name__ == '__main__': + if len(sys.argv) == 1 : + usage() + sys.exit(0) calculator = Calculator() cluster_file = '' ignore_words = set() tochar = False - verbose = 0 + verbose = 1 padding_symbol = ' ' case_sensitive = False max_words_per_line = sys.maxsize split = None + while len(sys.argv) > 3: + a = '--maxw=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):] + del sys.argv[1] + max_words_per_line = int(b) + continue + a = '--rt=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + remove_tag = (b == 'true') or (b != '0') + continue + a = '--cs=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + case_sensitive = (b == 'true') or (b != '0') + continue + a = '--cluster=' + if sys.argv[1].startswith(a): + cluster_file = sys.argv[1][len(a):] + del sys.argv[1] + continue + a = '--splitfile=' + if sys.argv[1].startswith(a): + split_file = sys.argv[1][len(a):] + del sys.argv[1] + split = dict() + with codecs.open(split_file, 'r', 'utf-8') as fh: + for line in fh: # line in unicode + words = line.strip().split() + if len(words) >= 2: + split[words[0]] = words[1:] + continue + a = '--ig=' + if sys.argv[1].startswith(a): + ignore_file = sys.argv[1][len(a):] + del sys.argv[1] + with codecs.open(ignore_file, 'r', 'utf-8') as fh: + for line in fh: # line in unicode + line = line.strip() + if len(line) > 0: + ignore_words.add(line) + continue + a = '--char=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + tochar = (b == 'true') or (b != '0') + continue + a = '--v=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + verbose = 0 + try: + verbose = int(b) + except Exception: + if b == 'true' or b != '0': + verbose = 1 + continue + a = '--padding-symbol=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + if b == 'space': + padding_symbol = ' ' + elif b == 'underline': + padding_symbol = '_' + continue + if True or sys.argv[1].startswith('-'): + # ignore invalid switch + del sys.argv[1] + continue if not case_sensitive: ig = set([w.upper() for w in ignore_words]) @@ -296,6 +369,8 @@ def get_acc(ref_file, hyp_file): default_clusters = {} default_words = {} + ref_file = sys.argv[1] + hyp_file = sys.argv[2] rec_set = {} if split and not case_sensitive: newsplit = dict() @@ -394,13 +469,18 @@ def get_acc(ref_file, hyp_file): lab1 = lab2 rec1 = rec2 + if verbose: + print('===================================================' + '========================') + print() + result = calculator.overall() if result['all'] != 0 : wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] else : wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') + print('Overall -> wer %4.2f %% Corr %4.2f %%' % (wer, result['cor']*100/result['all']), end=' ') print('N=%d C=%d S=%d D=%d I=%d' % (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) @@ -447,7 +527,6 @@ def get_acc(ref_file, hyp_file): # general terms, like WEATHER / CAR / ... else : cluster.append(token) - - acc = 100.0 - round(wer, 2) - - return acc \ No newline at end of file + print() + print('=======================================' + '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/tools/filter_scp.pl similarity index 100% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/filter_scp.pl rename to models/audio/speech_recognition/conformer/igie/tools/filter_scp.pl diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/tools/make_raw_list.py similarity index 100% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/make_raw_list.py rename to models/audio/speech_recognition/conformer/igie/tools/make_raw_list.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/tools/make_shard_list.py similarity index 91% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/make_shard_list.py rename to models/audio/speech_recognition/conformer/igie/tools/make_shard_list.py index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..fcd4bcd7d62ba933cf27c34fc02e18371a6b10a6 100644 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/make_shard_list.py +++ b/models/audio/speech_recognition/conformer/igie/tools/make_shard_list.py @@ -66,16 +66,8 @@ def write_tar_file(data_list, # resample if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) + audio = torchaudio.transforms.Resample( + sample_rate, resample)(audio) ts = time.time() f = io.BytesIO() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/tools/text2token.py similarity index 100% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/text2token.py rename to models/audio/speech_recognition/conformer/igie/tools/text2token.py diff --git a/models/audio/speech_recognition/conformer/igie/train.yaml b/models/audio/speech_recognition/conformer/igie/train.yaml deleted file mode 100644 index e1224b6931bb8e16dbe1f34b638779bbb72d2149..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/train.yaml +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -accum_grad: 4 -cmvn_file: exp/conformer/global_cmvn -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 -decoder: transformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 6 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 -encoder: conformer -encoder_conf: - activation_type: swish - attention_dropout_rate: 0.0 - attention_heads: 4 - cnn_module_kernel: 15 - dropout_rate: 0.1 - input_layer: conv2d - linear_units: 2048 - normalize_before: true - num_blocks: 12 - output_size: 256 - pos_enc_layer_type: rel_pos - positional_dropout_rate: 0.1 - selfattention_layer_type: rel_selfattn - use_cnn_module: true -grad_clip: 5 -input_dim: 80 -is_json_cmvn: true -log_interval: 100 -max_epoch: 240 -model_conf: - ctc_weight: 0.3 - length_normalized_loss: false - lsm_weight: 0.1 -optim: adam -optim_conf: - lr: 0.002 -output_dim: 4233 -scheduler: warmuplr -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/.clang-format b/models/audio/speech_recognition/conformer/igie/wenet/.clang-format deleted file mode 100644 index 29333f52be4f383d3a7e1fa8b4cd3680ca007a3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/.clang-format +++ /dev/null @@ -1,94 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlinesLeft: true -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -IncludeCategories: - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IncludeIsMainRegex: '([-_](test|unittest))?$' -IndentCaseLabels: true -IndentWidth: 2 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -ReflowComments: true -SortIncludes: true -SpaceAfterCStyleCast: false -SpaceBeforeAssignmentOperators: true -SpaceBeforeParens: ControlStatements -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Auto -TabWidth: 8 -UseTab: Never -... - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/.flake8 b/models/audio/speech_recognition/conformer/igie/wenet/.flake8 deleted file mode 100644 index 34aa3e3843d2cb87028da016d6d688b77359b2b4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/.flake8 +++ /dev/null @@ -1,15 +0,0 @@ -[flake8] -select = B,C,E,F,P,T4,W,B9 -max-line-length = 80 -# C408 ignored because we like the dict keyword argument syntax -# E501 is not flexible enough, we're using B950 instead -ignore = - E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303, - # shebang has extra meaning in fbcode lints, so I think it's not worth trying - # to line this up with executable bit - EXE001, EXE002, - # these ignores are from flake8-bugbear; please fix! - B007,B008,B905 - # these ignores are from flake8-comprehensions; please fix! - C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415 -exclude = compute-wer.py,kaldi_io.py,__torch__,docs/conf.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/CODE_OF_CONDUCT.md b/models/audio/speech_recognition/conformer/igie/wenet/CODE_OF_CONDUCT.md deleted file mode 100644 index 66c2a4cafb77b81f9d8f7e65a485b841a6a347a9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,76 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to making participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or - advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. Examples of -representing a project or community include using an official project e-mail -address, posting via an official social media account, or acting as an appointed -representative at an online or offline event. Representation of a project may be -further defined and clarified by project maintainers. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at mikelei@mobvoi.com. All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq diff --git a/models/audio/speech_recognition/conformer/igie/wenet/CONTRIBUTING.md b/models/audio/speech_recognition/conformer/igie/wenet/CONTRIBUTING.md deleted file mode 100644 index cb56befc95b19d428b7953851caa69a622f5fb88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/CONTRIBUTING.md +++ /dev/null @@ -1,36 +0,0 @@ -# Contributing guidelines - -## Pre-commit tidy/linting hook - -You'll need to install flake8 first. - -`pip3 install flake8==3.8.2` - -We use flake8 to perform additional formatting and semantic checking of code. -We provide a pre-commit git hook for performing these checks, before a commit -is created: - -```bash -ln -s ../../tools/git-pre-commit .git/hooks/pre-commit -``` - -You have to execute above command in wenet project root directory. -After that, each commit will be checked by flake8. - -If you do not set pre-commit, just run `flake8` in wenet project root directory -and fix all the problems. - -## Github checks - -After a pull request is submitted, some checks will run to check your code style. - -Below is an example where some checks fail. - -![github checks](docs/images/checks.png) - -You need to click the details to see the detailed info like the example below. - -![github checks](docs/images/check_detail.png) - -You have to fix all style problems according to the detailed info. - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/CPPLINT.cfg deleted file mode 100644 index d3c898441efaec14fcd356efbefaa0ef3e237b57..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/CPPLINT.cfg +++ /dev/null @@ -1,2 +0,0 @@ -root=runtime/core -filter=-build/c++11 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/LICENSE b/models/audio/speech_recognition/conformer/igie/wenet/LICENSE deleted file mode 100644 index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/README.md b/models/audio/speech_recognition/conformer/igie/wenet/README.md deleted file mode 100644 index 0afabf07bb2d2c97e060afadbb862bcd060c32d5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/README.md +++ /dev/null @@ -1,119 +0,0 @@ -# WeNet - -[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://opensource.org/licenses/Apache-2.0) -[![Python-Version](https://img.shields.io/badge/Python-3.7%7C3.8-brightgreen)](https://github.com/wenet-e2e/wenet) - -[**Roadmap**](https://github.com/wenet-e2e/wenet/issues/1683) -| [**Docs**](https://wenet-e2e.github.io/wenet) -| [**Papers**](https://wenet-e2e.github.io/wenet/papers.html) -| [**Runtime (x86)**](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch) -| [**Runtime (android)**](https://github.com/wenet-e2e/wenet/tree/main/runtime/android) -| [**Pretrained Models**](docs/pretrained_models.md) -| [**HuggingFace**](https://huggingface.co/spaces/wenet/wenet_demo) - -**We** share neural **Net** together. - -The main motivation of WeNet is to close the gap between research and production end-to-end (E2E) speech recognition models, -to reduce the effort of productionizing E2E models, and to explore better E2E models for production. - -## :fire: News - -* 2022.12: Horizon X3 pi BPU, see https://github.com/wenet-e2e/wenet/pull/1597, Kunlun Core XPU, see https://github.com/wenet-e2e/wenet/pull/1455, Raspberry Pi, see https://github.com/wenet-e2e/wenet/pull/1477, IOS, see https://github.com/wenet-e2e/wenet/pull/1549. -* 2022.11: TrimTail paper released, see https://arxiv.org/pdf/2211.00522.pdf -* 2022.10: Squeezeformer is supported, see https://github.com/wenet-e2e/wenet/pull/1447. -* 2022.07: RNN-T is supported now, see [rnnt](https://github.com/wenet-e2e/wenet/tree/main/examples/aishell/rnnt) for benchmark. - -## Highlights - -* **Production first and production ready**: The core design principle of WeNet. WeNet provides full stack solutions for speech recognition. - * *Unified solution for streaming and non-streaming ASR*: [U2++ framework](https://arxiv.org/pdf/2203.15455.pdf)--develop, train, and deploy only once. - * *Runtime solution*: built-in server [x86](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch) and on-device [android](https://github.com/wenet-e2e/wenet/tree/main/runtime/android) runtime solution. - * *Model exporting solution*: built-in solution to export model to LibTorch/ONNX for inference. - * *LM solution*: built-in production-level [LM solution](docs/lm.md). - * *Other production solutions*: built-in contextual biasing, time stamp, endpoint, and n-best solutions. - -* **Accurate**: WeNet achieves SOTA results on a lot of public speech datasets. -* **Light weight**: WeNet is easy to install, easy to use, well designed, and well documented. - -## Performance Benchmark - -Please see `examples/$dataset/s0/README.md` for benchmark on different speech datasets. - -## Installation(Python Only) - -If you just want to use WeNet as a python package for speech recognition application, -just install it by `pip`, please note python 3.6+ is required. -``` sh -pip3 install wenetruntime -``` - -And please see [doc](runtime/binding/python/README.md) for usage. - - -## Installation(Training and Developing) - -- Clone the repo -``` sh -git clone https://github.com/wenet-e2e/wenet.git -``` - -- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html -- Create Conda env: - -``` sh -conda create -n wenet python=3.8 -conda activate wenet -pip3 install -r requirements.txt -conda install pytorch=1.10.0 torchvision torchaudio=0.10.0 cudatoolkit=11.1 -c pytorch -c conda-forge -``` - -- Optionally, if you want to use x86 runtime or language model(LM), -you have to build the runtime as follows. Otherwise, you can just ignore this step. - -``` sh -# runtime build requires cmake 3.14 or above -cd runtime/libtorch -mkdir build && cd build && cmake -DGRAPH_TOOLS=ON .. && cmake --build . -``` - -## Discussion & Communication - -Please visit [Discussions](https://github.com/wenet-e2e/wenet/discussions) for further discussion. - -For Chinese users, you can aslo scan the QR code on the left to follow our offical account of WeNet. -We created a WeChat group for better discussion and quicker response. -Please scan the personal QR code on the right, and the guy is responsible for inviting you to the chat group. - -If you can not access the QR image, please access it on [gitee](https://gitee.com/robin1001/qr/tree/master). - -| | | -| ---- | ---- | - -Or you can directly discuss on [Github Issues](https://github.com/wenet-e2e/wenet/issues). - -## Acknowledge - -1. We borrowed a lot of code from [ESPnet](https://github.com/espnet/espnet) for transformer based modeling. -2. We borrowed a lot of code from [Kaldi](http://kaldi-asr.org/) for WFST based decoding for LM integration. -3. We referred [EESEN](https://github.com/srvk/eesen) for building TLG based graph for LM integration. -4. We referred to [OpenTransformer](https://github.com/ZhengkunTian/OpenTransformer/) for python batch inference of e2e models. - -## Citations - -``` bibtex -@inproceedings{yao2021wenet, - title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit}, - author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin}, - booktitle={Proc. Interspeech}, - year={2021}, - address={Brno, Czech Republic }, - organization={IEEE} -} - -@article{zhang2022wenet, - title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit}, - author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei}, - journal={arXiv preprint arXiv:2203.15455}, - year={2022} -} -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/README_CN.md b/models/audio/speech_recognition/conformer/igie/wenet/README_CN.md deleted file mode 100644 index dc7254512f5dad1dfe191fb486a0c5a6c7255bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/README_CN.md +++ /dev/null @@ -1,129 +0,0 @@ -# WeNet - -[**English version**](https://github.com/wenet-e2e/wenet/tree/main/README.md) - -[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://opensource.org/licenses/Apache-2.0) -[![Python-Version](https://img.shields.io/badge/Python-3.7%7C3.8-brightgreen)](https://github.com/wenet-e2e/wenet) - -[**文档**](https://wenet-e2e.github.io/wenet/) -| [**训练模型教程 1**](https://wenet.org.cn/wenet/tutorial_librispeech.html) -| [**训练模型教程 2**](https://wenet.org.cn/wenet/tutorial_aishell.html) -| [**WeNet 论文**](https://wenet-e2e.github.io/wenet/papers.html) -| [**x86 识别服务**](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch) -| [**android 本地识别**](https://github.com/wenet-e2e/wenet/tree/main/runtime/android) - - - -## 核心功能 - -WeNet 是一款面向工业落地应用的语音识别工具包,提供了从语音识别模型的训练到部署的一条龙服务,其主要特点如下: - -* 使用 conformer 网络结构和 CTC/attention loss 联合优化方法,统一的流式/非流式语音识别方案,具有业界一流的识别效果。 -* 提供云上和端上直接部署的方案,最小化模型训练和产品落地之间的工程工作。 -* 框架简洁,模型训练部分完全基于 pytorch 生态,不依赖于 kaldi 等复杂的工具。 -* 详细的注释和文档,非常适合用于学习端到端语音识别的基础知识和实现细节。 -* 支持时间戳,对齐,端点检测,语言模型等相关功能。 - - -## 1分钟 Demo - -**使用预训练模型和 docker 进行语音识别,1分钟(如果网速够快)搭建一个语音识别系统** - -下载官方提供的预训练模型,并启动 docker 服务,加载模型,提供 websocket 协议的语音识别服务。 - -``` sh -wget https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell2/20210618_u2pp_conformer_libtorch.tar.gz -tar -xf 20210618_u2pp_conformer_libtorch.tar.gz -model_dir=$PWD/20210618_u2pp_conformer_libtorch -docker run --rm -it -p 10086:10086 -v $model_dir:/home/wenet/model wenetorg/wenet-mini:latest bash /home/run.sh -``` - -**实时识别** - -使用浏览器打开文件`wenet/runtime/libtorch/web/templates/index.html`,在 `WebSocket URL` 中填入 `ws://127.0.0.1:10086` (若在windows下通过wsl2运行docker, 则使用`ws://localhost:10086`) , 允许浏览器弹出的请求使用麦克风,即可通过麦克风进行实时语音识别。 - -![Runtime web](/docs/images/runtime_web.png) - - -## 训练语音识别模型 - -**配置环境** - -``` sh -git clone https://github.com/wenet-e2e/wenet.git -``` - -- 安装 Conda: https://docs.conda.io/en/latest/miniconda.html -- 创建 Conda 环境: - -``` sh -conda create -n wenet python=3.8 -conda activate wenet -pip3 install -r requirements.txt -conda install pytorch=1.10.0 torchvision torchaudio=0.10.0 cudatoolkit=11.1 -c pytorch -c conda-forge -``` - -**训练模型** - -使用中文 Aishell-1 数据集训练模型 -``` -cd examples/aishell/s0/ -bash run.sh --stage -1 -``` - -细节请阅读 [**训练模型教程**](https://wenet-e2e.github.io/wenet/tutorial_aishell.html) - - -### 新手常见问题 - -1. 请使用具有gpu的机器。确保cuda和torch都已经安装。wenet也支持cpu训练,但是速度非常很慢。 -2. 请使用支持bash的环境。windows的默认cmd是不支持bash的。 -3. run.sh脚本里,`export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"`, 改为自己要用的GPU id,比如你的机器有4张GPU卡,4张卡都用来训练,改为 `export CUDA_VISIBLE_DEVICES="0,1,2,3"` -4. run.sh脚本里,`data=/export/data/asr-data/OpenSLR/33/`设置为自己的目录。请使用绝对路径而不要用相对路径。 -5. 如果继续训练出错,请先删除实验目录下的 ddp_init文件再试一试。 - - -## 技术支持 - -欢迎在 [Github Issues](https://github.com/wenet-e2e/wenet/issues) 中提交问题。 - -欢迎扫二维码加入微信讨论群,如果群人数较多,请添加右侧个人微信入群。 - -| | | -| ---- | ---- | - -## 致谢 - -WeNet 借鉴了一些优秀的开源项目,包括 - -1. Transformer 建模 [ESPnet](https://github.com/espnet/espnet) -2. WFST 解码 [Kaldi](http://kaldi-asr.org/) -3. TLG 构图 [EESEN](https://github.com/srvk/eesen) -4. Python Batch 推理 [OpenTransformer](https://github.com/ZhengkunTian/OpenTransformer/) - -## 引用 - -``` bibtex -@inproceedings{yao2021wenet, - title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit}, - author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin}, - booktitle={Proc. Interspeech}, - year={2021}, - address={Brno, Czech Republic }, - organization={IEEE} -} - -@article{zhang2020unified, - title={Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition}, - author={Zhang, Binbin and Wu, Di and Yao, Zhuoyuan and Wang, Xiong and Yu, Fan and Yang, Chao and Guo, Liyong and Hu, Yaguang and Xie, Lei and Lei, Xin}, - journal={arXiv preprint arXiv:2012.05481}, - year={2020} -} - -@article{wu2021u2++, - title={U2++: Unified Two-pass Bidirectional End-to-end Model for Speech Recognition}, - author={Wu, Di and Zhang, Binbin and Yang, Chao and Peng, Zhendong and Xia, Wenjing and Chen, Xiaoyu and Lei, Xin}, - journal={arXiv preprint arXiv:2106.05642}, - year={2021} -} -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/ROADMAP.md b/models/audio/speech_recognition/conformer/igie/wenet/ROADMAP.md deleted file mode 100644 index 4d44b2ab6548b8a74cc20baf0371674839f68cb6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/ROADMAP.md +++ /dev/null @@ -1,49 +0,0 @@ -# WeNet Roadmap - -This roadmap for WeNet. -WeNet is a community-driven project and we love your feedback and proposals on where we should be heading. - -Please open up [issues](https://github.com/wenet-e2e/wenet/issues/) or -[discussion](https://github.com/wenet-e2e/wenet/discussions) on github to write your proposal. -Feel free to volunteer yourself if you are interested in trying out some items(they do not have to be on the list). - - -## WeNet 3.0 (2023.06) - -- [x] ONNX support, see https://github.com/wenet-e2e/wenet/pull/1103 -- [x] RNN-T support, see https://github.com/wenet-e2e/wenet/pull/1261 -- [ ] Self training, streaming -- [ ] Light weight, low latency, on-device model exploration - - [x] TrimTail, see https://github.com/wenet-e2e/wenet/pull/1487/, [paper link](https://arxiv.org/pdf/2211.00522.pdf) -- [ ] Audio-Visual speech recognition -- [ ] OS or Hardware Platforms - - [x] IOS, https://github.com/wenet-e2e/wenet/pull/1549 - - [x] Raspberry Pi, see https://github.com/wenet-e2e/wenet/pull/1477 - - [ ] Harmony OS -- [ ] ASIC XPU - - [x] Horizon X3 pi, BPU, see https://github.com/wenet-e2e/wenet/pull/1597 - - [x] Kunlun XPU, see https://github.com/wenet-e2e/wenet/pull/1455 -- [x] Public Model Hub Support - - [x] HuggingFace, see https://huggingface.co/spaces/wenet/wenet_demo - - [x] ModelScope, see https://modelscope.cn/models/wenet/u2pp_conformer-asr-cn-16k-online/summary - - [x] [Vosk](https://github.com/alphacep/vosk-api/) like models and API for developers. - - [x] Models(Chinese/English/Japanese/Korean/French/German/Spanish/Portuguese) - - [x] Chinese - - [x] English - - [x] API(python/c/c++/go/java) - - [x] python - -## WeNet 2.0 (2022.06) - -- [x] U2++ framework for better accuracy -- [x] n-gram + WFST language model solution -- [x] Context biasing(hotword) solution -- [x] Very big data training support with UIO -- [x] More dataset support, including WenetSpeech, GigaSpeech, HKUST and so on. - -## WeNet 1.0 (2021.02) - -- [x] Streaming solution(U2 framework) -- [x] Production runtime solution with `TorchScript` training and `LibTorch` inference. -- [x] Unified streaming and non-streaming model(U2 framework) - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/dataset.py similarity index 86% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/dataset.py rename to models/audio/speech_recognition/conformer/igie/wenet/dataset.py index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..88a8cd15aec2277a36358883b25e929b179165e8 100644 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/dataset.py +++ b/models/audio/speech_recognition/conformer/igie/wenet/dataset.py @@ -18,8 +18,8 @@ import torch import torch.distributed as dist from torch.utils.data import IterableDataset -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists +import wenet.processor as processor +from wenet.file_utils import read_lists class Processor(IterableDataset): @@ -156,27 +156,13 @@ def Dataset(data_type, if speed_perturb: dataset = Processor(dataset, processor.speed_perturb) - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) + fbank_conf = conf.get('fbank_conf', {}) + dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) if spec_aug: spec_aug_conf = conf.get('spec_aug_conf', {}) dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) if shuffle: shuffle_conf = conf.get('shuffle_conf', {}) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/Makefile b/models/audio/speech_recognition/conformer/igie/wenet/docs/Makefile deleted file mode 100644 index a025a1f0c74dfe301edb4403f42fde65eb204aa5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SPHINXPROJ = Wenet -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/UIO.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/UIO.md deleted file mode 100644 index dd2555694075893bb29d68018c1b656728ed6aab..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/UIO.md +++ /dev/null @@ -1,177 +0,0 @@ -# UIO for WeNet - -In order to support the model training of industrial tens of millions of hours of speech dataset, the data processing -method UIO (Unified IO) has been updated in WeNet. The document will introduce UIO from the following sections: -Necessity of upgrading IO mothod, System design of UIO, Validation experiments, Usage of UIO, Q&A. - -## Necessity of upgrading IO mothod -The old IO method in WeNet is based on Pytorch's native Dataset. During training, it need to load all training audio -paths and correspondingly labels into the memory at one time, then randomly read data. In the case of industrial-grade -ultra-large-scale data (egs: more than 50,000 hours or 50 million or more audio), this method will cause the training -to fail to run for two reasons: -- Out of memory(OOM): The physical memory of the general machine is difficult to load the training data at one time. -- Slow down reading performance: In the case that the large-scale data memory cannot be used as a file cache, the training -data reading speed is greatly reduced. - -## System design of UIO -### Overall design -Inspired by the following industrial methods(egs: [webdataset](https://github.com/webdataset/webdataset), [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord)), -WeNet redesigned the IO method. Its core idea is to make the audio and labels of multiple small data(such as 1000 pieces), -into compressed packets (tar) and read them based on the IterableDataset of Pytorch. The advantages of this method is: -- Only the index information of the compressed package needs to be maintained in memory, which greatly saves memory and -solves the problem of OOM. -- The on-the-fly decompression is performed in the memory, and the data in the same compressed package is read in -sequence, which solves the problem of slow random reading performance. Different compressed packets can be read randomly -to ensure the global randomness of data. - -The new IO method takes into account both small datasets and large datasets, and provides two data reading methods. -We call it UIO. The overall design of UIO is shown in the figure below: - -![UIO System Design](./images/UIO_system.png) - -Some necessary explanations about the above figure: -- Small IO(raw) supports small datasets, which we call ``raw`` mode. This mode only supports local file reading. -The required documents must be sorted into Kaldi style file: wav.scp and text.(It's the same as before) -- Big IO(shared) supports large datasets, which we call ``shard`` mode. This mode can support both local file -reading and network cloud storage file reading. The required files must be sorted into compressed packages. Audio (wav) -and label (txt) are stored in a single compressed package in sequence. - -### Chain IO -Inspired by TFRecord chain IO, UIO also adopts chain implementation. In practical use, chain IO is more flexible, -easier to expand and easier to debug. TFRecord IO example as follows, -```python -def read_dataset(filename, batch_size): - dataset = tf.data.TFRecordDataset(filename) - dataset = dataset.map(_parse_image_function, num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.shuffle(500) - dataset = dataset.batch(batch_size, drop_remainder=True) - dataset = dataset.repeat() - dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) - return dataset -``` -Refer to TFRecord IO, the UIO dataflow in WeNet is designed as the figure below: - -![UIO dataflow](./images/UIO_dataflow.png) - -It includes the following modules: -- tokenize module: convert the label into specify modeling unit(egs: char or BPE). - -- filter module: filter out unqualified training data. - -- resample module: optional resampling of training data. - -- compute_fbank module: fbank feature extraction. - -- spec_augmentation module: feature enhancement. - -- shuffle module: disrupt local data. - -- sort module: sort local data. - -- batch module: organize multiple pieces of data into batch. - -- padding module: padding data in the same batch. - -what's more, There are several parameters to note. first, ``shuffle buffer`` and ``sort buffer`` in ``buffer size``: -* ``Shuffle buffer``: shuffle data. It is recommended that the size of this buffer be larger than -the number of data contained in a single shard. Each time it is equivalent to shuffling data between two shards, -which increases the randomness of the data.(egs: if each shard contains 1000, you can set shuffle buffer as 1500) -* ``Sort buffer``: sort the data according to the number of frames. This operation is very important and can greatly -improve the training speed. - -then, ``Prefetch``: -``Prefetch`` is used in the Pytorch ``Dataloader`` to pre-read data. The granularity of prefetch is the batch of final training. -The default parameter is 2, that is, the data of two batches will be pre-read by default. In the design of the UIO, -due to the existence of the pre buffer, the pre-read data may already be in the buffer, so there is no real pre read. -Only when the data in the buffer is insufficient during the next training can the buffer be filled on the fly. -At this time, the training is blocked on the read data. In a word, when the prefetch is very small, the training will -block reading data in part of the time, because the previous data is still in cache. Therefore, you can set a large -prefetch to avoid this problem. - - -## Validation experiments -At present, we have verified the accuracy of UIO on aishell (200 hours) and wenetspeech (10000 hours) data respectively. -### Aishell(``raw`` vs ``shard``) - -|IO|CER| -|:---|:---| -|Old|4.61| -|UIO(``Raw``)|4.63| -|UIO(``Shard``)|4.67| - -### WenetSpeech(``shard``) - -![UIO WenetSpeech](./images/UIO_wenetspeech_cer.png) - -WeNet and ESPnet use similar model structure and parameter configuration, and they achieve similar recognition rate, -which shows the correctness of UIO in WeNet. And during the training, we observed that the overall utilization rate of -GPU of UIO is more than 80% - 90%, indicating that the overall IO reading efficiency is very high. - -## Usage of UIO -For detailed usage of UIO, please refer to the aishell dataset example: -https://github.com/wenet-e2e/wenet/blob/main/examples/aishell/s0/run.sh -At present, all datasets in WeNet have used UIO as the default data preparation. - -There are three parameters related to UIO in the training script train.py: -- ``train_data``(``cv_data``/``test_data``): data.list -- ``data_type``: raw/shard -- ``symbol_table``: specify modeling unit - -For example: -```shell -python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data $feat_dir/$train_set/data.list \ - --cv_data $feat_dir/dev/data.list \ - ... -``` -If data_type is ``raw``, the format of data.list is as follows: -``` -{"key": "BAC009S0002W0122", "wav": "/export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0122.wav", "txt": "而对楼市成交抑制作用最大的限购"} -{"key": "BAC009S0002W0123", "wav": "/export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0123.wav", "txt": "也成为地方政府的眼中钉"} -{"key": "BAC009S0002W0124", "wav": "/export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0124.wav", "txt": "自六月底呼和浩特市率先宣布取消限购后"} -``` -Each line is a json serialized string, which contains three fields: ``key``, ``wav`` and ``txt``. - -If data_type is ``shard``, the format of data.list is as follows: -``` -# [option 1: local] -/export/maryland/binbinzhang/code/wenet/examples/aishell/s3/raw_wav/train/shards/shards_000000000.tar.gz -/export/maryland/binbinzhang/code/wenet/examples/aishell/s3/raw_wav/train/shards/shards_000000001.tar.gz -/export/maryland/binbinzhang/code/wenet/examples/aishell/s3/raw_wav/train/shards/shards_000000002.tar.gz - -# [option 2: network(egs: OSS)] -https://examplebucket.oss-cn-hangzhou.aliyuncs.com/exampledir/1.tar.gz -https://examplebucket.oss-cn-hangzhou.aliyuncs.com/exampledir/2.tar.gz -``` - -## Q&A -Q1: How to operate distributed partition of training data? - -A: According to rank and num_workers can segment the data. for example: -```python -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - data = list(range(len(data))) - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data -``` - -Q2: How to deal with unbalanced data? - -A: Use model.join() to handle the imbalance of data allocated on each rank. Please refer [this](https://pytorch.org/tutorials/advanced/generic_join.html#how-does-join-work). \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/conf.py b/models/audio/speech_recognition/conformer/igie/wenet/docs/conf.py deleted file mode 100644 index 49abc10713f4caa97c2d076352a11b7b121b7c29..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/conf.py +++ /dev/null @@ -1,71 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = 'Wenet' -copyright = '2020, wenet-team' -author = 'wenet-team' - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "nbsphinx", - "sphinx.ext.autodoc", - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', - "sphinx.ext.mathjax", - "sphinx.ext.todo", - # "sphinxarg.ext", - "sphinx_markdown_tables", - 'recommonmark', - 'sphinx_rtd_theme', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -source_suffix = { - '.rst': 'restructuredtext', - '.txt': 'markdown', - '.md': 'markdown', -} - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# html_theme = 'alabaster' -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/context.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/context.md deleted file mode 100644 index 881d119ea05e4a591d42cba0d4efd123fc56a80f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/context.md +++ /dev/null @@ -1,138 +0,0 @@ -## Context Biasing - -In the practical application of ASR, the recognition effect of commonly used words is better, but for some unique words, the recognition accuracy may be low. Contextual biasing is the problem of injecting prior knowledge into an ASR system during inference, for example a user’s favorite songs, contacts, apps or location. Conventional ASR systems perform contextual biasing by building an n-gram finite state transducer (FST) from a list of biasing phrases, which is composed on-the-fly with the decoder graph during decoding. This helps to bias the recognition result towards the n-grams contained in the contextual FST, and thus improves accuracy in certain scenarios. - -In WeNet, we compute the biasing scores $P_C(\mathbf y)$, which are interpolated with the base model $P(\mathbf y|\mathbf x)$ using shallow-fusion during beam search, including CTC prefix beam search and CTC WFST beam search. - -$$ -\mathbf y^*=\mathrm{arg\,max\,log}P(\mathbf y|\mathbf x)+\lambda\,\mathrm{log}\,P_C(\mathbf y) -$$ - -where, $\lambda$ is a tunable hyperparameter controlling how much the contextual LM influences the overall model score during beam search. - -### Context Graph - -If we want to improve the score of the word "cat", and the biasing score $\lambda\,\mathrm{log}\,P_C(\mathbf y)$ of each character is 0.25. The context graph can be constructed as follow: - -![context graph](images/context_graph.png) - -In the decoding process, when the corresponding prefix is matched, the corresponding score reward will be obtained. In order to avoid artificially boosting prefixes which match early on but do not match the entire phrase, we add a special failure arc which removes the boosted score. - -WeNet records only one state for each prefix, to easily determine the boundary of the matched hot word. That is, only one hot word can be matched at the same time, and only after the hot word matching succeeds or fails can other hot words start matching. - -``` c++ -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - // Traverse the arcs of current state. - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // Record the score of the backoff arc. It might will be covered. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - // If they match, record the next state and the score. - next_state = arc.nextstate; - *score = arc.weight.Value(); - // Check whether is the boundary of the hot word. - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} -``` - -### CTC Prefix Beam Search - -In the process of CTC prefix beam search, each prefix needs to record the hot word matching information. After appending the current output character, if the prefix changes, call the above function `GetNextState` to update the state and score of the hot word. If it is the start or end of a hot word, it is also necessary to record the position, which are used to insert the start tag and end tag in the result, such as: "The \cat\ is in the bag". - -### CTC WFST Beam Search - -WeNet adopts the Lattice Faster Online Decoder from Kaldi for WFST beam search. We have to modify the `lattice-faster-decoder.cc` to support context biasing. - -WFST beam search decodes in the TLG graph according to the CTC outputs. If we bias the input label of the TLG, we need to compose the context graph with the Token graph. Finally, we decide to bias TLG's output towards the contextual fst. We need to modify the `ProcessEmitting` and `ProcessNonemitting` functions as follow: - -```c++ -Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); -// NULL: no change indicator needed - -// ========== Context code BEGIN =========== -bool is_start_boundary = false; -bool is_end_boundary = false; -float context_score = 0; -if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } -} -// ========== Context code END ========== - -// Add ForwardLink from tok to next_tok (put on head of list -// tok->links) -tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); -tok->links->context_score = context_score; -``` - -### Pruning - -The backoff arc will return the accumulated scores to a single ForwardLink. It leads to the cost of that ForwardLink is too large. We have to remove the cost returned by backoff arc before pruning. - -```c++ -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - ... - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // ========== Context code BEGIN =========== - // graph_cost contains the score of hot word - // link->context_score < 0 means the hot word of the link is returned from backoff arc - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - // ========== Context code END ========== - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state -``` - -### Usage - -1. Specify the `--context_path` to a text file. - - Each line of the file contains a context. - - Each context can be split into words with the symbol_table of the ASR model (It means there is no oov in the context). -2. Specify the `--context_score`, the reward of each word in the context. - -```bash -cd /home/wenet/runtime/libtorch -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=docker_resource/test.wav -context_path=docker_resource/context.txt -model_dir=docker_resource/model -./build/decoder_main \ - --chunk_size -1 \ - --wav_path $wav_path \ - --model_path $model_dir/final.zip \ - --context_path $context_path \ - --context_score 3 \ - --unit_path $model_dir/units.txt 2>&1 | tee log.txt -``` - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_dataflow.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_dataflow.png deleted file mode 100644 index dce9f0128b8a6fc610f2d8dc8ec5cface0b1553a..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_dataflow.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_system.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_system.png deleted file mode 100644 index 1861daf61c6265b08e60fb4661f17654823ad0e4..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_system.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_wenetspeech_cer.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_wenetspeech_cer.png deleted file mode 100644 index 223e9ccf9422aa4fe567a27c1778d2f61c0c610c..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/UIO_wenetspeech_cer.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/check_detail.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/check_detail.png deleted file mode 100644 index 45c08be40bfd530d8bddfb091aa7d6340afa1bf2..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/check_detail.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/checks.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/checks.png deleted file mode 100644 index 14f0124875170bff4d4328f9b3295081db85473e..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/checks.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/context_graph.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/context_graph.png deleted file mode 100644 index c89cd37f7b92168733aeba538a5260ed25ca39d6..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/context_graph.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/lm_system.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/lm_system.png deleted file mode 100644 index 76b428d8297b5c4c36d26831aff6692a42318a84..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/lm_system.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_android.gif b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_android.gif deleted file mode 100644 index ab060b4f45cf580c332e41d146c620ec0b212a0f..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_android.gif and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_server.gif b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_server.gif deleted file mode 100644 index f418c3db1c67c90b4cfa6e57bc5e1542c88c5323..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_server.gif and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_web.png b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_web.png deleted file mode 100644 index 1788eadc425643496ef16ac36579a297d7d54ba1..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/runtime_web.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/subsampling_overalp.gif b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/subsampling_overalp.gif deleted file mode 100644 index 8bffe67315bd89cc4fbfda2c4505d8918914dce6..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/subsampling_overalp.gif and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/u2.gif b/models/audio/speech_recognition/conformer/igie/wenet/docs/images/u2.gif deleted file mode 100644 index a4a62e8bd1fc385be31040acefd57816469d361a..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/docs/images/u2.gif and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/index.rst b/models/audio/speech_recognition/conformer/igie/wenet/docs/index.rst deleted file mode 100644 index 9eb920f3c46dc5a13db285d45843c4e8b2303028..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/index.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. Wenet documentation master file, created by - sphinx-quickstart on Thu Dec 3 11:43:53 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to Wenet's documentation! -================================= - - -Wenet is an tansformer-based end-to-end ASR toolkit. - -.. toctree:: - :maxdepth: 1 - :caption: Tutorial: - - ./python_binding.md - ./papers.md - ./tutorial_librispeech.md - ./tutorial_aishell.md - ./pretrained_models.md - ./lm.md - ./context.md - ./runtime.md - ./jit_in_wenet.md - ./UIO.md - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/jit_in_wenet.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/jit_in_wenet.md deleted file mode 100644 index 650090d9e6f6f19f0081df912f750eee3820a713..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/jit_in_wenet.md +++ /dev/null @@ -1,31 +0,0 @@ -# JIT in WeNet - -We want that our PyTorch model can be directly exported by torch.jit.script method, -which is essential for deploying the model to production. - -See the following resource for how to deploy PyTorch models in production in details. -- [INTRODUCTION TO TORCHSCRIPT](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) -- [TORCHSCRIPT LANGUAGE REFERENCE](https://pytorch.org/docs/stable/jit_language_reference.html#language-reference) -- [LOADING A TORCHSCRIPT MODEL IN C++](https://pytorch.org/tutorials/advanced/cpp_export.html) -- [TorchScript and PyTorch JIT | Deep Dive](https://www.youtube.com/watch?v=2awmrMRf0dA&t=574s) -- [Research to Production: PyTorch JIT/TorchScript Updates](https://www.youtube.com/watch?v=St3gdHJzic0) - -To ensure that, we will try to export the model before training stage. -If it fails, we should modify the training code to satisfy the export requirements. - -``` python -# See in wenet/bin/train.py -script_model = torch.jit.script(model) -script_model.save(os.path.join(args.model_dir, 'init.zip')) -``` - -Two principles should be taken into consideration when we contribute our python code -to WeNet, especially for the subclass of torch.nn.Module, and for the forward function. - -1. Know what is allowed and what is disallowed. - - [Torch and Tensor Unsupported Attributes](https://pytorch.org/docs/master/jit_unsupported.html#jit-unsupported) - - [Python Language Reference Coverage](https://pytorch.org/docs/master/jit_python_reference.html#python-language-reference) - -2. Try to use explicit typing as much as possible. You can try to do type checking -forced by typeguard, see https://typeguard.readthedocs.io/en/latest/userguide.html for details. - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/lm.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/lm.md deleted file mode 100644 index 12acce16f7f5fcb9d5e8e61993eb6164396849b7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/lm.md +++ /dev/null @@ -1,106 +0,0 @@ -# LM for WeNet - -WeNet uses n-gram based statistical language model and the WFST framework to support the custom language model. -And LM is only supported in runtime of WeNet. - -## Motivation - -Why n-gram based LM? This may be the first question many people will ask. -Now that LM based on RNN and Transformer is in full swing, why does WeNet go backward? -The reason is simple, it is for productivity. -The n-gram-based language model has mature and complete training tools, -any amount of corpus can be trained, the training is very fast, the hotfix is easy, -and it has a wide range of mature applications in actual products. - -Why WFST? It may be the second question many people will ask. -Since both industry and research have been working so hard to abandon traditional speech recognition, -especially the complex decoding technology. Why does WeNet back? -The reason is also very simple, it is for productivity. -WFST is a standard and powerful tool in traditional speech recognition. -And based on this solution, we have mature and complete bug fix solutions and product solutions, -such as that we can use the replace function in WFST for class-based personalization such as contact recognition. - -Therefore, just like WeNet's design goal "Production first and Production Ready", -LM in WeNet also puts productivity as the first priority. -So it draws on many very productive tools and solutions accumulated in traditional speech recognition. -The difference to traditional speech recognition are: - -1. The training in WeNet is pure end to end. -2. As described below, LM is optional in decoding, you can choose whether to use LM according to your needs and application scenarios. - - -## System Design - -The whole system is shown in the bellowing picture. There are two ways to generate N-best. - -![LM System Design](./images/lm_system.png) - -1. Without LM, we use CTC prefix beam search to generate N-best. -2. With LM, we use CTC WFST search to generate N-best and CTC WFST search is the traditional WFST based decoder. - -There are two main parts of the CTC WFST based search. - -The first is building the decoding graph, which is to compose the model unit T, the lexicon L and the language model G into one unified graph TLG. And in which: -1. T is the model unit in E2E training. Typically it's char in Chinese, char or BPE in English. -2. L is the lexicon, the lexicon is very simple. What we need to do is just split a word into its modeling unit sequence. -For example, the word "我们" is split into two chars "我 们", and the word "APPLE" is split into five letters "A P P L E". -We can see there is no phonemes and there is no need to design pronunciation on purpose. -3. G is the language model, namely compiling the n-gram to standard WFST representation. - -The second is the decoder, which is the same as the traditional decoder, which uses the standard Viterbi beam search algorithm in decoding. - -## Implementation - -WeNet draws on the decoder and related tools in Kaldi to support LM and WFST based decoding. -For ease of using and keeping independence, we directly migrated the code related to decoding in Kaldi to [this directory](https://github.com/wenet-e2e/wenet/tree/main/runtime/core/kaldi) in WeNet runtime. -And modify and organize according to the following principles: -1. To minimize changes, the migrated code remains the same directory structure as the original. -2. We use GLOG to replace the log system in Kaldi. -3. We modify the code format to meet the lint requirements of the code style in WeNet. - -The core code is https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_wfst_beam_search.cc, -which wraps the LatticeFasterDecoder in Kaldi. -And we use blank frame skipping to speed up decoding. - -In addition, WeNet also migrated related tools for building the decoding graph, -such as arpa2fst, fstdeterminizestar, fsttablecompose, fstminimizeencoded, and other tools. -So all the tools related to LM are built-in tools and can be used out of the box. - - -## Results - -We get consistent gain (3%~10%) on different datasets, -including aishell, aishell2, and librispeech, -please go to the corresponding example dataset for the details. - -## How to use? - -Here is an example from aishell, which shows how to prepare the dictionary, how to train the LM, -how to build the graph, and how to decode with the runtime. - -``` sh -# 7.1 Prepare dict -unit_file=$dict -mkdir -p data/local/dict -cp $unit_file data/local/dict/units.txt -tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ - data/local/dict/lexicon.txt -# 7.2 Train lm -lm=data/local/lm -mkdir -p $lm -tools/filter_scp.pl data/train/text \ - $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text -local/aishell_train_lms.sh -# 7.3 Build decoding TLG -tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang -tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; -# 7.4 Decoding with runtime -./tools/decode.sh --nj 16 \ - --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ - --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ - --fst_path data/lang_test/TLG.fst \ - --dict_path data/lang_test/words.txt \ - data/test/wav.scp data/test/text $dir/final.zip \ - data/lang_test/units.txt $dir/lm_with_runtime -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/make.bat b/models/audio/speech_recognition/conformer/igie/wenet/docs/make.bat deleted file mode 100644 index a42274a63310b8672adb4eb1bbd2c170cdc7684a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/papers.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/papers.md deleted file mode 100644 index f006314f31324594e729c48ef7df8b0bd5c51afa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/papers.md +++ /dev/null @@ -1,5 +0,0 @@ -## Papers - -* [WeNet: Production Oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit](https://arxiv.org/pdf/2102.01547.pdf), accepted by InterSpeech 2021. -* [WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit](https://arxiv.org/pdf/2203.15455.pdf), accepted by InterSpeech 2022. - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/pretrained_models.en.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/pretrained_models.en.md deleted file mode 100644 index 1aaac5a1760650d2150decd644ed4ec30433e520..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/pretrained_models.en.md +++ /dev/null @@ -1,26 +0,0 @@ -# Pretrained Models in WeNet - -## Model Types -We provide two types of pretrained model in WeNet to facilitate users with different requirements. - -1. **Checkpoint Model**, with suffix **.pt**, the model trained and saved as checkpoint by WeNet python code, you can reproduce our published result with it, or you can use it as checkpoint to continue. - -2. **Runtime Model**, with suffix **.zip**, you can directly use `runtime model` in our [x86](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch) or [android](https://github.com/wenet-e2e/wenet/tree/main/runtime/android) runtime, the `runtime model` is export by Pytorch JIT on the `checkpoint model`. And the runtime models has been quantized to reduce the model size and network traffic. - -## Model License - -The pretrained model in WeNet follows the license of it's corresponding dataset. -For example, the pretrained model on LibriSpeech follows `CC BY 4.0`, since it is used as license of the LibriSpeech dataset, see http://openslr.org/12/. - -## Model List - -Here is a list of the pretrained models on different datasets. The model structure, model size, and download link are given. - -| Datasets | Languages | Checkpoint Model | Runtime Model | Contributor | -|--- |--- |--- |--- |--- | -| [aishell](../examples/aishell/s0/README.md) | CN | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_exp.tar.gz) | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_libtorch.tar.gz) | | -| [aishell2](../examples/aishell2/s0/README.md) | CN | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell2/20210618_u2pp_conformer_exp.tar.gz) | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell2/20210618_u2pp_conformer_libtorch.tar.gz) | | -| [gigaspeech](../examples/gigaspeech/s0/README.md) | EN | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/gigaspeech/gigaspeech_u2pp_conformer_exp.tar.gz) | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/gigaspeech/gigaspeech_u2pp_conformer_libtorch.tar.gz) | | -| [librispeech](../examples/librispeech/s0/README.md) | EN | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/librispeech/20210610_u2pp_conformer_exp.tar.gz) | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/librispeech/20210610_u2pp_conformer_libtorch.tar.gz) | | -| [multi_cn](../examples/multi_cn/s0/README.md) | CN | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/multi_cn/20210815_unified_conformer_exp.tar.gz) | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/multi_cn/20210815_unified_conformer_libtorch.tar.gz) | | -| [wenetspeech](../examples/wenetspeech/s0/README.md) | CN | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/wenetspeech/wenetspeech_u2pp_conformer_exp.tar.gz) | [Conformer](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/wenetspeech/wenetspeech_u2pp_conformer_libtorch.tar.gz) | | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/pretrained_models.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/pretrained_models.md deleted file mode 100644 index ee4fc02e524163ef91221413e9a190b206aaa93f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/pretrained_models.md +++ /dev/null @@ -1,28 +0,0 @@ -# Pretrained Models in WeNet - -## Model Types -We provide two types of pretrained model in WeNet to facilitate users with different requirements. - -1. **Checkpoint Model**, with suffix **.pt**, the model trained and saved as checkpoint by WeNet python code, you can reproduce our published result with it, or you can use it as checkpoint to continue. - -2. **Runtime Model**, with suffix **.zip**, you can directly use `runtime model` in our [x86](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch) or [android](https://github.com/wenet-e2e/wenet/tree/main/runtime/android) runtime, the `runtime model` is export by Pytorch JIT on the `checkpoint model`. And the runtime models has been quantized to reduce the model size and network traffic. - -## Model License - -The pretrained model in WeNet follows the license of it's corresponding dataset. -For example, the pretrained model on LibriSpeech follows `CC BY 4.0`, since it is used as license of the LibriSpeech dataset, see http://openslr.org/12/. - -## Model List - -Here is a list of the pretrained models on different datasets. - -For non-Chinese users, please visit [Pretrained Models(En)](./pretrained_models.en.md) to download. - -| Datasets | Languages | Checkpoint Model | Runtime Model | Contributor | -|--- |--- |--- |--- |--- | -| [aishell](../examples/aishell/s0/README.md) | CN | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | | -| [aishell2](../examples/aishell2/s0/README.md) | CN | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | | -| [gigaspeech](../examples/gigaspeech/s0/README.md) | EN | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | | -| [librispeech](../examples/librispeech/s0/README.md) | EN | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | | -| [multi_cn](../examples/multi_cn/s0/README.md) | CN | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | | -| [wenetspeech](../examples/wenetspeech/s0/README.md) | CN | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | [Conformer](https://docs.qq.com/form/page/DZnRkVHlnUk5QaFdC) | | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/python_binding.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/python_binding.md deleted file mode 100644 index b5a86ff853d4fa13540d2926706e0c1113eaee64..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/python_binding.md +++ /dev/null @@ -1,105 +0,0 @@ -# WeNet Python Binding - -This is a python binding of WeNet. - -WeNet is a production first and production ready end-to-end speech recognition toolkit. - -The best things of the binding are: - -1. Multiple languages supports, including English, Chinese. Other languages are in development. -2. Non-streaming and streaming API -3. N-best, contextual biasing, and timestamp supports, which are very important for speech productions. -4. Alignment support. You can get phone level alignments this tool, on developing. - -## Install - -Python 3.6+ is required. - -``` sh -pip3 install wenetruntime -``` - -## Usage - -Note: - -1. For macOS, wenetruntime packed `libtorch.so`, so we can't import torch and wenetruntime at the same time. -2. For Windows and Linux, wenetruntime depends on torch. Please install and import the same version `torch` as wenetruntime. - -### Non-streaming Usage - -``` python -import sys -import torch -import wenetruntime as wenet - -wav_file = sys.argv[1] -decoder = wenet.Decoder(lang='chs') -ans = decoder.decode_wav(wav_file) -print(ans) -``` - -You can also specify the following parameter in `wenet.Decoder` - -* `lang` (str): The language you used, `chs` for Chinese, and `en` for English. -* `model_dir` (str): is the `Runtime Model` directory, it contains the following files. - If not provided, official model for specific `lang` will be downloaded automatically. - - * `final.zip`: runtime TorchScript ASR model. - * `units.txt`: modeling units file - * `TLG.fst`: optional, it means decoding with LM when `TLG.fst` is given. - * `words.txt`: optional, word level symbol table for decoding with `TLG.fst` - - Please refer https://github.com/wenet-e2e/wenet/blob/main/docs/pretrained_models.md for the details of `Runtime Model`. - -* `nbest` (int): Output the top-n best result. -* `enable_timestamp` (bool): Whether to enable the word level timestamp. -* `context` (List[str]): a list of context biasing words. -* `context_score` (float): context bonus score. -* `continuous_decoding` (bool): Whether to enable continuous(long) decoding. - -For example: -``` python -decoder = wenet.Decoder(model_dir, - lang='chs', - nbest=5, - enable_timestamp=True, - context=['不忘初心', '牢记使命'], - context_score=3.0) -``` - -### Streaming Usage - -``` python -import sys -import torch -import wave -import wenetruntime as wenet - -test_wav = sys.argv[1] - -with wave.open(test_wav, 'rb') as fin: - assert fin.getnchannels() == 1 - wav = fin.readframes(fin.getnframes()) - -decoder = wenet.Decoder(lang='chs') -# We suppose the wav is 16k, 16bits, and decode every 0.5 seconds -interval = int(0.5 * 16000) * 2 -for i in range(0, len(wav), interval): - last = False if i + interval < len(wav) else True - chunk_wav = wav[i: min(i + interval, len(wav))] - ans = decoder.decode(chunk_wav, last) - print(ans) -``` - -You can use the same parameters as we introduced above to control the behavior of `wenet.Decoder` - - -## Build on Your Local Machine - -``` sh -git clone https://github.com/wenet-e2e/wenet.git -cd wenet/runtime/binding/python -python setup.py install -``` - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/runtime.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/runtime.md deleted file mode 100644 index cf65e7505aaf44055479ed9f8371e2807558ae8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/runtime.md +++ /dev/null @@ -1,69 +0,0 @@ -# Runtime for WeNet - -WeNet runtime uses [Unified Two Pass (U2)](https://arxiv.org/pdf/2102.01547.pdf) framework for inference. U2 has the following advantages: -* **Unified**: U2 unified the streaming and non-streaming model in a simple way, and our runtime is also unified. Therefore you can easily balance the latency and accuracy by changing chunk_size (described in the following section). -* **Accurate**: U2 achieves better accuracy by CTC joint training. -* **Fast**: Our runtime uses attention rescoring based decoding method described in U2, which is much faster than a traditional autoregressive beam search. -* **Other benefits**: In practice, we find U2 is more stable on long-form speech than standard transformer which usually fails or degrades a lot on long-form speech; and we can easily get the word-level time stamps by CTC spikes in U2. Both of these aspects are favored for industry adoption. - -## Platforms Supported - -The WeNet runtime supports the following platforms. - -* Server - * [x86](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch) -* Device - * [android](https://github.com/wenet-e2e/wenet/tree/main/runtime/android) - -## Architecture and Implementation - -### Architecture - -The following picture shows how U2 works. - -![U2](images/u2.gif) - -When input is not finished, the input frames $x_t$ are fed into the *Shared Encoder* module frame by frame to get the encoder output $e_t$, then $e_t$ is transformed by the *CTC Activation* module (typically, it's just a linear transform with a log_softmax) to get the CTC prob $y_t$ at current frame, and $y_t$ is further used by the *CTC prefix beam search* module to generate n-best results at current time $t$, and the best result is used as partial result of the U2 system. - -When input is finished at time $T$, the n-best results from the *CTC prefix beam search* module and the encoder output $e_1, e_2, e_3, ..., e_T$ are fed into the *Attention Decoder* module, then the *Attention Decoder* module computes a score for every result. The result with the best score is selected as the final result of U2 system. - -We can group $C$ continuous frames $x_t, x_{t+1}, x_{t+C}$ as one chunk for the *Shared Encoder* module, and $C$ is called chunk_size in the U2 framework. The chunk_size will affect the attention computation in the *Shared Encoder* module. When chunk_size is infinite, it is a non-streaming case. The system gives the best accuracy with infinite latency. When chunk_size is limited (typically less than 1s), it is a streaming case. The system has limited latency and also gives promising accuracy. So the developer can balance the accuracy and latency and setting a proper chunk_size. - -### Interface Design - -We use LibTorch to implement U2 runtime in WeNet, and we export several interfaces in PyTorch python code -by @torch.jit.export (see [asr_model.py](https://github.com/wenet-e2e/wenet/tree/main/wenet/transformer/asr_model.py)), -which are required and used in C++ runtime in [torch_asr_model.cc](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch/decoder/torch_asr_model.cc). -Here we just list the interface and give a brief introduction. - -| interface | description | -|----------------------------------|-----------------------------------------| -| subsampling_rate (args) | get the subsampling rate of the model | -| right_context (args) | get the right context of the model | -| sos_symbol (args) | get the sos symbol id of the model | -| eos_symbol (args) | get the eos symbol id of the model | -| forward_encoder_chunk (args) | used for the *Shared Encoder* module | -| ctc_activation (args) | used for the *CTC Activation* module | -| forward_attention_decoder (args) | used for the *Attention Decoder* module | - -### Cache in Details - -For streaming scenario, the *Shared Encoder* module works in an incremental way. The current chunk computation requries the inputs and outputs of all the history chunks. We implement the incremental computation by using caches. Overall, two types of cache are used in our runtime. - -* att_cache: the attention cache of the *Shared Encoder*(Conformer/Transformer) module. -* cnn_cache: the cnn cache of the *Shared Encoder*, which caches the left context for causal CNN computation in Conformer. - -Please see [encoder.py:forward_chunk()](https://github.com/wenet-e2e/wenet/tree/main/wenet/transformer/encoder.py) and [torch_asr_model.cc](https://github.com/wenet-e2e/wenet/tree/main/runtime/libtorch/decoder/torch_asr_model.cc) for details of the caches. - -In practice, CNN is also used for subsampling, we should handle the CNN cache in subsampling. -However, there are different CNN layers in subsampling with different left contexts, right contexts and strides, which makes it tircky to directly implement the CNN cache in subsampling. -In our implementation, we simply overlap the input to avoid subsampling CNN cache. -It is simple and straightforward with negligible additional cost since subsampling CNN only costs a very small fraction of the whole computation. -The following picture shows how it works, where the blue color is for the overlap part of current inputs and previous inputs. - -![Overlap input for Subsampling CNN](images/subsampling_overalp.gif) - -## References -1. [Sequence Modeling With CTC](https://distill.pub/2017/ctc/) -2. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/pdf/1408.2873.pdf) -3. [Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition](https://arxiv.org/pdf/2012.05481.pdf) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/tutorial_aishell.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/tutorial_aishell.md deleted file mode 100644 index ea4e80d32439bbce901a18724d81e1ccc37fd49f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/tutorial_aishell.md +++ /dev/null @@ -1,173 +0,0 @@ -## Tutorial on AIShell - -If you meet any problems when going through this tutorial, please feel free to ask in github [issues](https://github.com/mobvoi/wenet/issues). Thanks for any kind of feedback. - -### Setup environment - -Please follow [Installation](https://github.com/wenet-e2e/wenet#installation) to install WeNet. - -### First Experiment - -We provide a recipe `example/aishell/s0/run.sh` on aishell-1 data. - -The recipe is simple and we suggest you run each stage one by one manually and check the result to understand the whole process. - -``` -cd example/aishell/s0 -bash run.sh --stage -1 --stop_stage -1 -bash run.sh --stage 0 --stop_stage 0 -bash run.sh --stage 1 --stop_stage 1 -bash run.sh --stage 2 --stop_stage 2 -bash run.sh --stage 3 --stop_stage 3 -bash run.sh --stage 4 --stop_stage 4 -bash run.sh --stage 5 --stop_stage 5 -bash run.sh --stage 6 --stop_stage 6 -``` - -You could also just run the whole script -``` -bash run.sh --stage -1 --stop_stage 6 -``` - - -#### Stage -1: Download data - -This stage downloads the aishell-1 data to the local path `$data`. This may take several hours. If you have already downloaded the data, please change the `$data` variable in `run.sh` and start from `--stage 0`. -Please set a **absolute path** for `$data`, e.g. `/home/username/asr-data/aishell/` - -#### Stage 0: Prepare Training data - -In this stage, `local/aishell_data_prep.sh` organizes the original aishell-1 data into two files: -* **wav.scp** each line records two tab-separated columns : `wav_id` and `wav_path` -* **text** each line records two tab-separated columns : `wav_id` and `text_label` - -**wav.scp** -``` -BAC009S0002W0122 /export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0122.wav -BAC009S0002W0123 /export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0123.wav -BAC009S0002W0124 /export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0124.wav -BAC009S0002W0125 /export/data/asr-data/OpenSLR/33/data_aishell/wav/train/S0002/BAC009S0002W0125.wav -... -``` - -**text** -``` -BAC009S0002W0122 而对楼市成交抑制作用最大的限购 -BAC009S0002W0123 也成为地方政府的眼中钉 -BAC009S0002W0124 自六月底呼和浩特市率先宣布取消限购后 -BAC009S0002W0125 各地政府便纷纷跟进 -... -``` - -If you want to train using your customized data, just organize the data into two files `wav.scp` and `text`, and start from `stage 1`. - - -#### Stage 1: Extract optinal cmvn features - -`example/aishell/s0` uses raw wav as input and and [TorchAudio](https://pytorch.org/audio/stable/index.html) to extract the features just-in-time in dataloader. So in this step we just copy the training wav.scp and text file into the `raw_wav/train/` dir. - -`tools/compute_cmvn_stats.py` is used to extract global cmvn(cepstral mean and variance normalization) statistics. These statistics will be used to normalize the acoustic features. Setting `cmvn=false` will skip this step. - -#### Stage 2: Generate label token dictionary - -The dict is a map between label tokens (we use characters for Aishell-1) and - the integer indices. - -An example dict is as follows -``` - 0 - 1 -一 2 -丁 3 -... -龚 4230 -龟 4231 - 4232 -``` - -* `` denotes the blank symbol for CTC. -* `` denotes the unknown token, any out-of-vocabulary tokens will be mapped into it. -* `` denotes start-of-speech and end-of-speech symbols for attention based encoder decoder training, and they shares the same id. - -#### Stage 3: Prepare WeNet data format - -This stage generates the WeNet required format file `data.list`. Each line in `data.list` is in json format which contains the following fields. - -1. `key`: key of the utterance -2. `wav`: audio file path of the utterance -3. `txt`: normalized transcription of the utterance, the transcription will be tokenized to the model units on-the-fly at the training stage. - -Here is an example of the `data.list`, and please see the generated training feature file in `data/train/data.list`. - -``` -{"key": "BAC009S0002W0122", "wav": "/export/data/asr-data/OpenSLR/33//data_aishell/wav/train/S0002/BAC009S0002W0122.wav", "txt": "而对楼市成交抑制作用最大的限购"} -{"key": "BAC009S0002W0123", "wav": "/export/data/asr-data/OpenSLR/33//data_aishell/wav/train/S0002/BAC009S0002W0123.wav", "txt": "也成为地方政府的眼中钉"} -{"key": "BAC009S0002W0124", "wav": "/export/data/asr-data/OpenSLR/33//data_aishell/wav/train/S0002/BAC009S0002W0124.wav", "txt": "自六月底呼和浩特市率先宣布取消限购后"} -``` - -We aslo design another format for `data.list` named `shard` which is for big data training. -Please see [gigaspeech](https://github.com/wenet-e2e/wenet/tree/main/examples/gigaspeech/s0)(10k hours) or -[wenetspeech](https://github.com/wenet-e2e/wenet/tree/main/examples/wenetspeech/s0)(10k hours) -for how to use `shard` style `data.list` if you want to apply WeNet on big data set(more than 5k). - -#### Stage 4: Neural Network training - -The NN model is trained in this step. - -- Multi-GPU mode - -If using DDP mode for multi-GPU, we suggest using `dist_backend="nccl"`. If the NCCL does not work, try using `gloo` or use `torch==1.6.0` -Set the GPU ids in CUDA_VISIBLE_DEVICES. For example, set `export CUDA_VISIBLE_DEVICES="0,1,2,3,6,7"` to use card 0,1,2,3,6,7. - -- Resume training - -If your experiment is terminated after running several epochs for some reasons (e.g. the GPU is accidentally used by other people and is out-of-memory ), you could continue the training from a checkpoint model. Just find out the finished epoch in `exp/your_exp/`, set `checkpoint=exp/your_exp/$n.pt` and run the `run.sh --stage 4`. Then the training will continue from the $n+1.pt - -- Config - -The config of neural network structure, optimization parameter, loss parameters, and dataset can be set in a YAML format file. - -In `conf/`, we provide several models like transformer and conformer. see `conf/train_conformer.yaml` for reference. - -- Use Tensorboard - -The training takes several hours. The actual time depends on the number and type of your GPU cards. In an 8-card 2080 Ti machine, it takes about less than one day for 50 epochs. -You could use tensorboard to monitor the loss. - -``` -tensorboard --logdir tensorboard/$your_exp_name/ --port 12598 --bind_all -``` - -#### Stage 5: Recognize wav using the trained model - -This stage shows how to recognize a set of wavs into texts. It also shows how to do the model averaging. - -- Average model - -If `${average_checkpoint}` is set to `true`, the best `${average_num}` models on cross validation set will be averaged to generate a boosted model and used for recognition. - -- Decoding - -Recognition is also called decoding or inference. The function of the NN will be applied on the input acoustic feature sequence to output a sequence of text. - -Four decoding methods are provided in WeNet: - -* `ctc_greedy_search` : encoder + CTC greedy search -* `ctc_prefix_beam_search` : encoder + CTC prefix beam search -* `attention` : encoder + attention-based decoder decoding -* `attention_rescoring` : rescoring the ctc candidates from ctc prefix beam search with encoder output on attention-based decoder. - -In general, attention_rescoring is the best method. Please see [U2 paper](https://arxiv.org/pdf/2012.05481.pdf) for the details of these algorithms. - -`--beam_size` is a tunable parameter, a large beam size may get better results but also cause higher computation cost. - -`--batch_size` can be greater than 1 for "ctc_greedy_search" and "attention" decoding mode, and must be 1 for "ctc_prefix_beam_search" and "attention_rescoring" decoding mode. - -- WER evaluation - -`tools/compute-wer.py` will calculate the word (or char) error rate of the result. If you run the recipe without any change, you may get WER ~= 5%. - - -#### Stage 6: Export the trained model - -`wenet/bin/export_jit.py` will export the trained model using Libtorch. The exported model files can be easily used for inference in other programming languages such as C++. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/docs/tutorial_librispeech.md b/models/audio/speech_recognition/conformer/igie/wenet/docs/tutorial_librispeech.md deleted file mode 100644 index 223f3b6a913def973a5ce3feb9b95d73ab9b491d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/docs/tutorial_librispeech.md +++ /dev/null @@ -1,436 +0,0 @@ -## Tutorial on LibriSpeech - -If you meet any problems when going through this tutorial, please feel free to ask in github [issues](https://github.com/mobvoi/wenet/issues). Thanks for any kind of feedback. - -### Setup environment - -Please follow [Installation](https://github.com/wenet-e2e/wenet#installation) to install WeNet. - -### First Experiment - -We provide a recipe `example/librispeech/s0/run.sh` on librispeech data. - -The recipe is simple and we suggest you run each stage one by one manually and check the result to understand the whole process. - -``` -cd example/librispeech/s0 -bash run.sh --stage -1 --stop_stage -1 -bash run.sh --stage 0 --stop_stage 0 -bash run.sh --stage 1 --stop_stage 1 -bash run.sh --stage 2 --stop_stage 2 -bash run.sh --stage 3 --stop_stage 3 -bash run.sh --stage 4 --stop_stage 4 -bash run.sh --stage 5 --stop_stage 5 -bash run.sh --stage 6 --stop_stage 6 -bash run.sh --stage 7 --stop_stage 7 -``` - -You could also just run the whole script -``` -bash run.sh --stage -1 --stop_stage 7 -``` - - -#### Stage -1: Download data - -``` sh -data_url=www.openslr.org/resources/12 -datadir=/export/data/en-asr-data/OpenSLR/ -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - local/download_and_untar.sh ${datadir} ${data_url} ${part} - done -fi -``` - -This stage downloads the librispeech data to the local path `$data`. This may take several hours. If you have already downloaded the data, please change the `$data` variable in `run.sh` and start from `--stage 0`. - -#### Stage 0: Prepare Training data - -``` sh -wave_data=data -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - # use underscore-separated names in data directories. - local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_} - done -fi -``` - -In this stage, `local/data_prep_torchaudio.sh` organizes the original data into two files: -* **wav.scp** each line records two tab-separated columns : `wav_id` and `wav_path` -* **text** each line records two tab-separated columns : `wav_id` and `text_label` - -**wav.scp** -``` -1867-154075-0014 /export/data/en-asr-data/OpenSLR//LibriSpeech/train-clean-100/1867/154075/1867-154075-0014.flac -1970-26100-0022 /export/data/en-asr-data/OpenSLR//LibriSpeech/train-clean-100/1970/26100/1970-26100-0022.flac -... -``` - -**text** -``` -1867-154075-0014 YOU SHOW HIM THAT IT IS POSSIBLE -1970-26100-0022 DID YOU SEE HIM AT THAT TIME -... -``` - -If you want to train using your customized data, just organize the data into two files `wav.scp` and `text`, and start from `stage 1`. - - -#### Stage 1: Extract optinal cmvn features - -``` sh -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ### Task dependent. You have to design training and dev sets by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 1: Feature Generation" - mkdir -p $wave_data/train_960 - # merge total training data - for set in train_clean_100 train_clean_360 train_other_500; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/train_960/$f - done - done - mkdir -p $wave_data/dev - # merge total dev data - for set in dev_clean dev_other; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/dev/$f - done - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp $wave_data/$train_set/wav.scp \ - --out_cmvn $wave_data/$train_set/global_cmvn - -fi -``` - -The librispeech corpus contains 3 subsets for training, namely `train_clean_100`, `train_clean_360`, and `train_other_500`, -so we first merge them to get our final training data. - -`tools/compute_cmvn_stats.py` is used to extract global cmvn(cepstral mean and variance normalization) statistics. These statistics will be used to normalize the acoustic features. Setting `cmvn=false` will skip this step. - -#### Stage 2: Generate label token dictionary - -``` sh -dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt - tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi -``` - -The model unit of English e2e speech recognition system could be char or BPE(byte-pair-encoding). -Typically, BPE shows better result. So here we use BPE as model unit, -and the BPE is trained by [sentencepiece](https://github.com/google/sentencepiece) tool on the librispeech training data. - -The model unit is defined as a dict in WeNet, which maps the a BPE into integer index. -The librispeech dict is like: - -``` - 0 - 1 -' 2 -▁ 3 -A 4 -▁A 5 -AB 6 -▁AB 7 -▁YOU 4995 -▁YOUNG 4996 -▁YOUR 4997 -▁YOUTH 4998 -Z 4999 -ZZ 5000 - 5001 -``` - -* `` denotes the blank symbol for CTC. -* `` denotes the unknown token, any out-of-vocabulary tokens will be mapped into it. -* `` denotes start-of-speech and end-of-speech symbols for attention based encoder decoder training, and they shares the same id. - -#### Stage 3: Prepare WeNet data format - -``` sh -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in dev ${recog_set} $train_set ; do - tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \ - $wave_data/$x/data.list - done - -fi -``` - -This stage generates the WeNet required format file `data.list`. Each line in `data.list` is in json format which contains the following fields. - -1. `key`: key of the utterance -2. `wav`: audio file path of the utterance -3. `txt`: normalized transcription of the utterance, the transcription will be tokenized to the model units on-the-fly at the training stage. - -Here is an example of the `data.list`, and please see the generated training feature file in `data/train/data.list`. - -``` -{"key": "1455-134435-0000", "wav": "/mnt/nfs/ptm1/open-data/LibriSpeech/train-clean-100/1455/134435/1455-134435-0000.flac", "txt": "THE GIRL WHO CAME INTO THE WORLD ON THAT NIGHT WHEN JESSE RAN THROUGH THE FIELDS CRYING TO GOD THAT HE BE GIVEN A SON HAD GROWN TO WOMANHOOD ON THE FARM"} -{"key": "1455-134435-0001", "wav": "/mnt/nfs/ptm1/open-data/LibriSpeech/train-clean-100/1455/134435/1455-134435-0001.flac", "txt": "AND WHEN NOT ANGRY SHE WAS OFTEN MOROSE AND SILENT IN WINESBURG IT WAS SAID THAT SHE DRANK HER HUSBAND THE BANKER"} -{"key": "1455-134435-0002", "wav": "/mnt/nfs/ptm1/open-data/LibriSpeech/train-clean-100/1455/134435/1455-134435-0002.flac", "txt": "BUT LOUISE COULD NOT BE MADE HAPPY SHE FLEW INTO HALF INSANE FITS OF TEMPER DURING WHICH SHE WAS SOMETIMES SILENT SOMETIMES NOISY AND QUARRELSOME SHE SWORE AND CRIED OUT IN HER ANGER SHE GOT A KNIFE FROM THE KITCHEN AND THREATENED HER HUSBAND'S LIFE"} -``` - -We aslo design another format for `data.list` named `shard` which is for big data training. -Please see [gigaspeech](https://github.com/wenet-e2e/wenet/tree/main/examples/gigaspeech/s0)(10k hours) or -[wenetspeech](https://github.com/wenet-e2e/wenet/tree/main/examples/wenetspeech/s0)(10k hours) -for how to use `shard` style `data.list` if you want to apply WeNet on big data set(more than 5k). - -#### Stage 4: Neural Network training - -``` sh -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - cmvn_opts= - $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --train_data $wave_data/$train_set/data.list \ - --cv_data $wave_data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi -``` - -The NN model is trained in this step. - -- Multi-GPU mode - -If using DDP mode for multi-GPU, we suggest using `dist_backend="nccl"`. If the NCCL does not work, try using `gloo` or use `torch==1.6.0` -Set the GPU ids in CUDA_VISIBLE_DEVICES. For example, set `export CUDA_VISIBLE_DEVICES="0,1,2,3,6,7"` to use card 0,1,2,3,6,7. - -- Resume training - -If your experiment is terminated after running several epochs for some reasons (e.g. the GPU is accidentally used by other people and is out-of-memory ), you could continue the training from a checkpoint model. Just find out the finished epoch in `exp/your_exp/`, set `checkpoint=exp/your_exp/$n.pt` and run the `run.sh --stage 4`. Then the training will continue from the $n+1.pt - -- Config - -The config of neural network structure, optimization parameter, loss parameters, and dataset can be set in a YAML format file. - -In `conf/`, we provide several models like transformer and conformer. see `conf/train_conformer.yaml` for reference. - -- Use Tensorboard - -The training takes several hours. The actual time depends on the number and type of your GPU cards. In an 8-card 2080 Ti machine, it takes about less than one day for 50 epochs. -You could use tensorboard to monitor the loss. - -``` -tensorboard --logdir tensorboard/$your_exp_name/ --port 12598 --bind_all -``` - -#### Stage 5: Recognize wav using the trained model - -``` sh -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - # Polling GPU id begin with index 0 - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - idx=0 - for test in $recog_set; do - for mode in ${decode_modes}; do - { - { - test_dir=$dir/${test}_${mode} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type raw \ - --test_data $wave_data/$test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - tools/spm_decode --model=${bpemodel}.model --input_format=piece < $test_dir/text_bpe | sed -e "s/▁/ /g" > $test_dir/text - python tools/compute-wer.py --char=1 --v=1 \ - $wave_data/$test/text $test_dir/text > $test_dir/wer - } & - - ((idx+=1)) - if [ $idx -eq $num_gpus ]; then - idx=0 - fi - } - done - done - wait - -fi - -``` - -This stage shows how to recognize a set of wavs into texts. It also shows how to do the model averaging. - -- Average model - -If `${average_checkpoint}` is set to `true`, the best `${average_num}` models on cross validation set will be averaged to generate a boosted model and used for recognition. - -- Decoding - -Recognition is also called decoding or inference. The function of the NN will be applied on the input acoustic feature sequence to output a sequence of text. - -Four decoding methods are provided in WeNet: - -* `ctc_greedy_search` : encoder + CTC greedy search -* `ctc_prefix_beam_search` : encoder + CTC prefix beam search -* `attention` : encoder + attention-based decoder decoding -* `attention_rescoring` : rescoring the ctc candidates from ctc prefix beam search with encoder output on attention-based decoder. - -In general, attention_rescoring is the best method. Please see [U2 paper](https://arxiv.org/pdf/2012.05481.pdf) for the details of these algorithms. - -`--beam_size` is a tunable parameter, a large beam size may get better results but also cause higher computation cost. - -`--batch_size` can be greater than 1 for "ctc_greedy_search" and "attention" decoding mode, and must be 1 for "ctc_prefix_beam_search" and "attention_rescoring" decoding mode. - -- WER evaluation - -`tools/compute-wer.py` will calculate the word (or char) error rate of the result. - - -#### Stage 6(Optional): Export the trained model - -``` sh -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi -``` - -`wenet/bin/export_jit.py` will export the trained model using Libtorch. -The exported model files can be easily used for C++ inference in our runtime. -It is required if you want to integrate language model(LM), as shown in Stage 7. - - -#### Stage 7(Optional): Add LM and test it with runtime - - - -``` sh -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - lm=data/local/lm - lexicon=data/local/dict/lexicon.txt - mkdir -p $lm - mkdir -p data/local/dict - - # 7.1 Download & format LM - which_lm=3-gram.pruned.1e-7.arpa.gz - if [ ! -e ${lm}/${which_lm} ]; then - wget http://www.openslr.org/resources/11/${which_lm} -P ${lm} - fi - echo "unzip lm($which_lm)..." - gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa - echo "Lm saved as ${lm}/lm.arpa" - - # 7.2 Prepare dict - unit_file=$dict - bpemodel=$bpemodel - # use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel) - # if you download pretrained librispeech conformer model - cp $unit_file data/local/dict/units.txt - if [ ! -e ${lm}/librispeech-lexicon.txt ]; then - wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm} - fi - echo "build lexicon..." - tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \ - $lexicon $bpemodel.model - echo "lexicon saved as '$lexicon'" - - # 7.3 Build decoding TLG - tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - - # 7.4 Decoding with runtime - fst_dir=data/lang_test - for test in ${recog_set}; do - ./tools/decode.sh --nj 6 \ - --beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \ - --ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \ - --fst_path $fst_dir/TLG.fst \ - --dict_path $fst_dir/words.txt \ - data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \ - $dir/lm_with_runtime_${test} - tail $dir/lm_with_runtime_${test}/wer - done -fi -``` - -LM is only supported in runtime, you have to build the runtime as shown in [Installation](https://github.com/wenet-e2e/wenet#installation), -and please refer [LM for WeNet](https://wenet-e2e.github.io/wenet/lm.html) for the details of LM design. - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/README.md deleted file mode 100644 index 75fb1e43a9398ee1826a617882029e09e25f3b93..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/README.md +++ /dev/null @@ -1,146 +0,0 @@ -# Recipe to run Noisy Student Training with LM filter in WeNet - -Noisy Student Training (NST) has recently demonstrated extremely strong performance in Automatic Speech Recognition (ASR). - -Here, we provide a recipe to run NST with `LM filter` strategy using AISHELL-1 as supervised data and WenetSpeech as unsupervised data from [this paper](https://arxiv.org/abs/2211.04717), where hypotheses with and without Language Model are generated and CER differences between them are utilized as a filter threshold to improve the ASR performances of non-target domain datas. - -## Table of Contents - -- [Guideline](#guideline) - - [Data preparation](#data-preparation) - - [Initial supervised teacher](#initial-supervised-teacher) - - [Noisy student interations](#noisy-student-interations) -- [Performance Record](#performance-record) - - [Supervised baseline and standard NST](##supervised-baseline-and-standard-nst) - - [Supervised AISHELL-1 and unsupervised 1khr WenetSpeech](#supervised-aishell-1-and-unsupervised-1khr-wenetspeech) - - [Supervised AISHELL-2 and unsupervised 4khr WenetSpeech](#supervised-aishell-2-and-unsupervised-4khr-wenetspeech) -- [Citations](#citations) - -## Guideline - - -First, you have to prepare supervised and unsupervised data for NST. Then in stage 1 of `run.sh`, you will train an initial supervised teacher and generate pseudo labels for unsupervised data. -After that, you can run the noisy student training iteratively in stage 2. The whole pipeline is illustrated in the following picture. - -![plot](local/NST_plot.png) - -### Data preparation - -To run this recipe, you should follow the steps from [WeNet examples](https://github.com/wenet-e2e/wenet/tree/main/examples) to prepare [AISHELL1](https://github.com/wenet-e2e/wenet/tree/main/examples/aishell/s0) and [WenetSpeech](https://github.com/wenet-e2e/wenet/tree/main/examples/wenetspeech/s0) data. -We extract 1khr data from WenetSpeech and data should be prepared and stored in the following format: - -``` -data/ -├── train/ -├──── data_aishell.list -├──── wenet_1khr.list -├──── wav_dir/ -├──── utter_time.json (optional) -├── dev/ -└── test/ - -``` -- Files `*.list` contain paths for all the data shards for training. -- A Json file containing the audio length should be prepared as `utter_time.json` if you want to apply the `speaking rate` filter. -- A wav_dir contains all the audio data (id.wav) and labels (id.txt which is optional) for unsupervised data. - -### Initial supervised teacher - -To train an initial supervised teacher model, run the following command: - -```bash -bash run.sh --stage 1 --stop-stage 1 -``` - -Full arguments are listed below, you can check `run.sh` and `run_nst.sh` for more information about steps in each stage and their arguments. We used `num_split = 60` and generate shards with different cpu for the experiments in our paper which saved us lots of inference time and data shards generation time. - -```bash -bash run.sh --stage 1 --stop-stage 1 --dir exp/conformer_test_fully_supervised --supervised_data_list data_aishell.list --enable_nst 0 --num_split 1 --unsupervised_data_list wenet_1khr.list --dir_split wenet_split_60_test/ --job_num 0 --hypo_name hypothesis_nst0.txt --label 1 --wav_dir data/train/wenet_1k_untar/ --cer_hypo_dir wenet_cer_hypo --cer_label_dir wenet_cer_label --label_file label.txt --cer_hypo_threshold 10 --speak_rate_threshold 0 --utter_time_file utter_time.json --untar_dir data/train/wenet_1khr_untar/ --tar_dir data/train/wenet_1khr_tar/ --out_data_list data/train/wenet_1khr.list -``` -- `dir` contains the training parameters. -- `data_list` contains paths for the training data list. -- `supervised_data_list` contains paths for supervised data shards. -- `unsupervised_data_list`contains paths for unsupervised data shards which is used for inference. -- `dir_split` is the directory stores split unsupervised data for parallel computing. -- `out_data_list` is the pseudo label data list file path. -- `enable_nst` indicates whether we train with pseudo label and split data, for initial teacher we set it to 0. -- This recipe uses the default `num_split=1` while we strongly recommend use larger number to decrease the inference and shards generation time. -> **HINTS** If num_split is set to N larger than 1, you need to modify the script in step 4-8 in run_nst.sh to submit N tasks into your own clusters (such as slurm,ngc etc..). -> We strongly recommend to do so since inference and pseudo-data generation is time-consuming. - -### Noisy student interations - -After finishing the initial fully supervised baseline, we now have the mixed list contains both supervised and pseudo data which is `wenet_1khr_nst0.list`. -We will use it as the `data_list` in the training step and the `data_list` for next NST iteration will be generated. - -Here is an example command: - -```bash -bash run.sh --stage 2 --stop-stage 2 --iter_num 2 -``` - -Here we add extra argument `iter_num` for number of NST iterations. Intermediate files are named with `iter_num` as a suffix. -Please check the `run.sh` and `run_nst.sh` scripts for more information about each stage and their arguments. - -## Performance Record - -### Supervised baseline and standard NST -* Non-streaming conformer model with attention rescoring decoder. -* Without filter strategy, first iteration -* Feature info: using FBANK feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size 32, 8 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.3, average_num 30 - - -| Supervised | Unsupervised | Test CER | -|--------------------------|--------------|----------| -| AISHELL-1 Only | ---- | 4.85 | -| AISHELL-1+WenetSpeech | ---- | 3.54 | -| AISHELL-1+AISHELL-2 | ---- | 1.01 | -| AISHELL-1 (standard NST) | WenetSpeech | 5.52 | - - - -### Supervised AISHELL-1 and unsupervised 1khr WenetSpeech -* Non-streaming conformer model with attention rescoring decoder. -* Feature info: using FBANK feature -* Training info: lr=0.002, batch_size=32, 8 GPUs, acc_grad=4, 120 epochs, dither=0.1 -* Decoding info: ctc_weight=0.3, average_num=30, pseudo_ratio=0.75 - -| # nst iteration | AISHELL-1 test CER | Pseudo CER| Filtered CER | Filtered hours | -|----------------|--------------------|-----------|--------------|----------------| -| 0 | 4.85 | 47.10 | 25.18 | 323 | -| 1 | 4.86 | 37.02 | 20.93 | 436 | -| 2 | 4.75 | 31.81 | 19.74 | 540 | -| 3 | 4.69 | 28.27 | 17.85 | 592 | -| 4 | 4.48 | 26.64 | 14.76 | 588 | -| 5 | 4.41 | 24.70 | 15.86 | 670 | -| 6 | 4.34 | 23.64 | 15.40 | 669 | -| 7 | 4.31 | 23.79 | 15.75 | 694 | - -### Supervised AISHELL-2 and unsupervised 4khr WenetSpeech -* Non-streaming conformer model with attention rescoring decoder. -* Feature info: using FBANK feature -* Training info: lr=0.002, batch_size=32, 8 GPUs, acc_grad=4, 120 epochs, dither=0.1 -* Decoding info: ctc_weight=0.3, average_num=30, pseudo_ratio=0.75 - -| # nst iteration | AISHELL-2 test CER | Pseudo CER | Filtered CER | Filtered hours | -|----------------|--------------------|------------|--------------|----------------| -| 0 | 5.48 | 30.10 | 11.73 | 1637 | -| 1 | 5.09 | 28.31 | 9.39 | 2016 | -| 2 | 4.88 | 25.38 | 9.99 | 2186 | -| 3 | 4.74 | 22.47 | 10.66 | 2528 | -| 4 | 4.73 | 22.23 | 10.43 | 2734 | - - - -## Citations - -``` bibtex - -@article{chen2022NST, - title={Improving Noisy Student Training on Non-target Domain Data for Automatic Speech Recognition}, - author={Chen, Yu and Wen, Ding and Lai, Junjie}, - journal={arXiv preprint arXiv:2203.15455}, - year={2022} -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/conf/train_conformer.yaml deleted file mode 100644 index 8499de2e97b8ae13e15d7cfb8357ae59bb6b6115..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/conf/train_conformer.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 1200 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/NST_plot.png b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/NST_plot.png deleted file mode 100644 index c652c62caed741bd52d4d1a1a9cf290477be9223..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/NST_plot.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/generate_data_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/generate_data_list.py deleted file mode 100644 index 684e7cb683a9697e2f0807859ab71c2ac3820d42..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/generate_data_list.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import random - -def get_args(): - parser = argparse.ArgumentParser(description='generate data.list file ') - parser.add_argument('--tar_dir', help='path for tar file') - parser.add_argument('--supervised_data_list', - help='path for supervised data list') - parser.add_argument('--pseudo_data_ratio', - type=float, - help='ratio of pseudo data, ' - '0 means none pseudo data, ' - '1 means all using pseudo data.') - parser.add_argument('--out_data_list', help='output path for data list') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - target_dir = args.tar_dir - pseudo_data_list = os.listdir(target_dir) - output_file = args.out_data_list - pseudo_data_ratio = args.pseudo_data_ratio - supervised_path = args.supervised_data_list - with open(supervised_path, "r") as reader: - supervised_data_list = reader.readlines() - pseudo_len = len(pseudo_data_list) - supervised_len = len(supervised_data_list) - random.shuffle(pseudo_data_list) - random.shuffle(supervised_data_list) - - cur_ratio = pseudo_len / (pseudo_len + supervised_len) - if cur_ratio < pseudo_data_ratio: - pseudo_to_super_datio = pseudo_data_ratio / (1 - pseudo_data_ratio) - supervised_len = int(pseudo_len / pseudo_to_super_datio) - elif cur_ratio > pseudo_data_ratio: - super_to_pseudo_datio = (1 - pseudo_data_ratio) / pseudo_data_ratio - pseudo_len = int(supervised_len / super_to_pseudo_datio) - - for i in range(len(pseudo_data_list)): - pseudo_data_list[i] = target_dir + "/" + pseudo_data_list[i] + "\n" - - fused_list = pseudo_data_list[:pseudo_len] + supervised_data_list[:supervised_len] - - with open(output_file, "w") as writer: - for line in fused_list: - writer.write(line) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/generate_filtered_pseudo_label.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/generate_filtered_pseudo_label.py deleted file mode 100644 index 2a8ee83c32f1c69fcffc796c83cc58e4a6f1eec2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/generate_filtered_pseudo_label.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import tarfile -import time -import json - - -def get_args(): - parser = argparse.ArgumentParser(description='generate filter pseudo label') - parser.add_argument('--dir_num', required=True, help='split directory number') - parser.add_argument('--cer_hypo_dir', required=True, - help='prefix for cer_hypo_dir') - parser.add_argument('--utter_time_file', required=True, - help='the json file that contains audio time infos ') - parser.add_argument('--cer_hypo_threshold', required=True, type=float, - help='the cer-hypo threshold used to filter') - parser.add_argument('--speak_rate_threshold', type=float, - help='the cer threshold we use to filter') - parser.add_argument('--dir', required=True, help='dir for the experiment ') - # output untar and tar - parser.add_argument('--untar_dir', required=True, - help='the output path, ' - 'eg: data/train/wenet_untar_cer_hypo_nst1/') - parser.add_argument('--tar_dir', required=True, - help='the tar file path, ' - 'eg: data/train/wenet_tar_cer_hypo_leq_10_nst1/') - parser.add_argument('--wav_dir', required=True, - help='dir to store wav files, ' - 'eg "data/train/wenet_1k_untar/"') - parser.add_argument('--start_tar_id', default=0 , type=int, - help='the initial tar id (for debugging)') - args = parser.parse_args() - return args - - -def make_tarfile(output_filename, source_dir): - with tarfile.open(output_filename, "w") as tar: - tar.add(source_dir, arcname=os.path.basename(source_dir)) - - -def main(): - args = get_args() - dir_num = args.dir_num - dir_name = args.dir - output_dir = args.untar_dir - cer_hypo_threshold = args.cer_hypo_threshold - speak_rate_threshold = args.speak_rate_threshold - utter_time_file = args.utter_time_file - tar_dir = args.tar_dir - wav_dir = args.wav_dir - start_tar_id = args.start_tar_id - os.makedirs(tar_dir, exist_ok=True) - os.makedirs(output_dir, exist_ok=True) - cer_hypo_name = args.cer_hypo_dir - print("start tar id is", start_tar_id) - print("make dirs") - - utter_time_enable = True - dataset = "wenet" - - utter_time = {} - if utter_time_enable: - - if dataset == "wenet": - print("wenet") - with open(utter_time_file, encoding='utf-8') as fh: - utter_time = json.load(fh) - - if dataset == "aishell2": - aishell2_jason = utter_time_file - print("aishell2") - with open(aishell2_jason, "r", encoding="utf-8") as f: - for line in f: - data = json.loads(line) - data_audio = data["audio_filepath"] - t_id = data_audio.split("/")[-1].split(".")[0] - data_duration = data["duration"] - utter_time[t_id] = data_duration - - print(time.time(), "start time ") - cer_dict = {} - print("dir_num = ", dir_num) - cer_hypo_path = dir_name + "/Hypo_LM_diff10/" + cer_hypo_name - cer_hypo_path = cer_hypo_path + "_" + dir_num + "/wer" - with open(cer_hypo_path, 'r', encoding="utf-8") as reader: - data = reader.readlines() - - for i in range(len(data)): - line = data[i] - if line[:3] == 'utt': - wer_list = data[i + 1].split() - wer_pred_lm = float(wer_list[1]) - n_hypo = int(wer_list[3].split("=")[1]) - - utt_list = line.split() - lab_list = data[i + 2].split() - rec_list = data[i + 3].split() - - utt_id = utt_list[1] - pred_no_lm = "".join(lab_list[1:]) - pred_lm = "".join(rec_list[1:]) - prediction = "".join(lab_list[1:]) - - if utter_time_enable: - - utt_time = utter_time[utt_id] - - cer_dict[utt_id] = [pred_no_lm, pred_lm, wer_pred_lm, - utt_time, n_hypo, prediction] - else: - cer_dict[utt_id] = [pred_no_lm, pred_lm, - wer_pred_lm, -1, -1, prediction] - - c = 0 - cer_preds = [] - uttr_len = [] - speak_rates = [] - num_lines = 0 - data_filtered = [] - - for key, item in cer_dict.items(): - - cer_pred = item[2] - speak_rate = item[4] / item[3] # char per second - - if cer_pred <= cer_hypo_threshold and speak_rate > speak_rate_threshold: - - num_lines += 1 - c += 1 - cer_preds.append(cer_pred) - uttr_len.append(item[4]) - speak_rates.append(speak_rate) - pred = item[1] - utt_id = key - filtered_line = [utt_id, pred] - data_filtered.append(filtered_line) - - num_uttr = 1000 - len_data = len(data_filtered) - print("total sentences after filter ") - cur_id = start_tar_id * 1000 - end_id = cur_id + num_uttr - if cur_id < len_data < end_id: - end_id = len_data - tar_id = start_tar_id - - not_exist = [] - while end_id <= len_data: - - tar_s = str(tar_id) - diff = 6 - len(tar_s) - for _ in range(diff): - tar_s = "0" + tar_s - - out_put_dir = output_dir + "dir" + str(dir_num) - out_put_dir = out_put_dir + "_" + "tar" + tar_s + "/" - os.makedirs(out_put_dir, exist_ok=True) - - for i in range(cur_id, end_id): - print("dir:", dir_num, ", " "tar: ", tar_id, - ", ", "progress:", i / len_data) - - t_id, utter = data_filtered[i] - - output_path = out_put_dir + t_id + ".txt" - wav_path = wav_dir + t_id + ".wav" - print(wav_path) - wav_exist = os.path.exists(wav_path) - if wav_exist: - # update .txt - with open(output_path, "w", encoding="utf-8") as writer: - writer.write(utter) - # update .wav - os.system("cp" + " " + wav_path + " " - + out_put_dir + t_id + ".wav") - else: - print(" wav does not exists ! ", wav_path) - not_exist.append(wav_path) - - tar_file_name = tar_dir + "dir" + str(dir_num) + "_" + tar_s + ".tar" - # tar the dir - - make_tarfile(tar_file_name, out_put_dir) - # update index - tar_id += 1 - cur_id += num_uttr - end_id += num_uttr - - if cur_id < len_data < end_id: - end_id = len_data - - print("end, now removing untar files for saving storge space.") - print("rm -rf" + " " + out_put_dir[:-1]) - os.system("rm -rf" + " " + out_put_dir[:-1]) - print("remove done") - - print("There are ", len(not_exist), "wav files not exist") - print(not_exist) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/get_wav_labels.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/get_wav_labels.py deleted file mode 100644 index fb0c5c2b0f66f274680b95b0e872f04886f2c7ca..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/get_wav_labels.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - - -def get_args(): - parser = argparse.ArgumentParser(description='sum up prediction wer') - parser.add_argument('--job_num', type=int, default=8, - help='number of total split dir') - parser.add_argument('--dir_split', required=True, - help='the path to the data_list dir ' - 'eg data/train/wenet1k_good_split_60/') - parser.add_argument('--label', type=int, default=0, - help='if ture, label file will also be considered.') - parser.add_argument('--hypo_name', type=str, required=True, - help='the hypothesis path. eg. /hypothesis_0.txt ') - parser.add_argument('--wav_dir', type=str, required=True, - help='the wav dir path. eg. data/train/wenet_1k_untar/ ') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - data_list_dir = args.dir_split - num_lists = args.job_num - hypo = args.hypo_name - # wav_dir is the directory where your pair of ID.scp - # (the audio file ) and ID.txt (the optional label file ) file stored. - # We assumed that you have generated this dir in data processing steps. - wav_dir = args.wav_dir - label = args.label - - print("data_list_path is", data_list_dir) - print("num_lists is", num_lists) - print("hypo is", hypo) - print("wav_dir is", wav_dir) - - i = num_lists - c = 0 - hypo_path = data_list_dir + "data_sublist" + str(i) + hypo - output_wav = data_list_dir + "data_sublist" + str(i) + "/wav.scp" - output_label = data_list_dir + "data_sublist" + str(i) + "/label.txt" - # bad lines are just for debugging - output_bad_lines = data_list_dir + "data_sublist" + str(i) + "/bad_line.txt" - - with open(hypo_path, 'r', encoding="utf-8") as reader: - hypo_lines = reader.readlines() - - wavs = [] - labels = [] - bad_files = [] - for x in hypo_lines: - c += 1 - file_id = x.split()[0] - - label_path = wav_dir + file_id + ".txt" - wav_path = wav_dir + file_id + ".wav\n" - wav_line = file_id + " " + wav_path - wavs.append(wav_line) - if label: - try: - with open(label_path, 'r', encoding="utf-8") as reader1: - label_line = reader1.readline() - except OSError as e: - bad_files.append(label_path) - - label_line = file_id + " " + label_line + "\n" - labels.append(label_line) - - with open(output_wav, 'w', encoding="utf-8") as writer2: - for wav in wavs: - writer2.write(wav) - with open(output_bad_lines, 'w', encoding="utf-8") as writer4: - for line in bad_files: - writer4.write(line) - if label: - with open(output_label, 'w', encoding="utf-8") as writer3: - for label in labels: - writer3.write(label) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/split_data_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/split_data_list.py deleted file mode 100644 index 17d507cb79ed1c4e25cdbd3d59a0eeb93000f0d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/local/split_data_list.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument('--job_nums', type=int, default=8, - help='number of total split jobs') - parser.add_argument('--data_list_path', required=True, - help='the path to the data.list file') - parser.add_argument('--output_dir', required=True, - help='path to output dir, ' - 'eg --output_dir=data/train/aishell_split_60') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - data_list_path = args.data_list_path - num_lists = args.job_nums - output_dir = args.output_dir - - print("data_list_path is", data_list_path) - print("num_lists is", num_lists) - print("output_dir is", output_dir) - os.makedirs(output_dir, exist_ok=True) - - with open(data_list_path, 'r', encoding="utf-8") as reader: - data_list_we = reader.readlines() - - # divide data.list equally - len_d = int(len(data_list_we) / num_lists) - rest_lines = data_list_we[num_lists * len_d:] - rest_len = len(rest_lines) - print("total num of lines", len(data_list_we) , "rest len is", rest_len) - - # generate N sublist - for i in range(num_lists): - print("current dir num", i) - out_put_sub_dir = output_dir + "/" + "data_sublist" + str(i) + "/" - os.makedirs(out_put_sub_dir, exist_ok=True) - output_list = out_put_sub_dir + "data_list" - - with open(output_list, 'w', encoding="utf-8") as writer: - - new_list = data_list_we[i * len_d: (i + 1) * len_d] - if i < rest_len: - new_list.append(rest_lines[i]) - for x in new_list: - # output list - writer.write(x) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/path.sh deleted file mode 100644 index 5ddca76cc23a90f320dd95fd262c345dc700aa04..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/run.sh deleted file mode 100644 index 258f5061f314de6347d9418ccc85035cbf51074d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/run.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -iter_num=2 -stage=1 -stop_stage=1 -pseudo_data_ratio=0.75 -dir=exp/conformer_test_fully_supervised -data_list=data_aishell.list -supervised_data_list=data_aishell.list -unsupervised_data_list=wenet_1khr.list -dir_split=wenet_split_60_test/ -out_data_list=data/train/wenet_1khr_nst0.list -num_split=1 -. tools/parse_options.sh || exit 1; - -# Stage 1 trains the initial teacher and generates initial pseudo-labels. -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "******** stage 1 training the intial teacher ********" - bash run_nst.sh --dir $dir \ - --data_list $data_list \ - --supervised_data_list $supervised_data_list \ - --unsupervised_data_list $unsupervised_data_list \ - --dir_split $dir_split\ - --out_data_list $out_data_list \ - --enable_nst 0 \ - --pseudo_data_ratio pseudo_data_ratio \ - --num_split $num_split - -fi - -# Stage 2 trains the nst iterations. -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - - for ((i = 0; i < $iter_num; ++i)); do - { - echo "******** stage 2 training nst iteration number $i ********" - bash run_nst.sh --dir exp/conformer_nst${i+1} \ - --supervised_data_list data_aishell.list \ - --data_list wenet_1khr_nst${i}.list \ - --enable_nst 1 \ - --job_num 0 \ - --num_split $num_split \ - --hypo_name hypothesis_nst${i+1}.txt \ - --untar_dir wenet_1khr_untar_nst${i+1}/ \ - --tar_dir wenet_1khr_tar_nst${i+1}/ \ - --out_data_list wenet_1khr_nst${i+1}.list \ - --pseudo_data_ratio $pseudo_data_ratio - - } - done - -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/run_nst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/run_nst.sh deleted file mode 100644 index 877d55dddc3c8d2f7f43e1acf24d1d8a1785b3c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/NST/run_nst.sh +++ /dev/null @@ -1,409 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# This is an augmented version of aishell-1 "run.sh" to make the code compatible with noisy student training - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=1 # start from 0 if you need to start from data preparation -stop_stage=8 - -# here are extra parameters used in NST -cer_out_dir="" -dir="" -supervised_data_list="" -checkpoint= -unsupervised_data_list="" -data_list="" - -hypo_name="" -out_data_list="" -#parameters with default values: -label=0 -average_num=30 -nj=16 -num_split=1 -cer_hypo_threshold=10 -speak_rate_threshold=0 -label_file="label.txt" -utter_time_file="utter_time.json" -enable_nst=1 -job_num=0 -dir_split="wenet_split_60_test/" -hypo_name="hypothesis_nst${job_num}.txt" -wav_dir="data/train/wenet_1k_untar/" -tar_dir="data/train/wenet_1khr_tar/" -untar_dir="data/train/wenet_1khr_untar/" -cer_hypo_dir="wenet_cer_hypo" -cer_label_dir="wenet_cer_label" -pseudo_data_ratio=0.75 - -# The num of machines(nodes) for multi-machine training, 1 is for one machine. -# NFS is required if num_nodes > 1. - -num_nodes=1 - -# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. -# You should set the node_ranHk=0 on the first machine, set the node_rank=1 -# on the second machine, and so on. -node_rank=0 -dict=data/dict/lang_char.txt - -# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, -# `shard` is used for large dataset which is over 1k hours, and `shard` is -# faster on reading data and training. -data_type=shard -num_utts_per_shard=1000 -train_set=train -train_config=conf/train_conformer.yaml -cmvn=true -average_checkpoint=true -target_pt=80 -decode_checkpoint=$dir/$target_pt.pt - -# here we only use attention_rescoring for NST -decode_modes="attention_rescoring" - -. tools/parse_options.sh || exit 1; - -# print the settings -echo "setting for this run:" -echo "dir is ${dir}" -echo "data list is ${data_list}" -echo "job_num is ${job_num}" -echo "cer_out_dir is ${cer_out_dir}" -echo "average_num is ${average_num}" -echo "checkpoint is ${checkpoint} " -echo "enable_nst is ${enable_nst} " - -# we assumed that you have finished the data pre-process steps from -1 to 3 in aishell1/s0/run.sh . -# You can modify the "--train_data_supervised" to match your supervised data list. -# Here i used wenetspeech as the unsupervised data, you can run the data pre-process steps from -1 to 3 in -# wenetspeech/s0/run.sh ; you can modify "--train_data_supervised" to match your unsupervised data list. -# you can follow this process to generate your own dataset. -# I have also included my code for extracting data in local/... - -# stage 1 is for training -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "********step 1 start time : $now ********" - mkdir -p $dir - # You have to rm `INIT_FILE` manually when you resume or restart a - # multi-machine training. - rm $dir/ddp_init - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - - # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets, - # and it should be positioned at data/${train_set}/global_cmvn . - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir/global_cmvn - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - - # train.py rewrite $train_config to $dir/train.yaml with model input - # and output dimension, and $dir/train.yaml will be used for inference - # and export. - echo "checkpoint is " ${checkpoint} - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - echo "gpu number $i " - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data data/$train_set/$data_list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -# In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy -# please make sure your test and valid data.list are in the proper location. -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Test model, please specify the model you want to test by --checkpoint - # stage 5 we test with aishell dataset, - echo "******** step 2 start time : $now ********" - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - - # export model - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip - # Please specify decoding_chunk_size for unified streaming and - # non-streaming model. The default value is -1, which is full chunk - # for non-streaming inference. - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - - # test_wer - for mode in ${decode_modes}; do - { - #test_dir=$dir/test_${mode}_${target_pt}pt # for target pt - test_dir=$dir/test_${mode}${average_num}pt # for average pt - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - echo "before compute-wer" - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - done - -# dev_wer - for mode in ${decode_modes}; do - { - #test_dir=$dir/test_${mode}_${target_pt}pt # for target pt - dev_dir=$dir/dev_${mode}${average_num}pt # for average pt - mkdir -p $dev_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/dev/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $dev_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - echo "before compute-wer" - python tools/compute-wer.py --char=1 --v=1 \ - data/dev/text $dev_dir/text > $dev_dir/wer - } & - done - wait -fi - - -# split the (unsupervised) datalist into N sublists, where N depends on the number of available cpu in your cluster. -# when making inference, we compute N sublist in parallel. -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ ${enable_nst} -eq 0 ]; then - echo "********step 3 start time : $now ********" - python local/split_data_list.py \ - --job_nums $num_split \ - --data_list_path data/train/$unsupervised_data_list \ - --output_dir data/train/$dir_split - -fi - - -# stage 4 will perform inference without language model on the given sublist(job num) -# here is example usages: -# bash run_nst.sh --stage 4 --stop-stage 4 --job_num $i --dir_split data/train/wenet_4khr_split_60/ -# --hypo_name hypothesis_0.txt --dir exp/conformer_aishell2_wenet4k_nst4 -# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data -# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model. -# For each gpu, you can run with different job_num to perform data-wise parallel computing. -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - echo "********step 4 start time : $now ********" - # we assume you have run stage 2 so that avg_${average_num}.pt exists - decode_checkpoint=$dir/avg_${average_num}.pt - # Please specify decoding_chunk_size for unified streaming and - # non-streaming model. The default value is -1, which is full chunk - # for non-streaming inference. - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - mode="attention_rescoring" - gpu_id=0 - echo "job number ${job_num} " - echo "data_list dir is ${dir_split}" - echo "hypo name is " $hypo_name - echo "dir is ${dir}" - - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/train/${dir_split}data_sublist${job_num}/data_list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file data/train/${dir_split}data_sublist${job_num}/${hypo_name} \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - echo "end time : $now" - -fi - - -# Generate wav.scp file and label.txt file(optional) for each sublist we generated in step 3. -# the wav_dir should be prepared in data processing step as we mentioned. -#You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data, -# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model. -# wav_dir is the directory that stores raw wav file and possible labels. -# if you have label for unsupervised dataset, set label = 1 other wise keep it 0 -# For each gpu or cpu, you can run with different job_num to perform data-wise parallel computing. -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] && [ ${enable_nst} -eq 0 ]; then - echo "********step 5 start time : $now ********" - python local/get_wav_labels.py \ - --dir_split data/train/${dir_split} \ - --hypo_name /$hypo_name \ - --wav_dir $wav_dir\ - --job_num $job_num \ - --label $label -fi - -# Calculate cer-hypo between hypothesis with and without language model. -# We assumed that you have finished language model -# training using the wenet aishell-1 pipline. (You should have data/lang/words.txt , data/lang/TLG.fst files ready.) -# Here is an exmaple usage: -# bash run_nst.sh --stage 5 --stop-stage 5 --job_num n --dir_split data/train/wenet1k_redo_split_60/ -# --cer_hypo_dir wenet1k_cer_hypo --hypo_name hypothesis_nst.txt --dir exp/conformer_no_filter_redo_nst6 -# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data -# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model. -# For each gpu, you can run with different job_num to perform data-wise parallel computing. -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - echo "********step 6 start time : $now ********" - chunk_size=-1 - mode="attention_rescoring" - test_dir=$dir/test_${mode}_${job_num} - now=$(date +"%T") - echo "start time : $now" - echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split} - echo "nj is" $nj "hypo_file is" $hypo_name "cer out is" $cer_hypo_dir "lm is 4gram" - echo "dir is " $dir - if [ ! -f data/train/${dir_split}data_sublist${job_num}/${hypo_name} ]; then - echo "text file does not exists" - exit 1; - fi - - ./tools/decode.sh --nj 16 \ - --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ - --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ - --chunk_size $chunk_size \ - --fst_path data/lang_test/TLG.fst \ - data/train/${dir_split}data_sublist${job_num}/wav.scp \ - data/train/${dir_split}data_sublist${job_num}/${hypo_name} $dir/final.zip \ - data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_hypo_dir}_${job_num} - now=$(date +"%T") - echo "end time : $now" -fi - -# (optional, only run this stage if you have true label for unsupervised data.) -# Calculate cer-label between true label and hypothesis with language model. -# You can use the output cer to evaluate NST's performance. -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] && [ ${label} -eq 1 ]; then - echo "********step 7 start time : $now ********" - chunk_size=-1 - mode="attention_rescoring" - test_dir=$dir/test_${mode}_${job_num} - now=$(date +"%T") - echo "start time : $now" - echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split} - echo "nj is" $nj "label_file is" $label_file "cer out is" $cer_label_dir "lm is 4gram" - echo "dir is " $dir - echo "label_file " data/train/${dir_split}data_sublist${job_num}/${label_file} - if [ ! -f data/train/${dir_split}data_sublist${job_num}/${label_file} ]; then - echo "text file does not exists" - exit 1; - fi - - ./tools/decode.sh --nj 16 \ - --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ - --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ - --chunk_size $chunk_size \ - --fst_path data/lang_test/TLG.fst \ - data/train/${dir_split}data_sublist${job_num}/wav.scp \ - data/train/${dir_split}data_sublist${job_num}/${label_file} $dir/final.zip \ - data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_label_dir}_${job_num} - now=$(date +"%T") - echo "end time : $now" -fi - - -if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then - echo "********step 8 start time : $now ********" - python local/generate_filtered_pseudo_label.py \ - --cer_hypo_dir $cer_hypo_dir \ - --untar_dir data/train/$untar_dir \ - --wav_dir $wav_dir \ - --dir_num $job_num \ - --cer_hypo_threshold $cer_hypo_threshold \ - --speak_rate_threshold $speak_rate_threshold \ - --dir $dir \ - --tar_dir data/train/$tar_dir \ - --utter_time_file $utter_time_file - - python local/generate_data_list.py \ - --tar_dir data/train/$tar_dir \ - --out_data_list data/train/$out_data_list \ - --supervised_data_list data/train/$supervised_data_list \ - --pseudo_data_ratio $pseudo_data_ratio - -fi - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/README.md deleted file mode 100644 index ff2ec88c7da99adac6f7bbaafb5d445d644cf70e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.001, batch size 8, 8 gpu, acc_grad 1, 100 epochs, dither 0.1 -* Training weight info: transducer_weight 0.75, ctc_weight 0.1, attention_weight 0.15, average_num 10 -* Predictor type: lstm - -| decoding mode | CER | -|---------------------------|-------| -| rnnt greedy search | 5.24 | - -* after 165 epochs and avg 30 - -| decoding mode | CER | -|---------------------------|-------| -| rnnt greedy search | 5.02 | -| ctc prefix beam search | 5.17 | -| ctc prefix beam + rescore | 4.48 | - -## Conformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.001, batch size 20, 8 gpu, acc_grad 1, 140 epochs, dither 0.1 -* Training weight info: transducer_weight 0.4, ctc_weight 0.2, attention_weight 0.4, average_num 10 -* Predictor type: lstm -* Model link: https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20220728_conformer_rnnt_exp.tar.gz - -| decoding mode | CER | -|---------------------------------------|-------| -| rnnt greedy search | 4.88 | -| rnnt beam search | 4.67 | -| ctc prefix beam search | 5.02 | -| ctc prefix beam + rescore | 4.51 | -| ctc prefix beam + rnnt&attn rescore | 4.45 | -| rnnt prefix beam + rnnt&attn rescore | 4.49 | - - -## U2++ Conformer Result - -* Feature info: using fbank feature, dither, cmvn, oneline speed perturb -* Training info: lr 0.001, batch size 4, 32 gpu, acc_grad 1, 360 epochs -* Training weight info: transducer_weight 0.75, ctc_weight 0.1, reverse_weight 0.15 average_num 30 -* Predictor type: lstm - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| rnnt greedy search | 5.68 | 6.26 | - -## Pretrain -* Pretrain model: https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_exp.tar.gz -* Feature info: using fbank feature, dither, cmvn, oneline speed perturb -* Training info: lr 0.001, batch size 8, 8 gpu, acc_grad 1, 140 epochs -* Training weight info: transducer_weight 0.4, ctc_weight 0.2 , attention_weight 0.4, reverse_weight 0.3 average_num 30 -* Predictor type: lstm - -| decoding mode/chunk size | full | 16 | -|-----------------------------|-------|--------| -| rnnt greedy search | 5.21 | 5.73 | -| rnnt prefix beam | 5.14 | 5.63 | -| rnnt prefix beam + rescore | 4.73 | 5.095 | - - -## Training loss ablation study - -note: - -- If rnnt is checked, greedy means rnnt greedy search; so is beam - -- if rnnt is checked, rescoring means rnnt beam & attention rescoring - -- if only 'ctc & att' is checked, greedy means ctc gredy search; so is beam - -- if only 'ctc & att' (AED) is checked, rescoring means ctc beam & attention rescoring - -- what if rnnt model do search of wenet's style, comming soon - -| rnnt | ctc | att | greedy | beam | rescoring | fusion | -|------|-----|-----|--------|------|-----------|--------| -| ✔ | ✔ | ✔ | 4.88 | 4.67 | 4.45 | 4.49 | -| ✔ | ✔ | | 5.56 | 5.46 | / | 5.40 | -| ✔ | | ✔ | 5.03 | 4.94 | 4.87 | / | -| ✔ | | | 5.64 | 5.59 | / | / | -| | ✔ | ✔ | 4.94 | 4.94 | 4.61 | / | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/conformer_rnnt.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/conformer_rnnt.yaml deleted file mode 100644 index aeab0b180bc4904d32de6d01997e96c3f6ed9efd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/conformer_rnnt.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: true - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - - -joint_conf: - join_dim: 512 - prejoin_linear: True - postjoin_linear: false - joint_mode: 'add' - activation: 'tanh' - -predictor: rnn -predictor_conf: - embed_size: 256 - output_size: 256 - embed_dropout: 0.1 - hidden_size: 256 - num_layers: 2 - bias: true - rnn_type: 'lstm' - dropout: 0.1 - -decoder: bitransformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid transducer+ctc+attention -model_conf: - transducer_weight: 0.75 - ctc_weight: 0.1 - attention_weight: 0.15 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 8 - -grad_clip: 4 -accum_grad: 1 -max_epoch: 140 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml deleted file mode 100644 index 28a80d5f7f9f59be932ccc2cc8900d7ab397cf49..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - -joint_conf: - join_dim: 512 - prejoin_linear: True - postjoin_linear: false - joint_mode: 'add' - activation: 'tanh' - -predictor: rnn -predictor_conf: - embed_size: 256 - output_size: 256 - embed_dropout: 0.1 - hidden_size: 256 - num_layers: 2 - bias: true - rnn_type: 'lstm' - dropout: 0.1 - -decoder: bitransformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid transducer+ctc+attention -model_conf: - transducer_weight: 0.75 - ctc_weight: 0.1 - attention_weight: 0.15 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 4 - -grad_clip: 4 -accum_grad: 1 -max_epoch: 130 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/example_embedding_predictor.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/example_embedding_predictor.yaml deleted file mode 100644 index 6d15b2fc03c43c250f5588bb9c50d01b128a4ecd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/conf/example_embedding_predictor.yaml +++ /dev/null @@ -1,95 +0,0 @@ -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: true - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - - -joint_conf: - join_dim: 320 - prejoin_linear: true - postjoin_linear: false - joint_mode: 'add' - activation: 'tanh' - -predictor: embedding -predictor_conf: - embed_size: 320 - embed_dropout: 0.1 - n_head: 4 - history_size: 5 - bias: false - -decoder: bitransformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid transducer+ctc+attention -model_conf: - transducer_weight: 0.4 - ctc_weight: 0.2 - attention_weight: 0.4 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 30 - - -grad_clip: 4 -accum_grad: 1 -max_epoch: 500 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/aishell_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/aishell_data_prep.sh deleted file mode 100644 index fb4d5fb0adefb9e3e3ebeaa5ccb1a92562eb77c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/aishell_data_prep.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript" - exit 1; -fi - -aishell_audio_dir=$1 -aishell_text=$2/aishell_transcript_v0.8.txt - -train_dir=data/local/train -dev_dir=data/local/dev -test_dir=data/local/test -tmp_dir=data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -# find wav audio file for train, dev and test resp. -find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 141925 ] && \ - echo Warning: expected 141925 data data files, found $n - -grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all - tools/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp - sort -u $dir/transcripts.txt > $dir/text -done - -mkdir -p data/train data/dev data/test - -for f in wav.scp text; do - cp $train_dir/$f data/train/$f || exit 1; - cp $dev_dir/$f data/dev/$f || exit 1; - cp $test_dir/$f data/test/$f || exit 1; -done - -echo "$0: AISHELL data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/aishell_train_lms.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/aishell_train_lms.sh deleted file mode 100644 index 30ffb7973b3ddec4ef4c0f09c8184837cad768d6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/aishell_train_lms.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. -. ./path.sh - -text=data/local/lm/text -lexicon=data/local/dict/lexicon.txt - -for f in "$text" "$lexicon"; do - [ ! -f $x ] && echo "$0: No such file $f" && exit 1; -done - -# Check SRILM tools -if ! which ngram-count > /dev/null; then - echo "srilm tools are not found, please download it and install it from: " - echo "http://www.speech.sri.com/projects/srilm/download.html" - echo "Then add the tools to your PATH" - exit 1 -fi - -# This script takes no arguments. It assumes you have already run -# aishell_data_prep.sh. -# It takes as input the files -# data/local/lm/text -# data/local/dict/lexicon.txt -dir=data/local/lm -mkdir -p $dir - - -cleantext=$dir/text.no_oov - -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; - -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -mkdir -p $dir -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train - -ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ - -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa -ngram -lm $dir/lm.arpa -ppl $dir/heldout diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/download_and_untar.sh deleted file mode 100644 index 58a278241d75caeba25ba4b17d186912d0d724ec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/local/download_and_untar.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_aishell, resource_aishell." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1; -fi - -part_ok=false -list="data_aishell resource_aishell" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="15582913665 1246920" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tgz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -if [ $part == "data_aishell" ]; then - cd $data/$part/wav - for wav in ./*.tar.gz; do - echo "Extracting wav from $wav" - tar -zxf $wav && rm $wav - done -fi - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/run.sh deleted file mode 100644 index e9a0640240d58c38f914ebb165caa01ca11f857d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/run.sh +++ /dev/null @@ -1,201 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -# 2022 Binbin Zhang(binbizha@qq.com) - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 - -# The num of machines(nodes) for multi-machine training, 1 is for one machine. -# NFS is required if num_nodes > 1. -num_nodes=1 - -# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. -# You should set the node_rank=0 on the first machine, set the node_rank=1 -# on the second machine, and so on. -node_rank=0 -# The aishell dataset location, please change this to your own path -# make sure of using absolute path. DO-NOT-USE relatvie path! -data=/export/data/asr-data/OpenSLR/33/ -data_url=www.openslr.org/resources/33 - -nj=16 -dict=data/dict/lang_char.txt - -# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, -# `shard` is used for large dataset which is over 1k hours, and `shard` is -# faster on reading data and training. -data_type=raw -num_utts_per_shard=1000 - -train_set=train -train_config=conf/conformer_u2pp_rnnt.yaml -cmvn=true -dir=exp/conformer_rnnt -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="rnnt_beam_search" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - local/download_and_untar.sh ${data} ${data_url} data_aishell - local/download_and_untar.sh ${data} ${data_url} resource_aishell -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/aishell_data_prep.sh ${data}/data_aishell/wav \ - ${data}/data_aishell/transcript -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # remove the space between the text labels for Mandarin dataset - for x in train dev test; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 -d" " data/${x}/text.org) \ - <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/$train_set/global_cmvn -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 is for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \ - | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | \ - awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - for x in dev test ${train_set}; do - if [ $data_type == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data/$x/wav.scp data/$x/text \ - $(realpath data/$x/shards) data/$x/data.list - else - tools/make_raw_list.py data/$x/wav.scp data/$x/text \ - data/$x/data.list - fi - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - mkdir -p $dir - # You have to rm `INIT_FILE` manually when you resume or restart a - # multi-machine training. - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - - # train.py rewrite $train_config to $dir/train.yaml with model input - # and output dimension, and $dir/train.yaml will be used for inference - # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Please specify decoding_chunk_size for unified streaming and - # non-streaming model. The default value is -1, which is full chunk - # for non-streaming inference. - decoding_chunk_size= - # only used in rescore mode for weighting different scores - rescore_ctc_weight=0.5 - rescore_transducer_weight=0.5 - rescore_attn_weight=0.5 - # only used in beam search, either pure beam search mode OR beam search inside rescoring - search_ctc_weight=0.3 - search_transducer_weight=0.7 - - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $rescore_ctc_weight \ - --transducer_weight $rescore_transducer_weight \ - --attn_weight $rescore_attn_weight \ - --search_ctc_weight $search_ctc_weight \ - --search_transducer_weight $search_transducer_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - done - wait -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/README.md deleted file mode 100644 index e7d5ca2567fd21ea84f8a32b5ac31f1451b5baf0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/README.md +++ /dev/null @@ -1,180 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size 18, 4 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode | CER | -|---------------------------|-------| -| attention decoder | 5.18 | -| ctc greedy search | 4.94 | -| ctc prefix beam search | 4.94 | -| attention rescoring | 4.61 | -| LM + attention rescoring | 4.36 | - -## U2++ Conformer Result - -* Feature info: using fbank feature, dither=1.0, cmvn, oneline speed perturb -* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 360 epochs -* Decoding info: ctc_weight 0.3, reverse_weight 0.5 average_num 30, lm_scale 0.7, decoder_scale 0.1, r_decoder_scale 0.7 -* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95 - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| ctc greedy search | 5.19 | 5.81 | -| ctc prefix beam search | 5.17 | 5.81 | -| attention rescoring | 4.63 | 5.05 | -| LM + attention rescoring | 4.40 | 4.75 | -| HLG(k2 LM) | 4.81 | 5.27 | -| HLG(k2 LM) + attention rescoring | 4.32 | 4.70 | - -## Unified Conformer Result - -* Feature info: using fbank feature, dither=0, cmvn, oneline speed perturb -* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 180 epochs, dither 0.0 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode/chunk size | full | 16 | 8 | 4 | -|---------------------------|-------|-------|-------|-------| -| attention decoder | 5.40 | 5.60 | 5.74 | 5.86 | -| ctc greedy search | 5.56 | 6.29 | 6.68 | 7.10 | -| ctc prefix beam search | 5.57 | 6.30 | 6.67 | 7.10 | -| attention rescoring | 5.05 | 5.45 | 5.69 | 5.91 | -| LM + attention rescoring | 4.73 | 5.08 | 5.22 | 5.38 | - -## U2++ Transformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb. -* Training info: lr 0.001, batch size 26, 8 gpu, acc_grad 1, 360 epochs, dither 0.1 -* Decoding info: ctc_weight 0.2, reverse_weight 0.5, average_num 30 -* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0 - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| ctc greedy search | 6.05 | 6.92 | -| ctc prefix beam search | 6.05 | 6.90 | -| attention rescoring | 5.11 | 5.63 | -| LM + attention rescoring | 4.82 | 5.24 | - -## Transformer Result - -* Feature info: using fbank feature, dither, with cmvn, online speed perturb. -* Training info: lr 0.002, batch size 26, 4 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode | CER | -|---------------------------|-------| -| attention decoder | 5.69 | -| ctc greedy search | 5.92 | -| ctc prefix beam search | 5.91 | -| attention rescoring | 5.30 | -| LM + attention rescoring | 5.04 | - -## Unified Transformer Result - -* Feature info: using fbank feature, dither=0, with cmvn, online speed perturb. -* Training info: lr 0.002, batch size 16, 4 gpu, acc_grad 1, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode/chunk size | full | 16 | 8 | 4 | -|---------------------------|-------|-------|-------|-------| -| attention decoder | 6.04 | 6.35 | 6.45 | 6.70 | -| ctc greedy search | 6.28 | 6.99 | 7.39 | 7.89 | -| ctc prefix beam search | 6.28 | 6.98 | 7.40 | 7.89 | -| attention rescoring | 5.52 | 6.05 | 6.28 | 6.62 | -| LM + attention rescoring | 5.11 | 5.59 | 5.86 | 6.17 | - -## AMP Training Transformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size, 4 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 25000 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 1bb4e5a269c535340fae5b0739482fa47733d2c1 - -| decoding mode | CER | -|------------------------|------| -| attention decoder | 5.73 | -| ctc greedy search | 5.92 | -| ctc prefix beam search | 5.92 | -| attention rescoring | 5.31 | - - -## Muilti-machines Training Conformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.004, batch size 16, 2 machines, 8\*2=16 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 10000 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: f6b1409023440da1998d31abbcc3826dd40aaf35 - -| decoding mode | CER | -|------------------------|------| -| attention decoder | 4.90 | -| ctc greedy search | 5.07 | -| ctc prefix beam search | 5.06 | -| attention rescoring | 4.65 | - - -## Conformer with/without Position Encoding Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size 16, 8 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 - -| decoding mode | with PE | without PE | -|------------------------|---------|------------| -| attention decoder | 5.18 | 5.73 | -| ctc greedy search | 4.94 | 4.97 | -| ctc prefix beam search | 4.94 | 4.97 | -| attention rescoring | 4.61 | 4.69 | - - -## Efficient Conformer v1 Result - -* Feature info: - * using fbank feature, cmvn, speed perturb, dither -* Training info: - * train_u2++_efficonformer_v1.yaml - * 8 gpu, batch size 16, acc_grad 1, 200 epochs - * lr 0.001, warmup_steps 25000 -* Model info: - * Model Params: 48,488,347 - * Downsample rate: 1/4 (conv2d) * 1/2 (efficonformer block) - * encoder_dim 256, output_size 256, head 8, linear_units 2048 - * num_blocks 12, cnn_module_kernel 15, group_size 3 -* Decoding info: - * ctc_weight 0.5, reverse_weight 0.3, average_num 20 - -| decoding mode | full | 18 | 16 | -|------------------------|------|------|------| -| attention decoder | 4.99 | 5.13 | 5.16 | -| ctc prefix beam search | 4.98 | 5.23 | 5.23 | -| attention rescoring | 4.64 | 4.86 | 4.85 | - - -## Efficient Conformer v2 Result - -* Feature info: - * using fbank feature, cmvn, speed perturb, dither -* Training info: - * train_u2++_efficonformer_v2.yaml - * 8 gpu, batch size 16, acc_grad 1, 200 epochs - * lr 0.001, warmup_steps 25000 -* Model info: - * Model Params: 49,354,651 - * Downsample rate: 1/2 (conv2d2) * 1/4 (efficonformer block) - * encoder_dim 256, output_size 256, head 8, linear_units 2048 - * num_blocks 12, cnn_module_kernel 15, group_size 3 -* Decoding info: - * ctc_weight 0.5, reverse_weight 0.3, average_num 20 - -| decoding mode | full | 18 | 16 | -|------------------------|------|------|------| -| attention decoder | 4.87 | 5.03 | 5.07 | -| ctc prefix beam search | 4.97 | 5.18 | 5.20 | -| attention rescoring | 4.56 | 4.75 | 4.77 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/UIO_RESULT.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/UIO_RESULT.md deleted file mode 100644 index b18775c5cd73437bcb3d4840eaf6778103b4dbb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/UIO_RESULT.md +++ /dev/null @@ -1,9 +0,0 @@ -# Benchmark on Conformer - -| IO | CER | -|--------------|-------| -| Old | 4.61 | -| UIO(Raw) | 4.63 | -| UIO(Shards) | 4.67 | - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_conformer.yaml deleted file mode 100644 index b8ce511cdaad0f03be4a82708d70290ec9e37c3d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_conformer_no_pos.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_conformer_no_pos.yaml deleted file mode 100644 index a2d5d03f570119d4c54a26917552f92939c83ac1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_conformer_no_pos.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'no_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_transformer.yaml deleted file mode 100644 index b7d7eee83ace095b4c7a09e61fd63776cb50b2d6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_transformer.yaml +++ /dev/null @@ -1,72 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 26 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_conformer.yaml deleted file mode 100644 index b4587bce33be458b15490dccbf2f98aaa798959c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_conformer.yaml +++ /dev/null @@ -1,90 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - spec_trim: false - spec_trim_conf: - max_t: 50 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 360 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml deleted file mode 100644 index 3d0de82dbf23e2c3abaa26eda5178c8ba1452861..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# network architecture -# encoder related -encoder: efficientConformer -encoder_conf: - activation_type: 'swish' - attention_heads: 8 - causal: false - cnn_module_kernel: 15 - cnn_module_norm: 'layer_norm' - dropout_rate: 0.1 - input_layer: conv2d - linear_units: 2048 - normalize_before: true - num_blocks: 12 - output_size: 256 - pos_enc_layer_type: 'rel_pos' - attention_dropout_rate: 0.1 - positional_dropout_rate: 0.1 - use_cnn_module: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - efficient_conf: - stride_layer_idx: [3] # layer id with StrideConv - stride: [2] # stride size of each StrideConv - group_layer_idx: [0, 1, 2, 3] # layer id with GroupedAttention - group_size: 3 # group size of every GroupedAttention layer - stride_kernel: true # true: recompute cnn kernels with stride - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - batch_conf: - batch_size: 16 - batch_type: 'static' - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - spec_trim: false - spec_trim_conf: - max_t: 50 - speed_perturb: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 200 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml deleted file mode 100644 index 3b5a99a86276971592e6a35a26557b10fb561cdc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# network architecture -# encoder related -encoder: efficientConformer -encoder_conf: - activation_type: 'swish' - attention_heads: 8 - causal: true - cnn_module_kernel: 15 - cnn_module_norm: 'layer_norm' - dropout_rate: 0.1 - input_layer: conv2d - linear_units: 2048 - normalize_before: true - num_blocks: 12 - output_size: 256 - pos_enc_layer_type: 'rel_pos' - attention_dropout_rate: 0.1 - positional_dropout_rate: 0.1 - use_cnn_module: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - efficient_conf: - stride_layer_idx: [3] # layer id with StrideConv - stride: [2] # stride size of each StrideConv - group_layer_idx: [0, 1, 2, 3] # layer id with GroupedAttention - group_size: 3 # group size of every GroupedAttention layer - stride_kernel: true # true: recompute cnn kernels with stride - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - batch_conf: - batch_size: 16 - batch_type: 'static' - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - spec_trim: false - spec_trim_conf: - max_t: 50 - speed_perturb: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 200 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml deleted file mode 100644 index c23e1b64da5304e19d8339bc94d10b3cf80b36a8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml +++ /dev/null @@ -1,98 +0,0 @@ -# network architecture -# encoder related -encoder: efficientConformer -encoder_conf: - activation_type: 'swish' - attention_heads: 8 - causal: false - cnn_module_kernel: 15 - cnn_module_norm: 'layer_norm' - dropout_rate: 0.1 - input_layer: conv2d2 - linear_units: 2048 - normalize_before: true - num_blocks: 12 - output_size: 256 - pos_enc_layer_type: 'rel_pos' - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - use_cnn_module: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - efficient_conf: - stride_layer_idx: [3, 7] # layer id with StrideConv - stride: [2, 2] # stride size of each StrideConv - group_layer_idx: [3, 7] # layer id with GroupedAttention - group_size: 3 # group size of every GroupedAttention layer - stride_kernel: false # true: recompute cnn kernels with stride - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - batch_conf: - batch_size: 16 - batch_type: 'static' - fbank_conf: - dither: 1.0 - frame_length: 25 - frame_shift: 10 - num_mel_bins: 80 - filter_conf: - max_length: 40960 - min_length: 0 - max_output_input_ratio: 0.1 - min_output_input_ratio: 0.005 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - spec_trim: false - spec_trim_conf: - max_t: 50 - speed_perturb: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 200 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_transformer.yaml deleted file mode 100644 index 44b4d4be7f70b7921e2ce67b3e4b8a80f99e9048..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_u2++_transformer.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_trim: false - spec_trim_conf: - max_t: 70 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 26 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 360 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_unified_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_unified_conformer.yaml deleted file mode 100644 index 978d3d91c4d9eef417c60a068647bb5d7db88fe0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,81 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 180 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_unified_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_unified_transformer.yaml deleted file mode 100644 index 9d7a386872ddfb0859ff9e09c81e185e1a60d7a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/conf/train_unified_transformer.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - - -grad_clip: 5 -accum_grad: 1 -max_epoch: 180 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/aishell_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/aishell_data_prep.sh deleted file mode 100644 index fb4d5fb0adefb9e3e3ebeaa5ccb1a92562eb77c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/aishell_data_prep.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript" - exit 1; -fi - -aishell_audio_dir=$1 -aishell_text=$2/aishell_transcript_v0.8.txt - -train_dir=data/local/train -dev_dir=data/local/dev -test_dir=data/local/test -tmp_dir=data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -# find wav audio file for train, dev and test resp. -find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 141925 ] && \ - echo Warning: expected 141925 data data files, found $n - -grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all - tools/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp - sort -u $dir/transcripts.txt > $dir/text -done - -mkdir -p data/train data/dev data/test - -for f in wav.scp text; do - cp $train_dir/$f data/train/$f || exit 1; - cp $dev_dir/$f data/dev/$f || exit 1; - cp $test_dir/$f data/test/$f || exit 1; -done - -echo "$0: AISHELL data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/aishell_train_lms.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/aishell_train_lms.sh deleted file mode 100644 index 30ffb7973b3ddec4ef4c0f09c8184837cad768d6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/aishell_train_lms.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. -. ./path.sh - -text=data/local/lm/text -lexicon=data/local/dict/lexicon.txt - -for f in "$text" "$lexicon"; do - [ ! -f $x ] && echo "$0: No such file $f" && exit 1; -done - -# Check SRILM tools -if ! which ngram-count > /dev/null; then - echo "srilm tools are not found, please download it and install it from: " - echo "http://www.speech.sri.com/projects/srilm/download.html" - echo "Then add the tools to your PATH" - exit 1 -fi - -# This script takes no arguments. It assumes you have already run -# aishell_data_prep.sh. -# It takes as input the files -# data/local/lm/text -# data/local/dict/lexicon.txt -dir=data/local/lm -mkdir -p $dir - - -cleantext=$dir/text.no_oov - -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; - -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -mkdir -p $dir -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train - -ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ - -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa -ngram -lm $dir/lm.arpa -ppl $dir/heldout diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/download_and_untar.sh deleted file mode 100644 index 58a278241d75caeba25ba4b17d186912d0d724ec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/local/download_and_untar.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_aishell, resource_aishell." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1; -fi - -part_ok=false -list="data_aishell resource_aishell" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="15582913665 1246920" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tgz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -if [ $part == "data_aishell" ]; then - cd $data/$part/wav - for wav in ./*.tar.gz; do - echo "Extracting wav from $wav" - tar -zxf $wav && rm $wav - done -fi - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/path.sh deleted file mode 100644 index ac1ca08baf5d4540b92ed239b8aa7cd613064a8c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/run.sh deleted file mode 100644 index 7a4c4e1d0e6326371774ae38d90948f9cfdd5927..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/run.sh +++ /dev/null @@ -1,287 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 - -# The num of machines(nodes) for multi-machine training, 1 is for one machine. -# NFS is required if num_nodes > 1. -num_nodes=1 - -# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. -# You should set the node_rank=0 on the first machine, set the node_rank=1 -# on the second machine, and so on. -node_rank=0 -# The aishell dataset location, please change this to your own path -# make sure of using absolute path. DO-NOT-USE relatvie path! -data=/export/data/asr-data/OpenSLR/33/ -data_url=www.openslr.org/resources/33 - -nj=16 -dict=data/dict/lang_char.txt - -# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, -# `shard` is used for large dataset which is over 1k hours, and `shard` is -# faster on reading data and training. -data_type=raw -num_utts_per_shard=1000 - -train_set=train -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -# 5. conf/train_u2++_conformer.yaml: U2++ conformer -# 6. conf/train_u2++_transformer.yaml: U2++ transformer -train_config=conf/train_conformer.yaml -cmvn=true -dir=exp/conformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - local/download_and_untar.sh ${data} ${data_url} data_aishell - local/download_and_untar.sh ${data} ${data_url} resource_aishell -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/aishell_data_prep.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # remove the space between the text labels for Mandarin dataset - for x in train dev test; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 -d" " data/${x}/text.org) \ - <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/$train_set/global_cmvn -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 is for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \ - | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | \ - awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - for x in dev test ${train_set}; do - if [ $data_type == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data/$x/wav.scp data/$x/text \ - $(realpath data/$x/shards) data/$x/data.list - else - tools/make_raw_list.py data/$x/wav.scp data/$x/text \ - data/$x/data.list - fi - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - mkdir -p $dir - # You have to rm `INIT_FILE` manually when you resume or restart a - # multi-machine training. - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - - # train.py rewrite $train_config to $dir/train.yaml with model input - # and output dimension, and $dir/train.yaml will be used for inference - # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Please specify decoding_chunk_size for unified streaming and - # non-streaming model. The default value is -1, which is full chunk - # for non-streaming inference. - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - done - wait -fi - - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip -fi - -# Optionally, you can add LM and test it with runtime. -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # 7.1 Prepare dict - unit_file=$dict - mkdir -p data/local/dict - cp $unit_file data/local/dict/units.txt - tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ - data/local/dict/lexicon.txt - # 7.2 Train lm - lm=data/local/lm - mkdir -p $lm - tools/filter_scp.pl data/train/text \ - $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text - local/aishell_train_lms.sh - # 7.3 Build decoding TLG - tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - # 7.4 Decoding with runtime - chunk_size=-1 - ./tools/decode.sh --nj 16 \ - --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ - --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ - --chunk_size $chunk_size \ - --fst_path data/lang_test/TLG.fst \ - --dict_path data/lang_test/words.txt \ - data/test/wav.scp data/test/text $dir/final.zip \ - data/lang_test/units.txt $dir/lm_with_runtime - # Please see $dir/lm_with_runtime for wer -fi - -# Optionally, you can decode with k2 hlg -if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then - if [ ! -f data/local/lm/lm.arpa ]; then - echo "Please run prepare dict and train lm in Stage 7" || exit 1; - fi - - # 8.1 Build decoding HLG - required="data/local/hlg/HLG.pt data/local/hlg/words.txt" - for f in $required; do - if [ ! -f $f ]; then - tools/k2/make_hlg.sh data/local/dict/ data/local/lm/ data/local/hlg - break - fi - done - - # 8.2 Decode using HLG - decoding_chunk_size= - lm_scale=0.7 - decoder_scale=0.1 - r_decoder_scale=0.7 - for mode in hlg_onebest hlg_rescore; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 16 \ - --penalty 0.0 \ - --dict $dict \ - --word data/local/hlg/words.txt \ - --hlg data/local/hlg/HLG.pt \ - --lm_scale $lm_scale \ - --decoder_scale $decoder_scale \ - --r_decoder_scale $r_decoder_scale \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } - done -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/README.md deleted file mode 100644 index c74fbeadc6d1d6edd04147be7490257a2b3e5e9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Performance Record - -## U2++ Conformer Result - -* Feature info: using fbank feature, dither, cmvn, oneline speed perturb -* Training info: lr 0.001, dynamic batch with max_frames_in_batch 15000, 4 gpu, acc_grad 1, 130 epochs -* Training weight info: transducer_weight 0.75, ctc_weight 0.1, reverse_weight 0.30, average_num 30 -* Predictor type: lstm - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| rnnt greedy search | 6.44 | 7.09 | - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/conf/conformer_rnnt.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/conf/conformer_rnnt.yaml deleted file mode 100644 index aeab0b180bc4904d32de6d01997e96c3f6ed9efd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/conf/conformer_rnnt.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: true - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - - -joint_conf: - join_dim: 512 - prejoin_linear: True - postjoin_linear: false - joint_mode: 'add' - activation: 'tanh' - -predictor: rnn -predictor_conf: - embed_size: 256 - output_size: 256 - embed_dropout: 0.1 - hidden_size: 256 - num_layers: 2 - bias: true - rnn_type: 'lstm' - dropout: 0.1 - -decoder: bitransformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid transducer+ctc+attention -model_conf: - transducer_weight: 0.75 - ctc_weight: 0.1 - attention_weight: 0.15 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 8 - -grad_clip: 4 -accum_grad: 1 -max_epoch: 140 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/conf/conformer_u2pp_rnnt.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/conf/conformer_u2pp_rnnt.yaml deleted file mode 100644 index cfb4b18b6ed9ff1a19cecde760f5fa0d53b5a2c2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/conf/conformer_u2pp_rnnt.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - -joint_conf: - join_dim: 512 - prejoin_linear: True - postjoin_linear: false - joint_mode: 'add' - activation: 'tanh' - -predictor: rnn -predictor_conf: - embed_size: 256 - output_size: 256 - embed_dropout: 0.1 - hidden_size: 256 - num_layers: 2 - bias: true - rnn_type: 'lstm' - dropout: 0.1 - -decoder: bitransformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid transducer+ctc+attention -model_conf: - transducer_weight: 0.75 - ctc_weight: 0.1 - attention_weight: 0.15 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'dynamic' # static or dynamic - max_frames_in_batch: 15000 - -grad_clip: 4 -accum_grad: 1 -max_epoch: 130 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/prepare_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/prepare_data.sh deleted file mode 100644 index c1586b87856804bce4a23609f696417deb7d4e79..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/prepare_data.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) -# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) -# Apache 2.0 - -# transform raw AISHELL-2 data to kaldi format - -if [ $# != 3 ]; then - echo "prepare_data.sh " - echo " e.g prepare_data.sh /data/AISHELL-2/iOS/train data/local/train data/train" - exit 1; -fi - -corpus=$1 -tmp=$2 -dir=$3 - -echo "prepare_data.sh: Preparing data in $corpus" - -mkdir -p $tmp -mkdir -p $dir - -# corpus check -if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then - echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." - exit 1; -fi - -# validate utt-key list -awk '{print $1}' $corpus/wav.scp > $tmp/wav_utt.list -awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list -tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list - -# wav.scp -awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp -tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp - -# text -tools/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt -dos2unix < $tmp/trans.txt | \ - tools/filter_scp.pl -f 1 $tmp/utt.list - | \ - sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \ - sed 's/A/A/g' | sed 's/T/T/g' | sed 's/M/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' | \ - sed 's/[()]//g' | sed "s/\([^A-Z]\)'/\1/g" > $tmp/text - -# copy prepared resources from tmp_dir to target dir -mkdir -p $dir -for f in wav.scp text; do - cp $tmp/$f $dir/$f || exit 1; -done - -echo "local/prepare_data.sh succeeded" -exit 0; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/train_lms.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/train_lms.sh deleted file mode 100644 index 2e2d0dbeb79c54d707add6d20269fe9b89e69d8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/train_lms.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. -. ./path.sh - -text=data/local/lm/text -lexicon=data/local/dict/lexicon.txt - -. tools/parse_options.sh - -for f in "$text" "$lexicon"; do - [ ! -f $x ] && echo "$0: No such file $f" && exit 1; -done - -# Check SRILM tools -if ! which ngram-count > /dev/null; then - echo "srilm tools are not found, please download it and install it from: " - echo "http://www.speech.sri.com/projects/srilm/download.html" - echo "Then add the tools to your PATH" - exit 1 -fi - -dir=data/local/lm -mkdir -p $dir - -cleantext=$dir/text.no_oov - -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; - -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -mkdir -p $dir -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train - -ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ - -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa -ngram -lm $dir/lm.arpa -ppl $dir/heldout diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/word_segmentation.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/word_segmentation.py deleted file mode 100644 index 117686dd3e826ebc63abc22779c913a2cb9f78d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/local/word_segmentation.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python -# encoding=utf-8 -# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) -# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) -# Apache 2.0 - -from __future__ import print_function -import sys -import jieba - -if len(sys.argv) < 3: - sys.stderr.write( - "word_segmentation.py \n") - exit(1) - -vocab_file = sys.argv[1] -trans_file = sys.argv[2] - -jieba.set_dictionary(vocab_file) -for line in open(trans_file, 'r', encoding='utf8'): - key, trans = line.strip().split(' ', 1) - words = jieba.cut(trans, - HMM=False) # turn off new word discovery (HMM-based) - new_line = key + '\t' + " ".join(words) - print(new_line) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/path.sh deleted file mode 100644 index 8d4c9092e217d2fbd4e4cada2edbe0da768358eb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/run.sh deleted file mode 100644 index 8102c1a888ab42f0b391228314696c210f85b186..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/run.sh +++ /dev/null @@ -1,191 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -# 2022 burkliu(boji123@aliyun.com) - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - -# modify this to your AISHELL-2 data path -# Note: the evaluation data (dev & test) is available at AISHELL. -# Please download it from http://aishell-eval.oss-cn-beijing.aliyuncs.com/TEST%26DEV%20DATA.zip -train_set=/cfs/share/corpus/aishell-2/AISHELL-2/iOS/data -dev_set=/cfs/share/corpus/aishell-2/AISHELL-DEV-TEST-SET/iOS/dev -test_set=/cfs/share/corpus/aishell-2/AISHELL-DEV-TEST-SET/iOS/test - -nj=16 -dict=data/dict/lang_char.txt - -train_set=train -train_config=conf/conformer_u2pp_rnnt.yaml -cmvn=true -dir=exp/`basename ${train_config%.*}` -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="rnnt_beam_search" - -# Specify decoding_chunk_size if it's a unified dynamic chunk trained model -# -1 for full chunk -decoding_chunk_size=-1 -# only used in rescore mode for weighting different scores -rescore_ctc_weight=0.5 -rescore_transducer_weight=0.5 -rescore_attn_weight=0.5 -# only used in beam search, either pure beam search mode OR beam search inside rescoring -search_ctc_weight=0.3 -search_transducer_weight=0.7 - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/prepare_data.sh ${train_set} data/local/${train_set} data/${train_set} || exit 1; - local/prepare_data.sh ${dev_set} data/local/dev data/dev || exit 1; - local/prepare_data.sh ${test_set} data/local/test data/test || exit 1; -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # remove the space between the text labels for Mandarin dataset - for x in ${train_set} dev test; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 data/${x}/text.org) <(cut -f 2- data/${x}/text.org \ - | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/$train_set/global_cmvn - -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in dev test ${train_set}; do - tools/make_raw_list.py data/$x/wav.scp data/$x/text data/$x/data.list - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - #dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 4 \ - $cmvn_opts \ - 2>&1 | tee -a $dir/train.log || exit 1; - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best \ - 2>&1 | tee -a $dir/average.log || exit 1; - fi - - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode}_chunk_${decoding_chunk_size} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type raw \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $rescore_ctc_weight \ - --transducer_weight $rescore_transducer_weight \ - --attn_weight $rescore_attn_weight \ - --search_ctc_weight $search_ctc_weight \ - --search_transducer_weight $search_transducer_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - done - wait -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/README.md deleted file mode 100644 index ef98864f6b07e6fb0c3b0e7bef57f06d16f2ef79..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# Performance Record - -## U2++ Conformer Result - -* Feature info: using fbank feature, with cmvn, no speed perturb, dither -* Training info: lr 0.001, batch size 32, 8 gpus, acc_grad 1, 240 epochs, dither 1.0 -* Decoding info: ctc_weight 0.1, reverse_weight 0.4, average_num 30 -* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95 - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| ctc greedy search | 6.18 | 6.79 | -| ctc prefix beam search | 6.20 | 6.80 | -| attention rescoring | 5.39 | 5.78 | -| LM + attention rescoring | 5.35 | 5.73 | - -## U2++ Transformer Result - -* Feature info: using fbank feature, with cmvn, no speed perturb -* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 240 epochs, dither 0.0 -* Decoding info: ctc_weight 0.1, reverse_weight 0.5, average_num 30 -* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95 - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| ctc greedy search | 7.35 | 8.23 | -| ctc prefix beam search | 7.36 | 8.23 | -| attention rescoring | 6.09 | 6.70 | -| LM + attention rescoring | 6.07 | 6.55 | - -## Unified Conformer Result - -* Feature info: using fbank feature, with cmvn, no speed perturb. -* Training info: lr 0.002, batch size 16, 8 gpus, acc_grad 1, 120 epochs, dither 1.0 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 14d38085a8d966cf9e9577ffafc51d578dce954f - -| decoding mode/chunk size | full | 16 | 8 | 4 | -|---------------------------|-------|-------|-------|-------| -| attention decoder | 6.23 | 6.42 | 6.58 | 7.20 | -| ctc greedy search | 6.98 | 7.75 | 8.21 | 9.91 | -| ctc prefix beam search | 7.02 | 7.76 | 8.21 | 9.93 | -| attention rescoring | 6.08 | 6.46 | 6.72 | 7.79 | -| LM + attention rescoring | 5.87 | 6.37 | 6.47 | 6.61 | - -## Unified Transformer Result - -* Feature info: using fbank feature, with cmvn, no speed perturb. -* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 180 epochs, dither 0.0 -* Decoding info: ctc_weight 0.5, average_num 30 -* Git hash: 14d38085a8d966cf9e9577ffafc51d578dce954f - -| decoding mode/chunk size | full | 16 | 8 | 4 | -|---------------------------|-------|-------|-------|-------| -| attention decoder | 6.71 | 7.08 | 7.17 | 7.40 | -| ctc greedy search | 7.84 | 8.68 | 8.98 | 9.46 | -| ctc prefix beam search | 7.86 | 8.68 | 8.98 | 9.45 | -| attention rescoring | 6.71 | 7.31 | 7.51 | 7.85 | -| LM + attention rescoring | 6.35 | 7.02 | 7.24 | 7.52 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_u2++_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_u2++_conformer.yaml deleted file mode 100644 index 2680893cf5b8707241908469697a35ce3f5acb3e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_u2++_conformer.yaml +++ /dev/null @@ -1,84 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_u2++_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_u2++_transformer.yaml deleted file mode 100644 index 391c9a65af89fed3d038ff15e6e9bc08b5493142..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_u2++_transformer.yaml +++ /dev/null @@ -1,90 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 1.0 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: false - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_unified_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_unified_conformer.yaml deleted file mode 100644 index 9c907d97a981662f7b1d87e09fbef14c8d1f5bb5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_unified_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_unified_transformer.yaml deleted file mode 100644 index ecd9f8cfff3677c5e0c56f996923a849fa38346b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/conf/train_unified_transformer.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 0.0 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: false - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 130 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/prepare_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/prepare_data.sh deleted file mode 100644 index c1586b87856804bce4a23609f696417deb7d4e79..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/prepare_data.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) -# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) -# Apache 2.0 - -# transform raw AISHELL-2 data to kaldi format - -if [ $# != 3 ]; then - echo "prepare_data.sh " - echo " e.g prepare_data.sh /data/AISHELL-2/iOS/train data/local/train data/train" - exit 1; -fi - -corpus=$1 -tmp=$2 -dir=$3 - -echo "prepare_data.sh: Preparing data in $corpus" - -mkdir -p $tmp -mkdir -p $dir - -# corpus check -if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then - echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." - exit 1; -fi - -# validate utt-key list -awk '{print $1}' $corpus/wav.scp > $tmp/wav_utt.list -awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list -tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list - -# wav.scp -awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp -tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp - -# text -tools/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt -dos2unix < $tmp/trans.txt | \ - tools/filter_scp.pl -f 1 $tmp/utt.list - | \ - sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \ - sed 's/A/A/g' | sed 's/T/T/g' | sed 's/M/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' | \ - sed 's/[()]//g' | sed "s/\([^A-Z]\)'/\1/g" > $tmp/text - -# copy prepared resources from tmp_dir to target dir -mkdir -p $dir -for f in wav.scp text; do - cp $tmp/$f $dir/$f || exit 1; -done - -echo "local/prepare_data.sh succeeded" -exit 0; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/train_lms.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/train_lms.sh deleted file mode 100644 index 2e2d0dbeb79c54d707add6d20269fe9b89e69d8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/train_lms.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. -. ./path.sh - -text=data/local/lm/text -lexicon=data/local/dict/lexicon.txt - -. tools/parse_options.sh - -for f in "$text" "$lexicon"; do - [ ! -f $x ] && echo "$0: No such file $f" && exit 1; -done - -# Check SRILM tools -if ! which ngram-count > /dev/null; then - echo "srilm tools are not found, please download it and install it from: " - echo "http://www.speech.sri.com/projects/srilm/download.html" - echo "Then add the tools to your PATH" - exit 1 -fi - -dir=data/local/lm -mkdir -p $dir - -cleantext=$dir/text.no_oov - -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; - -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -mkdir -p $dir -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train - -ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ - -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa -ngram -lm $dir/lm.arpa -ppl $dir/heldout diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/word_segmentation.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/word_segmentation.py deleted file mode 100644 index 117686dd3e826ebc63abc22779c913a2cb9f78d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/local/word_segmentation.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python -# encoding=utf-8 -# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) -# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) -# Apache 2.0 - -from __future__ import print_function -import sys -import jieba - -if len(sys.argv) < 3: - sys.stderr.write( - "word_segmentation.py \n") - exit(1) - -vocab_file = sys.argv[1] -trans_file = sys.argv[2] - -jieba.set_dictionary(vocab_file) -for line in open(trans_file, 'r', encoding='utf8'): - key, trans = line.strip().split(' ', 1) - words = jieba.cut(trans, - HMM=False) # turn off new word discovery (HMM-based) - new_line = key + '\t' + " ".join(words) - print(new_line) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/path.sh deleted file mode 100644 index ac1ca08baf5d4540b92ed239b8aa7cd613064a8c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/run.sh deleted file mode 100644 index 7a40f4223c377fb2f9d66b8a2ad96bcf86cd8506..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/run.sh +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=6 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - -# modify this to your AISHELL-2 data path -# Note: the evaluation data (dev & test) is available at AISHELL. -# Please download it from http://aishell-eval.oss-cn-beijing.aliyuncs.com/TEST%26DEV%20DATA.zip -trn_set=/mnt/nfs/ptm1/open-data/AISHELL-2/iOS/data -dev_set=/mnt/nfs/ptm1/open-data/AISHELL-DEV-TEST-SET/iOS/dev -tst_set=/mnt/nfs/ptm1/open-data/AISHELL-DEV-TEST-SET/iOS/test - -nj=16 -dict=data/dict/lang_char.txt - -train_set=train -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -train_config=conf/train_unified_transformer.yaml -cmvn=true -dir=exp/transformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/prepare_data.sh ${trn_set} data/local/${train_set} data/${train_set} || exit 1; - local/prepare_data.sh ${dev_set} data/local/dev data/dev || exit 1; - local/prepare_data.sh ${tst_set} data/local/test data/test || exit 1; -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # remove the space between the text labels for Mandarin dataset - for x in ${train_set} dev test; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 data/${x}/text.org) <(cut -f 2- data/${x}/text.org \ - | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/$train_set/global_cmvn - -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in dev test ${train_set}; do - tools/make_raw_list.py data/$x/wav.scp data/$x/text data/$x/data.list - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 2 \ - $cmvn_opts - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type raw \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip -fi - -# Optionally, you can add LM and test it with runtime. -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # 7.1 Prepare dict - unit_file=$dict - download_dir=data/local/DaCiDian - git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir - mkdir -p data/local/dict - cp $unit_file data/local/dict/units.txt - tools/fst/prepare_dict.py $unit_file $download_dir/word_to_pinyin.txt \ - data/local/dict/lexicon.txt - # 7.2 Segment text - pip3 install jieba - lm=data/local/lm - mkdir -p $lm - awk '{print $1}' data/local/dict/lexicon.txt | \ - awk '{print $1,99}' > $lm/word_seg_vocab.txt - python local/word_segmentation.py $lm/word_seg_vocab.txt \ - data/train/text > $lm/text - # 7.3 Train lm - local/train_lms.sh - # 7.4 Build decoding TLG - tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - # 7.5 Decoding with runtime - # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0. - reverse_weight=0.0 - chunk_size=-1 - ./tools/decode.sh --nj 16 --chunk_size $chunk_size\ - --beam 15.0 --lattice_beam 7.5 --max_active 7000 --blank_skip_thresh 0.98 \ - --ctc_weight 0.3 --rescoring_weight 1.0 --reverse_weight $reverse_weight\ - --fst_path data/lang_test/TLG.fst \ - --dict_path data/lang_test/words.txt \ - data/test/wav.scp data/test/text $dir/final.zip data/lang_test/units.txt \ - $dir/lm_with_runtime - # See $dir/lm_with_runtime for wer - tail $dir/lm_with_runtime/wer -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/README.md deleted file mode 100644 index 3d3fde55eb11dd503ac4e545ab535bf4670fe294..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: using fbank feature, cmvn, without speed perturb (not supported segments yet) -* Training info: lr 0.001, max_frames_in_batch 15000, 8 gpu, acc_grad 4, 100 epochs -* Decoding info: ctc_weight 0.5, average_num 30 - - -| decoding mode | Test WER | -|---------------------|----------| -| attention rescoring | 32.58% | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/conf/train_conformer.yaml deleted file mode 100644 index dcd115b6308ba2a8073c9ad44213dfb7e5bde2fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 4096 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - #resample_conf: - # resample_rate: 16000 - speed_perturb: false - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'dynamic' # static or dynamic - max_frames_in_batch: 15000 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 100 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 1000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/aishell4_process_textgrid.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/aishell4_process_textgrid.py deleted file mode 100644 index c4fdc54347d27d27440494b4e8b62dcce122b0e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/aishell4_process_textgrid.py +++ /dev/null @@ -1,109 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Process the textgrid files -""" -import argparse -import codecs -from pathlib import Path -import textgrid - - -class Segment(object): - def __init__(self, uttid, spkr, stime, etime, text): - self.uttid = uttid - self.spkr = spkr - self.stime = round(stime, 2) - self.etime = round(etime, 2) - self.text = text - - -def get_args(): - parser = argparse.ArgumentParser(description="process the textgrid files") - parser.add_argument("--path", type=str, required=True, help="Data path") - args = parser.parse_args() - return args - - -def main(args): - wav_scp = codecs.open(Path(args.path) / "wav.scp", "r", "utf-8") - textgrid_flist = codecs.open( - Path(args.path) / "textgrid.flist", "r", "utf-8") - # get the path of textgrid file for each utterance - utt2textgrid = {} - for line in textgrid_flist: - path = Path(line.strip()) - # the name of textgrid file is different between training and test set - if "train" in path.parts: - uttid = "%s_%s" % (path.parts[-2], path.stem) - else: - uttid = path.stem - utt2textgrid[uttid] = path - # parse the textgrid file for each utterance - all_segments = [] - for line in wav_scp: - uttid = line.strip().split(" ")[0] - if uttid not in utt2textgrid: - print("%s doesn't have transcription" % uttid) - continue - segments = [] - tg = textgrid.TextGrid.fromFile(utt2textgrid[uttid]) - for i in range(tg.__len__()): - for j in range(tg[i].__len__()): - if tg[i][j].mark.strip(): - segments.append( - Segment( - uttid, - tg[i].name, - tg[i][j].minTime, - tg[i][j].maxTime, - tg[i][j].mark.strip(), - )) - - segments = sorted(segments, key=lambda x: x.stime) - all_segments += segments - - wav_scp.close() - textgrid_flist.close() - - segments_file = codecs.open(Path(args.path) / "segments_all", "w", "utf-8") - utt2spk_file = codecs.open(Path(args.path) / "utt2spk_all", "w", "utf-8") - text_file = codecs.open(Path(args.path) / "text_all", "w", "utf-8") - utt2dur_file = codecs.open(Path(args.path) / "utt2dur_all", "w", "utf-8") - - for i in range(len(all_segments)): - utt_name = "%s-%s-%07d-%07d" % ( - all_segments[i].uttid, - all_segments[i].spkr, - all_segments[i].stime * 100, - all_segments[i].etime * 100, - ) - - segments_file.write("%s %s %.2f %.2f\n" % ( - utt_name, - all_segments[i].uttid, - all_segments[i].stime, - all_segments[i].etime, - )) - utt2spk_file.write( - "%s %s-%s\n" % - (utt_name, all_segments[i].uttid, all_segments[i].spkr)) - text_file.write("%s %s\n" % (utt_name, all_segments[i].text)) - utt2dur_file.write( - "%s %.2f\n" % - (utt_name, all_segments[i].etime - all_segments[i].stime)) - if len(all_segments[i].text) / (all_segments[i].etime - - all_segments[i].stime) > 100: - print(utt_name) - print( - len(all_segments[i].text) / - (all_segments[i].etime - all_segments[i].stime)) - - segments_file.close() - utt2spk_file.close() - text_file.close() - utt2dur_file.close() - - -if __name__ == "__main__": - args = get_args() - main(args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/apply_map.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/apply_map.pl deleted file mode 100644 index 725d3463a0098f58210a4b71d5c49f3be38fe23b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/apply_map.pl +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This program is a bit like ./sym2int.pl in that it applies a map -# to things in a file, but it's a bit more general in that it doesn't -# assume the things being mapped to are single tokens, they could -# be sequences of tokens. See the usage message. - - -$permissive = 0; - -for ($x = 0; $x <= 2; $x++) { - - if (@ARGV > 0 && $ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } - - if (@ARGV > 0 && $ARGV[0] eq '--permissive') { - shift @ARGV; - # Mapping is optional (missing key is printed to output) - $permissive = 1; - } -} - -if(@ARGV != 1) { - print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n"; - print STDERR <<'EOF'; -Usage: apply_map.pl [options] map output - options: [-f ] [--permissive] - This applies a map to some specified fields of some input text: - For each line in the map file: the first field is the thing we - map from, and the remaining fields are the sequence we map it to. - The -f (field-range) option says which fields of the input file the map - map should apply to. - If the --permissive option is supplied, fields which are not present - in the map will be left as they were. - Applies the map 'map' to all input text, where each line of the map - is interpreted as a map from the first field to the list of the other fields - Note: can look like 4-5, or 4-, or 5-, or 1, it means the field - range in the input to apply the map to. - e.g.: echo A B | apply_map.pl a.txt - where a.txt is: - A a1 a2 - B b - will produce: - a1 a2 b -EOF - exit(1); -} - -($map_file) = @ARGV; -open(M, "<$map_file") || die "Error opening map file $map_file: $!"; - -while () { - @A = split(" ", $_); - @A >= 1 || die "apply_map.pl: empty line."; - $i = shift @A; - $o = join(" ", @A); - $map{$i} = $o; -} - -while() { - @A = split(" ", $_); - for ($x = 0; $x < @A; $x++) { - if ( (!defined $field_begin || $x >= $field_begin) - && (!defined $field_end || $x <= $field_end)) { - $a = $A[$x]; - if (!defined $map{$a}) { - if (!$permissive) { - die "apply_map.pl: undefined key $a in $map_file\n"; - } else { - print STDERR "apply_map.pl: warning! missing key $a in $map_file\n"; - } - } else { - $A[$x] = $map{$a}; - } - } - } - print join(" ", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/copy_data_dir.sh deleted file mode 100644 index 6feb77fdf29beabc086f572f0c5d68ffde9581fd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/copy_data_dir.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. local/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | local/apply_map.pl -f 1 $destdir/utt_map | \ - local/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -local/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/text.tc ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.tc >$destdir/text.tc -fi -if [ -f $srcdir/text.lc ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc >$destdir/text.lc -fi -if [ -f $srcdir/text.lc.rm ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc.rm >$destdir/text.lc.rm -fi -if [ -f $srcdir/utt2dur ]; then - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - local/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - local/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - local/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -local/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/download_and_untar.sh deleted file mode 100644 index bd3ceae5679fd97693d4053d1fc01fef4ad64cda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/download_and_untar.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - echo "e.g.: $0 /home/data/aishell4 https://www.openslr.org/resources/111 train_L" - echo " can be one of: train_L, train_M, train_S, test." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1; -fi - -part_ok=false -list="train_L train_M train_S test" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -if [ -f $data/$part.tar.gz ]; then - echo "$0: removing existing file $data/$part.tar.gz" - rm $data/$part.tar.gz -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/prepare_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/prepare_data.sh deleted file mode 100644 index bfa73d46fea1a0b71d3f8734178a23249480b7fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/prepare_data.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -. ./path.sh || exit 1; - -if [ $# != 1 ]; then - echo "Usage: $0 " - echo " $0 /home/data/aishell4" - exit 1; -fi - -aishell4_source_dir=$1 -train_dir=data/local/aishell4_train -test_dir=data/local/aishell4_test - -mkdir -p $train_dir -mkdir -p $test_dir - -# data directory check -if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -for room_name in "train_L" "train_M" "train_S" "test"; do - if [ -f ${aishell4_source_dir}/$room_name/wav_list.txt ];then - rm ${aishell4_source_dir}/$room_name/wav_list.txt - fi - FILES="$PWD/${aishell4_source_dir}/$room_name/wav/*" - for f in $FILES; do - echo "$f" >> ${aishell4_source_dir}/$room_name/wav_list.txt - done - if [ -f ${aishell4_source_dir}/$room_name/TextGrid_list.txt ];then - rm ${aishell4_source_dir}/$room_name/TextGrid_list.txt - fi - FILES="$PWD/${aishell4_source_dir}/$room_name/TextGrid/*.TextGrid" - for f in $FILES; do - echo "$f" >> ${aishell4_source_dir}/$room_name/TextGrid_list.txt - done -done - -mkdir -p ${aishell4_source_dir}/full_train -for r in train_L train_M train_S ; do - cat ${aishell4_source_dir}/$r/TextGrid_list.txt >> ${aishell4_source_dir}/full_train/textgrid.flist - cat ${aishell4_source_dir}/$r/wav_list.txt >> ${aishell4_source_dir}/full_train/wav.flist -done - -wav_list_aishell4=${aishell4_source_dir}/full_train/wav.flist -text_grid_aishell4=${aishell4_source_dir}/full_train/textgrid.flist - -# process train set -sed -e 's/\.wav//' $train_dir/wav.flist | awk -F '/' '{print $NF}' > $train_dir/utt.list -paste -d' ' $train_dir/utt.list $train_dir/wav.flist | sort -u > $train_dir/wav.scp -python local/aishell4_process_textgrid.py --path $train_dir -cat $train_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $train_dir/text -local/filter_scp.pl -f 1 $train_dir/text $train_dir/utt2spk_all | sort -u > $train_dir/utt2spk -local/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt -local/filter_scp.pl -f 1 $train_dir/text $train_dir/segments_all | sort -u > $train_dir/segments - -# process test set -sed -e 's/\.wav//' $test_dir/wav.flist | awk -F '/' '{print $NF}' > $test_dir/utt.list -paste -d' ' $test_dir/utt.list $test_dir/wav.flist |sort -u > $test_dir/wav.scp -python local/aishell4_process_textgrid.py --path $test_dir -cat $test_dir/text_all | local/text_normalize.pl | local/text_format.pl | sort -u > $test_dir/text -local/filter_scp.pl -f 1 $test_dir/text $test_dir/utt2spk_all | sort -u > $test_dir/utt2spk -local/utt2spk_to_spk2utt.pl $test_dir/utt2spk > $test_dir/spk2utt -local/filter_scp.pl -f 1 $test_dir/text $test_dir/segments_all | sort -u > $test_dir/segments - -local/copy_data_dir.sh --utt-prefix Aishell4- --spk-prefix Aishell4- \ - $train_dir data/aishell4_train -local/copy_data_dir.sh --utt-prefix Aishell4- --spk-prefix Aishell4- \ - $test_dir data/aishell4_test - -echo "$0: AISHELL4 data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/text_format.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/text_format.pl deleted file mode 100644 index c1ff896d017fe5d99c69ff8161a7e7070c3442a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/text_format.pl +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright Chao Weng - -# normalizations for hkust trascript -# see the docs/trans-guidelines.pdf for details - -while () { - @A = split(" ", $_); - if (@A == 1) { - next; - } - print $_ -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/text_normalize.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/text_normalize.pl deleted file mode 100644 index 046903d02dd9f2ccd51215250c5b7797e207b61a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/text_normalize.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright Chao Weng - -# normalizations for hkust trascript -# see the docs/trans-guidelines.pdf for details - -while () { - @A = split(" ", $_); - print "$A[0] "; - for ($n = 1; $n < @A; $n++) { - $tmp = $A[$n]; - if ($tmp =~ //) {$tmp =~ s:::g;} - if ($tmp =~ /<%>/) {$tmp =~ s:<%>::g;} - if ($tmp =~ /<->/) {$tmp =~ s:<->::g;} - if ($tmp =~ /<\$>/) {$tmp =~ s:<\$>::g;} - if ($tmp =~ /<#>/) {$tmp =~ s:<#>::g;} - if ($tmp =~ /<_>/) {$tmp =~ s:<_>::g;} - if ($tmp =~ //) {$tmp =~ s:::g;} - if ($tmp =~ /`/) {$tmp =~ s:`::g;} - if ($tmp =~ /&/) {$tmp =~ s:&::g;} - if ($tmp =~ /,/) {$tmp =~ s:,::g;} - if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} - if ($tmp =~ /A/) {$tmp =~ s:A:A:g;} - if ($tmp =~ /a/) {$tmp =~ s:a:A:g;} - if ($tmp =~ /b/) {$tmp =~ s:b:B:g;} - if ($tmp =~ /c/) {$tmp =~ s:c:C:g;} - if ($tmp =~ /k/) {$tmp =~ s:k:K:g;} - if ($tmp =~ /t/) {$tmp =~ s:t:T:g;} - if ($tmp =~ /,/) {$tmp =~ s:,::g;} - if ($tmp =~ /丶/) {$tmp =~ s:丶::g;} - if ($tmp =~ /。/) {$tmp =~ s:。::g;} - if ($tmp =~ /、/) {$tmp =~ s:、::g;} - if ($tmp =~ /?/) {$tmp =~ s:?::g;} - if ($tmp =~ /·/) {$tmp =~ s:·::g;} - if ($tmp =~ /\*/) {$tmp =~ s:\*::g;} - if ($tmp =~ /!/) {$tmp =~ s:!::g;} - if ($tmp =~ /\$/) {$tmp =~ s:\$::g;} - if ($tmp =~ /\+/) {$tmp =~ s:\+::g;} - if ($tmp =~ /-/) {$tmp =~ s:-::g;} - if ($tmp =~ /\\/) {$tmp =~ s:\\::g;} - if ($tmp =~ /\?/) {$tmp =~ s:\?::g;} - if ($tmp =~ /¥/) {$tmp =~ s:¥::g;} - if ($tmp =~ /%/) {$tmp =~ s:%::g;} - if ($tmp =~ /\./) {$tmp =~ s:\.::g;} - if ($tmp =~ / 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/validate_data_dir.sh deleted file mode 100644 index 22a01fcdab1088bf42fdcbf9c3de2029a5a66d4f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/usr/bin/env bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(local/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - local/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/local/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/run.sh deleted file mode 100644 index dc9aef9ab77ef32c9b13d6bfd13818c795d2def5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/run.sh +++ /dev/null @@ -1,199 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=6 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 -num_utts_per_shard=1000 -data_url=https://www.openslr.org/resources/111 -data_source=/home/work_nfs5_ssd/yhliang/data/aishell4 -# modify this to your AISHELL-4 data path - -nj=16 -dict=data/dict/lang_char.txt - -train_set=aishell4_train -dev_set=aishell4_test -test_sets=aishell4_test - -train_config=conf/train_conformer.yaml -cmvn=true -dir=exp/conformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - local/download_and_untar.sh ${data_source} ${data_url} train_L - local/download_and_untar.sh ${data_source} ${data_url} train_M - local/download_and_untar.sh ${data_source} ${data_url} train_S - local/download_and_untar.sh ${data_source} ${data_url} test -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/prepare_data.sh ${data_source} || exit 1; -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # remove the space between the text labels for Mandarin dataset - for x in ${train_set} ${test_sets}; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -d " " -f 1 data/${x}/text.org) <(cut -d " " -f 2 data/${x}/text.org \ - | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") > data/${x}/text - rm data/${x}/text.org - done - - tools/compute_cmvn_stats.py --num_workers 32 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/$train_set/global_cmvn - -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in $train_set ${test_sets}; do - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 32 --segments data/$x/segments \ - data/$x/wav.scp data/$x/text $(realpath data/$x/shards) data/$x/data.list - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type shard \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/${dev_set}/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts - } - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - for mode in ${decode_modes}; do - { - for test_set in ${test_sets}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu $(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f1) \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type shard \ - --test_data data/${test_set}/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/${test_set}/text $test_dir/text > $test_dir/wer - } & - done - } - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell4/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/README.md deleted file mode 100644 index a65bfa961eb95eea182e28bc424724d290f26df3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: dither + specaug + speed perturb -* Training info: lr 0.0005, batch size 8, 1 gpu, acc_grad 4, 80 epochs -* Decoding info: average_num 10 - -| decoding mode | dt05_real_1ch | dt05_simu_1ch | et05_real_1ch | et05_simu_1ch | -|:----------------------:|:-------------:|:-------------:|:-------------:|:-------------:| -| ctc_prefix_beam_search | 19.06% | 21.17% | 28.39% | 29.16% | -| attention_rescoring | 17.92% | 20.22% | 27.40% | 28.25% | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/conf/train_conformer.yaml deleted file mode 100644 index 49aa0bcab0e37dc22aae19c276a75d6c4b157625..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - split_with_space: true - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 40 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 8 - -grad_clip: 10 -accum_grad: 4 -max_epoch: 80 -log_interval: 200 - -optim: adam -optim_conf: - lr: 0.0005 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 20000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/chime4_format_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/chime4_format_dir.sh deleted file mode 100644 index 118b950e6e34a7cc262f5586ed153e94174df927..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/chime4_format_dir.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# wujian@2020 - -set -eu - -echo "$0: Formating chime4 data dir..." - -track=isolated_1ch_track -data_dir=data/chime4 - -mkdir -p $data_dir/{train,dev} - -cat $data_dir/tr05_{simu,real}_noisy/wav.scp $data_dir/tr05_orig_clean/wav.scp \ - $data_dir/train_si200_wsj1_clean/wav.scp | sort -k1 > $data_dir/train/wav.scp -cat $data_dir/tr05_{simu,real}_noisy/text $data_dir/tr05_orig_clean/text \ - $data_dir/train_si200_wsj1_clean/text | sort -k1 > $data_dir/train/text - -cat $data_dir/dt05_{real,simu}_${track}/wav.scp | sort -k1 > $data_dir/dev/wav.scp -cat $data_dir/dt05_{real,simu}_${track}/text | sort -k1 > $data_dir/dev/text - -echo "$0: Format $data_dir done" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/chime4_gen_wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/chime4_gen_wav.sh deleted file mode 100644 index 7beca665efbba2ead04c88a2f5ac6a1c3b8b2a11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/chime4_gen_wav.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# wujian@2020 - -set -eu - -[ $# -ne 2 ] && echo "Script format error: $0 " && exit 0 - -data_dir=$1 -dump_dir=$2 - -mkdir -p $dump_dir - -num_utts=$(cat $data_dir/wav.scp | wc -l) -echo "Orginal utterances (.wav + .wv1): $num_utts" - -# cat $data_dir/wav.scp | grep "sph2pipe" | \ -# awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash - -cat $data_dir/wav.scp | grep -v "sph2pipe" > $data_dir/raw_wav.scp -find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \ - sed 's:\.wav::' > $data_dir/sph_wav.scp - -cat $data_dir/{raw_wav,sph_wav}.scp | sort -k1 > $data_dir/wav.scp -num_utts=$(cat $data_dir/wav.scp | wc -l) -echo "Wave utterances (.wav): $num_utts" - -echo "$0: Generate wav => $dump_dir done" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/clean_wsj0_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/clean_wsj0_data_prep.sh deleted file mode 100644 index 45798dd244690742ead80e3278b738323014f850..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/clean_wsj0_data_prep.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# Modified from Kaldi's chime4 recipe - -set -eu - -dataset=chime4 - -. ./tools/parse_options.sh || exit 1; - -if [ $# -ne 1 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "The argument should be a the top-level WSJ corpus directory." - echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" - echo "within the top-level corpus directory." - exit 1; -fi - -wsj0=$1 - -srcdir=$PWD/data/chime4/local -dstdir=$PWD/data/$dataset -local=$PWD/local -utils=$PWD/utils -sph2pipe=sph2pipe - -if [ ! `which sph2pipe` ]; then - echo "Could not find sph2pipe, install it first..." - mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz - tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5 - gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz - sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe - cd .. -fi - -mkdir -p $srcdir && cd $srcdir - -# This version for SI-84 -cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \ - | $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05.flist - -# Now for the test sets. -# $wsj0/wsj1/doc/indices/readme.doc -# describes all the different test sets. -# Note: each test-set seems to come in multiple versions depending -# on different vocabulary sizes, verbalized vs. non-verbalized -# pronunciations, etc. We use the largest vocab and non-verbalized -# pronunciations. -# The most normal one seems to be the "baseline 60k test set", which -# is h1_p0. - -# Nov'92 (330 utts, 5k vocab) -cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ - $local/cstr_ndx2flist.pl $wsj0 | sort > et05.flist - -# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. -# Sometimes this gets copied from the CD's with upcasing, don't know -# why (could be older versions of the disks). -find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05.flist - -# Finding the transcript files: -find -L $wsj0 -iname '*.dot' > dot_files.flist - -# Convert the transcripts into our format (no normalization yet) -# adding suffix to utt_id -# 0 for clean condition -for x in tr05 et05 dt05; do - $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp - cat ${x}_sph_tmp.scp | awk '{print $1}' \ - | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 - cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp - cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1 -done - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in tr05 et05 dt05; do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ - | sort > $x.txt || exit 1; -done - -# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) -for x in tr05 et05 dt05; do - awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp -done - -if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then - rm -f wsj0-train-spkrinfo.txt - wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \ - || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ - wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt ); -fi - -if [ ! -f wsj0-train-spkrinfo.txt ]; then - echo "Could not get the spkrinfo.txt file from LDC website (moved)?" - echo "This is possibly omitted from the training disks; couldn't find it." - echo "Everything else may have worked; we just may be missing gender info" - echo "which is only needed for VTLN-related diagnostics anyway." - exit 1 -fi -# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the -# LDC put it on the web. Perhaps it was accidentally omitted from the -# disks. - -cat $wsj0/wsj0/doc/spkrinfo.txt \ - ./wsj0-train-spkrinfo.txt | \ - perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ - awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender - -# return back -cd - - -for x in et05 dt05 tr05; do - mkdir -p $dstdir/${x}_orig_clean - cp $srcdir/$x.txt $dstdir/${x}_orig_clean/text || exit 1 - cp $srcdir/${x}_wav.scp $dstdir/${x}_orig_clean/wav.scp || exit 1 -done - -echo "Data preparation succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/clean_wsj1_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/clean_wsj1_data_prep.sh deleted file mode 100644 index 9043879da801bc08cd5d7294f8e1c5e8ed51aa93..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/clean_wsj1_data_prep.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -set -eu - - -if [ $# -ne 1 ]; then - echo "Arguments should be WSJ1 directory" - exit 1; -fi - -wsj1=$1 -dir=$PWD/data/chime4/local -odir=$PWD/data/chime4 -mkdir -p $dir -local=$PWD/local -sph2pipe=sph2pipe - -if [ ! `which sph2pipe` ]; then - echo "Could not find sph2pipe, install it first..." - mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz - tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5 - gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz - sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe - cd .. -fi - -cd $dir -# This version for SI-200 -cat $wsj1/13-34.1/wsj1/doc/indices/si_tr_s.ndx | \ - $local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > train_si200.flist - -nl=`cat train_si200.flist | wc -l` -[ "$nl" -eq 30278 ] || echo "Warning: expected 30278 lines in train_si200.flist, got $nl" - -# Dev-set for Nov'93 (503 utts) -cat $wsj1/13-34.1/wsj1/doc/indices/h1_p0.ndx | \ - $local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > test_dev93.flist - -# Finding the transcript files: -for x in $wsj1/??-{?,??}.?; do find -L $x -iname '*.dot'; done > dot_files.flist - -# Convert the transcripts into our format (no normalization yet) -for x in train_si200 test_dev93; do - $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp - cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1 -done - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in train_si200 test_dev93; do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; -done - -# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) -for x in train_si200 test_dev93; do - awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp -done - -# return back -cd - - -for x in train_si200 test_dev93; do - mkdir -p $odir/${x}_wsj1_clean - cp $dir/$x.txt $odir/${x}_wsj1_clean/text || exit 1 - cp $dir/${x}_wav.scp $odir/${x}_wsj1_clean/wav.scp || exit 1 -done - -echo "Data preparation WSJ1 succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/cstr_ndx2flist.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/cstr_ndx2flist.pl deleted file mode 100644 index 79daa1a99db992c5893a9d762fa4ef757b16dc76..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/cstr_ndx2flist.pl +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This is modified from the script in standard Kaldi recipe to account -# for the way the WSJ data is structured on the Edinburgh systems. -# - Arnab Ghoshal, 12/1/12 - -# This program takes as its standard input an .ndx file from the WSJ corpus that looks -# like this: -#;; File: tr_s_wv1.ndx, updated 04/26/94 -#;; -#;; Index for WSJ0 SI-short Sennheiser training data -#;; Data is read WSJ sentences, Sennheiser mic. -#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts -#;; per speaker TI) = 7236 utts -#;; -#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 - -# and as command-line argument it takes the names of the WSJ disk locations, e.g.: -# /group/corpora/public/wsjcam0/data on DICE machines. -# It outputs a list of absolute pathnames. - -$wsj_dir = $ARGV[0]; - -while(){ - if(m/^;/){ next; } # Comment. Ignore it. - else { - m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; - $filename = $2; # as a subdirectory of the distributed disk. - if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; } - $filename = "$wsj_dir/$filename"; - if (-e $filename) { - print "$filename\n"; - } else { - print STDERR "File $filename found in the index but not on disk\n"; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/find_noisy_transcripts.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/find_noisy_transcripts.pl deleted file mode 100644 index d24ae32668840dafddef768622aa234dc3d396f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/find_noisy_transcripts.pl +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program takes on its standard input a list of utterance -# id's, one for each line. (e.g. 4k0c030a is a an utterance id). -# It takes as -# Extracts from the dot files the transcripts for a given -# dataset (represented by a file list). -# - -@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; -$dot_flist = shift @ARGV; - -open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; -while(){ - chop; - m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; - $spk = $1; - $spk2dot{$spk} = $_; -} - - - -while(){ - chop; - $uttid_orig = $_; - $uttid = substr $uttid_orig, 0, 8; - $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; - $spk = $1; - if($spk ne $curspk) { - %utt2trans = { }; # Don't keep all the transcripts in memory... - $curspk = $spk; - $dotfile = $spk2dot{$spk}; - defined $dotfile || die "No dot file for speaker $spk\n"; - open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; - while() { - $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; - $trans = $1; - $utt = $2; - $utt2trans{$utt} = $trans; - } - } - if(!defined $utt2trans{$uttid}) { - print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; - } else { - print "$uttid_orig $utt2trans{$uttid}\n"; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/find_transcripts.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/find_transcripts.pl deleted file mode 100644 index 8884e4f811e2cb76b3d07511368f3ceb4ac17a43..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/find_transcripts.pl +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program takes on its standard input a list of utterance -# id's, one for each line. (e.g. 4k0c030a is a an utterance id). -# It takes as -# Extracts from the dot files the transcripts for a given -# dataset (represented by a file list). -# - -@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; -$dot_flist = shift @ARGV; - -open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; -while(){ - chop; - m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; - $spk = $1; - $spk2dot{$spk} = $_; -} - - - -while(){ - chop; - $uttid = $_; - $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; - $spk = $1; - if($spk ne $curspk) { - %utt2trans = { }; # Don't keep all the transcripts in memory... - $curspk = $spk; - $dotfile = $spk2dot{$spk}; - defined $dotfile || die "No dot file for speaker $spk\n"; - open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; - while() { - $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; - $trans = $1; - $utt = $2; - $utt2trans{$utt} = $trans; - } - } - if(!defined $utt2trans{$uttid}) { - print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; - } else { - print "$uttid $utt2trans{$uttid}\n"; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/flist2scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/flist2scp.pl deleted file mode 100644 index 7edf1e3f1f44e4ac3b97b39361a46ba8c453c88d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/flist2scp.pl +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# takes in a file list with lines like -# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 -# and outputs an scp in kaldi format with lines like -# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 -# (the first thing is the utterance-id, which is the same as the basename of the file. - - -while(<>){ - m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; - $id = $1; - $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) - print "$id $_"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/ndx2flist.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/ndx2flist.pl deleted file mode 100644 index c5f676affcd11ba1c6411c013c76841d65d776bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/ndx2flist.pl +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This program takes as its standard input an .ndx file from the WSJ corpus that looks -# like this: -#;; File: tr_s_wv1.ndx, updated 04/26/94 -#;; -#;; Index for WSJ0 SI-short Sennheiser training data -#;; Data is read WSJ sentences, Sennheiser mic. -#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts -#;; per speaker TI) = 7236 utts -#;; -#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 - -#and as command-line arguments it takes the names of the WSJ disk locations, e.g.: -#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc. -# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with -# /mnt/matylda2/data/WSJ0/11-1.1. -# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with -# uppercase rather than lower case filenames. - -foreach $fn (@ARGV) { - $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n"; - $disk_id=$1; - $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1 - $fn =~ s:/$::; # Remove final slash, just in case it is present. - $disk2fn{$disk_id} = $fn; -} - -while(){ - if(m/^;/){ next; } # Comment. Ignore it. - else { - m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; - $disk=$1; - if(!defined $disk2fn{$disk}) { - die "Disk id $disk not found"; - } - $filename = $2; # as a subdirectory of the distributed disk. - if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) { - # The disk 13-16.1 has been uppercased for some reason, on the - # BUT system. This is a fix specifically for that case. - $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why? - } - print "$disk2fn{$disk}/$filename\n"; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/normalize_transcript.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/normalize_transcript.pl deleted file mode 100644 index 6b18d43d26ff42e550b4b05eb77c4b4301c249c0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/normalize_transcript.pl +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This takes data from the standard input that's unnormalized transcripts in the format -# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] -# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] -# and outputs normalized transcripts. -# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc - -@ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2"; -$noise_word = shift @ARGV; - -while() { - $_ =~ m:^(\S+) (.+): || die "bad line $_"; - $utt = $1; - $trans = $2; - print "$utt"; - foreach $w (split (" ",$trans)) { - $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. . - $w =~ s:\\::g; # Remove backslashes. We don't need the quoting. - $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts. - $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts. - if($w =~ m:^\[\<\w+\]$: || # E.g. [\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete. - $w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon. - $w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon. - $w eq "~" || # This is used to indicate truncation of an utterance. Not a word. - $w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much - # point including this in the transcript. - next; # we won't print this word. - } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath]. - print " $noise_word"; - } elsif($w =~ m:^\<([\w\']+)\>$:) { - # e.g. replace with and. (the <> means verbal deletion of a word).. but it's pronounced. - print " $1"; - } elsif($w eq "--DASH") { - print " -DASH"; # This is a common issue; the CMU dictionary has it as -DASH. -# } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word -# print " $1 -DASH"; - } else { - print " $w"; - } - } - print "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/real_enhan_chime4_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/real_enhan_chime4_data_prep.sh deleted file mode 100644 index ea93343adf0bf6d68a6f54476841c3a8b628cdcc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/real_enhan_chime4_data_prep.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env bash -set -e - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This is modified from the script in standard Kaldi recipe to account -# for the way the WSJ data is structured on the Edinburgh systems. -# - Arnab Ghoshal, 29/05/12 - -# Modified from the script for CHiME2 baseline -# Shinji Watanabe 02/13/2015 - -# Config: -eval_flag=true # make it true when the evaluation data are released - -. tools/parse_options.sh || exit 1; - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "The argument should be a the directory that only contains enhanced speech data." - exit 1; -fi - -echo "$0 $@" # Print the command line for logging - -enhan=$1 -audio_dir=$2 - -dir=$PWD/data/chime4/local -mkdir -p $dir -local=$PWD/local -utils=$PWD/utils -odir=$PWD/data/chime4 - -if $eval_flag; then -list_set="tr05_real_$enhan dt05_real_$enhan et05_real_$enhan" -else -list_set="tr05_real_$enhan dt05_real_$enhan" -fi - -cd $dir - -find $audio_dir/ -name '*.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_$enhan.flist -find $audio_dir/ -name '*.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_$enhan.flist -if $eval_flag; then -find $audio_dir/ -name '*.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_$enhan.flist -fi - -# make a scp file from file list -for x in $list_set; do - cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids - paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp -done - -#make a transcription from dot -cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> tr05_real_$enhan.ids -cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_$enhan.txt -paste -d" " tr05_real_$enhan.ids tr05_real_$enhan.txt | sort -k 1 > tr05_real_$enhan.trans1 -cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> dt05_real_$enhan.ids -cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_$enhan.txt -paste -d" " dt05_real_$enhan.ids dt05_real_$enhan.txt | sort -k 1 > dt05_real_$enhan.trans1 -if $eval_flag; then -cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> et05_real_$enhan.ids -cat et05_real.dot | sed -e 's/(.*)//' > et05_real_$enhan.txt -paste -d" " et05_real_$enhan.ids et05_real_$enhan.txt | sort -k 1 > et05_real_$enhan.trans1 -fi - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in $list_set;do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ - | sort > $x.txt || exit 1; -done - -# copying data to data/... -for x in $list_set; do - mkdir -p $odir/$x - cp ${x}_wav.scp $odir/$x/wav.scp || exit 1; - cp ${x}.txt $odir/$x/text || exit 1; -done - -echo "Data preparation succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/real_noisy_chime4_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/real_noisy_chime4_data_prep.sh deleted file mode 100644 index aeb3b0314bbaa021577bdf1e0ba519468d8b666f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/real_noisy_chime4_data_prep.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env bash - -set -eu - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This is modified from the script in standard Kaldi recipe to account -# for the way the WSJ data is structured on the Edinburgh systems. -# - Arnab Ghoshal, 29/05/12 - -# Modified from the script for CHiME2 baseline -# Shinji Watanabe 02/13/2015 -# Modified to use data of six channels -# Szu-Jui Chen 09/29/2017 - -# Config: -eval_flag=true # make it true when the evaluation data are released - -. tools/parse_options.sh || exit 1 - -if [ $# -ne 1 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "The argument should be a the top-level Chime4 directory." - echo "It is assumed that there will be a 'data' subdirectory" - echo "within the top-level corpus directory." - exit 1; -fi - -echo "$0 $@" # Print the command line for logging - -audio_dir=$1/data/audio/16kHz/isolated/ -trans_dir=$1/data/transcriptions - -echo "extract all channels (CH[1-6].wav) for noisy data" - -dir=$PWD/data/chime4/local -mkdir -p $dir -local=$PWD/local - -if $eval_flag; then -list_set="tr05_real_noisy dt05_real_noisy et05_real_noisy" -else -list_set="tr05_real_noisy dt05_real_noisy" -fi - -cd $dir - -find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_noisy.flist -find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_noisy.flist -if $eval_flag; then -find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_noisy.flist -fi - -# make a dot format from json annotation files -cp $trans_dir/tr05_real.dot_all tr05_real.dot -cp $trans_dir/dt05_real.dot_all dt05_real.dot -if $eval_flag; then -cp $trans_dir/et05_real.dot_all et05_real.dot -fi - -# make a scp temporary file from file list -for x in $list_set; do - cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.id.temp - cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch - cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1 - cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2 - paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids - paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp -done - -#make a transcription from dot -cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> tr05_real_noisy.ids -cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_noisy.txt -paste -d" " tr05_real_noisy.ids tr05_real_noisy.txt | \ -awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \ -sort -k 1 > tr05_real_noisy.trans1 -cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> dt05_real_noisy.ids -cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_noisy.txt -paste -d" " dt05_real_noisy.ids dt05_real_noisy.txt | \ -awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \ -sort -k 1 > dt05_real_noisy.trans1 -if $eval_flag; then -cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> et05_real_noisy.ids -cat et05_real.dot | sed -e 's/(.*)//' > et05_real_noisy.txt -paste -d" " et05_real_noisy.ids et05_real_noisy.txt | \ -awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \ -sort -k 1 > et05_real_noisy.trans1 -fi - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in $list_set;do - cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1 - cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2 - paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1 - cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ - | sort > $x.txt || exit 1; -done - -# copying data to data/... -for x in $list_set; do - sort ${x}_wav.scp.temp > ${x}_wav.scp - mkdir -p ../../chime4/$x - cp ${x}_wav.scp ../../chime4/$x/wav.scp || exit 1; - cp ${x}.txt ../../chime4/$x/text || exit 1; -done - -# clean up temp files -rm *.temp -rm *.part{1,2} - -echo "Data preparation succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/simu_enhan_chime4_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/simu_enhan_chime4_data_prep.sh deleted file mode 100644 index f5d28366dd0c7dec74b8441237ae8fbe3789363f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/simu_enhan_chime4_data_prep.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env bash -set -eu - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This is modified from the script in standard Kaldi recipe to account -# for the way the WSJ data is structured on the Edinburgh systems. -# - Arnab Ghoshal, 29/05/12 - -# Modified from the script for CHiME2 baseline -# Shinji Watanabe 02/13/2015 - -# Config: -eval_flag=true # make it true when the evaluation data are released - -. tools/parse_options.sh || exit 1; - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "The argument should be a the directory that only contains enhanced speech data." - exit 1; -fi - -echo "$0 $@" # Print the command line for logging - -enhan=$1 -audio_dir=$2 - -dir=$PWD/data/chime4/local -mkdir -p $dir -local=$PWD/local -utils=$PWD/utils -odir=$PWD/data/chime4 - -if $eval_flag; then -list_set="tr05_simu_$enhan dt05_simu_$enhan et05_simu_$enhan" -else -list_set="tr05_simu_$enhan dt05_simu_$enhan" -fi - -cd $dir - -find $audio_dir/ -name '*.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_$enhan.flist -find $audio_dir/ -name '*.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_$enhan.flist -if $eval_flag; then -find $audio_dir/ -name '*.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_$enhan.flist -fi - -# make a scp file from file list -for x in $list_set; do - cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids - paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp -done - -# make a transcription from dot -# simulation training data extract dot file from original WSJ0 data -# since it is generated from these data -if [ ! -e dot_files.flist ]; then - echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh"; - exit 1; -fi -cat tr05_simu_${enhan}_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \ - | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_$enhan.txt -cat tr05_simu_${enhan}_wav.scp | cut -f 1 -d" " > tr05_simu_$enhan.ids -paste -d" " tr05_simu_$enhan.ids tr05_simu_$enhan.txt | sort -k 1 > tr05_simu_$enhan.trans1 -# dt05 and et05 simulation data are generated from the CHiME4 booth recording -# and we use CHiME4 dot files -cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> dt05_simu_$enhan.ids -cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_$enhan.txt -paste -d" " dt05_simu_$enhan.ids dt05_simu_$enhan.txt | sort -k 1 > dt05_simu_$enhan.trans1 -if $eval_flag; then -cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> et05_simu_$enhan.ids -cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_$enhan.txt -paste -d" " et05_simu_$enhan.ids et05_simu_$enhan.txt | sort -k 1 > et05_simu_$enhan.trans1 -fi - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in $list_set;do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ - | sort > $x.txt || exit 1; -done - -# copying data to data/... -for x in $list_set; do - mkdir -p $odir/$x - cp ${x}_wav.scp $odir/$x/wav.scp || exit 1; - cp ${x}.txt $odir/$x/text || exit 1; -done - -echo "Data preparation succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/simu_noisy_chime4_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/simu_noisy_chime4_data_prep.sh deleted file mode 100644 index 52bd7c6c3aa13e596847b721194a703e42030c75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/local/simu_noisy_chime4_data_prep.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash -set -eu - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This is modified from the script in standard Kaldi recipe to account -# for the way the WSJ data is structured on the Edinburgh systems. -# - Arnab Ghoshal, 29/05/12 - -# Modified from the script for CHiME2 baseline -# Shinji Watanabe 02/13/2015 -# Modified to use data of six channels -# Szu-Jui Chen 09/29/2017 - -# Config: -eval_flag=true # make it true when the evaluation data are released - -. tools/parse_options.sh || exit 1; - -if [ $# -ne 1 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "The argument should be a the top-level Chime4 directory." - echo "It is assumed that there will be a 'data' subdirectory" - echo "within the top-level corpus directory." - exit 1; -fi - -echo "$0 $@" # Print the command line for logging - -audio_dir=$1/data/audio/16kHz/isolated/ -trans_dir=$1/data/transcriptions - -echo "extract all channels (CH[1-6].wav) for noisy data" - -dir=$PWD/data/chime4/local -mkdir -p $dir -local=$PWD/local -utils=$PWD/utils - -if $eval_flag; then -list_set="tr05_simu_noisy dt05_simu_noisy et05_simu_noisy" -else -list_set="tr05_simu_noisy dt05_simu_noisy" -fi - -cd $dir - -find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_noisy.flist -find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_noisy.flist -if $eval_flag; then -find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_noisy.flist -fi - -# make a dot format from json annotation files -cp $trans_dir/dt05_simu.dot_all dt05_simu.dot -if $eval_flag; then -cp $trans_dir/et05_simu.dot_all et05_simu.dot -fi - -# make a scp file from file list -for x in $list_set; do - cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.id.temp - cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch - cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1 - cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2 - paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids - paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp -done - -# make a transcription from dot -# simulation training data extract dot file from original WSJ0 data -# since it is generated from these data -if [ ! -e dot_files.flist ]; then - echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh"; - exit 1; -fi -cat tr05_simu_noisy_wav.scp.temp | awk -F'[_]' '{print $3}' | tr '[A-Z]' '[a-z]' \ - | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt -cat tr05_simu_noisy_wav.scp.temp | cut -f 1 -d" " > tr05_simu_noisy.ids -paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -t_ -k1,1 -k3 > tr05_simu_noisy.trans1 -# dt05 and et05 simulation data are generated from the CHiME4 booth recording -# and we use CHiME4 dot files -cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> dt05_simu_noisy.ids -cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_noisy.txt -paste -d" " dt05_simu_noisy.ids dt05_simu_noisy.txt | \ -awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \ -sort -k 1 > dt05_simu_noisy.trans1 -if $eval_flag; then -cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> et05_simu_noisy.ids -cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_noisy.txt -paste -d" " et05_simu_noisy.ids et05_simu_noisy.txt | \ -awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \ -sort -k 1 > et05_simu_noisy.trans1 -fi - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in $list_set;do - cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1 - cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2 - paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1 - cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ - | sort > $x.txt || exit 1; -done - -# copying data to data/... -for x in $list_set; do - sort ${x}_wav.scp.temp > ${x}_wav.scp - mkdir -p ../../chime4/$x - cp ${x}_wav.scp ../../chime4/$x/wav.scp || exit 1; - cp ${x}.txt ../../chime4/$x/text || exit 1; -done - -# clean up temp files -rm *.temp -rm *.part{1,2} - -echo "Data preparation succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/run.sh deleted file mode 100644 index f010265fa3348af1b04753bcf3dc77f5e71bffb8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/run.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2020 Jian Wu -# License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -set -eu - -stage="1-4" -space="" -track="isolated_1ch_track" -wsj1_data_dir=//scratch/jwu/wsj1 -chime4_data_dir=/scratch/jwu/CHiME4 -dump_wav_dir=/scratch/jwu/chime4_wav - -data_dir=data/chime4 -dict=$data_dir/dict_char.txt -train_config=conf/train_conformer.yaml -exp_dir=exp/1a -decode_modes="ctc_prefix_beam_search attention_rescoring" -average_checkpoint=true -average_num=10 - -. ./path.sh -. ./tools/parse_options.sh || exit 1 - -beg=$(echo $stage | awk -F '-' '{print $1}') -end=$(echo $stage | awk -F '-' '{print $2}') -[ -z $end ] && end=$beg - -if [ $end -ge 1 ] && [ $beg -le 1 ]; then - echo "Stage 1: preparing data ..." - ./local/clean_wsj0_data_prep.sh $chime4_data_dir/CHiME3/data/WSJ0 - ./local/simu_noisy_chime4_data_prep.sh $chime4_data_dir - ./local/real_noisy_chime4_data_prep.sh $chime4_data_dir - ./local/simu_enhan_chime4_data_prep.sh $track $chime4_data_dir/data/audio/16kHz/$track - ./local/real_enhan_chime4_data_prep.sh $track $chime4_data_dir/data/audio/16kHz/$track - ./local/clean_wsj1_data_prep.sh $wsj1_data_dir - ./local/chime4_format_dir.sh -fi - - -if [ $end -ge 2 ] && [ $beg -le 2 ]; then - echo -e "\n<*IN*>\n<*MR.*>" > $data_dir/train/non_lang.txt - for name in dev train; do - python tools/text2token.py $data_dir/$name/text -n 1 -s 1 \ - -l $data_dir/train/non_lang.txt > $data_dir/$name/char - done - mkdir -p $(dirname $dict) && echo -e " 0\n 1" > ${dict} - cat $data_dir/train/char | cut -f 2- -d" " | tr " " "\n" | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict - echo "Make dictionary done" -fi - - -if [ $end -ge 3 ] && [ $beg -le 3 ]; then - ./local/chime4_gen_wav.sh $data_dir/train $dump_wav_dir - tools/compute_cmvn_stats.py --num_workers 16 \ - --train_config $train_config \ - --in_scp $data_dir/train/wav.scp \ - --out_cmvn $data_dir/train/global_cmvn - echo "Prepare data, prepare required format" - for x in train dev; do - tools/make_raw_list.py $data_dir/$x/wav.scp $data_dir/$x/char \ - $data_dir/$x/data.list - done -fi - -if [ $end -ge 4 ] && [ $beg -le 4 ]; then - mkdir -p $exp_dir && cp $data_dir/train/global_cmvn $exp_dir - python wenet/bin/train.py \ - --gpu 0 \ - --config $train_config \ - --train_data $data_dir/train/data.list \ - --cv_data $data_dir/dev/data.list \ - --model_dir $exp_dir \ - --num_workers 4 \ - --symbol_table $dict \ - --cmvn $exp_dir/global_cmvn \ - --pin_memory > $exp_dir/train.log 2>&1 -fi - -suffix="isolated_1ch_track" -if [ $end -ge 5 ] && [ $beg -le 5 ]; then - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$exp_dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $exp_dir \ - --num ${average_num} \ - --val_best - fi - nj=4 - ctc_weight=0.5 - for x in dt05_{simu,real} et05_{simu,real}; do - subdir=${x}_${suffix} - tools/make_raw_list.py $data_dir/$subdir/wav.scp $data_dir/$subdir/text \ - $data_dir/$subdir/data.list - done - for mode in ${decode_modes}; do - for x in dt05_{simu,real} et05_{simu,real}; do - subdir=${x}_${suffix} - dec_dir=$exp_dir/${subdir}_${mode} && mkdir -p $dec_dir - python wenet/bin/recognize.py \ - --gpu 0 \ - --mode $mode \ - --config $exp_dir/train.yaml \ - --test_data $data_dir/$subdir/data.list \ - --checkpoint $exp_dir/avg_${average_num}.pt \ - --beam_size 8 \ - --batch_size 1 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --result_file $dec_dir/text & - done - wait - done - for mode in ${decode_modes}; do - for x in dt05_{simu,real} et05_{simu,real}; do - subdir=${x}_${suffix} - dec_dir=$exp_dir/${subdir}_${mode} - sed 's:: :g' $dec_dir/text > $dec_dir/text.norm - python tools/compute-wer.py --char=1 --v=1 \ - $data_dir/$subdir/text $dec_dir/text.norm > $dec_dir/wer - done - done -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/chime4/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/README.md deleted file mode 100644 index 853415bf35645f6effe6bc2ee7b3fdd7854eeabe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Performance Record -# Should be installed ffmpeg , pandas !!! -## Conformer Result - -* Feature info: dither + specaug + speed perturb -* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs -* Decoding info: average_num 20 - - - -| decoding mode | test (wer) | -| :--------------------: | :---------: | -| ctc_greedy_search | 16.12% | -| ctc_prefix_beam_search | 16.07% | -| attention | 13.56% | -| attention_rescoring | 14.01% | \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/conf/train_conformer.yaml deleted file mode 100644 index 1e20f58224e4b4307d4dc2c24ef96adf2c30c4a8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/conf/train_conformer.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - split_with_space: true - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 40 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'dynamic' # static or dynamic - batch_size: 8 - -grad_clip: 10 -accum_grad: 4 -max_epoch: 30 -log_interval: 200 - -optim: adam -optim_conf: - lr: 0.0005 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 20000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/create_scp_text.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/create_scp_text.py deleted file mode 100644 index b3d94276e4ca58e3e0f5bf676671cf81b51fec15..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/create_scp_text.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import sys -import os -import re -def process(src_str): - punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}' - return re.sub(r"[{0}]+".format(punc), "", src_str).upper() - -if __name__ == '__main__': - src_dir = sys.argv[1] - tsv_file = src_dir + "/" + sys.argv[2] + ".tsv" - output_dir = sys.argv[3] - for file_path in os.listdir(src_dir + "/clips"): - if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")): - continue - t_str = src_dir + "/clips/" + file_path - tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav" - os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str)) - import pandas - tsv_content = pandas.read_csv(tsv_file, sep="\t") - path_list = tsv_content["path"] - sentence = tsv_content["sentence"] - client_list = tsv_content["client_id"] - scp_file = open(output_dir + "/wav.scp", "w") - text_file = open(output_dir + "/text", "w") - utt2spk = open(output_dir + "/utt2spk", "w") - for i in range(len(path_list)): - temple_str = path_list[i].split(".")[0] - now_sentence = process(sentence[i]) - wav_file = src_dir + "/wavs/" + temple_str + ".wav" - scp_file.writelines(temple_str + " " + wav_file + "\n") - text_file.writelines(temple_str + " " + now_sentence + "\n") - utt2spk.writelines(temple_str + " " + client_list[i] + "\n") - scp_file.close() - text_file.close() - utt2spk.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/download_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/download_data.sh deleted file mode 100644 index 1dc1914a59311c426280a4308e5a4d5a476fb6ec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/download_data.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -if [ $# -le 1 ]; then - echo "Args_Error:Two parameters are required." - exit 1; -fi -download_path=$1 -data_France=$2 -wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz -tar -xvf ${download_path}/tmp.zip -C ${data_France} -rm -rf ${download_path}/tmp.zip \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/prepare_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/prepare_data.sh deleted file mode 100644 index 5e561a556b32b7482d87a3c387caa03a8b8e6878..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/local/prepare_data.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -if [ $# -le 0 ]; then - echo "Argument should be France src directory, see ../run.sh for example." - exit 1; -fi -dir=`pwd`/data -local=`pwd`/local -src_path=$1 -if [ ! -d ${dir} ]; then - mkdir ${dir} - else - rm -rf ${dir} - mkdir ${dir} -fi - -for x in train dev test; do - if [ ! ${dir}/${x} ]; then - mkdir ${dir}/${x} - else - rm -rf ${dir}/${x} - mkdir ${dir}/${x} - fi -done - -if [ ! -d ${src_path}/wavs ]; then - mkdir ${src_path}/wavs -fi -for x in train dev test; do - python3 ${local}/create_scp_text.py ${src_path} ${x} ${dir}/${x} -done diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/run.sh deleted file mode 100644 index 5ca76d43060d36693cbe156b6d0f80ff4298002b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/run.sh +++ /dev/null @@ -1,244 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data download -stop_stage=2 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 -# data -download_path=/root/autodl-tmp -french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19 -# path to save preproecssed data -# export data=data -. ./path.sh -. ./tools/parse_options.sh || exit 1 - -nj=16 - -# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, -# `shard` is used for large dataset which is over 1k hours, and `shard` is -# faster on reading data and training. -data_type=raw -num_utts_per_shard=1000 - -train_set=train -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding -# 6. conf/train_u2++_conformer.yaml: U2++ conformer -# 7. conf/train_u2++_transformer.yaml: U2++ transformer -train_config=conf/train_conformer.yaml -cmvn=true -dir=exp/conformer -checkpoint= -nbpe=5000 - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=20 -#decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" -decode_modes="attention attention_rescoring" - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - - echo "stage -1: Data download" - echo "download Dataset!" - local/download_data.sh ${download_path} ${french_data} - echo "Finish stage 0" -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - - echo "stage 0: Data preparation" - local/prepare_data.sh ${french_data}/fr - echo "Finish stage 0" -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "stage 1: compute global cmvn" - # compute cmvn - python tools/compute_cmvn_stats.py --num_workers 1 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/${train_set}/global_cmvn - echo "Finish stage 1" -fi - - -bpemode=unigram -dict=data/lang_char_/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=data/lang_char_/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char_/ - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " data/${train_set}/text > data/lang_char_/input.txt - tools/spm_train --input=data/lang_char_/input.txt --vocab_size=${nbpe} \ - --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece \ - < data/lang_char_/input.txt | \ - tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "stage 3: Prepare data, prepare required format" - for x in dev test ${train_set}; do - if [ $data_type == "shard" ]; then - python tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data/$x/wav.scp data/$x/text \ - $(realpath data/$x/shards) data/$x/data.list - else - python tools/make_raw_list.py data/$x/wav.scp data/$x/text \ - data/$x/data.list - fi - done - echo "Finish stage 3" -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - mkdir -p $dir - # You have to rm `INIT_FILE` manually when you resume or restart a - # multi-machine training. - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - - # train.py rewrite $train_config to $dir/train.yaml with model input - # and output dimension, and $dir/train.yaml will be used for inference - # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --bpe_model $bpemodel.model \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - # Polling GPU id begin with index 0 - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - idx=0 - for mode in ${decode_modes}; do - { - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type "raw" \ - --bpe_model $bpemodel.model \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 20 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp - cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp - - tools/spm_decode --model=${bpemodel}.model --input_format=piece \ - < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value - #sed -e "s/▁/ /g" $test_dir/text_bpe_value_tmp > $test_dir/text_value - paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text - # a raw version wer without refining processs - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - - ((idx+=1)) - if [ $idx -eq $num_gpus ]; then - idx=0 - fi - } - done - - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/commonvoice/fr/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/README.md deleted file mode 100644 index 82fe2662e934d9d9f498321b1406911217d23d13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Performance Record - -## Conformer Result Bidecoder (large) - - -## Conformer Result - -* Feature info: using fbank feature, cmvn, dither, online speed perturb -* Training info: train_conformer.yaml, kernel size 15, lr 0.004, batch size 12, 8 gpu, acc_grad 1, 50 epochs, dither 0.0 -* Decoding info: ctc_weight 0.5, average_num 10 - - -| decoding mode | test1 | test2 | test3 | -|----------------------------------|------------|------------|------------| -| ctc greedy search | 7.94 | 5.29 | 6.10 | -| ctc prefix beam search | 7.83+ | 5.28 | 6.08 | -| attention decoder | 7.83 | 5.63 | 6.37 | -| attention rescoring | 7.28+ | 4.81 | 5.44 | - -note that "+" means we removed two <0.1s wav files in test1 before decoding. - - - - -## Conformer U2++ Result - - -## Conformer U2 Result - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/conf/train_conformer.yaml deleted file mode 100644 index 461673ed7eea0889243b63df54b5fafb43c1c6f9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# dataset related -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 12 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 50 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.004 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.0.parse.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.0.parse.py deleted file mode 100644 index d916a2cf030b9338eb8dc698d7f16572a638b2c8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.0.parse.py +++ /dev/null @@ -1,119 +0,0 @@ - -# parse xml files and output simplified version - -import xml.dom.minidom -import os -import sys -import multiprocessing - -def parsexml(afile, outpath): - outfile = os.path.join(outpath, afile.split('/')[-1] + '.simp') - - with open(outfile, 'w') as bw: - domtree = xml.dom.minidom.parse(afile) - collection = domtree.documentElement - ipus = collection.getElementsByTagName('IPU') - - for ipu in ipus: - starttime = 0 - endtime = 0 - if ipu.hasAttribute('IPUStartTime'): - starttime = ipu.getAttribute('IPUStartTime') - if ipu.hasAttribute('IPUEndTime'): - endtime = ipu.getAttribute('IPUEndTime') - - # print('{}\t{}'.format(starttime, endtime)) - # ## original format ### - wlist = list() - plainwlist = list() - pronlist = list() - - # ## pronunciation ### - lemmalist = list() # lemma list - dictlemmalist = list() # dict lemma list - for suw in ipu.getElementsByTagName('SUW'): # short unit word - txt = '' - plaintxt = '' - # PhoneticTranscription - prontxt = '' - - if suw.hasAttribute('OrthographicTranscription'): - txt = suw.getAttribute('OrthographicTranscription') - if suw.hasAttribute('PlainOrthographicTranscription'): - plaintxt = suw.getAttribute('PlainOrthographicTranscription') - if suw.hasAttribute('PhoneticTranscription'): - prontxt = suw.getAttribute('PhoneticTranscription') - wlist.append(txt) - plainwlist.append(plaintxt) - pronlist.append(prontxt) - - lemma = '' - dictlemma = '' - - if suw.hasAttribute('SUWLemma'): - lemma = suw.getAttribute('SUWLemma') - if suw.hasAttribute('SUWDictionaryForm'): - dictlemma = suw.getAttribute('SUWDictionaryForm') - lemmalist.append(lemma) - dictlemmalist.append(dictlemma) - txtsent = ' '.join(wlist) - plaintxtsent = ' '.join(plainwlist) - prontxtsent = ' '.join(pronlist) - - lemmasent = ' '.join(lemmalist) - dictlemmasent = ' '.join(dictlemmalist) - outrow = '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( - starttime, endtime, txtsent, plaintxtsent, - prontxtsent, lemmasent, dictlemmasent) - bw.write(outrow) - -def procfolder_orig(apath, outpath): - count = 0 - for afile in os.listdir(apath): - if not afile.endswith('.xml'): - continue - afile = os.path.join(apath, afile) - parsexml(afile, outpath) - count += 1 - print('done: {} [{}]'.format(afile, count)) - -def procfolder(apath, outpath): - # count = 0 - fnlist = list() - for afile in os.listdir(apath): - if not afile.endswith('.xml'): - continue - fnlist.append(afile) - # now parallel processing: - nthreads = 16 - for i in range(0, len(fnlist), nthreads): - # fnlist[i, i+16] - pool = multiprocessing.Pool(processes=nthreads) - for j in range(nthreads): - if i + j < len(fnlist): - afile = os.path.join(apath, fnlist[i + j]) - pool.apply_async(parsexml, (afile, outpath)) - pool.close() - pool.join() - print('parallel {} threads done for {} files in total.'.format( - nthreads, len(fnlist))) - -if __name__ == '__main__': - if len(sys.argv) < 3: - print("Usage: {} ".format(sys.argv[0])) - exit(1) - # e.g., csjpath='/workspace/asr/csj/' - csjpath = sys.argv[1] - outcsjpath = sys.argv[2] - - apath = os.path.join(csjpath, 'XML/BaseXML/core') - apath2 = os.path.join(csjpath, 'XML/BaseXML/noncore') - - outapath = os.path.join(outcsjpath, 'xml') - # create the "outapath" dir: - if not os.path.exists(outapath): - os.mkdir(outapath) - - # range over the following two folders: - procfolder(apath, outapath) - procfolder(apath2, outapath) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.1.split_wav.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.1.split_wav.py deleted file mode 100644 index ccdf04e9b5168337fd06509e2999afad57de2904..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.1.split_wav.py +++ /dev/null @@ -1,123 +0,0 @@ -# based on xml.simp -> start_time and end_time -> split using sox - -import os -import sys -import multiprocessing - -import librosa -import soundfile as sf - -# use .simp as the source for .wav file splitting -def wavfn(apath): - wavdict = dict() # key=id, value=full.path of .wav - for awavfn in os.listdir(apath): - fullwavpath = os.path.join(apath, awavfn) - aid = awavfn.replace('.wav', '') - wavdict[aid] = fullwavpath - return wavdict - -def xmlfn(apath): - xmldict = dict() # key=id, value=full.path of .xml.simp - for axmlfn in os.listdir(apath): - if not axmlfn.endswith('.xml.simp'): - continue - axmlfn2 = os.path.join(apath, axmlfn) - aid = axmlfn.replace('.xml.simp', '') - # print('obtain id: {}\t{}'.format(axmlfn, aid)) - xmldict[aid] = axmlfn2 - return xmldict - -def ch2to1(f1, outf1): - wav1, _ = librosa.load(f1, sr=16000, mono=False) - if wav1.ndim == 1: - return - wav1mono = librosa.to_mono(wav1) - sf.write(outf1, wav1mono, 16000) - # print('2ch to 1ch, {} -> {}'.format(f1, outf1)) - acmd = 'mv {} {}'.format(outf1, f1) - res = os.system(acmd) - # rename the .1ch file back to the .wav file and - # overwrite the old .wav file which is 2ch - # print(res, acmd) - -def proc1file(fullxmlfn, fullwavfn, outwavpath): - with open(fullxmlfn) as xmlbr: - for axmlline in xmlbr.readlines(): - # start.time end.time ortho plainortho phonetic - axmlline = axmlline.strip() - cols = axmlline.split('\t') - stime = cols[0] - etime = cols[1] - - if len(cols) == 2: - continue # skip - - basename = fullwavfn.split('/')[-1] - - name2 = '{}_{}_{}.wav'.format(basename, stime, etime) - partwavfn = os.path.join(outwavpath, name2) - - dur = float(etime) - float(stime) - acmd = 'sox {} {} trim {} {}'.format(fullwavfn, partwavfn, stime, dur) - res = os.system(acmd) - # print(res, acmd) - - # perform 2ch to 1ch if necessary! - partwavfn1ch = partwavfn + ".1ch.wav" # NOTE must ends with '.wav'! - # otherwise, soundfile.write will give us error report! - ch2to1(partwavfn, partwavfn1ch) - -def procpath(atag, csjpath, xmlsimppath, outwavpath, idset): - # atag = 'core' and 'noncore' - axmlpath = xmlsimppath - awavpath = os.path.join(csjpath, atag) - - xmldict = xmlfn(axmlpath) - wavdict = wavfn(awavpath) - - wavidlist = list(wavdict.keys()) - - # parallel processing - nthreads = 16 - for i in range(0, len(wavidlist), nthreads): - pool = multiprocessing.Pool(processes=nthreads) - for j in range(nthreads): - if i + j < len(wavidlist): - wavid = wavidlist[i + j] - if len(idset) > 0 and wavid not in idset: - # when idset is not empty, then only process the ids - # that are included in idset: - continue - - fullwavfn = wavdict[wavid] - if wavid in xmldict: - fullxmlfn = xmldict[wavid] - pool.apply_async(proc1file, (fullxmlfn, fullwavfn, outwavpath)) - pool.close() - pool.join() - - print('parallel {} threads done for {} files.'.format( - nthreads, - len(wavidlist))) - -if __name__ == '__main__': - if len(sys.argv) < 4: - print( - "Usage: {}".format(sys.argv[0]) + - " [id.list.fn]") - exit(1) - - csjpath = sys.argv[1] - xmlsimppath = sys.argv[2] - outwavpath = sys.argv[3] - idlistfn = sys.argv[4] if len(sys.argv) == 5 else "" - idset = set() - if len(idlistfn) > 0: - with open(idlistfn) as br: - for aline in br.readlines(): - aline = aline.strip() - idset.add(aline) - print(idset) - - for atag in ['core', 'noncore']: - procpath(atag, csjpath, xmlsimppath, outwavpath, idset) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.2.prep.text.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.2.prep.text.py deleted file mode 100644 index 2b132ad9d6155eb48804e918d2fa77996e129c42..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.2.prep.text.py +++ /dev/null @@ -1,154 +0,0 @@ -import os -import sys - -# train test1 test2 test3 - -def readtst(tstfn): - outlist = list() - with open(tstfn) as br: - for aline in br.readlines(): - aline = aline.strip() - outlist.append(aline) - return outlist - -def split_train_tests_xml(xmlpath, test1fn, test2fn, test3fn): - test1list = readtst(test1fn) - test2list = readtst(test2fn) - test3list = readtst(test3fn) - - outtrainlist = list() # full path ".xml.simp" files - outt1list = list() # test 1, full path ".xml.simp" files - outt2list = list() - outt3list = list() - - for afile in os.listdir(xmlpath): - if not afile.endswith('.xml.simp'): - continue - afile2 = xmlpath + '/' + afile - aid = afile.split('.')[0] - if aid in test1list: - outt1list.append(afile2) - elif aid in test2list: - outt2list.append(afile2) - elif aid in test3list: - outt3list.append(afile2) - else: - outtrainlist.append(afile2) - - return outtrainlist, outt1list, outt2list, outt3list - -def all_wavs(wavpath): - wavlist = list() - for afile in os.listdir(wavpath): - if not afile.endswith('.wav'): - continue - afile2 = wavpath + '/' + afile - wavlist.append(afile2) - return wavlist - -def gen_text(xmllist, outpath): - # id \t text - # e.g., /workspace/asr/wenet/examples/csj/s0/data/xml/S11M1689.xml.simp - # ID = S11M1689_stime_etime - outtxtfn = os.path.join(outpath, 'text') - with open(outtxtfn, 'w') as bw: - for xmlfn in xmllist: - aid = xmlfn.split('/')[-1] - aid2 = aid.split('.')[0] - - with open(xmlfn) as br: - for aline in br.readlines(): - aline = aline.strip() - # stime \t etime \t text1 \t text2 \t text3 \t text4 \t text5 - cols = aline.split('\t') - # TODO different between "< 7" and "< 4"? strange - # -> use "< 4", DO NOT use "< 7" ! - if len(cols) < 4: - continue - - stime = cols[0] - etime = cols[1] - atxt = cols[3].replace(' ', '') - - afullid = '{}_{}_{}'.format(aid2, stime, etime) - aoutline = '{}\t{}\n'.format(afullid, atxt) - bw.write(aoutline) - -def parse_xml_set(xmllist): - outset = set() - for xml in xmllist: - aid = xml.split('/')[-1] - aid2 = aid.split('.')[0] - outset.add(aid2) - return outset - -def gen_wav_scp(xmllist, wavlist, outpath): - # xmlset = pure id set, alike 'S04F1228' - # can be from train, test1, test2, or test3 - xmlset = parse_xml_set(xmllist) - - outwavscpfn = os.path.join(outpath, 'wav.scp') - with open(outwavscpfn, 'w') as bw: - for wav in wavlist: - # wav is alike "/workspace/asr/wenet/examples/csj/s0/data - # /wav/S04F1228.wav_00458.875_00459.209.wav" - aid = wav.split('/')[-1] - cols = aid.split('_') - - aid2 = cols[0].split('.')[0] - if aid2 not in xmlset: - continue - - stime = cols[1] - etime = cols[2].replace('.wav', '') - - afullid = '{}_{}_{}'.format(aid2, stime, etime) - - wavabspath = os.path.abspath(wav) - aoutline = '{}\t{}\n'.format(afullid, wavabspath) - bw.write(aoutline) - - -def prep_text_wavscp( - xmlpath, wavpath, test1fn, test2fn, test3fn, - outtrainpath, out1path, out2path, out3path): - - trainlist, t1list, t2list, t3list = split_train_tests_xml( - xmlpath, - test1fn, - test2fn, - test3fn) - wavlist = all_wavs(wavpath) - - gen_text(trainlist, outtrainpath) - gen_text(t1list, out1path) - gen_text(t2list, out2path) - gen_text(t3list, out3path) - - gen_wav_scp(trainlist, wavlist, outtrainpath) - gen_wav_scp(t1list, wavlist, out1path) - gen_wav_scp(t2list, wavlist, out2path) - gen_wav_scp(t3list, wavlist, out3path) - -if __name__ == '__main__': - if len(sys.argv) < 10: - print( - "Usage: {}".format(sys.argv[0]) + " " + - " " + - " ") - exit(1) - - xmlpath = sys.argv[1] - wavpath = sys.argv[2] - test1fn = sys.argv[3] - test2fn = sys.argv[4] - test3fn = sys.argv[5] - - outtrainpath = sys.argv[6] - out1path = sys.argv[7] - out2path = sys.argv[8] - out3path = sys.argv[9] - - prep_text_wavscp(xmlpath, wavpath, test1fn, - test2fn, test3fn, outtrainpath, - out1path, out2path, out3path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.3.mincut.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.3.mincut.py deleted file mode 100644 index 39e8b8659f722ac430e74c297be67ccf9dd4e818..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.3.mincut.py +++ /dev/null @@ -1,27 +0,0 @@ -import librosa -# import os -import sys - -def mincut(wavscpfn, minsec): - outfn = wavscpfn + "_" + str(minsec) - - with open(outfn, 'w') as bw: - with open(wavscpfn) as br: - for aline in br.readlines(): - aline = aline.strip() - afn = aline.split('\t')[1] - # print(afn) - dur = librosa.get_duration(filename=afn) - if dur >= minsec: - bw.write(aline + '\n') - -# wn.3.mincut.py -if __name__ == '__main__': - if len(sys.argv) < 3: - print('{} '.format(sys.argv[0])) - exit() - - wavscpfn = sys.argv[1] - minsec = float(sys.argv[2]) - - mincut(wavscpfn, minsec) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.4.make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.4.make_raw_list.py deleted file mode 100644 index eb5aac28b0985e60b3597fba8b8bef11a0ec9614..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/csj_tools/wn.4.make_raw_list.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - # assert key in wav_table - if key in wav_table: - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - line = None - else: - # assert key in segments_table - if key in segments_table: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - else: - line = None - if line: - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/2ch.id.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/2ch.id.list deleted file mode 100644 index a516ada0503ffcc1203fc6359dbb131854d32e27..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/2ch.id.list +++ /dev/null @@ -1,58 +0,0 @@ -D01F0002 -D01F0003 -D01F0023 -D01F0030 -D01F0046 -D01F0049 -D01F0055 -D01F0057 -D01M0005 -D01M0009 -D01M0012 -D01M0019 -D01M0020 -D01M0042 -D01M0043 -D01M0047 -D02F0015 -D02F0018 -D02F0025 -D02F0027 -D02F0031 -D02F0032 -D02F0033 -D02F0054 -D02M0014 -D02M0016 -D02M0024 -D02M0026 -D02M0028 -D02M0035 -D02M0039 -D02M0051 -D03F0001 -D03F0006 -D03F0008 -D03F0034 -D03F0036 -D03F0040 -D03F0045 -D03F0058 -D03M0004 -D03M0007 -D03M0013 -D03M0017 -D03M0037 -D03M0038 -D03M0048 -D03M0053 -D04F0011 -D04F0022 -D04F0029 -D04F0044 -D04F0050 -D04M0010 -D04M0021 -D04M0041 -D04M0052 -D04M0056 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.1.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.1.list deleted file mode 100644 index b3293661df4df5b25580fcf01c53dc52b239c369..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.1.list +++ /dev/null @@ -1,11 +0,0 @@ -A01M0097 -A04M0051 -A04M0121 -A03M0156 -A03M0112 -A01M0110 -A05M0011 -A03M0106 -A01M0137 -A04M0123 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.123.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.123.list deleted file mode 100644 index cedf88f84006a083327a7b0d755c96770f1a11b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.123.list +++ /dev/null @@ -1,33 +0,0 @@ -A01M0097 -A04M0051 -A04M0121 -A03M0156 -A03M0112 -A01M0110 -A05M0011 -A03M0106 -A01M0137 -A04M0123 - -A01F0063 -A01M0056 -A06F0135 -A02M0012 -A06M0064 -A01M0141 -A01F0034 -A03M0016 -A03F0072 -A01F0001 - -S00F0066 -S00M0213 -S00M0070 -S00M0008 -S01F0105 -S00F0148 -S00F0019 -S00M0112 -S00F0152 -S00M0079 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.2.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.2.list deleted file mode 100644 index 7fd7de849da81debd842da04093584adab69d491..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.2.list +++ /dev/null @@ -1,11 +0,0 @@ -A01F0063 -A01M0056 -A06F0135 -A02M0012 -A06M0064 -A01M0141 -A01F0034 -A03M0016 -A03F0072 -A01F0001 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.3.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.3.list deleted file mode 100644 index c1fec392e41f167eda9d63fb2bbe4a16af17389d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/list_files/test.set.3.list +++ /dev/null @@ -1,11 +0,0 @@ -S00F0066 -S00M0213 -S00M0070 -S00M0008 -S01F0105 -S00F0148 -S00F0019 -S00M0112 -S00F0152 -S00M0079 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/run.sh deleted file mode 100644 index 39c91e41066348da0d89b6392ea57596327f29a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/run.sh +++ /dev/null @@ -1,278 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" - -# 1. xml split by sentences -# 2. wav split by xml.simp's guidance -# 3. generate "text" and "wav.scp" files as required by wenet -# 4. compute cmvn, better wav.len >= 0.1s, otherwise bug happens... -# 5. sentence piece's bpe vocabulary -# 6. make "data.list" files -# 7. train -> 50 epochs - -stage=1 # train -> 50 epochs -stop_stage=8 # - -# data -#data_url=www.openslr.org/resources/12 -# TODO use your own data path -datadir=/workspace/asr/csj - -# output wav data dir -wave_data=data # wave file path -# Optional train_config -train_config=conf/train_conformer.yaml -checkpoint= -cmvn=true # cmvn is for mean, variance, frame_number statistics -do_delta=false # not used... - -dir=exp/sp_spec_aug # model's dir (output dir) - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -# maybe you can try to adjust it if you can not get close results as README.md -average_num=10 -decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention" - -. tools/parse_options.sh || exit 1; - -# bpemode (unigram or bpe) -nbpe=4096 # TODO -> you can change this value to 5000, 100000 and so on -bpemode=bpe #unigram # TODO -> you can use unigram and other methods - -set -e # if any line's exex result is not true, bash stops -set -u # show the error line when stops (failed) -set -o pipefail # return value of the whole bash = final line executed's result - -train_set=train -dev_set=dev -recog_set="test1 test2 test3" - -### CSJ data is not free! -# buying URL: https://ccd.ninjal.ac.jp/csj/en/ - -### data preparing - split xml by sentences ### -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ### I did not check espnet nor kaldi for the pre-processing, - ### I developed my own ways. so, use at your own risks. - echo "stage 1: Data preparation -> xml preprocessing " - echo " -> extract [start.time, end.time, text] from raw xml files" - python ./csj_tools/wn.0.parse.py $datadir ${wave_data} -fi - -in_wav_path=$datadir/WAV -xml_simp_path=${wave_data}/xml -#wav_split_path=${wave_data}/wav.2 -wav_split_path=${wave_data}/wav -mkdir -p ${wav_split_path} - -### data preparing - split wav by xml.simp's guidance ### -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "stage 2: Data preparation -> wav preprocessing " - echo " -> split wav file by xml.simp's [start.time, end.time, text] format" - # in addition, 2ch to 1ch! - - python ./csj_tools/wn.1.split_wav.py ${in_wav_path} ${xml_simp_path} ${wav_split_path} -fi - -### data preparing - generate "text" and "wav.scp" files ### -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "stage 3: prepare text and wav.scp for train/test1/test2/test3 from wav and xml folders" - - t1fn='list_files/test.set.1.list' - t2fn='list_files/test.set.2.list' - t3fn='list_files/test.set.3.list' - - outtrain=${wave_data}/train - outt1=${wave_data}/test1 - outt2=${wave_data}/test2 - outt3=${wave_data}/test3 - - mkdir -p $outtrain - mkdir -p $outt1 - mkdir -p $outt2 - mkdir -p $outt3 - - python ./csj_tools/wn.2.prep.text.py \ - ${xml_simp_path} ${wav_split_path} \ - $t1fn $t2fn $t3fn \ - $outtrain $outt1 $outt2 $outt3 -fi - -minsec=0.1 - -### compute static info: mean, variance, frame_num ### -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - echo "stage 4: Feature Generation" - # TODO if failed, then please make sure your wav files are all >= 0.1s ... - - mkdir -p $wave_data/dev - # merge total dev data - for set in test1 test2 test3; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/$dev_set/$f - done - done - - python ./csj_tools/wn.3.mincut.py $wave_data/$train_set/wav.scp $minsec - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp $wave_data/$train_set/wav.scp_$minsec \ - --out_cmvn $wave_data/$train_set/global_cmvn -fi - -### use sentence piece to construct subword vocabulary ### -dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 5: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt - tools/spm_train \ - --input=$wave_data/lang_char/input.txt \ - --vocab_size=${nbpe} \ - --model_type=${bpemode} \ - --model_prefix=${bpemodel} \ - --input_sentence_size=100000000 - - tools/spm_encode \ - --model=${bpemodel}.model \ - --output_format=piece < $wave_data/lang_char/input.txt | \ - tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in $train_set ; do - python csj_tools/wn.4.make_raw_list.py $wave_data/$x/wav.scp_$minsec $wave_data/$x/text \ - $wave_data/$x/data.list - done - for x in $dev_set ${recog_set} ; do - python csj_tools/wn.4.make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \ - $wave_data/$x/data.list - done -fi - -### Training! ### - -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - cmvn_opts= - $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --train_data $wave_data/$train_set/data.list \ - --cv_data $wave_data/$dev_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -### test model ### - -if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then - # Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size=-1 - ctc_weight=0.5 - # Polling GPU id begin with index 0 - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - idx=0 - for test in $recog_set; do - for mode in ${decode_modes}; do - { - { - test_dir=$dir/${test}_${mode} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type raw \ - --test_data $wave_data/$test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp - cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp - tools/spm_decode --model=${bpemodel}.model --input_format=piece \ - < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp - paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text - - python tools/compute-wer.py --char=1 --v=1 \ - $wave_data/$test/text $test_dir/text > $test_dir/wer - } & - - ((idx+=1)) - if [ $idx -eq $num_gpus ]; then - idx=0 - fi - } - done - done - wait -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/csj/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/README.md deleted file mode 100644 index 6d4c175e362abb0ca45afa177f564a44dec5a60f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# GigaSpeech -A Large, modern and evolving dataset for automatic speech recognition. More details about GigaSpeech can be found: https://github.com/SpeechColab/GigaSpeech - -# Performance Record - -## Conformer bidecoder Result - -* Feature info: using fbank feature, dither 1.0, cmvn, 16k -* Training info: conf/train_conformer_bidecoder.yaml, subsample 4, kernel size 31, lr 0.001, batch size 24, 8 gpu, acc_grad 4, 40 epochs -* Decoding info: ctc_weight 0.3, reverse_weight 0.5, average_num 10 -* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27 - -### test set gigaspeech scoring - -| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | -|-----------|-------|--------|------|-----|-----|-----|------|-------| -| Sum/Avg | 19928 | 390656 | 91.4 | 6.4 | 2.2 | 2.0 | 10.6 | 63.1 | -| Mean | 152.1 | 2982.1 | 91.4 | 6.3 | 2.3 | 1.7 | 10.3 | 63.7 | -| S.D. | 142.2 | 2838.1 | 5.5 | 4.1 | 1.6 | 1.3 | 6.4 | 16.9 | -| Median | 108.0 | 2000.0 | 93.0 | 5.1 | 2.0 | 1.3 | 8.4 | 64.6 | - -### dev set gigaspeech scoring - -| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | -|-----------|-------|--------|------|-----|-----|-----|------|-------| -| Sum/Avg | 5715 | 127790 | 92.1 | 5.8 | 2.1 | 2.8 | 10.7 | 69.9 | -| Mean | 204.1 | 4563.9 | 92.9 | 5.2 | 1.9 | 2.0 | 9.1 | 69.4 | -| S.D. | 269.7 | 4551.6 | 3.4 | 2.7 | 0.9 | 1.7 | 4.6 | 15.9 | -| Median | 151.5 | 3314.0 | 93.8 | 4.4 | 1.6 | 1.7 | 7.9 | 71.6 | - -## Conformer U2++ Result - -* Feature info: using fbank feature, dither 1.0, cmvn, 16k -* Training info: conf/train_u2++_conformer.yaml, subsample 6, kernel size 31, lr 0.001, batch size 28, 8 gpu, acc_grad 1, 50 epochs -* Decoding info: ctc_weight 0.3, reverse_weight 0.5, average_num 10 -* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27 - -### test set gigaspeech scoring, full chunk (non-streaming) - -| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | -|-----------|-------|--------|------|-----|-----|-----|------|-------| -| Sum/Avg | 19928 | 390656 | 90.7 | 6.8 | 2.6 | 2.0 | 11.3 | 66.9 | -| Mean | 152.1 | 2982.1 | 90.6 | 6.8 | 2.7 | 1.6 | 11.1 | 67.1 | -| S.D. | 142.2 | 2838.1 | 5.8 | 4.3 | 1.9 | 1.2 | 6.7 | 16.5 | -| Median | 108.0 | 2000.0 | 92.1 | 5.7 | 2.2 | 1.3 | 9.0 | 68.9 | - -### test set gigaspeech scoring, chunk 8 (latency range from 0 to 480ms) - -| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | -|-----------|-------|--------|------|-----|-----|-----|------|-------| -| Sum/Avg | 19928 | 390656 | 89.6 | 7.5 | 2.9 | 2.0 | 12.5 | 70.1 | -| Mean | 152.1 | 2982.1 | 89.3 | 7.6 | 3.1 | 1.7 | 12.4 | 70.6 | -| S.D. | 142.2 | 2838.1 | 6.5 | 4.9 | 2.1 | 1.2 | 7.3 | 15.8 | -| Median | 108.0 | 2000.0 | 91.1 | 6.3 | 2.5 | 1.4 | 10.2 | 72.2 | - -## Conformer Result - -* Feature info: using fbank feature, dither 1.0, no cmvn, 48k -* Training info: conf/train_conformer.yaml, kernel size 31, lr 0.001, batch size 24, 8 gpu, acc_grad 4, 30 epochs -* Decoding info: ctc_weight 0.5, average_num 5 -* Git hash: 9a0c270f9f976d7e887f777690e6c358a45a1c27 - -### test set gigaspeech scoring - -| SPKR | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | -|---------------|-------|--------|------|-----|-----|-----|------|-------| -| Sum/Avg | 19930 | 390744 | 90.8 | 6.9 | 2.3 | 2.0 | 11.2 | 65.1 | -| Mean | 152.1 | 2982.8 | 90.6 | 6.9 | 2.5 | 1.7 | 11.1 | 65.7 | -| S.D. | 142.3 | 2839.0 | 5.8 | 4.3 | 1.7 | 1.2 | 6.7 | 16.6 | -| Median | 108.0 | 2000.0 | 92.5 | 5.6 | 2.1 | 1.3 | 9.1 | 65.9 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_conformer.yaml deleted file mode 100644 index ca3eaa5cc27c6cd8bab06162bcdaf44f189ac2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 31 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 100 - token_max_length: 160 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: false - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 28 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 30 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 100000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_conformer_bidecoder.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_conformer_bidecoder.yaml deleted file mode 100644 index 56dc40f70a086e51466e5a3f314197efe7377a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_conformer_bidecoder.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 31 - use_cnn_module: True - cnn_module_norm: 'layer_norm' - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - reverse_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 100 - token_max_length: 160 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: false - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 20 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 50 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 100000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_u2++_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_u2++_conformer.yaml deleted file mode 100644 index 4effa4a61687f086707e8ca6466335a3e2879788..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/conf/train_u2++_conformer.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - reverse_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 100 - token_max_length: 160 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: false - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 28 - - -grad_clip: 5 -accum_grad: 1 -max_epoch: 50 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 80000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/extract_meta.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/extract_meta.py deleted file mode 100644 index 27803537958703a09dbfcb4bca3088d768eb3216..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/extract_meta.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) -# Mobvoi Corporation (Author: Di Wu) - -import sys -import os -import argparse -import json - - -def get_args(): - parser = argparse.ArgumentParser(description=""" - This script is used to process raw json dataset of GigaSpeech, - where the long wav is splitinto segments and - data of wenet format is generated. - """) - parser.add_argument('input_json', help="""Input json file of Gigaspeech""") - parser.add_argument('output_dir', help="""Output dir for prepared data""") - - args = parser.parse_args() - return args - - -def meta_analysis(input_json, output_dir): - input_dir = os.path.dirname(input_json) - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - try: - with open(input_json, 'r') as injson: - json_data = json.load(injson) - except Exception: - sys.exit(f'Failed to load input json file: {input_json}') - else: - if json_data['audios'] is not None: - with open(f'{output_dir}/text', 'w') as utt2text, \ - open(f'{output_dir}/segments', 'w') as segments, \ - open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ - open(f'{output_dir}/wav.scp', 'w') as wavscp, \ - open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ - open(f'{output_dir}/reco2dur', 'w') as reco2dur: - for long_audio in json_data['audios']: - try: - long_audio_path = os.path.realpath( - os.path.join(input_dir, long_audio['path'])) - aid = long_audio['aid'] - segments_lists = long_audio['segments'] - duration = long_audio['duration'] - assert (os.path.exists(long_audio_path)) - assert ('opus' == long_audio['format']) - assert (16000 == long_audio['sample_rate']) - except AssertionError: - print(f'Warning: {aid} something is wrong, maybe' - 'AssertionError, skipped') - continue - except Warning: - print(f'Warning: {aid} something is wrong, maybe the' - 'error path: {long_audio_path}, skipped') - continue - else: - wavscp.write(f'{aid}\t{long_audio_path}\n') - reco2dur.write(f'{aid}\t{duration}\n') - for segment_file in segments_lists: - try: - sid = segment_file['sid'] - start_time = segment_file['begin_time'] - end_time = segment_file['end_time'] - dur = end_time - start_time - text = segment_file['text_tn'] - segment_subsets = segment_file["subsets"] - except Warning: - print(f'Warning: {segment_file} something is' - 'wrong, skipped') - continue - else: - utt2text.write(f'{sid}\t{text}\n') - segments.write( - f'{sid}\t{aid}\t{start_time}\t{end_time}\n' - ) - utt2dur.write(f'{sid}\t{dur}\n') - segment_sub_names = " ".join(segment_subsets) - utt2subsets.write( - f'{sid}\t{segment_sub_names}\n') - - -def main(): - args = get_args() - - meta_analysis(args.input_json, args.output_dir) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/gigaspeech_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/gigaspeech_data_prep.sh deleted file mode 100644 index b639457d7d485874912911945c2dd8e5e9bdc5de..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/gigaspeech_data_prep.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) -# Seasalt AI, Inc (Author: Guoguo Chen) -# Mobvoi Corporation (Author: Di Wu) - -set -e -set -o pipefail - -stage=1 -prefix= -garbage_utterance_tags=" " -punctuation_tags=" " -train_subset=XL - -. ./tools/parse_options.sh || exit 1; - -filter_by_id () { - idlist=$1 - input=$2 - output=$3 - field=1 - if [ $# -eq 4 ]; then - field=$4 - fi - cat $input | perl -se ' - open(F, "<$idlist") || die "Could not open id-list file $idlist"; - while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; - } - while(<>) { - @A = split; - @A > 0 || die "Invalid file line $_"; - @A >= $field || die "Invalid file line $_"; - if ($seen{$A[$field-1]}) { - print $_; - } - }' -- -idlist="$idlist" -field="$field" > $output ||\ - (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; -} - -subset_data_dir () { - utt_list=$1 - src_dir=$2 - dest_dir=$3 - mkdir -p $dest_dir || exit 1; - # wav.scp text segments utt2dur - filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ - (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; - filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ - (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; - filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ - (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; - awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco - filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ - (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; - rm -f $dest_dir/reco -} - -if [ $# -ne 2 ]; then - echo "Usage: $0 [options] " - echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/" - echo "" - echo "This script takes the GigaSpeech source directory, and prepares the" - echo "WeNet format data directory." - echo " --garbage-utterance-tags # Tags for non-speech." - echo " --prefix # Prefix for output data directory." - echo " --punctuation-tags # Tags for punctuations." - echo " --stage # Processing stage." - echo " --train-subset # Train subset to be created." - exit 1 -fi - -gigaspeech_dir=$1 -data_dir=$2 - -declare -A subsets -subsets=( - [XL]="train_xl" - [L]="train_l" - [M]="train_m" - [S]="train_s" - [XS]="train_xs" - [DEV]="dev" - [TEST]="test") -prefix=${prefix:+${prefix}_} - -corpus_dir=$data_dir/${prefix}corpus/ -if [ $stage -le 1 ]; then - echo "$0: Extract meta into $corpus_dir" - # Sanity check. - [ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\ - echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1; - [ ! -d $gigaspeech_dir/audio ] &&\ - echo "$0: Please download $gigaspeech_dir/audio!" && exit 1; - - [ ! -d $corpus_dir ] && mkdir -p $corpus_dir - - # Files to be created: - # wav.scp text segments utt2dur - python3 local/extract_meta.py \ - $gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1; -fi - -if [ $stage -le 2 ]; then - echo "$0: Filter $corpus_dir/text" - # Delete utterances with garbage meta tags - for tag in $garbage_utterance_tags; do - sed -i "/${tag}/d" $corpus_dir/text - done - - # Delete punctuations in utterances - for tag in $punctuation_tags; do - sed -i "s/${tag}//g" $corpus_dir/text - done - - # Ensure space only appears once and utt is seprated with others by '\t' - sed -i 's/\t/ /g' $corpus_dir/text - sed -i 's/[ ][ ]*/ /g' $corpus_dir/text - sed -i 's/ /\t/' $corpus_dir/text -fi - -if [ $stage -le 3 ]; then - echo "$0: Split data to train, dev and test" - # Split data to train, dev and test. - [ ! -f $corpus_dir/utt2subsets ] &&\ - echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; - for label in $train_subset DEV TEST; do - if [ ! ${subsets[$label]+set} ]; then - echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1; - fi - subset=${subsets[$label]} - [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset - grep "{$label}" $corpus_dir/utt2subsets \ - > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; - subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ - $corpus_dir $data_dir/${prefix}$subset || exit 1; - done -fi - -echo "$0: Done" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/gigaspeech_scoring.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/gigaspeech_scoring.py deleted file mode 100644 index e7679f4ab450bf26e472e613ebcaa14b39544187..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/local/gigaspeech_scoring.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import os -import argparse - -conversational_filler = [ - 'UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER', 'OOF', 'HEE', - 'ACH', 'EEE', 'EW' -] -unk_tags = ['', ''] -gigaspeech_punctuations = [ - '', '', '', '' -] -gigaspeech_garbage_utterance_tags = ['', '', '', ''] -non_scoring_words = conversational_filler + unk_tags + \ - gigaspeech_punctuations + gigaspeech_garbage_utterance_tags - -def asr_text_post_processing(text): - # 1. convert to uppercase - text = text.upper() - - # 2. remove hyphen - # "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART" - text = text.replace('-', ' ') - - # 3. remove non-scoring words from evaluation - remaining_words = [] - for word in text.split(): - if word in non_scoring_words: - continue - remaining_words.append(word) - - return ' '.join(remaining_words) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='''This script evaluates GigaSpeech ASR - result via SCTK's tool sclite''') - parser.add_argument( - 'ref', - type=str, - help="sclite's standard transcription(trn) reference file") - parser.add_argument( - 'hyp', - type=str, - help="sclite's standard transcription(trn) hypothesis file") - parser.add_argument('work_dir', type=str, help='working dir') - args = parser.parse_args() - - if not os.path.isdir(args.work_dir): - os.mkdir(args.work_dir) - - REF = os.path.join(args.work_dir, 'REF') - HYP = os.path.join(args.work_dir, 'HYP') - RESULT = os.path.join(args.work_dir, 'RESULT') - - for io in [(args.ref, REF), (args.hyp, HYP)]: - with open(io[0], - 'r', encoding='utf8') as fi, open(io[1], - 'w+', - encoding='utf8') as fo: - for line in fi: - line = line.strip() - if line: - cols = line.split() - text = asr_text_post_processing(' '.join(cols[0:-1])) - uttid_field = cols[-1] - print(F'{text} {uttid_field}', file=fo) - - os.system(F'sclite -r {REF} trn -h {HYP} trn -i swb | tee {RESULT}' - ) # GigaSpeech's uttid comforms to swb diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/run.sh deleted file mode 100644 index dc891bf20350b310580e792009c340db82205abc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/run.sh +++ /dev/null @@ -1,264 +0,0 @@ -#!/bin/bash - -# Copyright 2021 Mobvoi Inc. All Rights Reserved. - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 - -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - -# data -# use your own data path, you can contact gigaspeech@speechcolab.orgfor getting data for data information about gigaspeech -# the preparation of gigaspeech dataset for wenet can be found https://github.com/SpeechColab/GigaSpeech -giga_data_dir=/export/expts6/corpus/data/en-asr-data/16k/GigaSpeech -shards_dir=/ssd/nfs06/unified_data/giga_shards -# gigaspeech training set -set=XL -train_set=train_`echo $set |tr 'A-Z' 'a-z'` -train_dev=dev -recog_set=test -# wav data dir -data=data -nj=16 -# Optional train_config -# 1. conf/train_transformer.yaml: Standard Conformer -# 2. conf/train_transformer_bidecoder.yaml: Bidecoder Conformer -train_config=conf/train_conformer_bidecoder.yaml -checkpoint= -cmvn=false -do_delta=false -dir=exp/sp_spec_aug - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -# maybe you can try to adjust it if you can not get close results as README.md -average_num=3 -decode_modes="attention_rescoring ctc_greedy_search" - -. tools/parse_options.sh || exit 1; - -# bpemode (unigram or bpe) -nbpe=5000 -bpemode=unigram - -set -e -set -u -set -o pipefail - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - ### Task dependent. You have to make data the following preparation part by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 0: Data preparation" - local/gigaspeech_data_prep.sh --train-subset $set --stage 1 $giga_data_dir $data - sed -i "s/\t/ /g" $data/${train_set}/text - sed -i "s/\t/ /g" $data/${train_dev}/text - sed -i "s/\t/ /g" $data/${recog_set}/text - for x in $train_dev $train_set $recog_set; do - paste -d " " <(cut -f1 -d " " $data/$x/text) <(cut -f1 -d " " $data/$x/text) > $data/$x/spk2utt - cp $data/$x/spk2utt $data/$x/utt2spk - tools/fix_data_dir.sh $data/$x - done -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ### Task dependent. You have to design training and dev sets by yourself. - echo "stage 1: generate segmented wav.scp and compute cmvn" - # the format of wav.segment.scp is: - # POD1000000004_S0000000 /GigaSpeech/audio/podcast/P0000/POD1000000004.opus,0.0,10.197 - # 0.0 is start time, 10.197 is end time (second) - for x in $train_dev $train_set $recog_set; do - python tools/segment.py --segments $data/$x/segments \ - --input $data/$x/wav.scp \ - --output $data/$x/wav.segment.scp - done - - # optional - # compute cmvn, perhaps you can sample some segmented examples fron wav.scp for cmvn computation - python tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp $data/$train_set/wav.segment.scp \ - --out_cmvn $data/$train_set/global_cmvn -fi - - -dict=$data/lang_char_$set/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=$data/lang_char_$set/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p $data/lang_char_$set/ - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " $data/${train_set}/text > $data/lang_char_$set/input.txt - tools/spm_train --input=$data/lang_char_$set/input.txt --vocab_size=${nbpe} \ - --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece \ - < $data/lang_char_$set/input.txt | \ - tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Making shards, please wait..." - RED='\033[0;31m' - NOCOLOR='\033[0m' - echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space" - echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads" - - for x in $train_dev $train_set $recog_set; do - dst=$shards_dir/$x - mkdir -p $dst - tools/make_shard_list.py --resample 16000 --num_utts_per_shard 1000 \ - --num_threads 32 --segments data/$x/segments \ - data/$x/wav.scp data/$x/text \ - $(realpath $dst) data/$x/data.list - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type "shard" \ - --symbol_table $dict \ - --bpe_model $bpemodel.model \ - --train_data $data/$train_set/data.list \ - --cv_data $data/$train_dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 16 \ - $cmvn_opts - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - # Polling GPU id begin with index 0 - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - idx=0 - for test in $recog_set; do - for mode in ${decode_modes}; do - { - { - test_dir=$dir/${test}_${mode} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type "shard" \ - --symbol_table $dict \ - --bpe_model $bpemodel.model \ - --test_data $data/$test/format.data \ - --checkpoint $decode_checkpoint \ - --beam_size 20 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp - cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp - - tools/spm_decode --model=${bpemodel}.model --input_format=piece \ - < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value - paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text - # a raw version wer without refining processs - python tools/compute-wer.py --char=1 --v=1 \ - $data/$test/text $test_dir/text > $test_dir/wer - - # for gigaspeech scoring - cat $test_dir/text_bpe_key_tmp | sed -e "s/^/(/g" | sed -e "s/$/)/g" > $test_dir/hyp_key - paste -d " " $test_dir/text_value $test_dir/hyp_key > $test_dir/hyp - paste -d " " <(cut -f2- -d " " $data/$test/text) \ - <(cut -f1 -d " " $data/$test/text | \ - sed -e "s/^/(/g" | sed -e "s/$/)/g") > $data/$test/ref - local/gigaspeech_scoring.py $data/$test/ref $test_dir/hyp $test_dir - } & - - ((idx+=1)) - if [ $idx -eq $num_gpus ]; then - idx=0 - fi - } - done - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/gigaspeech/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/README.md deleted file mode 100644 index 2d83cec06ec0778509eeeae85cb2a8b0c819b698..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Performance Record - -## Conformer Result (Old IO) - -* Feature info: using fbank feature, with cmvn, with speed perturb. -* Training info: lr 0.002, batch size 16, 1 machines, 1*4 = 4 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 30 - -| decoding mode | | -|--------------------------|-------| -| attention decoder | 21.9 | -| ctc greedy search | 21.15 | -| ctc prefix beam search | 21.13 | -| attention rescoring | 20.47 | - -## Conformer Result (New IO) - -* Feature info: using fbank feature, with cmvn, with speed perturb. -* Training info: lr 0.002, batch size 16, 1 machines, 1*4 = 4 gpu, acc_grad 4, 133 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 30 - -| decoding mode | | -|--------------------------|-------| -| attention decoder | 21.42 | -| ctc greedy search | 21.16 | -| ctc prefix beam search | 21.18 | -| attention rescoring | 20.42 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/conf/train_960_unigram5000.model b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/conf/train_960_unigram5000.model deleted file mode 100644 index 8419aa7bac81d9b02f9644e9cf8929b73765a3af..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/conf/train_960_unigram5000.model and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/conf/train_conformer.yaml deleted file mode 100644 index 81c8571e2798f80d564f0650ce94266193cd8a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# feature extraction -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - max_output_input_ratio: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/local/hkust_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/local/hkust_data_prep.sh deleted file mode 100644 index cebdef1daf476c6e602b5cbe9a11fdb00521aced..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/local/hkust_data_prep.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env bash - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/corpora/LDC03S04 /export/corpora/LDC03T19" - exit 1; -fi - -hkust_audio_dir=$1 -hkust_text_dir=$2 - -train_dir=data/local/train -dev_dir=data/local/dev -train_dev=train_dev -train_nodev=train_nodev - -nj=16 - -mkdir -p $train_dir -mkdir -p $dev_dir - -#data directory check -if [ ! -d $hkust_audio_dir ] || [ ! -d $hkust_text_dir ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -#find sph audio file for train dev resp. -find $hkust_audio_dir -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist || exit 1; -find $hkust_audio_dir -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist || exit 1; - -n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l` -[ $n -ne 897 ] && \ - echo Warning: expected 897 data data files, found $n - -#Transcriptions preparation - -#collect all trans, convert encodings to utf-8, -find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\ - iconv -f GBK -t UTF-8 | perl -e ' - while () { - @A = split(" ", $_); - if (@A <= 1) { next; } - if ($A[0] eq "#") { $utt_id = $A[1]; } - if (@A >= 3) { - $A[2] =~ s:^([AB])\:$:$1:; - printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5; - for($n = 3; $n < @A; $n++) { print " $A[$n]" }; - print "\n"; - } - } - ' | sort -k1 > $train_dir/transcripts.txt || exit 1; - -find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\ - iconv -f GBK -t UTF-8 | perl -e ' - while () { - @A = split(" ", $_); - if (@A <= 1) { next; } - if ($A[0] eq "#") { $utt_id = $A[1]; } - if (@A >= 3) { - $A[2] =~ s:^([AB])\:$:$1:; - printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5; - for($n = 3; $n < @A; $n++) { print " $A[$n]" }; - print "\n"; - } - } - ' | sort -k1 > $dev_dir/transcripts.txt || exit 1; - -#transcripts normalization and segmentation -cat $train_dir/transcripts.txt |\ - sed -e 's// /g' |\ - sed -e 's/<\/foreign>/ /g' |\ - sed -e 's/\(.\+\)<\/noise>/\1/g' |\ - sed -e 's/<\/noise>//g' |\ - sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ - sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ - awk '{if (NF > 1) print $0;}' |\ - local/hkust_normalize.pl |\ - awk '{if (NF > 0) print $0;}' > $train_dir/text || exit 1; - -cat $dev_dir/transcripts.txt |\ - sed -e 's// /g' |\ - sed -e 's/<\/foreign>/ /g' |\ - sed -e 's/\(.\+\)<\/noise>/\1/g' |\ - sed -e 's/<\/noise>//g' |\ - sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ - sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ - awk '{if (NF > 1) print $0;}' |\ - local/hkust_normalize.pl |\ - awk '{if (NF > 0) print $0;}' > $dev_dir/text || exit 1; - -# some data is corrupted. Delete them -cat $train_dir/text | grep -v 20040527_210939_A901153_B901154-A-035691-035691 | egrep -v "A:|B:" > tmp -mv tmp $train_dir/text || exit 1; - -#Make segment files from transcript -#segments file format is: utt-id side-id start-time end-time, e.g.: -#sw02001-A_000098-001156 sw02001-A 0.98 11.56 - -awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4]; - print segment " " audioname "-" side " " startf/100 " " endf/100}' <$train_dir/text > $train_dir/segments -awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $train_dir/sph.flist > $train_dir/sph.scp - -awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4]; - print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments -awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp - -bash tools/sph2wav.sh --nj ${nj} $train_dir/sph.scp $train_dir/segments $train_dir/wav.scp -bash tools/sph2wav.sh --nj ${nj} $dev_dir/sph.scp $dev_dir/segments $dev_dir/wav.scp - -#side A - channel 1, side B - channel 2 - -# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) -# to the file name sw02001 and the A, e.g. -# sw02001-A sw02001 A -# In this case it's trivial, but in other corpora the information might -# be less obvious. Later it will be needed for ctm scoring. -cat $train_dir/wav_ori.scp | awk '{print $1}' | \ - perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \ - > $train_dir/reco2file_and_channel || exit 1; -cat $dev_dir/wav_ori.scp | awk '{print $1}' | \ - perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \ - > $dev_dir/reco2file_and_channel || exit 1; - - -cat $train_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $train_dir/utt2spk || exit 1; -cat $train_dir/utt2spk | sort -k 2 | tools/utt2spk_to_spk2utt.pl > $train_dir/spk2utt || exit 1; - -cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir/utt2spk || exit 1; -cat $dev_dir/utt2spk | sort -k 2 | tools/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1; - -mkdir -p data/train data/dev - -for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do - cp data/local/train/$f data/train/$f || exit 1; -done - -for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do - cp data/local/dev/$f data/dev/$f || exit 1; -done - -tools/subset_data_dir.sh --first data/train 4001 data/${train_dev} -n=$(($(wc -l < data/train/segments) - 4001)) -tools/subset_data_dir.sh --last data/train ${n} data/${train_nodev} - -echo "$0: HKUST data preparation succeeded" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/local/hkust_normalize.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/local/hkust_normalize.pl deleted file mode 100644 index ff2d3eaad6b1d5ca72c0e43ebf251dfcb4c953d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/local/hkust_normalize.pl +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright Chao Weng - -# normalizations for hkust trascript -# see the docs/trans-guidelines.pdf for details - -while () { - @A = split(" ", $_); - print "$A[0] "; - for ($n = 1; $n < @A; $n++) { - $a = $A[$n]; - if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}") - || ($a eq "{lipsmack}")) {next;} - if (($a eq "{laugh}")) {next;} - if (($a eq "")) {next;} - $tmp = $a; - if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:g; } - if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; } - if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; } - if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} - print "$tmp "; - } - print "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/run.sh deleted file mode 100644 index 612bddeaf693ba4fc7a9897d4835d3844e711404..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/run.sh +++ /dev/null @@ -1,247 +0,0 @@ -#!/bin/bash - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3" -stage=4 # start from 0 if you need to start from data preparation -stop_stage=4 - -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - -nj=16 -feat_dir=raw_wav - -data_type=raw -num_utts_per_shard=1000 -prefetch=100 - -train_set=train_nodev -dev_set=train_dev - -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -train_config=conf/train_conformer.yaml -# English modeling unit -# Optional 1. bpe 2. char -en_modeling_unit=bpe -dict=data/dict_$en_modeling_unit/lang_char.txt -cmvn=true -debug=false -num_workers=2 -dir=exp/conformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="ctc_greedy_search ctc_prefix_beam_search - attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/hkust_data_prep.sh /mnt/cfs/database/hkust/LDC2005S15/ \ - /mnt/cfs/database/hkust/LDC2005T32/ || exit 1; -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # For wav feature, just copy the data. Fbank extraction is done in training - mkdir -p ${feat_dir}_${en_modeling_unit} - for x in ${train_set} ${dev_set}; do - cp -r data/$x ${feat_dir}_${en_modeling_unit} - done - - cp -r data/dev ${feat_dir}_${en_modeling_unit}/test - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn - -fi - -# This bpe model is trained on librispeech training data set. -bpecode=conf/train_960_unigram5000.model -trans_type_ops= -bpe_ops= -if [ $en_modeling_unit = "bpe" ]; then - trans_type_ops="--trans_type cn_char_en_bpe" - bpe_ops="--bpecode ${bpecode}" -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - paste -d " " \ - <(cut -f 1 -d" " ${feat_dir}_${en_modeling_unit}/${train_set}/text) \ - <(cut -f 2- -d" " ${feat_dir}_${en_modeling_unit}/${train_set}/text \ - | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \ - | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " " ) \ - > ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict - sed -i 's/\xEF\xBB\xBF//' \ - ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict - - tools/text2token.py -s 1 -n 1 -m ${bpecode} \ - ${feat_dir}_${en_modeling_unit}/${train_set}/text4dict ${trans_type_ops} \ - | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' \ - | grep -v '·' | grep -v '“' | grep -v "”" | grep -v "\[" | grep -v "\]" \ - | grep -v "…" \ - | awk '{print $0 " " NR+1}' >> ${dict} - - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in ${dev_set} ${train_set} test; do - if [ $data_type == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 ${feat_dir}_${en_modeling_unit}/$x/wav.scp \ - ${feat_dir}_${en_modeling_unit}/$x/text \ - $(realpath ${feat_dir}_${en_modeling_unit}/$x/shards) \ - ${feat_dir}_${en_modeling_unit}/$x/data.list - else - tools/make_raw_list.py ${feat_dir}_${en_modeling_unit}/$x/wav.scp \ - ${feat_dir}_${en_modeling_unit}/$x/text \ - ${feat_dir}_${en_modeling_unit}/$x/data.list - fi - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp ${feat_dir}_${en_modeling_unit}/$train_set/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --prefetch $prefetch \ - --train_data ${feat_dir}_${en_modeling_unit}/$train_set/data.list \ - --cv_data ${feat_dir}_${en_modeling_unit}/$dev_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory \ - --bpe_model ${bpecode} - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size=-1 - ctc_weight=0.5 - idx=0 - for mode in ${decode_modes}; do - { - test_dir="$dir/"` - `"test_${mode}${decoding_chunk_size:+_chunk$decoding_chunk_size}/test" - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data ${feat_dir}_${en_modeling_unit}/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --result_file $test_dir/text_${en_modeling_unit} \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - if [ $en_modeling_unit == "bpe" ]; then - tools/spm_decode --model=${bpecode} --input_format=piece \ - < $test_dir/text_${en_modeling_unit} | sed -e "s/▁/ /g" > $test_dir/text - else - cat $test_dir/text_${en_modeling_unit} \ - | sed -e "s/▁/ /g" > $test_dir/text - fi - # Cer used to be consistent with kaldi & espnet - python tools/compute-cer.py --char=1 --v=1 \ - ${feat_dir}_${en_modeling_unit}/test/text $test_dir/text > $test_dir/wer - } & - ((idx+=1)) - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/hkust/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/README.md deleted file mode 100644 index 0a491eeb453db8e87c710fb1c075a76422433eb9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Performance Record - -## Conformer Bidecoder Transducer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.001, dynamic batch with max_frames_in_batch 4000, 8 gpu, acc_grad 1, 60 epochs -* Training weight info: transducer_weight 0.75, ctc_weight 0.1, reverse_weight 0.30, average_num 10 -* Predictor type: lstm - -| decoding mode | dev_clean | dev_other | test_clean | test_other | -|-----------------------|------------|-----------|------------|------------| -| rnnt_greedy_search | 3.42% | 8.99% | 3.56% | 9.15% | -| rnnt_beam_search | 3.35% | 8.77% | 3.45% | 8.78% | -| rnnt_beam_att_rescore | 3.25% | 8.66% | 3.41% | 8.68% | - -Pretrained model: https://huggingface.co/yuekai/wenet-asr-librispeech-conformer-transducer-mtl/blob/main/exp/conformer_transducer/avg_10.pt - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/conf/conformer_rnnt.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/conf/conformer_rnnt.yaml deleted file mode 100644 index 8a517cccae85d5d26b5fafefc8bd97f4118060d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/conf/conformer_rnnt.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: true - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - - -joint_conf: - join_dim: 512 - prejoin_linear: True - postjoin_linear: false - joint_mode: 'add' - activation: 'tanh' - -predictor: rnn -predictor_conf: - embed_size: 256 - output_size: 256 - embed_dropout: 0.1 - hidden_size: 256 - num_layers: 2 - bias: true - rnn_type: 'lstm' - dropout: 0.1 - -decoder: bitransformer -decoder_conf: - attention_heads: 4 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid transducer+ctc+attention -model_conf: - transducer_weight: 0.75 - ctc_weight: 0.1 - attention_weight: 0.15 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 1650 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'dynamic' # static or dynamic - max_frames_in_batch: 4000 - -grad_clip: 4 -accum_grad: 1 -max_epoch: 140 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/local/data_prep_torchaudio.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/local/data_prep_torchaudio.sh deleted file mode 100644 index c7dc1deb7dec59f571c8f6935fe36c6aea2e8e99..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/local/data_prep_torchaudio.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Vassil Panayotov -# 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" - exit 1 -fi - -src=$1 -dst=$2 - -# all utterances are FLAC compressed -if ! which flac >&/dev/null; then - echo "Please install 'flac' on ALL worker nodes!" - exit 1 -fi - -mkdir -p $dst || exit 1 - -[ ! -d $src ] && echo "$0: no such directory $src" && exit 1 - -wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp -trans=$dst/text; [[ -f "$trans" ]] && rm $trans - -for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do - reader=$(basename $reader_dir) - if ! [ $reader -eq $reader ]; then # not integer. - echo "$0: unexpected subdirectory name $reader" - exit 1 - fi - - for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do - chapter=$(basename $chapter_dir) - if ! [ "$chapter" -eq "$chapter" ]; then - echo "$0: unexpected chapter-subdirectory name $chapter" - exit 1 - fi - - find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ - awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1 - - chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt - [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 - cat $chapter_trans >>$trans - done -done - -echo "$0: successfully prepared data in $dst" - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/local/download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/local/download_and_untar.sh deleted file mode 100644 index cd32fb6b989d7229272f1066a75a1688df2bf06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/local/download_and_untar.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: dev-clean, test-clean, dev-other, test-other," - echo " train-clean-100, train-clean-360, train-other-500." - exit 1 -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1 -fi - -part_ok=false -list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1 -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1 -fi - -if [ -f $data/LibriSpeech/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0 -fi - - -# sizes of the archive files in bytes. This is some older versions. -sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" -# sizes_new is the archive file sizes of the final release. Some of these sizes are of -# things we probably won't download. -sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" - -if [ -f $data/$part.tar.gz ]; then - size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') - size_ok=false - for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tar.gz - else - echo "$data/$part.tar.gz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1 - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - if ! wget -P $data --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1 - fi -fi - -if ! tar -C $data -xvzf $data/$part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tar.gz" - exit 1 -fi - -touch $data/LibriSpeech/$part/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" - -if $remove_archive; then - echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." - rm $data/$part.tar.gz -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/path.sh deleted file mode 100644 index ac1ca08baf5d4540b92ed239b8aa7cd613064a8c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/run.sh deleted file mode 100644 index a7a30cc29aff691e52b79c0cd26384a114c30a04..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/run.sh +++ /dev/null @@ -1,286 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=-1 # start from 0 if you need to start from data preparation -stop_stage=7 -# data -data_url=www.openslr.org/resources/12 -# data_url=https://us.openslr.org/resources/12 -data_url=https://openslr.elda.org/resources/12 -# use your own data path -datadir= - -# wav data dir -wave_data=data -# Optional train_config -# 1. conf/train_transformer_large.yaml: Standard transformer -train_config=conf/conformer_rnnt.yaml -checkpoint= -cmvn=true -do_delta=false - -dir=exp/conformer_transducer - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -# maybe you can try to adjust it if you can not get close results as README.md -average_num=10 -decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention" - -. tools/parse_options.sh || exit 1; - -# bpemode (unigram or bpe) -nbpe=5000 -bpemode=unigram - -set -e -set -u -set -o pipefail - -train_set=train_960 -dev_set=dev -recog_set="test_clean test_other dev_clean dev_other" - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - for part in train-clean-100 train-clean-360 train-other-500; do - local/download_and_untar.sh ${datadir} ${data_url} ${part} - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - ### Task dependent. You have to make data the following preparation part by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 0: Data preparation" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - # use underscore-separated names in data directories. - local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_} - done -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ### Task dependent. You have to design training and dev sets by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 1: Feature Generation" - mkdir -p $wave_data/train_960 - # merge total training data - for set in train_clean_100 train_clean_360 train_other_500; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/train_960/$f - done - done - mkdir -p $wave_data/dev - # merge total dev data - for set in dev_clean dev_other; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/$dev_set/$f - done - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp $wave_data/$train_set/wav.scp \ - --out_cmvn $wave_data/$train_set/global_cmvn - -fi - - -dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt - tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in $dev_set ${recog_set} $train_set ; do - tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \ - $wave_data/$x/data.list - done - -fi - - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - dist_backend="nccl" - cmvn_opts= - $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python3 wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --bpe_model ${bpemodel}.model \ - --train_data $wave_data/$train_set/data.list \ - --cv_data $wave_data/$dev_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 4 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - # Polling GPU id begin with index 0 - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - idx=0 - for test in $recog_set; do - for mode in ${decode_modes}; do - { - { - test_dir=$dir/${test}_${mode} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type raw \ - --dict $dict \ - --bpe_model ${bpemodel}.model \ - --test_data $wave_data/$test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp - cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp - tools/spm_decode --model=${bpemodel}.model --input_format=piece \ - < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp - paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text - - python tools/compute-wer.py --char=1 --v=1 \ - $wave_data/$test/text $test_dir/text > $test_dir/wer - } & - - ((idx+=1)) - if [ $idx -eq $num_gpus ]; then - idx=0 - fi - } - done - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - -# Optionally, you can add LM and test it with runtime. -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - lm=data/local/lm - lexicon=data/local/dict/lexicon.txt - mkdir -p $lm - mkdir -p data/local/dict - - # 7.1 Download & format LM - which_lm=3-gram.pruned.1e-7.arpa.gz - if [ ! -e ${lm}/${which_lm} ]; then - wget http://www.openslr.org/resources/11/${which_lm} -P ${lm} - fi - echo "unzip lm($which_lm)..." - gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa - echo "Lm saved as ${lm}/lm.arpa" - - # 7.2 Prepare dict - unit_file=$dict - bpemodel=$bpemodel - # use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel) - # if you download pretrained librispeech conformer model - cp $unit_file data/local/dict/units.txt - if [ ! -e ${lm}/librispeech-lexicon.txt ]; then - wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm} - fi - echo "build lexicon..." - tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \ - $lexicon $bpemodel.model - echo "lexicon saved as '$lexicon'" - - # 7.3 Build decoding TLG - tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - - # 7.4 Decoding with runtime - fst_dir=data/lang_test - for test in ${recog_set}; do - ./tools/decode.sh --nj 6 \ - --beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \ - --ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \ - --fst_path $fst_dir/TLG.fst \ - --dict_path $fst_dir/words.txt \ - data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \ - $dir/lm_with_runtime_${test} - tail $dir/lm_with_runtime_${test}/wer - done -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/rnnt/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/README.md deleted file mode 100644 index 484de3773aa5b05a089b5be3f329037fc7a499b2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/README.md +++ /dev/null @@ -1,291 +0,0 @@ -# Performance Record - -## Conformer Result Bidecoder (large) - -* Encoder FLOPs(30s): 96,238,430,720, params: 85,709,704 -* Feature info: using fbank feature, cmvn, dither, online speed perturb -* Training info: train_conformer_bidecoder_large.yaml, kernel size 31, lr 0.002, batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0 -* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30 -* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0 -* LM-tgmed: [3-gram.pruned.1e-7.arpa.gz](http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz) -* LM-tglarge: [3-gram.arpa.gz](http://www.openslr.org/resources/11/3-gram.arpa.gz) -* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz) - -| decoding mode | test clean | test other | -|----------------------------------|------------|------------| -| ctc prefix beam search | 2.96 | 7.14 | -| attention rescoring | 2.66 | 6.53 | -| LM-tgmed + attention rescoring | 2.78 | 6.32 | -| LM-tglarge + attention rescoring | 2.68 | 6.10 | -| LM-fglarge + attention rescoring | 2.65 | 5.98 | - -## SqueezeFormer Result (U2++, FFN:2048) - -* Encoder info: - * SM12, reduce_idx 5, recover_idx 11, conv1d, batch_norm, syncbn - * encoder_dim 512, output_size 512, head 8, ffn_dim 512*4=2048 - * Encoder FLOPs(30s): 82,283,704,832, params: 85,984,648 -* Feature info: - * using fbank feature, cmvn, dither, online speed perturb, spec_aug -* Training info: - * train_squeezeformer_bidecoder_large.yaml, kernel size 31 - * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0 - * adamw, lr 8e-4, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0 -* Decoding info: - * ctc_weight 0.3, reverse weight 0.5, average_num 30 - -| decoding mode | dev clean | dev other | test clean | test other | -|----------------------------------|-----------|-----------|------------|------------| -| ctc greedy search | 2.55 | 6.62 | 2.73 | 6.59 | -| ctc prefix beam search | 2.53 | 6.60 | 2.72 | 6.52 | -| attention decoder | 2.93 | 6.56 | 3.31 | 6.47 | -| attention rescoring | 2.19 | 6.06 | 2.45 | 5.85 | - -## Conformer Result - -* Encoder FLOPs(30s): 34,085,088,512, params: 34,761,608 -* Feature info: using fbank feature, cmvn, dither, online speed perturb -* Training info: train_conformer.yaml, kernel size 31, lr 0.004, batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 30 -* Git hash: 90d9a559840e765e82119ab72a11a1f7c1a01b78 -* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz) - -| decoding mode | test clean | test other | -|----------------------------------|------------|------------| -| ctc greedy search | 3.51 | 9.57 | -| ctc prefix beam search | 3.51 | 9.56 | -| attention decoder | 3.05 | 8.36 | -| attention rescoring | 3.18 | 8.72 | -| attention rescoring (beam 50) | 3.12 | 8.55 | -| LM-fglarge + attention rescoring | 3.09 | 7.40 | - -## Conformer Result (12 layers, FFN:2048) -* Encoder FLOPs(30s): 34,085,088,512, params: 34,761,608 -* Feature info: using fbank feature, cmvn, dither, online speed perturb -* Training info: train_squeezeformer.yaml, kernel size 31, -* batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1 -* AdamW, lr 1e-3, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0 -* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30 - -| decoding mode | dev clean | dev other | test clean | test other | -|----------------------------------|-----------|-----------|------------|------------| -| ctc greedy search | 3.49 | 9.59 | 3.66 | 9.59 | -| ctc prefix beam search | 3.49 | 9.61 | 3.66 | 9.55 | -| attention decoder | 3.52 | 9.04 | 3.85 | 8.97 | -| attention rescoring | 3.10 | 8.91 | 3.29 | 8.81 | - -## SqueezeFormer Result (SM12, FFN:1024) -* Encoder info: - * SM12, reduce_idx 5, recover_idx 11, conv2d, w/o syncbn - * encoder_dim 256, output_size 256, head 4, ffn_dim 256*4=1024 - * Encoder FLOPs(30s): 21,158,877,440, params: 22,219,912 -* Feature info: - * using fbank feature, cmvn, dither, online speed perturb -* Training info: - * train_squeezeformer.yaml, kernel size 31, - * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1 - * adamw, lr=1e-3, noamhold, warmup=0.2, hold=0.3, lr_decay=1.0 -* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30 - -| decoding mode | dev clean | dev other | test clean | test other | -|----------------------------------|-----------|-----------|------------|------------| -| ctc greedy search | 3.49 | 9.24 | 3.51 | 9.28 | -| ctc prefix beam search | 3.44 | 9.23 | 3.51 | 9.25 | -| attention decoder | 3.59 | 8.74 | 3.75 | 8.70 | -| attention rescoring | 2.97 | 8.48 | 3.07 | 8.44 | - -## SqueezeFormer Result (SM12, FFN:2048) -* Encoder info: - * SM12, reduce_idx 5, recover_idx 11, conv2d, w/o syncbn - * encoder_dim 256, output_size 256, head 4, ffn_dim 256*8=2048 - * encoder FLOPs(30s): 28,230,473,984, params: 34,827,400 -* Feature info: using fbank feature, cmvn, dither, online speed perturb -* Training info: - * train_squeezeformer.yaml, kernel size 31 - * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 0.1 - * adamw, lr 1e-3, noamhold, warmup 0.2, hold 0.3, lr_decay 1.0 -* Decoding info: - * ctc_weight 0.3, reverse weight 0.5, average_num 30 - -| decoding mode | dev clean | dev other | test clean | test other | -|----------------------------------|-----------|-----------|------------|------------| -| ctc greedy search | 3.34 | 9.01 | 3.47 | 8.85 | -| ctc prefix beam search | 3.33 | 9.02 | 3.46 | 8.81 | -| attention decoder | 3.64 | 8.62 | 3.91 | 8.33 | -| attention rescoring | 2.89 | 8.34 | 3.10 | 8.03 | - -## SqueezeFormer Result (SM12, FFN:1312) -* Encoder info: - * SM12, reduce_idx 5, recover_idx 11, conv1d, w/o syncbn - * encoder_dim 328, output_size 256, head 4, ffn_dim 328*4=1312 - * encoder FLOPs(30s): 34,103,960,008, params: 35,678,352 -* Feature info: - * using fbank feature, cmvn, dither, online speed perturb -* Training info: - * train_squeezeformer.yaml, kernel size 31, - * batch size 12, 8 gpu, acc_grad 4, 120 epochs, dither 1.0 - * adamw, lr 1e-3, noamhold, warmup 0.2, hold 0.3, lr_decay 1.0 -* Decoding info: - * ctc_weight 0.3, reverse weight 0.5, average_num 30 - -| decoding mode | dev clean | dev other | test clean | test other | -|----------------------------------|-----------|-----------|------------|------------| -| ctc greedy search | 3.20 | 8.46 | 3.30 | 8.58 | -| ctc prefix beam search | 3.18 | 8.44 | 3.30 | 8.55 | -| attention decoder | 3.38 | 8.31 | 3.89 | 8.32 | -| attention rescoring | 2.81 | 7.86 | 2.96 | 7.91 | - -## Conformer U2++ Result - -* Feature info: using fbank feature, cmvn, no speed perturb, dither -* Training info: train_u2++_conformer.yaml lr 0.001, batch size 24, 8 gpu, acc_grad 1, 120 epochs, dither 1.0 -* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 30 -* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0 - -test clean - -| decoding mode | full | 16 | -|--------------------------------|------|------| -| ctc prefix beam search | 3.76 | 4.54 | -| attention rescoring | 3.32 | 3.80 | - -test other - -| decoding mode | full | 16 | -|--------------------------------|-------|-------| -| ctc prefix beam search | 9.50 | 11.52 | -| attention rescoring | 8.67 | 10.38 | - -## SqueezeFormer Result (U2++, FFN:2048) - -* Encoder info: - * SM12, reduce_idx 5, recover_idx 11, conv1d, layer_norm - * do_rel_shift false, warp_for_time, syncbn - * encoder_dim 256, output_size 256, head 4, ffn_dim 256*8=2048 - * Encoder FLOPs(30s): 28,255,337,984, params: 34,893,704 -* Feature info: - * using fbank feature, cmvn, dither, online speed perturb -* Training info: - * train_squeezeformer.yaml, kernel size 31 - * batch size 12, 8 gpu, acc_grad 2, 120 epochs, dither 1.0 - * adamw, lr 8e-4, NoamHold, warmup 0.2, hold 0.3, lr_decay 1.0 -* Decoding info: - * ctc_weight 0.3, reverse weight 0.5, average_num 30 - -test clean - -| decoding mode | full | 16 | -|--------------------------------|------|------| -| ctc prefix beam search | 3.45 | 4.34 | -| attention rescoring | 3.07 | 3.71 | - -test other - -| decoding mode | full | 16 | -|--------------------------------|-------|-------| -| ctc prefix beam search | 8.29 | 10.60 | -| attention rescoring | 7.58 | 9.60 | - -## Conformer U2 Result - -* Feature info: using fbank feature, cmvn, speed perturb, dither -* Training info: train_unified_conformer.yaml lr 0.001, batch size 10, 8 gpu, acc_grad 1, 120 epochs, dither 1.0 -* Decoding info: ctc_weight 0.5, average_num 30 -* Git hash: 90d9a559840e765e82119ab72a11a1f7c1a01b78 -* LM-tgmed: [3-gram.pruned.1e-7.arpa.gz](http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz) -* LM-tglarge: [3-gram.arpa.gz](http://www.openslr.org/resources/11/3-gram.arpa.gz) -* LM-fglarge: [4-gram.arpa.gz](http://www.openslr.org/resources/11/4-gram.arpa.gz) - -test clean - -| decoding mode | full | 16 | -|----------------------------------|------|------| -| ctc prefix beam search | 4.26 | 5.00 | -| attention decoder | 3.05 | 3.44 | -| attention rescoring | 3.72 | 4.10 | -| attention rescoring (beam 50) | 3.57 | 3.95 | -| LM-tgmed + attention rescoring | 3.56 | 4.02 | -| LM-tglarge + attention rescoring | 3.40 | 3.82 | -| LM-fglarge + attention rescoring | 3.38 | 3.74 | - -test other - -| decoding mode | full | 16 | -|----------------------------------|-------|-------| -| ctc prefix beam search | 10.87 | 12.87 | -| attention decoder | 9.07 | 10.44 | -| attention rescoring | 9.74 | 11.61 | -| attention rescoring (beam 50) | 9.34 | 11.13 | -| LM-tgmed + attention rescoring | 8.78 | 10.26 | -| LM-tglarge + attention rescoring | 8.34 | 9.74 | -| LM-fglarge + attention rescoring | 8.17 | 9.44 | - - -## Efficient Conformer V1 Result - -* Feature info: - * using fbank feature, cmvn, speed perturb, dither -* Training info: - * train_u2++_efficonformer_v1.yaml - * 8 gpu, batch size 16, acc_grad 1, 120 epochs - * lr 0.001, warmup_steps 35000 -* Model info: - * Model Params: 49,474,974 - * Downsample rate: 1/4 (conv2d) * 1/2 (efficonformer block) - * encoder_dim 256, output_size 256, head 8, linear_units 2048 - * num_blocks 12, cnn_module_kernel 15, group_size 3 -* Decoding info: - * ctc_weight 0.5, reverse_weight 0.3, average_num 20 - -test clean - -| decoding mode | full | 18 | 16 | -|------------------------|------|------|------| -| attention decoder | 3.65 | 3.88 | 3.87 | -| ctc_greedy_search | 3.46 | 3.79 | 3.77 | -| ctc prefix beam search | 3.44 | 3.75 | 3.74 | -| attention rescoring | 3.17 | 3.44 | 3.41 | - -test other - -| decoding mode | full | 18 | 16 | -|------------------------|------|-------|-------| -| attention decoder | 8.51 | 9.24 | 9.25 | -| ctc_greedy_search | 8.94 | 10.04 | 10.06 | -| ctc prefix beam search | 8.91 | 10 | 10.01 | -| attention rescoring | 8.21 | 9.25 | 9.25 | - - -## Efficient Conformer V2 Result - -* Feature info: - * using fbank feature, cmvn, speed perturb, dither -* Training info: - * train_u2++_efficonformer_v2.yaml - * 8 gpu, batch size 16, acc_grad 1, 120 epochs - * lr 0.001, warmup_steps 35000 -* Model info: - * Model Params: 50,341,278 - * Downsample rate: 1/2 (conv2d2) * 1/4 (efficonformer block) - * encoder_dim 256, output_size 256, head 8, linear_units 2048 - * num_blocks 12, cnn_module_kernel 15, group_size 3 -* Decoding info: - * ctc_weight 0.5, reverse_weight 0.3, average_num 20 - -test clean - -| decoding mode | full | 18 | 16 | -|------------------------|------|------|------| -| attention decoder | 3.49 | 3.71 | 3.72 | -| ctc_greedy_search | 3.49 | 3.74 | 3.77 | -| ctc prefix beam search | 3.47 | 3.72 | 3.74 | -| attention rescoring | 3.12 | 3.38 | 3.36 | - -test other - -| decoding mode | full | 18 | 16 | -|------------------------|------|------|------| -| attention decoder | 8.15 | 9.05 | 9.03 | -| ctc_greedy_search | 8.73 | 9.82 | 9.83 | -| ctc prefix beam search | 8.70 | 9.81 | 9.79 | -| attention rescoring | 8.05 | 9.08 | 9.10 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_conformer.yaml deleted file mode 100644 index a1298d8a4c90eed0d704c3839ab5bd71c84d8593..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# dataset related -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - min_output_input_ratio: 0.0005 - max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 12 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 70 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.004 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_conformer_bidecoder_large.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_conformer_bidecoder_large.yaml deleted file mode 100644 index 28b218855cf7ac83b21deb027f9c4420f3dbaecb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_conformer_bidecoder_large.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 31 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - cnn_module_norm: 'layer_norm' - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - min_output_input_ratio: 0.0005 - max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 12 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 50000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_squeezeformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_squeezeformer.yaml deleted file mode 100644 index 15dd2d33ba1483747753f33f3afc73bc61c01b6e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_squeezeformer.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# network architecture -# encoder related -encoder: squeezeformer -encoder_conf: - encoder_dim: 256 - output_size: 256 # dimension of attention - attention_heads: 4 - num_blocks: 12 # the number of encoder blocks - reduce_idx: 5 - recover_idx: 11 - pos_enc_layer_type: 'rel_pos' - time_reduction_layer_type: 'conv1d' - feed_forward_expansion_factor: 4 - input_dropout_rate: 0.1 - feed_forward_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - cnn_module_kernel: 31 - cnn_norm_type: layer_norm - adaptive_scale: true - normalize_before: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# dataset related -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - min_output_input_ratio: 0.0005 - max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 12 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 120 -log_interval: 100 - -optim: adamw -optim_conf: - lr: 1.e-3 - weight_decay: 4.e-5 - -scheduler: NoamHoldAnnealing -scheduler_conf: - warmup_ratio: 0.2 - hold_ratio: 0.3 - max_steps: 87960 - decay_rate: 1.0 - min_lr: 1.e-5 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_squeezeformer_bidecoder_large.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_squeezeformer_bidecoder_large.yaml deleted file mode 100644 index 6d81b2a545719b9e90ff0bc04f2e56d9d9d0c3bc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_squeezeformer_bidecoder_large.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# network architecture -# encoder related -encoder: squeezeformer -encoder_conf: - encoder_dim: 512 - output_size: 512 # dimension of attention - attention_heads: 8 - num_blocks: 12 # the number of encoder blocks - reduce_idx: 5 - recover_idx: 11 - feed_forward_expansion_factor: 4 - input_dropout_rate: 0.1 - feed_forward_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - cnn_module_kernel: 31 - cnn_norm_type: batch_norm - adaptive_scale: true - normalize_before: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - syncbn: true - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - min_output_input_ratio: 0.0005 - max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 100 - max_f: 27 - max_w: 80 -# warp_for_time: true - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 12 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 120 -log_interval: 100 - -optim: adamw -optim_conf: - lr: 1.e-3 - weight_decay: 4.e-5 - -scheduler: NoamHoldAnnealing -scheduler_conf: - warmup_ratio: 0.2 - hold_ratio: 0.3 - max_steps: 87960 - decay_rate: 1.0 - min_lr: 1.e-5 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_conformer.yaml deleted file mode 100644 index 97928fe7d77c32f169d9a66c3cb78634abd1c4fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_conformer.yaml +++ /dev/null @@ -1,91 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 400 - token_min_length: 1 - # min_output_input_ratio: 0.0005 - # max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - warp_for_time: true - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - shuffle: true - shuffle_conf: - shuffle_size: 10000 - sort: true - sort_conf: - sort_size: 2000 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 24 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_efficonformer_v1.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_efficonformer_v1.yaml deleted file mode 100644 index 451409abb0e5d739e792cbd5de820a2790e1fd5f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_efficonformer_v1.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# network architecture -# encoder related -encoder: efficientConformer -encoder_conf: - activation_type: 'swish' - attention_heads: 8 - causal: false - cnn_module_kernel: 15 - cnn_module_norm: 'layer_norm' - dropout_rate: 0.1 - input_layer: conv2d - linear_units: 2048 - normalize_before: true - num_blocks: 12 - output_size: 256 - pos_enc_layer_type: 'rel_pos' - attention_dropout_rate: 0.1 - positional_dropout_rate: 0.1 - use_cnn_module: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - efficient_conf: - stride_layer_idx: [3] # layer id with StrideConv - stride: [2] # stride size of each StrideConv - group_layer_idx: [0, 1, 2, 3] # layer id with GroupedAttention - group_size: 3 # group size of every GroupedAttention layer - stride_kernel: true # true: recompute cnn kernels with stride - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - batch_conf: - batch_size: 16 - batch_type: 'static' - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - spec_trim: false - spec_trim_conf: - max_t: 50 - speed_perturb: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 35000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_efficonformer_v2.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_efficonformer_v2.yaml deleted file mode 100644 index 1ba165953b671e5d657c3d5ad3261ca5db188c60..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_efficonformer_v2.yaml +++ /dev/null @@ -1,98 +0,0 @@ -# network architecture -# encoder related -encoder: efficientConformer -encoder_conf: - activation_type: 'swish' - attention_heads: 8 - causal: false - cnn_module_kernel: 15 - cnn_module_norm: 'layer_norm' - dropout_rate: 0.1 - input_layer: conv2d2 - linear_units: 2048 - normalize_before: true - num_blocks: 12 - output_size: 256 - pos_enc_layer_type: 'rel_pos' - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - use_cnn_module: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - efficient_conf: - stride_layer_idx: [3, 7] # layer id with StrideConv - stride: [2, 2] # stride size of each StrideConv - group_layer_idx: [3, 7] # layer id with GroupedAttention - group_size: 3 # group size of every GroupedAttention layer - stride_kernel: false # true: recompute cnn kernels with stride - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - dropout_rate: 0.1 - linear_units: 2048 - num_blocks: 3 - positional_dropout_rate: 0.1 - r_num_blocks: 3 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - batch_conf: - batch_size: 10 - batch_type: 'static' - fbank_conf: - dither: 1.0 - frame_length: 25 - frame_shift: 10 - num_mel_bins: 80 - filter_conf: - max_length: 40960 - min_length: 0 - max_output_input_ratio: 0.1 - min_output_input_ratio: 0.005 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - spec_trim: false - spec_trim_conf: - max_t: 50 - speed_perturb: true - -grad_clip: 5 -accum_grad: 2 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 28000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_squeezeformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_squeezeformer.yaml deleted file mode 100644 index 2cb6204bd67d51d1e9d796bb4f8ab77bbe55610e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_u2++_squeezeformer.yaml +++ /dev/null @@ -1,98 +0,0 @@ -# network architecture -# encoder related -encoder: squeezeformer -encoder_conf: - encoder_dim: 256 - output_size: 256 # dimension of attention - attention_heads: 4 - num_blocks: 12 # the number of encoder blocks - reduce_idx: 5 - recover_idx: 11 - time_reduction_layer_type: "stream" - feed_forward_expansion_factor: 8 - input_dropout_rate: 0.1 - feed_forward_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - cnn_module_kernel: 31 - do_rel_shift: false - cnn_norm_type: layer_norm - adaptive_scale: true - normalize_before: false - causal: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 400 - token_min_length: 1 - # min_output_input_ratio: 0.0005 - # max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 -# warp_for_time: true - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - shuffle: true - shuffle_conf: - shuffle_size: 10000 - sort: true - sort_conf: - sort_size: 2000 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 12 - -grad_clip: 5 -accum_grad: 2 -max_epoch: 120 -log_interval: 100 - -optim: adamw -optim_conf: - lr: 8.e-4 - weight_decay: 4.e-5 - -scheduler: NoamHoldAnnealing -scheduler_conf: - warmup_ratio: 0.2 - hold_ratio: 0.3 - max_steps: 175680 - decay_rate: 1.0 - min_lr: 1.e-5 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_unified_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_unified_conformer.yaml deleted file mode 100644 index 288687e6161eb8a4acb0ca3e6006ad1a5e51df54..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# dataset related -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 50 - token_max_length: 400 - token_min_length: 1 - min_output_input_ratio: 0.0005 - max_output_input_ratio: 0.1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/local/data_prep_torchaudio.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/local/data_prep_torchaudio.sh deleted file mode 100644 index c7dc1deb7dec59f571c8f6935fe36c6aea2e8e99..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/local/data_prep_torchaudio.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Vassil Panayotov -# 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" - exit 1 -fi - -src=$1 -dst=$2 - -# all utterances are FLAC compressed -if ! which flac >&/dev/null; then - echo "Please install 'flac' on ALL worker nodes!" - exit 1 -fi - -mkdir -p $dst || exit 1 - -[ ! -d $src ] && echo "$0: no such directory $src" && exit 1 - -wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp -trans=$dst/text; [[ -f "$trans" ]] && rm $trans - -for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do - reader=$(basename $reader_dir) - if ! [ $reader -eq $reader ]; then # not integer. - echo "$0: unexpected subdirectory name $reader" - exit 1 - fi - - for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do - chapter=$(basename $chapter_dir) - if ! [ "$chapter" -eq "$chapter" ]; then - echo "$0: unexpected chapter-subdirectory name $chapter" - exit 1 - fi - - find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ - awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1 - - chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt - [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 - cat $chapter_trans >>$trans - done -done - -echo "$0: successfully prepared data in $dst" - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/local/download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/local/download_and_untar.sh deleted file mode 100644 index cd32fb6b989d7229272f1066a75a1688df2bf06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/local/download_and_untar.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: dev-clean, test-clean, dev-other, test-other," - echo " train-clean-100, train-clean-360, train-other-500." - exit 1 -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1 -fi - -part_ok=false -list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1 -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1 -fi - -if [ -f $data/LibriSpeech/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0 -fi - - -# sizes of the archive files in bytes. This is some older versions. -sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" -# sizes_new is the archive file sizes of the final release. Some of these sizes are of -# things we probably won't download. -sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" - -if [ -f $data/$part.tar.gz ]; then - size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') - size_ok=false - for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tar.gz - else - echo "$data/$part.tar.gz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1 - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - if ! wget -P $data --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1 - fi -fi - -if ! tar -C $data -xvzf $data/$part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tar.gz" - exit 1 -fi - -touch $data/LibriSpeech/$part/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" - -if $remove_archive; then - echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." - rm $data/$part.tar.gz -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/path.sh deleted file mode 100644 index ac1ca08baf5d4540b92ed239b8aa7cd613064a8c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/run.sh deleted file mode 100644 index ede3922c1c457ea9ef2a9de7070867708617bb34..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/run.sh +++ /dev/null @@ -1,282 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 -# data -data_url=www.openslr.org/resources/12 -# use your own data path -datadir=/export/data/en-asr-data/OpenSLR -# wav data dir -wave_data=data -# Optional train_config -# 1. conf/train_transformer_large.yaml: Standard transformer -train_config=conf/train_conformer.yaml -checkpoint= -cmvn=true -do_delta=false - -dir=exp/sp_spec_aug - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -# maybe you can try to adjust it if you can not get close results as README.md -average_num=10 -decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention" - -. tools/parse_options.sh || exit 1; - -# bpemode (unigram or bpe) -nbpe=5000 -bpemode=unigram - -set -e -set -u -set -o pipefail - -train_set=train_960 -dev_set=dev -recog_set="test_clean test_other dev_clean dev_other" - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - local/download_and_untar.sh ${datadir} ${data_url} ${part} - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - ### Task dependent. You have to make data the following preparation part by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 0: Data preparation" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - # use underscore-separated names in data directories. - local/data_prep_torchaudio.sh ${datadir}/LibriSpeech/${part} $wave_data/${part//-/_} - done -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ### Task dependent. You have to design training and dev sets by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 1: Feature Generation" - mkdir -p $wave_data/train_960 - # merge total training data - for set in train_clean_100 train_clean_360 train_other_500; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/train_960/$f - done - done - mkdir -p $wave_data/dev - # merge total dev data - for set in dev_clean dev_other; do - for f in `ls $wave_data/$set`; do - cat $wave_data/$set/$f >> $wave_data/$dev_set/$f - done - done - - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp $wave_data/$train_set/wav.scp \ - --out_cmvn $wave_data/$train_set/global_cmvn - -fi - - -dict=$wave_data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=$wave_data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " $wave_data/${train_set}/text > $wave_data/lang_char/input.txt - tools/spm_train --input=$wave_data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece < $wave_data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in $dev_set ${recog_set} $train_set ; do - tools/make_raw_list.py $wave_data/$x/wav.scp $wave_data/$x/text \ - $wave_data/$x/data.list - done - -fi - - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - cmvn_opts= - $cmvn && cmvn_opts="--cmvn $wave_data/${train_set}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type raw \ - --symbol_table $dict \ - --bpe_model ${bpemodel}.model \ - --train_data $wave_data/$train_set/data.list \ - --cv_data $wave_data/$dev_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - # Polling GPU id begin with index 0 - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - idx=0 - for test in $recog_set; do - for mode in ${decode_modes}; do - { - { - test_dir=$dir/${test}_${mode} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type raw \ - --dict $dict \ - --bpe_model ${bpemodel}.model \ - --test_data $wave_data/$test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp - cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp - tools/spm_decode --model=${bpemodel}.model --input_format=piece \ - < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value_tmp - paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value_tmp > $test_dir/text - - python tools/compute-wer.py --char=1 --v=1 \ - $wave_data/$test/text $test_dir/text > $test_dir/wer - } & - - ((idx+=1)) - if [ $idx -eq $num_gpus ]; then - idx=0 - fi - } - done - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - -# Optionally, you can add LM and test it with runtime. -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - lm=data/local/lm - lexicon=data/local/dict/lexicon.txt - mkdir -p $lm - mkdir -p data/local/dict - - # 7.1 Download & format LM - which_lm=3-gram.pruned.1e-7.arpa.gz - if [ ! -e ${lm}/${which_lm} ]; then - wget http://www.openslr.org/resources/11/${which_lm} -P ${lm} - fi - echo "unzip lm($which_lm)..." - gunzip -k ${lm}/${which_lm} -c > ${lm}/lm.arpa - echo "Lm saved as ${lm}/lm.arpa" - - # 7.2 Prepare dict - unit_file=$dict - bpemodel=$bpemodel - # use $dir/words.txt (unit_file) and $dir/train_960_unigram5000 (bpemodel) - # if you download pretrained librispeech conformer model - cp $unit_file data/local/dict/units.txt - if [ ! -e ${lm}/librispeech-lexicon.txt ]; then - wget http://www.openslr.org/resources/11/librispeech-lexicon.txt -P ${lm} - fi - echo "build lexicon..." - tools/fst/prepare_dict.py $unit_file ${lm}/librispeech-lexicon.txt \ - $lexicon $bpemodel.model - echo "lexicon saved as '$lexicon'" - - # 7.3 Build decoding TLG - tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - - # 7.4 Decoding with runtime - fst_dir=data/lang_test - for test in ${recog_set}; do - ./tools/decode.sh --nj 6 \ - --beam 10.0 --lattice_beam 5 --max_active 7000 --blank_skip_thresh 0.98 \ - --ctc_weight 0.5 --rescoring_weight 1.0 --acoustic_scale 1.2 \ - --fst_path $fst_dir/TLG.fst \ - --dict_path $fst_dir/words.txt \ - data/$test/wav.scp data/$test/text $dir/final.zip $fst_dir/units.txt \ - $dir/lm_with_runtime_${test} - tail $dir/lm_with_runtime_${test}/wer - done -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/librispeech/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/README.md deleted file mode 100644 index 344faf23debb56bddbfb097df1db06f3f819e28e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Performance Record - -This is a Chinese speech recognition recipe that trains on all Chinese corpora including: - -| Dataset | Duration (Hours) | -|------------|------------------| -| Aidatatang | 140 | -| Aishell | 151 | -| MagicData | 712 | -| Primewords | 99 | -| ST-CMDS | 110 | -| THCHS-30 | 26 | -| TAL-ASR | 587 | -| AISHELL2 | 1000 | - -## Unified Transformer Result - -### Data info: - -* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, and THCHS-30. -* Feature info: using fbank feature, with cmvn, no speed perturb. -* Training info: lr 0.004, batch size 18, 3 machines, 3*8 = 24 GPUs, acc_grad 1, 220 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 30 -* Git hash: 013794572a55c7d0dbea23a66106ccf3e5d3b8d4 - -### WER - -| Dataset | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring | -|------------|------------|-------------------|-------------------|------------------------|---------------------| -| Aidatatang | full | 4.23 | 5.82 | 5.82 | 4.71 | -| | 16 | 4.59 | 6.99 | 6.99 | 5.29 | -| Aishell | full | 4.69 | 5.80 | 5.80 | 4.64 | -| | 16 | 4.97 | 6.75 | 6.75 | 5.37 | -| MagicData | full | 2.86 | 4.01 | 4.00 | 3.07 | -| | 16 | 3.10 | 5.02 | 5.02 | 3.68 | -| THCHS-30 | full | 16.68 | 15.46 | 15.46 | 14.38 | -| | 16 | 17.47 | 16.81 | 16.82 | 15.63 | - -## Unified Conformer Result - -### Data info: - -* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, and THCHS-30. -* Feature info: using fbank feature, with cmvn, speed perturb. -* Training info: lr 0.001, batch size 8, 1 machines, 1*8 = 8 GPUs, acc_grad 12, 60 epochs -* Decoding info: ctc_weight 0.5, average_num 10 -* Git hash: 5bdf436e671ef4c696d1b039f29cc33109e072fa - -### WER - -| Dataset | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring | -|------------|------------|-------------------|-------------------|------------------------|---------------------| -| Aidatatang | full | 4.12 | 4.97 | 4.97 | 4.22 | -| | 16 | 4.45 | 5.73 | 5.73 | 4.75 | -| Aishell | full | 4.49 | 5.07 | 5.05 | 4.43 | -| | 16 | 4.77 | 5.77 | 5.77 | 4.85 | -| MagicData | full | 2.55 | 3.07 | 3.05 | 2.59 | -| | 16 | 2.81 | 3.88 | 3.86 | 3.08 | -| THCHS-30 | full | 13.55 | 13.75 | 13.76 | 12.72 | -| | 16 | 13.78 | 15.10 | 15.08 | 13.90 | - -## Unified Conformer Result - -### Data info: - -* Dataset: Aidatatang, Aishell, MagicData, Primewords, ST-CMDS, THCHS-30, TAL-ASR, and AISHELL2. -* Feature info: using fbank feature, dither=0, cmvn, speed perturb -* Training info: lr 0.001, batch size 22, 4 GPUs, acc_grad 4, 120 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 10 -* Git hash: 66f30c197d00c59fdeda3bc8ada801f867b73f78 - -### WER - -| Dataset | chunk size | attention decoder | ctc greedy search | ctc prefix beam search | attention rescoring | -|------------|------------|-------------------|-------------------|------------------------|---------------------| -| Aidatatang | full | 3.22 | 4.00 | 4.01 | 3.35 | -| | 16 | 3.50 | 4.63 | 4.63 | 3.79 | -| Aishell | full | 1.23 | 2.12 | 2.13 | 1.42 | -| | 16 | 1.33 | 2.72 | 2.72 | 1.72 | -| MagicData | full | 2.38 | 3.07 | 3.05 | 2.52 | -| | 16 | 2.66 | 3.80 | 3.78 | 2.94 | -| THCHS-30 | full | 9.93 | 11.07 | 11.06 | 10.16 | -| | 16 | 10.28 | 11.85 | 11.85 | 10.81 | -| AISHELL2 | full | 5.25 | 5.81 | 5.79 | 5.22 | -| | 16 | 5.48 | 6.48 | 6.50 | 5.61 | -| TAL-ASR | full | 9.54 | 10.35 | 10.28 | 9.66 | -| | 16 | 10.04 | 11.43 | 11.39 | 10.55 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_960_unigram5000.model b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_960_unigram5000.model deleted file mode 100644 index 8419aa7bac81d9b02f9644e9cf8929b73765a3af..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_960_unigram5000.model and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_conformer.yaml deleted file mode 100644 index b8ce511cdaad0f03be4a82708d70290ec9e37c3d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_unified_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_unified_conformer.yaml deleted file mode 100644 index 3155d1b760b676476ba1abc60b64001b242988c4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,81 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 180 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_unified_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_unified_transformer.yaml deleted file mode 100644 index aa6645df9fc6df20e4946bdfe401c961c3bed31b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/conf/train_unified_transformer.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - - -grad_clip: 5 -accum_grad: 1 -max_epoch: 220 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.004 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aidatatang_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aidatatang_data_prep.sh deleted file mode 100644 index cb334a49a3472cde963329f134edc93476f83315..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aidatatang_data_prep.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/data_aidatatang_200zh data/aidatatang" - exit 1; -fi - -aidatatang_audio_dir=$1/corpus -aidatatang_text=$1/transcript/aidatatang_200_zh_transcript.txt -data=$2 - -train_dir=$data/local/train -dev_dir=$data/local/dev -test_dir=$data/local/test -tmp_dir=$data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -echo "**** Creating aidatatang data folder ****" - -# find wav audio file for train, dev and test resp. -find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 237265 ] && \ - echo Warning: expected 237265 data files, found $n - -grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all - paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all - tools/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text | sed 's/A/A/g' > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - tools/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" T0055"$2}' > $dir/utt2spk - tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp - sort -u $dir/transcripts.txt > $dir/text - tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt -done - -mkdir -p $data/train $data/dev $data/test - -for f in spk2utt utt2spk wav.scp text; do - cp $train_dir/$f $data/train/$f || exit 1; - cp $dev_dir/$f $data/dev/$f || exit 1; - cp $test_dir/$f $data/test/$f || exit 1; -done - -# utils/data/validate_data_dir.sh --no-feats $data/train || exit 1; -# utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1; -# utils/data/validate_data_dir.sh --no-feats $data/test || exit 1; - -echo "$0: aidatatang_200zh data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aidatatang_download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aidatatang_download_and_untar.sh deleted file mode 100644 index a2616ba0e2046e8a009e915bb02d1eb509f62228..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aidatatang_download_and_untar.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: aidatatang_200zh." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data, make it" - mkdir -p $data -fi - -part_ok=false -list="aidatatang_200zh" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="18756983399" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.gz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -dev_dir=$data/$part/corpus/dev -test_dir=$data/$part/corpus/test -train_dir=$data/$part/corpus/train -if [ $part == "aidatatang_200zh" ]; then - for set in $dev_dir $test_dir $train_dir;do - cd $set - for wav in ./*.tar.gz; do - echo "Extracting wav from $wav" - tar -zxf $wav && rm $wav - done - done -fi - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell2_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell2_data_prep.sh deleted file mode 100644 index 016b7058c3811fa93e8b10edc80cbc13a7e8e4d3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell2_data_prep.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) -# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) -# Apache 2.0 - -# This script is copied from aishell2/s5/local/prepare_data.sh -# but using difference word segmentation script. - -# transform raw AISHELL-2 data to kaldi format - -. ./path.sh || exit 1; - -tmp= -dir= - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/AISHELL-2/iOS/train data/train" - exit 1; -fi - -corpus=$1 -dir=$2 -tmp=$dir/tmp - -echo "prepare_data.sh: Preparing data in $corpus" - -mkdir -p $dir -mkdir -p $tmp - - -# corpus check -if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then - echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." - exit 1; -fi - -# validate utt-key list -awk '{print "AISHELL2_"$1}' $corpus/wav.scp > $tmp/wav_utt.list -awk '{print "AISHELL2_"$1}' $corpus/trans.txt > $tmp/trans_utt.list -tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list - -# wav.scp -awk -F'\t' -v path_prefix=$corpus '{printf("AISHELL2_%s %s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp -tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp - -awk -F'\t' '{printf("AISHELL2_%s %s\n",$1,$2)}' $corpus/trans.txt > $tmp/tmp_trans.txt -tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_trans.txt | sort -k 1 | uniq > $tmp/trans.txt - -# text has ' sed "s/'//g" -dos2unix < $tmp/trans.txt | \ - tools/filter_scp.pl -f 1 $tmp/utt.list - | \ - sort -k 1 | uniq | tr '[a-z]' '[A-Z]' | \ - sed 's/A/A/g' | sed 's/T/T/g' | sed 's/M/M/g' | sed 's/𫚉//g' | sed 's/𫖯/頫/g' \ - > $tmp/text - -# utt2spk & spk2utt -awk -F' ' '{print $2}' $tmp/wav.scp > $tmp/wav.list -sed -e 's:\.wav::g' $tmp/wav.list | \ - awk -F'/' '{i=NF-1;printf("AISHELL2_%s AISHELL2_%s\n",$NF,$i)}' > $tmp/tmp_utt2spk -tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk -tools/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt - -# copy prepared resources from tmp_dir to target dir -mkdir -p $dir -for f in wav.scp text spk2utt utt2spk; do - cp $tmp/$f $dir/$f || exit 1; -done - -tools/validate_data_dir.sh --no-feats $dir || exit 1; -echo "local/prepare_data.sh succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell_data_prep.sh deleted file mode 100644 index 6c3b6a40bf9e7d5392a0cdc2517b14b38714e332..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell_data_prep.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/data_aishell data/aishell" - exit 1; -fi - -aishell_audio_dir=$1/wav -aishell_text=$1/transcript/aishell_transcript_v0.8.txt -data=data/aishell - -train_dir=$data/local/train -dev_dir=$data/local/dev -test_dir=$data/local/test -tmp_dir=$data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -echo "**** Creating aishell data folder ****" - -# find wav audio file for train, dev and test resp. -find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 141925 ] && \ - echo Warning: expected 141925 data data files, found $n - -grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all - paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all - tools/filter_scp.pl -f 1 $dir/utt.list $aishell_text | \ - sed 's/a/a/g' | sed 's/b/b/g' |\ - sed 's/c/c/g' | sed 's/k/k/g' |\ - sed 's/t/t/g' > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - tools/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" BAC009"$2}' > $dir/utt2spk - tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp - sort -u $dir/transcripts.txt > $dir/text - tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt -done - -mkdir -p $data/train $data/dev $data/test - -for f in spk2utt utt2spk wav.scp text; do - cp $train_dir/$f $data/train/$f || exit 1; - cp $dev_dir/$f $data/dev/$f || exit 1; - cp $test_dir/$f $data/test/$f || exit 1; -done - -# utils/data/validate_data_dir.sh --no-feats $data/train || exit 1; -# utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1; -# utils/data/validate_data_dir.sh --no-feats $data/test || exit 1; - -echo "$0: AISHELL data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell_download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell_download_and_untar.sh deleted file mode 100644 index e251a9aae2fefd4d52e98530936cae35e74cf0e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/aishell_download_and_untar.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_aishell, resource_aishell." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - mkdir -p $data -fi - -part_ok=false -list="data_aishell resource_aishell" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="15582913665 1246920" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tgz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -if [ $part == "data_aishell" ]; then - cd $data/$part/wav - for wav in ./*.tar.gz; do - echo "Extracting wav from $wav" - tar -zxf $wav && rm $wav - done -fi - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_badlist b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_badlist deleted file mode 100644 index 67636273d53a9708c4f938d619d7622970d88540..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_badlist +++ /dev/null @@ -1,2 +0,0 @@ -16_4013_20170819121429.wav -18_1565_20170712000170.wav diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_data_prep.sh deleted file mode 100644 index a2609c5159da0acd9de18d7af7fe01683c38d433..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_data_prep.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/magicdata data/magicdata" - exit 1; -fi - -corpus=$1 -data=$2 - -if [ ! -d $corpus/train ] || [ ! -d $corpus/dev ] || [ ! -d $corpus/test ]; then - echo "Error: $0 requires complete corpus" - exit 1; -fi - -echo "**** Creating magicdata data folder ****" - -mkdir -p $data/{train,dev,test,tmp} - -# find wav audio file for train, dev and test resp. -tmp_dir=$data/tmp -find $corpus -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 609552 ] && \ - echo Warning: expected 609552 data data files, found $n - -for x in train dev test; do - grep -i "/$x/" $tmp_dir/wav.flist > $data/$x/wav.flist || exit 1; - echo "Filtering data using found wav list and provided transcript for $x" - awk -F '.wav' '{print $1}' local/magicdata_badlist | tools/filter_scp.pl --exclude -f 1 - \ - <(cat $data/$x/wav.flist|awk -F '/' '{print gensub(".wav", "", "g", $NF), $0}') \ - > $data/$x/wav.scp - sed '1d' $corpus/$x/TRANS.txt | awk -F '\t' '{print gensub(".wav","","g",$1), $2}' > $data/$x/utt2spk - sed '1d' $corpus/$x/TRANS.txt | awk -F '\t' '{print gensub(".wav","","g",$1), $3}' |\ - sed 's/!//g' | sed 's/?//g' |\ - sed 's/,//g' | sed 's/-//g' |\ - sed 's/://g' | sed 's/;//g' |\ - sed 's/ //g' | sed 's/。//g' |\ - sed 's/`//g' | sed 's/,//g' |\ - sed 's/://g' | sed 's/?//g' |\ - sed 's/\///g' | sed 's/·//g' |\ - sed 's/\"//g' | sed 's/“//g' |\ - sed 's/”//g' | sed 's/\\//g' |\ - sed 's/…//g' | sed "s///g" |\ - sed 's/、//g' | sed "s///g" | sed 's/《//g' | sed 's/》//g' |\ - sed 's/\[//g' | sed 's/\]//g' | sed 's/FIL//g' | sed 's/SPK//' |\ - tr '[a-z]' '[A-Z]' |\ - awk '{if (NF > 1) print $0;}' > $data/$x/text - for file in wav.scp utt2spk text; do - sort $data/$x/$file -o $data/$x/$file - done - tools/utt2spk_to_spk2utt.pl $data/$x/utt2spk > $data/$x/spk2utt -done - -# rm -r $tmp_dir - -tools/fix_data_dir.sh $data/train || exit 1; -tools/fix_data_dir.sh $data/dev || exit 1; -tools/fix_data_dir.sh $data/test || exit 1; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_download_and_untar.sh deleted file mode 100644 index df8ca8d229634b67f5fb21a3dd0f8fe561026cb6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/magicdata_download_and_untar.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2019 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/68 train_set" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: train_set, dev_set, test_set." -fi - -data=$1 -url=$2 -part=$3 -part1=`echo $part | sed s/_set//` - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data, make it." - mkdir -p $data -fi - -part_ok=false -list="train_set dev_set test_set" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part1/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="52627842921 1035537823 2201936013" - -if [ -f $data/$part.tar.gz ]; then - size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tar.gz - else - echo "$data/$part.tar.gz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tar.gz" - exit 1; -fi - -touch $data/$part1/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" - -if $remove_archive; then - echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." - rm $data/$part.tar.gz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_data_prep.sh deleted file mode 100644 index 96299a295c3a6f5055fb84d6f1d752868ce623ea..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_data_prep.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/primewords data/primewords" - exit 1; -fi - -corpus=$1/primewords_md_2018_set1 -data=$2 - -if [ ! -d $corpus/audio_files ] || [ ! -f $corpus/set1_transcript.json ]; then - echo "Error: $0 requires complete corpus" - exit 1; -fi - -echo "**** Creating primewords data folder ****" - -mkdir -p $data/train - -# find wav audio file for train - -find $corpus -iname "*.wav" > $data/wav.flist -n=`cat $data/wav.flist | wc -l` -[ $n -ne 50384 ] && \ - echo Warning: expected 50384 data files, found $n - -echo "Filtering data using found wav list and provided transcript" -local/primewords_parse_transcript.py $data/wav.flist $corpus/set1_transcript.json $data/train -cat $data/train/transcripts.txt |\ - awk '{if (NF > 1) print $0;}' > $data/train/text - -for file in wav.scp utt2spk text; do - sort $data/train/$file -o $data/train/$file -done -tools/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt - -# rm -r $data/wav.flist - -tools/validate_data_dir.sh --no-feats $data/train || exit 1; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_download_and_untar.sh deleted file mode 100644 index 7e716c7a0a6683459ae2c14bcdf7394c157aa1ba..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_download_and_untar.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 2 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." -fi - -data=$1 -url=$2 -part=primewords_md_2018_set1 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data, make it" - mkdir -p $data -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="9057625192" - -if [ -f $data/$part.tar.gz ]; then - size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tar.gz - else - echo "$data/$part.tar.gz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tar.gz" - exit 1; -fi - -touch $data/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." - rm $data/$part.tar.gz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_parse_transcript.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_parse_transcript.py deleted file mode 100644 index 772ab7f93810b3094c8f0b7bab22eac528a17817..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/primewords_parse_transcript.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import json - - -def main(argv): - fp = open(argv[1], encoding="utf-8") - js = json.load(fp) - fp.close() - metas = {} - for ele in js: - fname = ele['file'] - metas[fname] = ele - - fWavScp = open(os.path.join(argv[2], 'wav.scp'), 'w') - fText = open(os.path.join( - argv[2], 'transcripts.txt'), 'w', encoding="utf-8") - fUtt2Spk = open(os.path.join(argv[2], 'utt2spk'), 'w') - for line in open(argv[0]): - fpath = line.strip('\r\n') - wname = os.path.basename(fpath) - meta = metas[wname] - spkid = 'P' + meta['user_id'] - uttid = spkid + '-' + meta['id'] - fWavScp.write(uttid + ' ' + fpath + '\n') - fText.write(uttid + ' ' + meta['text'] + '\n') - fUtt2Spk.write(uttid + ' ' + spkid + '\n') - fWavScp.close() - fText.close() - fUtt2Spk.close() - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/stcmds_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/stcmds_data_prep.sh deleted file mode 100644 index 36f8e49af08e633c422fad4e771109cdfaa73847..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/stcmds_data_prep.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/stcmds data/stcmds" - exit 1; -fi - -corpus=$1/ST-CMDS-20170001_1-OS -data=$2 - -if [ ! -d $corpus ]; then - echo "Error: $0 requires complete corpus" - exit 1; -fi - -echo "**** Creating ST-CMDS data folder ****" - -mkdir -p $data/train - -# find wav audio file for train - -find $corpus -iname "*.wav" > $data/wav.list -n=`cat $data/wav.list | wc -l` -[ $n -ne 102600 ] && \ - echo Warning: expected 102600 data files, found $n - -cat $data/wav.list | awk -F'20170001' '{print $NF}' | awk -F'.' '{print $1}' > $data/utt.list -cat $data/utt.list | awk '{print substr($1,1,6)}' > $data/spk.list -while read line; do - tn=`dirname $line`/`basename $line .wav`.txt; - cat $tn; echo; -done < $data/wav.list > $data/text.list - -paste -d' ' $data/utt.list $data/wav.list > $data/train/wav.scp -paste -d' ' $data/utt.list $data/spk.list > $data/train/utt2spk -paste -d' ' $data/utt.list $data/text.list |\ - sed 's/,//g' |\ - tr '[a-z]' '[A-Z]' |\ - awk '{if (NF > 1) print $0;}' > $data/train/text - -for file in wav.scp utt2spk text; do - sort $data/train/$file -o $data/train/$file -done - -tools/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt - -# rm -r $data/{wav,utt,spk,text}.list - -tools/validate_data_dir.sh --no-feats $data/train || exit 1; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/stcmds_download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/stcmds_download_and_untar.sh deleted file mode 100644 index ca89b5a292ac8246d9d0aabeb5884c75020a0178..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/stcmds_download_and_untar.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 2 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." -fi - -data=$1 -url=$2 -part=ST-CMDS-20170001_1-OS - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data, make it" - mkdir -p $data -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="8231662593" - -if [ -f $data/$part.tar.gz ]; then - size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tar.gz - else - echo "$data/$part.tar.gz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tar.gz" - exit 1; -fi - -touch $data/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." - rm $data/$part.tar.gz -fi - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/tal_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/tal_data_prep.sh deleted file mode 100644 index 1a9c48cb1850465f461ecb63d9ebb8d103cc1682..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/tal_data_prep.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/aisolution_data data/tal_asr" - exit 1; -fi - -tal_audio_dir=$1/wav/ -tal_text=$1/transcript/transcript.txt -data=$2 - -train_dir=$data/local/train -dev_dir=$data/local/dev -test_dir=$data/local/test -tmp_dir=$data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $tal_audio_dir ] || [ ! -f $tal_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -echo "**** Creating tal asr data folder ****" - -# find wav audio file for train, dev and test resp. -find $tal_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 31747 ] && \ - echo Warning: expected 31747 data files, found $n - -grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF, "TALASR"$(NF-1)"-"$NF}' > $dir/utt_uttid - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print "TALASR"$(NF-1)"-"$NF, "TALASR"$(NF-1)}' > $dir/utt2spk - paste -d ' ' <(awk '{print $2}' $dir/utt_uttid) $dir/wav.flist > $dir/wav.scp - tools/filter_scp.pl -f 1 $dir/utt.list $tal_text | \ - sed 's/A/A/g' | sed 's/#//g' | sed 's/=//g' | sed 's/、//g' | \ - sed 's/,//g' | sed 's/?//g' | sed 's/。//g' | sed 's/[ ][ ]*$//g'\ - > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - paste -d " " <(sort -u -k 1 $dir/utt_uttid | awk '{print $2}') \ - <(sort -u -k 1 $dir/transcripts.txt | awk '{for(i=2;i $dir/text - tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt -done - -mkdir -p $data/train $data/dev $data/test - -for f in spk2utt utt2spk wav.scp text; do - cp $train_dir/$f $data/train/$f || exit 1; - cp $dev_dir/$f $data/dev/$f || exit 1; - cp $test_dir/$f $data/test/$f || exit 1; -done - -tools/fix_data_dir.sh $data/train || exit 1; -tools/fix_data_dir.sh $data/dev || exit 1; -tools/fix_data_dir.sh $data/test || exit 1; - -echo "$0: tal asr data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/tal_mix_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/tal_mix_data_prep.sh deleted file mode 100644 index 1bc808d76a83209c68132886da3bd6a9b79a0f46..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/tal_mix_data_prep.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/TAL_CSASR data/tal_mix" - exit 1; -fi - -tal_mix_audio_dir=$1/cs_wav -tal_mix_text=$1/label -data=$2 - -train_dir=$data/local/train -tmp_dir=$data/local/tmp - -mkdir -p $train_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $tal_mix_audio_dir ] || [ ! -f $tal_mix_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -echo "**** Creating tal mix data folder ****" - -# find wav audio file for train, dev and test resp. -find $tal_mix_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 370000 ] && \ - echo Warning: expected 370000 data files, found $n - -# rm -r $tmp_dir - -# Transcriptions preparation -echo Preparing transcriptions -sed -e 's/\.wav//' $tmp_dir/wav.flist | awk -F '/' '{print $NF}' > $train_dir/utt.list -sed -e 's/\.wav//' $tmp_dir/wav.flist | awk -F '/' '{printf("%s %s\n",$NF,$NF)}' > $train_dir/utt2spk -paste -d' ' $train_dir/utt.list $tmp_dir/wav.flist > $train_dir/wav.scp -cat $tal_mix_text | grep -Ev '^\s*$' | awk '{if(NF>1) print $0}' > $train_dir/transcript.txt -#cp $tal_mix_text $train_dir - -wc -l $train_dir/transcript.txt -echo filtering -tools/filter_scp.pl -f 1 $train_dir/utt.list $train_dir/transcript.txt | \ - sed 's/A/A/g' | sed 's/C/C/g' | sed 's/D/D/g' | sed 's/G/G/g' | \ - sed 's/H/H/g' | sed 's/U/U/g' | sed 's/Y/Y/g' | sed 's/a/a/g' | \ - sed 's/I/I/g' | sed 's/#//g' | sed 's/=//g' | sed 's/;//g' | \ - sed 's/,//g' | sed 's/?//g' | sed 's/。//g' | sed 's/\///g' | \ - sed 's/!//g' | sed 's/!//g' | sed 's/\.//g' | sed 's/\?//g' | \ - sed 's/://g' | sed 's/,//g' | sed 's/\"//g' | sed 's/://g' | \ - sed 's/@//g' | sed 's/-/ /g' | sed 's/、/ /g' | sed 's/~/ /g' | \ - sed "s/‘/\'/g" | sed 's/E/E/g' | sed "s/’/\'/g" | sed 's/《//g' | sed 's/》//g' | \ - sed "s/[ ][ ]*$//g" | sed "s/\[//g" | sed 's/、//g' > $train_dir/text -tools/utt2spk_to_spk2utt.pl $train_dir/utt2spk > $train_dir/spk2utt - -mkdir -p $data/train - -for f in spk2utt utt2spk wav.scp text; do - cp $train_dir/$f $data/train/$f || exit 1; -done - -tools/fix_data_dir.sh $data/train || exit 1; - -echo "$0: tal mix data preparation succeeded" -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/thchs-30_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/thchs-30_data_prep.sh deleted file mode 100644 index a72efcddccf84b5706502234428d50440f02b8e8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/thchs-30_data_prep.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. -# 2016 LeSpeech (Author: Xingyu Na) - -#This script pepares the data directory for thchs30 recipe. -#It reads the corpus and get wav.scp and transcriptions. - -corpus_dir=$1 -data=$2 - -echo "**** Creating THCHS-30 data folder ****" -mkdir -p $data/{train,dev,test} - -#create wav.scp, utt2spk.scp, spk2utt.scp, text -( -for x in train dev test; do - echo "cleaning $data/$x" - part=$data/$x - rm -rf $part/{wav.scp,utt2spk,spk2utt,text} - echo "preparing scps and text in $part" - # updated new "for loop" figured out the compatibility issue with Mac created by Xi Chen, in 03/06/2018 - for nn in `find $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do - spkid=`echo $nn | awk -F"_" '{print "" $1}'` - spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'` - spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'` - spkid=$(printf '%s%.2d' "$spk_char" "$spk_num") - utt_num=`echo $nn | awk -F"_" '{print $2}'` - uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num") - echo $uttid $corpus_dir/$x/$nn.wav >> $part/wav.scp - echo $uttid $spkid >> $part/utt2spk - echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` | sed 's/ l =//' >> $part/text - done - sort $part/wav.scp -o $part/wav.scp - sort $part/utt2spk -o $part/utt2spk - sort $part/text -o $part/text - tools/utt2spk_to_spk2utt.pl $part/utt2spk > $part/spk2utt -done -) || exit 1 - -tools/validate_data_dir.sh --no-feats $data/train || exit 1; -tools/validate_data_dir.sh --no-feats $data/dev || exit 1; -tools/validate_data_dir.sh --no-feats $data/test || exit 1; - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/thchs_download_and_untar.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/thchs_download_and_untar.sh deleted file mode 100644 index 5cf6769e0b69c4173075b1c1c631e64bba2bf296..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/local/thchs_download_and_untar.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# Copyright 2016 Tsinghua University (author: Dong Wang) -# Apache 2.0 - -# Adapted from librispeech recipe local/download_and_untar.sh - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_thchs30, test-noise, resource" -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data, make it" - mkdir -p $data -fi - -part_ok=false -list="data_thchs30 test-noise resource" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - - -sizes="6453425169 1971460210 24813708" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tgz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - pwd - echo " wget --no-check-certificate $full_url" - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/run.sh deleted file mode 100644 index 9b1813ceceaf75d5c8776c05c1843b17276b09ce..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/run.sh +++ /dev/null @@ -1,357 +0,0 @@ -#!/bin/bash - -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3" -stage=0 # start from 0 if you need to start from data preparation -stop_stage=6 - -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - -# data -dbase=/ssd/nfs06/di.wu/open_source -aidatatang_url=www.openslr.org/resources/62 -aishell_url=www.openslr.org/resources/33 -magicdata_url=www.openslr.org/resources/68 -primewords_url=www.openslr.org/resources/47 -stcmds_url=www.openslr.org/resources/38 -thchs_url=www.openslr.org/resources/18 - -nj=16 - -train_set=train -dev_set=dev - -has_aishell2=false # AISHELL2 train set is not publically downloadable - # With this option true, the script assumes you have it in - # $dbase -has_tal=false # TAL data need download from Baidu SkyDrive - # With this option true, the script assumes you have - # TAL/TAL_ASR and TAL/TAL_ASR_mix in $dbase -data_type=raw # raw or shard -num_utts_per_shard=1000 -shards_dir= # specify if you prefer to store to somewhere else -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -train_config=conf/train_conformer.yaml -# English modeling unit -# Optional 1. bpe 2. char -en_modeling_unit=bpe -dict=data/dict_$en_modeling_unit/lang_char.txt -cmvn=true -dir=exp/conformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="ctc_greedy_search ctc_prefix_beam_search" -decode_modes="$decode_modes attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -test_sets="aishell aidatatang magicdata thchs" -if $has_aishell2; then - test_sets="$test_sets aishell2" -fi -if $has_tal; then - test_sets="$test_sets tal_asr" -fi - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - # download all training data - local/aidatatang_download_and_untar.sh $dbase/aidatatang $aidatatang_url \ - aidatatang_200zh || exit 1; - local/aishell_download_and_untar.sh $dbase/aishell $aishell_url \ - data_aishell || exit 1; - local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \ - train_set || exit 1; - local/primewords_download_and_untar.sh $dbase/primewords $primewords_url \ - || exit 1; - local/stcmds_download_and_untar.sh $dbase/stcmds $stcmds_url || exit 1; - local/thchs_download_and_untar.sh $dbase/thchs $thchs_url data_thchs30 || \ - exit 1; - - # download all test data - local/thchs_download_and_untar.sh $dbase/thchs $thchs_url test-noise \ - || exit 1; - local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \ - dev_set || exit 1; - local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url \ - test_set || exit 1; - # tal data need download from Baidu SkyDrive - # AISHELL-2 database is free for academic research, not in the commerce, - # if without permission. - # You need to request the data from AISHELL company. -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/aidatatang_data_prep.sh $dbase/aidatatang/aidatatang_200zh \ - data/aidatatang || exit 1; - local/aishell_data_prep.sh $dbase/aishell/data_aishell data/aishell \ - || exit 1; - local/thchs-30_data_prep.sh $dbase/thchs/data_thchs30 data/thchs || exit 1; - local/magicdata_data_prep.sh $dbase/magicdata/ data/magicdata || exit 1; - local/primewords_data_prep.sh $dbase/primewords data/primewords || exit 1; - local/stcmds_data_prep.sh $dbase/stcmds data/stcmds || exit 1; - if $has_tal; then - local/tal_data_prep.sh $dbase/TAL/TAL_ASR data/tal_asr || exit 1; - local/tal_mix_data_prep.sh $dbase/TAL/TAL_ASR_mix data/tal_mix || exit 1; - fi - if $has_aishell2; then - local/aishell2_data_prep.sh $dbase/aishell2/IOS data/aishell2/train \ - || exit 1; - local/aishell2_data_prep.sh $dbase/aishell2/IOS/dev data/aishell2/dev \ - || exit 1; - local/aishell2_data_prep.sh $dbase/aishell2/IOS/test data/aishell2/test \ - || exit 1; - fi - # Merge all data sets. - train_sets=aidatatang,aishell,magicdata,primewords,stcmds,thchs - dev_sets=aidatatang,aishell,magicdata,thchs - if $has_aishell2; then - train_sets=$train_sets,aishell2 - dev_sets=$dev_sets,aishell2 - fi - if $has_tal; then - train_sets=$train_sets,tal_asr,tal_mix - dev_sets=$dev_sets,tal_asr - fi - unrolled_train_sets=$(eval echo data/{$train_sets}/train) - unrolled_dev_sets=$(eval echo data/{$dev_sets}/dev) - tools/combine_data.sh data/train $unrolled_train_sets || exit 1; - tools/combine_data.sh data/dev $unrolled_dev_sets || exit 1; -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # For wav feature, just copy the data. Fbank extraction is done in training - mkdir -p data_${en_modeling_unit} - for x in ${train_set} ${dev_set}; do - cp -r data/$x data_${en_modeling_unit} - done - - for x in ${test_sets}; do - cp -r data/$x/test data_${en_modeling_unit}/test_${x} - done - - # Unified data format for char and bpe modelding - # Here we use ▁ for blank among english words - # Warning : it is "▁" symbol, not "_" symbol - for x in train dev; do - cp data_${en_modeling_unit}/${x}/text data_${en_modeling_unit}/${x}/text.org - paste -d " " <(cut -f 1 -d" " data_${en_modeling_unit}/${x}/text.org) \ - <(cut -f 2- -d" " data_${en_modeling_unit}/${x}/text.org \ - | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \ - | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \ - > data_${en_modeling_unit}/${x}/text - sed -i 's/\xEF\xBB\xBF//' data_${en_modeling_unit}/${x}/text - - done - - for x in ${test_sets}; do - cp data_${en_modeling_unit}/test_${x}/text \ - data_${en_modeling_unit}/test_${x}/text.org - paste -d " " <(cut -f 1 -d" " data_${en_modeling_unit}/test_${x}/text.org) \ - <(cut -f 2- -d" " data_${en_modeling_unit}/test_${x}/text.org \ - | tr 'a-z' 'A-Z' | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' \ - | sed 's/\([A-Z]\) \([A-Z]\)/\1▁\2/g' | tr -d " ") \ - > data_${en_modeling_unit}/test_${x}/text - sed -i 's/\xEF\xBB\xBF//' data_${en_modeling_unit}/test_${x}/text - done -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "Compute cmvn" - # Here we use all the training data, you can sample some data to save time - if $cmvn; then - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data_${en_modeling_unit}/$train_set/global_cmvn - fi -fi - -# This bpe model is trained on librispeech training data set. -bpecode=conf/train_960_unigram5000.model -trans_type_ops= -enable_bpe= -if [ $en_modeling_unit = "bpe" ]; then - trans_type_ops="--trans_type cn_char_en_bpe" - enable_bpe=true -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - tools/text2token.py -s 1 -n 1 -m ${bpecode} \ - data_${en_modeling_unit}/${train_set}/text ${trans_type_ops} \ - | cut -f 2- -d" " | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' \ - | grep -v '·' | grep -v '“' | grep -v "”" | grep -v "\[" | grep -v "\]" \ - | grep -v "…" | awk '{print $0 " " NR+1}' >> ${dict} - - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - echo "Prepare data, prepare required format" - feat_test_sets="" - for x in ${test_sets}; do - feat_test_sets=${feat_test_sets}" "test_${x} - done - for x in ${dev_set} ${train_set} ${feat_test_sets}; do - if [ $data_type == "shard" ]; then - sdir=${shards_dir:+$shards_dir/}shards_${en_modeling_unit} - mkdir -p $sdir - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data_${en_modeling_unit}/$x/wav.scp \ - data_${en_modeling_unit}/$x/text $(realpath $sdir/$x) \ - data_${en_modeling_unit}/$x/data.list - else - tools/make_raw_list.py data_${en_modeling_unit}/$x/wav.scp \ - data_${en_modeling_unit}/$x/text data_${en_modeling_unit}/$x/data.list - fi - done -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data_${en_modeling_unit}/$train_set/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data data_${en_modeling_unit}/$train_set/data.list \ - --cv_data data_${en_modeling_unit}/$dev_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 4 \ - ${enable_bpe:+--bpe_model $bpecode} \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size=16 - ctc_weight=0.5 - idx=0 - for mode in ${decode_modes}; do - { - for x in ${test_sets}; do - { - test_name=test_${mode}${decoding_chunk_size:+_chunk$decoding_chunk_size} - test_dir=$dir/$test_name/${x} - mkdir -p $test_dir - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) - python wenet/bin/recognize.py --gpu $gpu_id \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data_${en_modeling_unit}/test_${x}/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - ${enable_bpe:+--bpe_model $bpecode} \ - --result_file $test_dir/text_${en_modeling_unit} \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - - cat $test_dir/text_${en_modeling_unit} | sed -e "s/▁/ /g" \ - > $test_dir/text - cat data_${en_modeling_unit}/test_${x}/text | sed -e "s/▁/ /g" \ - > data_${en_modeling_unit}/test_${x}/text.tmp - python tools/compute-wer.py --char=1 --v=1 \ - data_${en_modeling_unit}/test_${x}/text.tmp $test_dir/text \ - > $test_dir/wer - rm data_${en_modeling_unit}/test_${x}/text.tmp - } - done - } & - ((idx+=1)) - done - wait - -fi - -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/multi_cn/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/README.md deleted file mode 100644 index 7b904dbfb7af31ac08fccc27095631ece8173762..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# w2v-conformer based end-to-end model for Openasr2021 challenge - -This is a example to use unsupervised pretrained w2v-conformer model to fintune [OpenASR2021](https://www.nist.gov/itl/iad/mig/openasr-challenge) constrained-plus tasks. - -We pretrain conformer encoders using wav2vec 2.0 pre-training method , which we called ch-w2v-conformer. The original pre-training works take raw waveforms -as input. Unlike these works, we use MFCC features as inputs. - -The ch-w2v-conformer model uses following datasets to pretrain: - -ISML datasets (6 languages,70k hours): internal dataset contains 40k hours Chinese, Cantonese, Tibetan, Inner Mongolian, Inner Kazakh, Uighur. - -Babel datasets (17 languages, 2k hours): Assamese, Bengali, Cantonese, Cebuano, Georgian, Haitian, Kazakh, Kurmanji, Lao, Pashto, Swahili, Tagalog, Tamil, Tok, Turkish, Vietnamese, Zulu - -After pretraining, we build ASR system based on CTC-Attention structure. In very low resource task, we find that if too many initialization network structures are constructed in the upper layer of pre-training conformer encoder, the migration performance of the pre-training model will be destroyed, so we only build a single-layer transformer decoder for joint training. - -pretrained model link: https://huggingface.co/emiyasstar/ch-w2v-conformer - - -## constrained-plus Task Performance - -* Languages: Cantonese,mongolian,kazakh -* config: conf/train_conformer_large_10h.yaml -* Feature info: using mfcc feature, with dither 1.0, without cmvn -* Training info: lr 0.001, batch size 10, 4 gpus on V100, acc_grad 1, 80 epochs -* Decoding info: ctc_weight 0.5, average_num 35 - -dev set results trained only with 10 hours training set - -## w2v-Conformer - -| decoding_method | Cantonese(CER) | mongolian(WER) | -|:-------------------:|:----:|:----:| -| ctc_greedy_search | 31.46 | 53.64 | -| ctc_prefix_search | 31.47 | 53.50 | -| attention_rescoring | 31.45 | 52.96 | - -## Conformer (train from scratch) - - -| decoding_method | Cantonese(CER) | mongolian(WER) | -|:-------------------:|----:|:----:| -| ctc_greedy_search | 61.43 | 89.38 | -| ctc_prefix_search | 61.37 | 89.53| -| attention_rescoring | 60.61 | 89.60| diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/conf/lang.conf b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/conf/lang.conf deleted file mode 100644 index 5177c06e6940b9ba059bd316d65dd73da617f789..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/conf/lang.conf +++ /dev/null @@ -1,38 +0,0 @@ -# A giant configurations file for all the BABEL languages -# as well as some training configurations for training HMM-GMM systems -# for obtaining phoneme level alignments if you really want to do that -# All paths starting with /export/* are set for the JHU/CLSP grid and shoudl -# be changed appropriately for other users - -# Cantonese -train_data_dir_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build -train_data_list_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build/ct_train_openasr21_uniq -train_data_dir_101_FLP=/export/babel/data/101-cantonese/release-current/conversational/training -train_data_list_101_FLP=./conf/lists/101-cantonese/train.FullLP.list -dev10h_data_dir_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/dev -dev10h_data_list_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/dev/ct_dev_openasr21_uniq -lexicon_file_101=/train/asr/feat/mfcchires/openasr/cantonese/openasr21_cantonese/build/reference_materials/lexicon.txt -lexiconFlags_101="--romanized --oov " - - -# Kazakh -train_data_dir_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/training -train_data_list_302=./conf/lists/302-kazakh/sub-train.list -train_data_dir_302_FLP=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/training -train_data_list_302_FLP=./conf/lists/302-kazakh/training.list -dev10h_data_dir_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/dev -dev10h_data_list_302=./conf/lists/302-kazakh/dev.list -lexicon_file_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/conversational/reference_materials/lexicon.sub-train.txt -lexiconFlags_302="--romanized --oov " - -#mongolian -train_data_dir_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build -train_data_list_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build/mn_train_openasr21 -dev10h_data_dir_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/dev -dev10h_data_list_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/dev/mn_dev_openasr21 -lexicon_file_401=/train/asr/feat/mfcchires/openasr/mongolian/openasr21_mongolian/build/reference_materials/lexicon.txt -lexiconFlags_401="--romanized --oov " - - -oovSymbol="" -lexiconFlags="--oov " diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/conf/train_conformer_large_10h.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/conf/train_conformer_large_10h.yaml deleted file mode 100644 index cb3312c13cd2bc9d294506f2a1429e74c9f0c93c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/conf/train_conformer_large_10h.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 24 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.0 - attention_dropout_rate: 0.0 - input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - macaron_style: True - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - cnn_module_norm: 'layer_norm' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 1024 - num_blocks: 1 - dropout_rate: 0.1 - positional_dropout_rate: 0.0 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.7 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -raw_wav: True - -# dataset related -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - feats_type: mfcc - mfcc_conf: - num_mel_bins: 40 - frame_shift: 10 - frame_length: 25 - num_ceps: 40 - low_freq: 20 - high_freq: -400 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 10 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 100 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.0004 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 15000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/dump_wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/dump_wav.sh deleted file mode 100644 index 0f9df4c7a2b2bd59542dadcf07131fb17f52effc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/dump_wav.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# dumps such pipe-style-wav to real audio file -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -paste -d " " <(cut -f 1 -d " " $inscp) <(cut -f 2- -d " " $inscp | tr -t " " "#") \ - > $data/wav_ori.scp - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp - -rm -f $data/{segments,wav_segments.scp,reco2file_and_channel,reco2dur} -tools/fix_data_dir.sh $data diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/make_absolute.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/make_absolute.sh deleted file mode 100644 index 8936bdaea78972dea9261a041c67dfa5ee41eca6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/make_absolute.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# This script replaces the command readlink -f (which is not portable). -# It turns a pathname into an absolute pathname, including following soft links. -target_file=$1 - -cd $(dirname $target_file) -target_file=$(basename $target_file) - -# Iterate down a (possible) chain of symlinks -while [ -L "$target_file" ]; do - target_file=$(readlink $target_file) - cd $(dirname $target_file) - target_file=$(basename $target_file) -done - -# Compute the canonicalized name by finding the physical path -# for the directory we're in and appending the target file. -phys_dir=$(pwd -P) -result=$phys_dir/$target_file -echo $result diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/make_corpus_subset.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/make_corpus_subset.sh deleted file mode 100644 index d5bfbb30ae071dcb0ecdb02ec7a654d738eeef5f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/make_corpus_subset.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Yenda Trmal) -# Apache 2.0. - -#Begin configuration -ignore_missing_txt=false #If the reference transcript txt is missing, \ - #shall we ignore it or treat it as a fatal error? -#End configuration -echo "$0 $@" # Print the command line for logging - -help_message="$0: create subset of the input directory (specified as the first directory). - The subset is specified by the second parameter. - The directory in which the subset should be created is the third parameter - Example: - $0 " - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [[ "$#" -ne "3" ]] ; then - echo -e "FATAL: wrong number of script parameters!\n\n" - printf "$help_message\n\n" - exit 1; -fi - -input_data_dir=$1 -input_data_list=$2 -output_data_dir=$3 - -if [[ ! -d "$input_data_dir" ]] ; then - echo "FATAL: input data directory does not exist"; - exit 1; -fi -if [[ ! -f "$input_data_list" ]] ; then - echo "FATAL: input data list file does not exist!"; - exit 1; -fi - -mkdir -p $output_data_dir/transcription -mkdir -p $output_data_dir/audio - -abs_src_dir=`local/make_absolute.sh $input_data_dir` -abs_tgt_dir=`local/make_absolute.sh $output_data_dir` - -echo "Making subset..." -for file_basename in `cat $input_data_list`; do - echo $file_basename - if [[ -e $abs_src_dir/audio/$file_basename.sph ]] ; then - ln -sf $abs_src_dir/audio/$file_basename.sph $abs_tgt_dir/audio || exit 1 - else - if [[ -e $abs_src_dir/audio/$file_basename.wav ]] ; then - ln -sf $abs_src_dir/audio/$file_basename.wav $abs_tgt_dir/audio || exit 1 - else - echo "File $abs_src_dir/audio/$file_basename.sph|wav does not exist!" - exit 1 - fi - fi - - if [[ -e $abs_src_dir/transcription/$file_basename.txt ]] ; then - ln -sf $abs_src_dir/transcription/$file_basename.txt $abs_tgt_dir/transcription || exit 1 - else - echo "File $abs_src_dir/transcription/$file_basename.txt does not exist!" - - if ! $ignore_missing_txt ; then - exit 1; - fi - fi -done - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/prepare_acoustic_training_data.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/prepare_acoustic_training_data.pl deleted file mode 100644 index aeb57c4901eb23c21b0cec68e4afd0cf9a75b1b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/prepare_acoustic_training_data.pl +++ /dev/null @@ -1,477 +0,0 @@ -#!/usr/bin/env perl -use Getopt::Long; - -######################################################################## -# -# Script to prepare the Babel acoustic training data for Kaldi. -# -# - Place transcripts in a file named "text" -# Each line contains: utteranceID word1 word2 ... -# -# - Place the utterance-to-speaker map in a file named "utt2spk" -# Each line contains: utteranceID speakerID -# speakerID MUST BE be a prefix of the utteranceID -# Kaldi code does not require it, but some training scripts do. -# -# - Place the utterance-to-segment map in a file named "segments" -# Each line contains: utteranceID recordingID startTime endTime -# -# - Place the recordingID-to-waveformFile map in "wav.scp" -# Each line contains: recordingIB Input_pipe_for_reading_waveform| -# -# - Place the speaker-utterance map in a file named "spk2utt" -# Each line contains: speakerID utteranceID_1 utteranceID_2 ... -# This is the inverse of the utt2spk mapping -# -# Note 1: the utteranceIDs in the first 3 files must match exactly, and -# the recordingIDSs in the last 2 files must match exactly. -# -# Note 2: Babel data formats and file-naming conventions are assumed. -# -# - The transcriptions and waveforms are in subdirectories named -# audio/.sph -# transcription/.txt -# There is 1 pair of files per recording, with extensions as above -# -# - The audio is in NIST sphere format, so shp2pipe may be used, e.g. -# BABEL_BP_101_11694_20111204_205320_inLine \ -# /export/babel/sanjeev/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe \ -# -f wav -p -c 1 \ -# BABEL_BP_101_11694_20111204_205320_inLine.sph| -# -# - The filename contains speaker information, e.g. -# BABEL_BP_101_37210_20111102_170037_O1_scripted.sph -> 37210_A -# BABEL_BP_101_37210_20111102_172955_inLine.sph -> 37210_A -# BABEL_BP_101_37210_20111102_172955_outLine.sph -> 37210_B -# Specifically, the inLine speaker is the same as scripted -# -# - The transcription file has time marks in square brackets, e.g. -# [0.0] -# -# [7.05] -# 啊 听 听唔听到 啊 你 而家 仲未 上课 系 嘛 -# [14.07] -# -# - If a vocabulary is provided, map all OOV tokens to an OOV symbol, -# and write out an OOV list with counts to a file named "oovCounts" -# -# If one or more word-fragment markers are provided, this script -# checks if an OOV token can be made in-vocabulary by stripping off -# the markers one by one from either end of the token. -# -# The default settings are -# - $vocabFile = ""; # No vocab file; nothing is mapped to OOV - $OOV_symbol = ""; # Default OOV symbol - $fragMarkers = ""; # No characters are word-fragment markers -# -# - Babel transcriptions contain 4 kinds of untranscribed words -# -# (()) designates unintelligible words -# designates a word in another language -# designates a sequence of pre-recorded words -# designates two simultaneous foreground speakers -# -# This script maps them to OOV. They are not included in oovCounts -# -# - Babel transcriptions also contain a few non-linguistics tokens -# -# map to a vocal noise symbol -# map to a vocal noise symbol -# map to a vocal noise symbol -# map to a vocal noise symbol -# -# map to a nonvocal noise symbol -# map to a nonvocal noise symbol -# map to a nonvocal noise symbol -# map to a nonvocal noise symbol -# -# designates silence > 1 sec. -# - $vocalNoise = ""; - $nVoclNoise = ""; - $silence = ""; - $icu_transform=""; -# -######################################################################## - -GetOptions("fragmentMarkers=s" => \$fragMarkers, - "oov=s" => \$OOV_symbol, - "vocab=s" => \$vocabFile, - "icu-transform=s" => \$icu_transform - ); - -if ($#ARGV == 1) { - $inDir = $ARGV[0]; - $outDir = $ARGV[1]; - print STDERR ("$0: $inDir $outDir\n"); - if($vocabFile) { - print STDERR ("\tLimiting transcriptions to words in $vocabFile\n"); - print STDERR ("\tMapping OOV tokens to \"$OOV_symbol\"\n"); - print STDERR ("\tif they remain OOV even after removing [$fragMarkers] from either end\n") if ($fragMarkers); - } - print STDERR ("$0 ADVICE: Use full path for the Input Directory\n") unless ($inDir=~m:^/:); -} else { - print STDERR ("Usage: $0 [--options] InputDir OutputDir\n"); - print STDERR ("\t--vocab File containing the permitted vocabulary\n"); - print STDERR ("\t--oov Use this symbol for OOV words (default )\n"); - print STDERR ("\t--fragmentMarkers Remove these from ends of words to minimize OOVs (default none)\n"); - exit(1); -} - -######################################################################## -# Read and save the vocabulary and map anything not in the vocab -######################################################################## - -if ($vocabFile) { - open (VOCAB, $vocabFile) - || die "Unable to open vocabulary file $vocabFile"; - $numWords = 0; - while () { - next unless (m:^([^\s]+):); - $numWords++ unless (exists $inVocab{$1}); # Don't count word repetitions - $inVocab{$1} = 1; # commonly found in lexicons - } - close(VOCAB); - print STDERR ("Read $numWords unique words from $vocabFile\n"); -} - -######################################################################## -# First read segmentation information from all the transcription files -######################################################################## - -$TranscriptionDir = "$inDir/transcription"; -if (-d $TranscriptionDir) { - @TranscriptionFiles = `ls ${TranscriptionDir}/*.txt`; - if ($#TranscriptionFiles >= 0) { - printf STDERR ("$0: Found %d .txt files in $TranscriptionDir\n", ($#TranscriptionFiles +1)); - $numFiles = $numUtterances = $numWords = $numOOV = $numSilence = 0; - while ($filename = shift @TranscriptionFiles) { - $fileID = $filename; # To capture the base file name - $fileID =~ s:.+/::; # remove path prefix - $fileID =~ s:\.txt\s*$::; # remove file extension - # For each transcription file, extract and save segmentation data - $numUtterancesThisFile = 0; - $prevTimeMark = -1.0; - $text = ""; - if ( $icu_transform ) { - $inputspec="uconv -f utf8 -t utf8 -x \"$icu_transform\" $filename |"; - } else { - $inputspec=$filename; - } - open (TRANSCRIPT, $inputspec) || die "Unable to open $filename"; - while ($line=) { - chomp $line; - if ($line =~ m:^\[([0-9]+\.*[0-9]*)\]$:) { - $thisTimeMark = $1; - if ($thisTimeMark < $prevTimeMark) { - print STDERR ("$0 ERROR: Found segment with negative duration in $filename\n"); - print STDERR ("\tStart time = $prevTimeMark, End time = $thisTimeMark\n"); - print STDERR ("\tThis could be a sign of something seriously wrong!\n"); - print STDERR ("\tFix the file by hand or remove it from the directory, and retry.\n"); - exit(1); - } - if ($prevTimeMark<0) { - # Record the first timemark and continue - $prevTimeMark = $thisTimeMark; - next; - } - ################################################## - # Create an utteranceID using fileID & start time - # - Assume Babel file naming conventions - # - Remove prefix: program_phase_language - # - inLine = scripted = spkr A, outLine = B - # - Move A/B so that utteranceIDs sort by spkr - # - Assume utterance start time < 10000 sec. - ################################################## - $utteranceID = $fileID; - $utteranceID =~ s:[^_]+_[^_]+_[^_]+_::; - $utteranceID =~ s:([^_]+)_(.+)_(inLine|scripted):${1}_A_${2}:; - $utteranceID =~ s:([^_]+)_(.+)_outLine:${1}_B_${2}:; - $utteranceID .= sprintf ("_%06i", (100*$prevTimeMark)); - ################################################## - # Then save segmentation, transcription, spkeaerID - ################################################## - if (exists $transcription{$utteranceID}) { - # utteranceIDs should be unique, but this one is not! - # Either time marks in the transcription file are bad, - # or something went wrong in generating the utteranceID - print STDERR ("$0 WARNING: Skipping duplicate utterance $utteranceID\n"); - } - elsif ($text eq "") { - # Could be due to text filtering done below - # Output information to STDOUT to enable > /dev/null - print STDOUT ("$0: Skipping empty transcription $utteranceID\n"); - } else { - $transcription{$utteranceID} = $text; - $startTime{$utteranceID} = $prevTimeMark; - $endTime{$utteranceID} = $thisTimeMark; - if ($utteranceID =~ m:([^_]+_[AB]).*:) { - $speakerID{$utteranceID} = $1; - } else { - # default: one speaker per audio file - $speakerID{$utteranceID} = $fileID; - } - $baseFileID{$utteranceID} = $fileID; - $numUtterancesThisFile++; - $numUtterances++; - $text = ""; - } - $prevTimeMark = $thisTimeMark; - } else { - @tokens = split(/\s+/, $line); - $text = ""; - while ($w = shift(@tokens)) { - # First, some Babel-specific transcription filtering - if (($w eq "")||($w eq "")||($w eq "")||($w eq "~")) { - next; - } elsif (($w eq "")||($w eq "")||($w eq "")||($w eq "")) { - $text .= " $vocalNoise"; - $numWords++; - } elsif (($w eq "")||($w eq "")||($w eq "")||($w eq "")){ - $text .= " $nVoclNoise"; - $numWords++; - } elsif (($w eq "(())")||($w eq "")||($w eq "")||($w eq "")) { - $text .= " $OOV_symbol"; - $oovCount{$w}++; - $numOOV++; - $numWords++; - } elsif ($w eq "") { - $text .= " $silence"; - $numSilence++; - } else { - # This is a just regular spoken word - if ($vocabFile && (! $inVocab{$w}) && $fragMarkers) { - # $w is a potential OOV token - # Remove fragMarkers to see if $w becomes in-vocabulary - while ($w =~ m:^(\S+[$fragMarkers]|[$fragMarkers]\S+)$:) { - if ($w =~ m:^(\S+)[$fragMarkers]$:) { - $w = $1; - last if ($inVocab{$w}); - } elsif ($w =~m:^[$fragMarkers](\S+)$:) { - $w = $1; - last if ($inVocab{$w}); - } else { - die "Logically, the program should never reach here!"; - } - } - } - # If still an OOV, replace $w by $OOV_symbol - if ($vocabFile && (! $inVocab{$w})) { - # $w is definitely an OOV token - if (exists $oovCount{$w}) { - $oovCount{$w}++; - } else { - $oovCount{$w} = 1; - } - $w = $OOV_symbol; - $numOOV++; - } - $text .= " $w"; - $numWords++; - } - } - $text =~ s:^\s+::; # Remove leading white space, if any - # Transcriptions must contain real words to be useful in training - $text =~ s:^(($OOV_symbol|$vocalNoise|$nVoclNoise|$silence)[ ]{0,1})+$::; - } - } - close(TRANSCRIPTION); - if ($numUtterancesThisFile>0) { - $lastTimeMarkInFile{$fileID} = $prevTimeMark; - $numUtterancesInFile{$fileID} = $numUtterancesThisFile; - $numUtterancesThisFile = 0; - } - $numFiles++; - } - print STDERR ("$0: Recorded $numUtterances non-empty utterances from $numFiles files\n"); - } else { - print STDERR ("$0 ERROR: No .txt files found $TranscriptionDir\n"); - exit(1); - } -} else { - print STDERR ("$0 ERROR: No directory named $TranscriptionDir\n"); - exit(1); -} - -######################################################################## -# Then verify existence of corresponding audio files and their durations -######################################################################## - -$AudioDir = "$inDir/audio"; -if (-d $AudioDir) { - @AudioFiles = `ls ${AudioDir}/*.sph`; - if ($#AudioFiles >= 0) { - printf STDERR ("$0: Found %d .sph files in $AudioDir\n", ($#AudioFiles +1)); - $numFiles = 0; - while ($filename = shift @AudioFiles) { - $fileID = $filename; - $fileID =~ s:.+/::; # remove path prefix - $fileID =~ s:\.sph\s*::; # remove file extension - if (exists $numUtterancesInFile{$fileID}) { - # Some portion of this file has training transcriptions - @Info = `head $filename`; - $SampleCount = -1; - $SampleRate = 8000; #default - while ($#Info>=0) { - $line = shift @Info; - $SampleCount = $1 if ($line =~ m:sample_count -i (\d+):); - $SampleRate = $1 if ($line =~ m:sample_rate -i (\d+):); - } - if ($SampleCount<0) { - # Unable to extract a valid duration from the sphere header - print STDERR ("Unable to extract duration: skipping file $filename"); - } else { - $waveformName{$fileID} = $filename; chomp $waveformName{$fileID}; - $duration{$fileID} = $SampleCount/$SampleRate; - $numFiles++; - } - } else { - # Could be due to text filtering resulting in an empty transcription - # Output information to STDOUT to enable > /dev/null - print STDOUT ("$0: No transcriptions for audio file ${fileID}.sph\n"); - } - } - print STDERR ("$0: Recorded durations from headers of $numFiles .sph files\n"); - } else { - print STDERR ("$0 NOTICE: No .sph files in $AudioDir\n"); - } - - @AudioFiles = `ls ${AudioDir}/*.wav`; - if ($#AudioFiles >= 0) { - $soxi=`which soxi` or die "Could not find soxi binary -- do you have sox installed?\n"; - chomp $soxi; - printf STDERR ("$0: Found %d .wav files in $AudioDir\n", ($#AudioFiles +1)); - print STDERR "Soxi found: $soxi\n"; - $numFiles = 0; - while ($filename = shift @AudioFiles) { - $fileID = $filename; - $fileID =~ s:.+/::; # remove path prefix - $fileID =~ s:\.wav\s*::; # remove file extension - if (exists $numUtterancesInFile{$fileID}) { - # Some portion of this file has training transcriptions - $duration = `$soxi -D $filename`; - if ($duration <=0) { - # Unable to extract a valid duration from the sphere header - print STDERR ("Unable to extract duration: skipping file $filename"); - } else { - if (exists $waveformName{$fileID} ) { - print STDERR ("$0 ERROR: duplicate fileID \"$fileID\" for files \"$filename\" and \"" . $waveformName{$fileID} ."\"\n"); - exit(1); - } - $waveformName{$fileID} = $filename; chomp $waveformName{$fileID}; - $duration{$fileID} = $duration; - $numFiles++; - } - } else { - # Could be due to text filtering resulting in an empty transcription - # Output information to STDOUT to enable > /dev/null - print STDOUT ("$0: No transcriptions for audio file ${fileID}.sph\n"); - } - } - print STDERR ("$0: Recorded durations from headers of $numFiles .sph files\n"); - } else { - print STDERR ("$0 NOTICE: No .wav files in $AudioDir\n"); - } - if ( $#waveformName == 0 ) { - print STDERR ("$0 ERROR: No audio files found!"); - } -} else { - print STDERR ("$0 ERROR: No directory named $AudioDir\n"); - exit(1); -} - -######################################################################## -# Now all the needed information is available. Write out the 4 files. -######################################################################## - -unless (-d $outDir) { - print STDERR ("$0: Creating output directory $outDir\n"); - die "Failed to create output directory" if (`mkdir -p $outDir`); # i.e. if the exit status is not zero. -} -print STDERR ("$0: Writing 5 output files to $outDir\n"); - -$textFileName = "$outDir/text"; -open (TEXT, "> $textFileName") || die "$0 ERROR: Unable to write text file $textFileName\n"; - -$utt2spkFileName = "$outDir/utt2spk"; -open (UTT2SPK, "> $utt2spkFileName") || die "$0 ERROR: Unable to write utt2spk file $utt2spkFileName\n"; - -$segmentsFileName = "$outDir/segments"; -open (SEGMENTS, "> $segmentsFileName") || die "$0 ERROR: Unable to write segments file $segmentsFileName\n"; - -$scpFileName = "$outDir/wav.scp"; -open (SCP, "| sort -u > $scpFileName") || die "$0 ERROR: Unable to write wav.scp file $scpFileName\n"; -my $binary=$ENV{SPH2PIPE} -$SPHBINARY ="$binary -f wav -p -c 1"; -my $SOXBINARY =`which sox` or die "Could not find the sph2pipe command"; chomp $SOXBINARY; -$SOXFLAGS ="-r 8000 -c 1 -b 16 -t wav - downsample"; - -$spk2uttFileName = "$outDir/spk2utt"; -open (SPK2UTT, "> $spk2uttFileName") || die "$0 ERROR: Unable to write spk2utt file $spk2uttFileName\n"; - -$oovFileName = "$outDir/oovCounts"; -open (OOV, "| sort -nrk2 > $oovFileName") || die "$0 ERROR: Unable to write oov file $oovFileName\n"; - -$numUtterances = $numSpeakers = $numWaveforms = 0; -$totalSpeech = $totalSpeechSq = 0.0; -foreach $utteranceID (sort keys %transcription) { - $fileID = $baseFileID{$utteranceID}; - if (exists $waveformName{$fileID}) { - # There are matching transcriptions and audio - $numUtterances++; - $totalSpeech += ($endTime{$utteranceID} - $startTime{$utteranceID}); - $totalSpeechSq += (($endTime{$utteranceID} - $startTime{$utteranceID}) - *($endTime{$utteranceID} - $startTime{$utteranceID})); - print TEXT ("$utteranceID $transcription{$utteranceID}\n"); - print UTT2SPK ("$utteranceID $speakerID{$utteranceID}\n"); - print SEGMENTS ("$utteranceID $fileID $startTime{$utteranceID} $endTime{$utteranceID}\n"); - if (exists $uttList{$speakerID{$utteranceID}}) { - $uttList{$speakerID{$utteranceID}} .= " $utteranceID"; - } else { - $numSpeakers++; - $uttList{$speakerID{$utteranceID}} = "$utteranceID"; - } - next if (exists $scpEntry{$fileID}); - $numWaveforms++; - if ($waveformName{$fileID} =~ /.*\.sph/ ) { - $scpEntry{$fileID} = "$SPHBINARY $waveformName{$fileID} |"; - } else { - $scpEntry{$fileID} = "$SOXBINARY $waveformName{$fileID} $SOXFLAGS |"; - } - } else { - print STDERR ("$0 WARNING: No audio file for transcription $utteranceID\n"); - } -} -foreach $fileID (sort keys %scpEntry) { - print SCP ("$fileID $scpEntry{$fileID}\n"); -} -foreach $speakerID (sort keys %uttList) { - print SPK2UTT ("$speakerID $uttList{$speakerID}\n"); -} -foreach $w (sort keys %oovCount) { - print OOV ("$w\t$oovCount{$w}\n"); -} -exit(1) unless (close(TEXT) && close(UTT2SPK) && close(SEGMENTS) && close(SCP) && close(SPK2UTT) && close(OOV)); - -print STDERR ("$0: Summary\n"); -print STDERR ("\tWrote $numUtterances lines each to text, utt2spk and segments\n"); -print STDERR ("\tWrote $numWaveforms lines to wav.scp\n"); -print STDERR ("\tWrote $numSpeakers lines to spk2utt\n"); -print STDERR ("\tHmmm ... $numSpeakers distinct speakers in this corpus? Unusual!\n") - if (($numSpeakers<($numUtterances/500.0)) || ($numSpeakers>($numUtterances/2.0))); -print STDERR ("\tTotal # words = $numWords (including $numOOV OOVs) + $numSilence $silence\n") - if ($vocabFile); -printf STDERR ("\tAmount of speech = %.2f hours (including some due to $silence)\n", $totalSpeech/3600.0); -if ($numUtterances>0) { - printf STDERR ("\tAverage utterance length = %.2f sec +/- %.2f sec, and %.2f words\n", - $totalSpeech /= $numUtterances, - sqrt(($totalSpeechSq/$numUtterances)-($totalSpeech*$totalSpeech)), - $numWords/$numUtterances); -} - -exit(0); - -######################################################################## -# Done! -######################################################################## diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/prepare_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/prepare_data.sh deleted file mode 100644 index b53c805e291851f0e82182f6a5b22f406c8311bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/prepare_data.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Johns Hopkins University (Matthew Wiesner) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -# This is not necessarily the top-level run.sh as it is in other directories. see README.txt first. - -. ./conf/lang.conf -. ./path.sh -. ./cmd.sh - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -export SPH2PIPE=$sph2pipe -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -FLP=false - -. ./utils/parse_options.sh -if [ $# -ne 1 ]; then - echo >&2 "Usage: ./local/prepare_data.sh [opts] " - echo >&2 " --FLP : Use FLP training data (instead of LLP ~10h)" - exit 1 -fi - -l=$1 - -l_suffix=${l} -if $FLP; then - l_suffix=${l_suffix}_FLP -fi - -#Preparing train directories -if [ ! -f data/raw_train_data/.done ]; then - echo --------------------------------------------------------------------- - echo "Subsetting the TRAIN set" - echo --------------------------------------------------------------------- - train_data_dir=train_data_dir_${l_suffix} - train_data_list=train_data_list_${l_suffix} - local/make_corpus_subset.sh "${!train_data_dir}" "${!train_data_list}" ./data/raw_train_data - train_data_dir=`utils/make_absolute.sh ./data/raw_train_data` - touch data/raw_train_data/.done -fi - -#exit 0 - -#Preparing dev10 directories -if [ ! -f data/raw_dev10h_data/.done ]; then - echo --------------------------------------------------------------------- - echo "Subsetting the Dev set" - echo --------------------------------------------------------------------- - dev10h_data_dir=dev10h_data_dir_${l} - dev10h_data_list=dev10h_data_list_${l} - local/make_corpus_subset.sh "${!dev10h_data_dir}" "${!dev10h_data_list}" ./data/raw_dev10h_data - dev10h_data_dir=`utils/make_absolute.sh ./data/raw_dev10h_data` - touch data/raw_dev10h_data/.done -fi - -dev10h_data_dir=`utils/make_absolute.sh ./data/raw_dev10h_data` -train_data_dir=`utils/make_absolute.sh ./data/raw_train_data` -lexicon_file=lexicon_file_${l_suffix} - -if [[ ! -f data/train/wav.scp || data/train/wav.scp -ot "$train_data_dir" ]]; then - echo --------------------------------------------------------------------- - echo "Preparing acoustic training lists in data/train on" `date` - echo --------------------------------------------------------------------- - mkdir -p data/train.tmp - local/prepare_acoustic_training_data.pl \ - --fragmentMarkers \-\*\~ \ - $train_data_dir data/train.tmp > data/train.tmp/skipped_utts.log -fi - -if [[ ! -f data/dev10h.pem/wav.scp || data/dev10h.pem/wav.scp -ot "$dev10h_data_dir" ]]; then - echo --------------------------------------------------------------------- - echo "Preparing acoustic training lists in data/train on" `date` - echo --------------------------------------------------------------------- - mkdir -p data/dev10h.pem - local/prepare_acoustic_training_data.pl \ - --fragmentMarkers \-\*\~ \ - $dev10h_data_dir data/dev10h.pem > data/dev10h.pem/skipped_utts.log -fi - - -########################################################################### -# Prepend language ID to all utterances to disambiguate between speakers -# of different languages sharing the same speaker id. -# -# The individual lang directories can be used for alignments, while a -# combined directory will be used for training. This probably has minimal -# impact on performance as only words repeated across languages will pose -# problems and even amongst these, the main concern is the marker. -########################################################################### - -num_utts=$(cat data/train.tmp/segments | wc -l) -dev_utts=$((num_utts / 10)) - -./utils/subset_data_dir.sh data/train.tmp ${dev_utts} data/train_dev - -awk '{print $1}' data/train_dev/utt2spk > data/train_dev.list -awk '{print $1}' data/train.tmp/utt2spk | grep -vf data/train_dev.list > data/train.list - -./utils/subset_data_dir.sh --utt-list data/train.list data/train.tmp data/train - -echo "Prepend ${l} to data dir" -./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \ - data/train data/train_${l} - -./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \ - data/train_dev data/dev_${l} - -./utils/copy_data_dir.sh --spk-prefix "${l}_" --utt-prefix "${l}_" \ - data/dev10h.pem data/eval_${l} - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/setup_languages.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/setup_languages.sh deleted file mode 100644 index 8c6eb48a4b83cc7421f11a92def1aa7cf14f4ba6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/local/setup_languages.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Johns Hopkins University (Matthew Wiesner) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -. ./path.sh -. ./cmd.sh -. ./conf/lang.conf - -#langs="101 102 103 104 105 106 202 203 204 205 206 207 301 302 303 304 305 306 401 402 403" -langs="101" -recog="101" -FLP=false -garbage_utterance_tags=" " - -. ./tools/parse_options.sh - -set -e -set -o pipefail - -all_langs="" -for l in `cat <(echo ${langs}) <(echo ${recog}) | tr " " "\n" | sort -u`; do - all_langs="${l} ${all_langs}" -done -all_langs=${all_langs%% } - -# Save top-level directory -cwd=$(local/make_absolute.sh `pwd`) -echo "Stage 0: Setup Language Specific Directories" -echo "cwd" -echo $cwd - -echo " --------------------------------------------" -echo "Languagues: ${all_langs}" - -# Basic directory prep -for l in ${all_langs}; do - [ -d data/${l} ] || mkdir -p data/${l} - cd data/${l} - - ln -sf ${cwd}/local . - for f in ${cwd}/{tools,conf}; do - link=`make_absolute.sh $f` - ln -sf $link . - done - - cp ${cwd}/cmd.sh . - cp ${cwd}/path.sh . - sed -i 's/\.\.\/\.\.\/\.\./\.\.\/\.\.\/\.\.\/\.\.\/\.\./g' path.sh - cd ${cwd} -done - -# Prepare language specific data -for l in ${all_langs}; do - ( - cd data/${l} - ./local/prepare_data.sh --FLP ${FLP} ${l} - cd ${cwd} - ) & -done -wait - -# Combine all language specific training directories and generate a single -# lang directory by combining all language specific dictionaries -train_dirs="" -dev_dirs="" -eval_dirs="" -for l in ${langs}; do - train_dirs="data/${l}/data/train_${l} ${train_dirs}" -done - -for l in ${recog}; do - dev_dirs="data/${l}/data/dev_${l} ${dev_dirs}" -done - -./tools/combine_data.sh data/train ${train_dirs} -./tools/combine_data.sh data/dev ${dev_dirs} - -for l in ${recog}; do - ln -s ${cwd}/data/${l}/data/eval_${l} ${cwd}/data/eval_${l} -done - - -# Delete utterances with garbage meta tags -for tag in $garbage_utterance_tags; do - sed -i "s/${tag}//g" data/train/text - sed -i "s/${tag}//g" data/dev/text - sed -i "s/${tag}//g" data/eval_${l}/text -done - -sed -i "/_.*[0-9][ ]*$/d" data/train/text -sed -i "/_.*[0-9][ ]*$/d" data/dev/text -sed -i "/_.*[0-9][ ]*$/d" data/eval_${l}/text -sed -i 's/[ ][ ]*/ /g' data/train/text -sed -i 's/[ ][ ]*/ /g' data/dev/text -sed -i 's/[ ][ ]*/ /g' data/eval_${l}/text - -./tools/fix_data_dir.sh data/train -./tools/fix_data_dir.sh data/dev -./tools/fix_data_dir.sh data/eval_${l} - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/run.sh deleted file mode 100644 index 7c338f13e9c8ca7c9613993e9ac5117ec3ba6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/run.sh +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/bash -# Copyright 2021 Tencent Inc. (Author: Kai Tang). -# Apach 2.0 - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3" -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 -# data -data=data -data_url=www.openslr.org/resources/33 -nj=4 - -#langid: 101 Cantonese , 302 Kazakh , 401 mongolian -langs="101" -recog="101" - -token_type=char -# bpemode (unigram or bpe) -nbpe=4500 -bpemode=unigram - -# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, -# `shard` is used for large dataset which is over 1k hours, and `shard` is -# faster on reading data and training. -data_type=raw -num_utts_per_shard=1000 - -if [ "${token_type}" = bpe ]; then - dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt - bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe} -elif [ "${token_type}" = char ]; then - dict=data/lang_char/lang_char.txt - bpe_model= -else - echo "Error: not supported token_type" - exit 0 -fi - -train_set=train_sp -train_dev=dev -recog_set=eval_$recog - -# pretrained w2v-conformer encoder -enc_init=pretrain/conformer.pt -#reinit last pretrained encoder layer: https://arxiv.org/pdf/2107.04734.pdf -enc_init_mods='encoder.encoders.0,encoder.encoders.1,encoder.encoders.2,encoder.encoders.3,encoder.encoders.4,encoder.encoders.5,encoder.encoders.6,encoder.encoders.7,encoder.encoders.8,encoder.encoders.9,encoder.encoders.10,encoder.encoders.11,encoder.encoders.12,encoder.encoders.13,encoder.encoders.14,encoder.encoders.15,encoder.encoders.16,encoder.encoders.17,encoder.encoders.18,encoder.encoders.19,encoder.encoders.20,encoder.encoders.21,encoder.encoders.22,encoder.embed' - -train_config=conf/train_conformer_large_10h.yaml -checkpoint= -cmvn=false -dir=exp/${langs}_finetune_10h - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=35 - -. utils/parse_options.sh || exit 1; - -#Babel style data preparation -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - echo "stage 0: Setting up individual languages" - ./local/setup_languages.sh --langs "${langs}" --recog "${recog}" -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # Data preparation - for x in ${train_set} ${train_dev} ${recog_set}; do - # Remove the space in text - if [ "${token_type}" = char ]; then - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - fi - done -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - if [ "${token_type}" = bpe ]; then - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " data/${train_set}/text | sort > data/lang_char/input.txt - tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - elif [ "${token_type}" = char ]; then - tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - fi - - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "stage 1: format scp " - #dumps such pipe-style-wav to real audio file - for x in ${train_set} ${train_dev} ${recog_set}; do - cp data/${x}/wav.scp data/${x}/wav.scp.org - bash local/dump_wav.sh --nj 26 data/$x/wav.scp.org data/$x/segments data/$x/wav.scp - rm data/$x/wav.scp.org - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - echo "Prepare data, prepare required format" - # For wav feature, just copy the data. mfcc/fbank extraction is done in training - for x in ${train_set} ${train_dev} ${recog_set}; do - if [ $data_type == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data/$x/wav.scp data/$x/text \ - $(realpath data/$x/shards) data/$x/data.list - else - tools/make_raw_list.py data/$x/wav.scp data/$x/text \ - data/$x/data.list - fi - done -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - ${bpemodel:+--bpe_model ${bpemodel}.model} \ - --train_data data/$train_set/data.list \ - --cv_data data/$train_dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - ${enc_init:+--enc_init $enc_init} \ - --enc_init_mods $enc_init_mods \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 6 \ - $cmvn_opts - } & - done - wait -fi - - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Test model, please specify the model you want to test by --checkpoint - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - for mode in ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring; do - for rtask in ${recog_set}; do - { - test_dir=$dir/test_${rtask}_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/$rtask/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 5 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - ${bpemodel:+--bpe_model ${bpemodel}.model} \ - --ctc_weight $ctc_weight \ - --result_file $test_dir/text_ori \ - $cmvn_opts \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - if [ "${token_type}" = bpe ]; then - tools/spm_decode --model=${bpemodel}.model --input_format=piece < $test_dir/text_ori | sed -e "s/▁/ /g" > $test_dir/text - python tools/compute-wer.py --char=0 --v=1 \ - data/$rtask/text $test_dir/text > $test_dir/wer - elif [ "${token_type}" = char ]; then - python tools/compute-wer.py --char=1 --v=1 \ - data/$rtask/text $test_dir/text_ori > $test_dir/wer - fi - } & - done - done - wait - -fi - -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/openasr2021/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/README.md deleted file mode 100644 index 44c7164241b5e5ddcee4ef8bbc2f485d00daa321..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: dither + specaug + speed perturb -* Training info: lr 0.001, warmup_steps 25000, batch size 16, 1 gpu, acc_grad 4, 240 epochs -* Decoding info: average_num 10 - -| decoding mode | eval2000 (wer) | -|:----------------------:|:----------------:| -| ctc_greedy_search | 32.39% | -| ctc_prefix_beam_search | 32.39% | -| attention | 31.28% | -| attention_rescoring | 31.36% | \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/conf/train_conformer.yaml deleted file mode 100644 index a871cb11b7af4bffe2e8693e890d1bb11d0e8780..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 31 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - # batch_size: 32 - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/MSU_single_letter.txt b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/MSU_single_letter.txt deleted file mode 100644 index 1f7b419cca7421b37cfa86507b19c1a23d793a6b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/MSU_single_letter.txt +++ /dev/null @@ -1,26 +0,0 @@ -A ey -B b iy -C s iy -D d iy -E iy -F eh f -G jh iy -H ey ch -I ay -J jh ey -K k ey -L eh l -M eh m -N eh n -O ow -P p iy -Q k y uw -R aa r -S eh s -T t iy -U y uw -V v iy -W d ah b ax l y uw -X eh k s -Y w ay -Z z iy diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/dict.patch b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/dict.patch deleted file mode 100644 index 12c63d6127cca5acebf70c90599ad8bb3258c150..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/dict.patch +++ /dev/null @@ -1,380 +0,0 @@ -1d0 -< file: $SWB/data/dictionary/sw-ms98-dict.text -8645a8646 -> uh-hum ah m hh ah m -9006c9007 -< April ey p r ih l ---- -> April ey p r ax l -9144d9144 -< B ay zh aa n iy z -9261c9261 -< Battle b ae t el ---- -> Battle b ae t ax l -10014a10015 -> Chevy sh eh v iy -10211a10213 -> Colorado k ao l ax r aa d ow -10212a10215 -> Colorado' k ao l ax r aa d ow z -10370c10373 -< Creek k r ih k ---- -> Creek k r iy k -10889a10893 -> Eleven ax l eh v ih n -10951c10955 -< Erie ih r iy ---- -> Erie iy r iy -11183c11187 -< Forever f ax r eh v er ---- -> Forever f er eh v er -11231a11236 -> Friday f r ay d iy -11744a11750 -> History hh ih s t r iy -12004a12011,12012 -> Israel ih z r ih l -> Israel's ih z r ih l z -12573a12582 -> Lincoln l ih ng k ih n -12574a12584 -> Lincolns l ih ng k ih n z -13268c13278 -< NAACP eh ey ey s iy p iy ---- -> NAACP eh n ey ey s iy p iy -13286c13296 -< NIT eh ay t iy ---- -> NIT eh n ay t iy -13292c13302 -< NTSC eh t iy eh s s iy ---- -> NTSC eh n t iy eh s s iy -14058a14069 -> Quarter k ow r t er -14059a14071 -> Quarterback k ow r t er b ae k -14060a14073 -> Quarters k ow r t er z -14569a14583 -> Science s ay n s -15087a15102 -> Sunday s ah n d iy -15088a15104 -> Sunday's s ah n d iy z -15089a15106 -> Sundays s ah n d iy z -15290,15291c15307,15308 -< Texan t eh k sh ih n -< Texan's t eh k sh ih n s ---- -> Texan t eh k s ih n -> Texan's t eh k s ih n s -15335a15353 -> Thousands th aw z ih n z -15739c15757 -< Waco w ae k ow ---- -> Waco w ey k ow -15841a15860 -> Weekends w iy k eh n z -16782a16802 -> acceptable eh k s eh p ax b ax l -16833a16854 -> accounting ax k aw n ih ng -16948a16970 -> address ax d r eh s -17281a17304 -> already aa r d iy -17315a17339 -> am m -17709a17734 -> asked ae s t -17847a17873 -> attorney ih t er n iy -17919a17946 -> autopilot ao t ow p ay l ih t -17960a17988 -> awfully ao f l iy -18221a18250 -> basketball b ae s k ax b ao l -18222a18252 -> basketball's b ae s k ax b ao l z -18302a18333 -> become b ah k ah m -18303a18335 -> becomes b iy k ah m z -18344a18377 -> began b ax g en n -18817c18850 -< bottle b aa t el ---- -> bottle b aa t ax l -19332,19333c19365,19367 -< camera's k ae m ax r ax z -< cameras k ae m ax r ax z ---- -> camera k ae m r ax -> camera's k ae m r ax z -> cameras k ae m r ax z -19411a19446 -> capital k ae p ax l -19505a19541 -> carrying k ae r ih ng -20316a20353,20354 -> combination k aa m ih n ey sh ih n -> combinations k aa m ih n ey sh ih n z -20831a20870 -> contracts k aa n t r ae k s -21010a21050 -> costs k ao s -21062a21103 -> county k aw n iy -21371a21413 -> cultural k ao l ch ax r ax l -21372a21415 -> culturally k ao l ch ax r ax l iy -21373a21417 -> culture k ao l ch er -21375a21420 -> cultures k ao l ch er z -21543a21589 -> data d ey t ax -22097a22144 -> differently d ih f ax r ih n t l iy -22972a23020 -> effects ax f eh k t s -23016a23065 -> election ax l eh k sh ih n -23018a23068 -> elections ax l eh k sh ih n z -23052a23103 -> eleven ax l eh v ih n -23242a23294 -> enjoyable ae n jh oy ax b ax l -23248a23301 -> enjoys ae n jh oy z -23293a23347 -> entire ih n t ay r -23295a23350,23351 -> entirely ih n t ay r l iy -> entirety ih n t ay r t iy -23745a23802 -> extra eh k s t er -23818a23876 -> facts f ae k s -24508c24566 -< forever f ax r eh v er ---- -> forever f er eh v er -24514c24572 -< forget f ow r g eh t ---- -> forget f er r g eh t -24521a24580 -> forgot f er r g aa t -24522a24582 -> forgotten f er r g aa t ax n -24563a24624 -> forward f ow er d -24680a24742 -> frightening f r ay t n ih ng -24742a24805 -> full-time f ax l t ay m -24862a24926 -> garage g r aa jh -25218a25283 -> grandmother g r ae m ah dh er -25790a25856 -> heavily hh eh v ax l iy -25949a26016 -> history hh ih s t r iy -26038a26106 -> honestly aa n ax s t l iy -26039a26108 -> honesty aa n ax s t iy -26099a26169 -> horror hh ow r -26155a26226 -> houses hh aw z ih z -26184c26255 -< huh-uh hh ah hh ah ---- -> huh-uh ah hh ah -26189c26260 -< hum-um hh m hh m ---- -> hum-um ah m hh ah m -26236a26308 -> hunting hh ah n ih ng -26307a26380,26381 -> ideal ay d iy l -> idealist ay d iy l ih s t -26369a26444 -> imagine m ae jh ih n -26628a26704 -> individuals ih n d ih v ih jh ax l z -26968a27045 -> interest ih n t r ih s t -27184a27262 -> it'd ih d -27702a27781 -> lead l iy d -28378a28458 -> mandatory m ae n d ih t ow r iy -28885a28966 -> minute m ih n ih t -29167a29249 -> mountains m aw t n z -29317a29400 -> mysteries m ih s t r iy z -29318a29402 -> mystery m ih s t r iy -29470a29555 -> nervous n er v ih s -29578,29580c29663,29665 -< nobody n ow b aa d iy -< nobody'll n ow b aa d iy l -< nobody's n ow b aa d iy z ---- -> nobody n ow b ah d iy -> nobody'll n ow b ah d iy l -> nobody's n ow b ah d iy z -29712a29798 -> nuclear n uw k l iy r -29938a30025 -> onto aa n t ax -30051a30139 -> originally ax r ih jh ax l iy -30507a30596 -> particularly p er t ih k y ax l iy -30755a30845 -> perfectly p er f ih k l iy -30820a30911 -> personally p er s n ax l iy -30915a31007 -> physically f ih z ih k l iy -30986a31079 -> pilot p ay l ih t -30987a31081 -> pilot's p ay l ih t s -31227a31322 -> police p l iy s -31513a31609 -> prefer p er f er -31553a31650 -> prepare p r ax p ey r -31578a31676 -> prescription p er s k r ih p sh ih n -31579a31678 -> prescriptions p er s k r ih p sh ih n z -31770a31870 -> products p r aa d ax k s -31821a31922 -> projects p r aa jh eh k s -31908a32010 -> protect p er t eh k t -31909a32012 -> protected p er t eh k t ih d -31911a32015 -> protection p er t eh k sh ih n -31914a32019 -> protection p er t eh k t ih v -32149a32255 -> quarter k ow r t er -32414a32521 -> read r iy d -32785a32893 -> rehabilitation r iy ax b ih l ih t ey sh ih n -33150a33259 -> resource r ih s ow r s -33151a33261 -> resources r iy s ow r s ih z -33539c33649 -< roots r uh t s ---- -> roots r uw t s -33929a34040 -> science s ay n s -34315a34427 -> seventy s eh v ih n iy -34319,34320c34431,34432 -< severe s ax v iy r -< severely s ax v iy r l iy ---- -> severe s ih v iy r -> severely s ih v iy r l iy -35060a35173 -> software s ao f w ey r -35083a35197 -> solid s ao l ih d -35084a35199 -> solidly s ao l ih d l iy -35750a35866 -> stood s t ih d -35854a35971 -> strictly s t r ih k l iy -35889c36006 -< stronger s t r ao ng er ---- -> stronger s t r ao ng g er -36192a36310,36311 -> supposed s p ow z -> supposed s p ow s -36510a36630 -> tastes t ey s -36856a36977 -> thoroughly th er r l iy -36866a36988 -> thousands th aw z ih n z -37081c37203 -< toots t uh t s ---- -> toots t uw t s -37157a37280 -> toward t w ow r d -37158a37282 -> towards t w ow r d z -37564a37689 -> twenties t w eh n iy z -37565a37691 -> twentieth t w eh n iy ih th -37637a37764 -> unacceptable ah n ae k s eh p ax b ax l -37728a37856 -> understand ah n d er s t ae n -37860a37989 -> unless ih n l eh s -38040a38170 -> use y uw z -38049a38180 -> uses y uw z ih z -38125a38257 -> various v ah r iy ih s -38202a38335 -> versus v er s ih z -38381c38514 -< wacko w ae k ow ---- -> wacko w ey k ow -38455c38588 -< wanna w aa n ax ---- -> wanna w ah n ax -38675c38808 -< whatnot w ah t n aa t ---- -> whatnot w aa t n aa t -38676a38810 -> whatsoever w aa t s ow eh v er -38890c39024 -< wok w aa k ---- -> wok w ao k -38910a39045 -> wondering w ah n d r ih ng diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/eval2000_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/eval2000_data_prep.sh deleted file mode 100644 index 0c08a92d1b3c764c30412f69f6626d5483a75e0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/eval2000_data_prep.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env bash - -# Hub-5 Eval 2000 data preparation -# Author: Arnab Ghoshal (Jan 2013) - -# To be run from one directory above this script. - -# The input is two directory names (possibly the same) containing the -# 2000 Hub5 english evaluation test set and transcripts, which are -# respectively: LDC2002S09 LDC2002T43 -# e.g. see -# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09 -# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43 -# -# Example usage: -# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr -# The first directory ($sdir) contains the speech data, and the directory -# $sdir/english/ must exist. -# The second directory ($tdir) contains the transcripts, and the directory -# $tdir/reference must exist; in particular we need the file -# $tdir/reference/hub5e00.english.000405.stm - -if [ $# -ne 2 ]; then - echo "Usage: "`basename $0`" " - echo "See comments in the script for more details" - exit 1 -fi - -sdir=$1 -tdir=$2 -[ ! -d $sdir/english ] \ - && echo Expecting directory $sdir/english to be present && exit 1; -[ -d $tdir/2000_hub5_eng_eval_tr ] \ - && tdir=$tdir/2000_hub5_eng_eval_tr -[ ! -d $tdir/reference ] \ - && echo Expecting directory $tdir/reference to be present && exit 1; - -. ./path.sh - -dir=data/local/eval2000 -mkdir -p $dir - -find -L $sdir/english -iname '*.sph' | sort > $dir/sph.flist -sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ - > $dir/sph.scp - -# Get segments file... -# segments file format is: utt-id side-id start-time end-time, e.g.: -# sw02001-A_000098-001156 sw02001-A 0.98 11.56 -pem=$sdir/english/hub5e_00.pem -[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1; -# pem file has lines like: -# en_4156 A unknown_speaker 301.85 302.48 - -# we ignore the warnings below for now, although they seem to indicate some problems -# with the data. -grep -v ';;' $pem \ - | awk '{ - spk=$1"-"$2; - utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - print utt,spk,$4,$5;}' \ - | sort -u | local/extend_segments.pl 0.1 > $dir/segments - -# stm file has lines like: -# en_4156 A en_4156_A 357.64 359.64 HE IS A POLICE OFFICER -# TODO(arnab): We should really be lowercasing this since the Edinburgh -# recipe uses lowercase. This is not used in the actual scoring. -grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ - | awk '{ - spk=$1"-"$2; - utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); - printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ - | sort > $dir/text.all - -# We'll use the stm file for sclite scoring. There seem to be various errors -# in the stm file that upset hubscr.pl, and we fix them here. -sed -e 's:((:(:' -e 's:::g' -e 's:::g' \ - $tdir/reference/hub5e00.english.000405.stm > $dir/stm -cp $tdir/reference/en20000405_hub5.glm $dir/glm - -# next line uses command substitution -# Just checking that the segments are the same in pem vs. stm. -! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ - echo "Segments from pem file and stm file do not match." && exit 1; - -grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text - -# side A - channel 1, side B - channel 2 -bash tools/sph2wav.sh --nj 16 $dir/sph.scp $dir/segments $dir/wav.scp - -# create an utt2spk file that assumes each conversation side is -# a separate speaker. -awk '{print $1,$2;}' $dir/segments > $dir/utt2spk -tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt - -# cp $dir/segments $dir/segments.tmp -# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ -# $dir/segments.tmp > $dir/segments - -awk '{print $1}' $dir/wav_ori.scp \ - | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; - print "$1-$2 $1 $2\n"; ' \ - > $dir/reco2file_and_channel || exit 1; - -dest=data/eval2000 -mkdir -p $dest -for x in wav.scp text utt2spk spk2utt; do - cp $dir/$x $dest/$x -done - -echo Data preparation and formatting completed for Eval 2000 -echo "(but not MFCC extraction)" - -tools/fix_data_dir.sh $dest -if [ $(wc -l < $dest/wav.scp) -ne 80 ]; then - echo "$0: error: expected 80 lines in wav.scp, got $(wc -l < $dest/wav.scp)" - exit 1; -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/extend_segments.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/extend_segments.pl deleted file mode 100644 index e8b4894d5f6cc4595cd0cf352200d045da8d87ff..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/extend_segments.pl +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -if (@ARGV != 1 || !($ARGV[0] =~ m/^-?\d+\.?\d*$/ && $ARGV[0] >= 0)) { - print STDERR "Usage: extend_segments.pl time-in-seconds segments.extended \n" . - "e.g. extend_segments.pl 0.25 segments.2\n" . - "This command modifies a segments file, with lines like\n" . - " \n" . - "by extending the beginning and end of each segment by a certain\n" . - "length of time. This script makes sure the output segments do not\n" . - "overlap as a result of this time-extension, and that there are no\n" . - "negative times in the output.\n"; - exit 1; -} - -$extend = $ARGV[0]; - -@all_lines = (); - -while () { - chop; - @A = split(" ", $_); - if (@A != 4) { - die "invalid line in segments file: $_"; - } - $line = @all_lines; # current number of lines. - ($utt_id, $reco_id, $start_time, $end_time) = @A; - - push @all_lines, [ $utt_id, $reco_id, $start_time, $end_time ]; # anonymous array. - if (! defined $lines_for_reco{$reco_id}) { - $lines_for_reco{$reco_id} = [ ]; # push new anonymous array. - } - push @{$lines_for_reco{$reco_id}}, $line; -} - -foreach $reco_id (keys %lines_for_reco) { - $ref = $lines_for_reco{$reco_id}; - @line_numbers = sort { ${$all_lines[$a]}[2] <=> ${$all_lines[$b]}[2] } @$ref; - - - { - # handle start of earliest segment as a special case. - $l0 = $line_numbers[0]; - $tstart = ${$all_lines[$l0]}[2] - $extend; - if ($tstart < 0.0) { $tstart = 0.0; } - ${$all_lines[$l0]}[2] = $tstart; - } - { - # handle end of latest segment as a special case. - $lN = $line_numbers[$#line_numbers]; - $tend = ${$all_lines[$lN]}[3] + $extend; - ${$all_lines[$lN]}[3] = $tend; - } - for ($i = 0; $i < $#line_numbers; $i++) { - $ln = $line_numbers[$i]; - $ln1 = $line_numbers[$i+1]; - $tend = ${$all_lines[$ln]}[3]; # end of earlier segment. - $tstart = ${$all_lines[$ln1]}[2]; # start of later segment. - if ($tend > $tstart) { - $utt1 = ${$all_lines[$ln]}[0]; - $utt2 = ${$all_lines[$ln1]}[0]; - print STDERR "Warning: for utterances $utt1 and $utt2, segments " . - "already overlap; leaving these times unchanged.\n"; - } else { - $my_extend = $extend; - $max_extend = 0.5 * ($tstart - $tend); - if ($my_extend > $max_extend) { $my_extend = $max_extend; } - $tend += $my_extend; - $tstart -= $my_extend; - ${$all_lines[$ln]}[3] = $tend; - ${$all_lines[$ln1]}[2] = $tstart; - } - } -} - -# leave the numbering of the lines unchanged. -for ($l = 0; $l < @all_lines; $l++) { - $ref = $all_lines[$l]; - ($utt_id, $reco_id, $start_time, $end_time) = @$ref; - printf("%s %s %.2f %.2f\n", $utt_id, $reco_id, $start_time, $end_time); -} - -__END__ - -# testing below. - -# ( echo a1 A 0 1; echo a2 A 3 4; echo b1 B 0 1; echo b2 B 2 3 ) | local/extend_segments.pl 1.0 -a1 A 0.00 2.00 -a2 A 2.00 5.00 -b1 B 0.00 1.50 -b2 B 1.50 4.00 -# ( echo a1 A 0 2; echo a2 A 1 3 ) | local/extend_segments.pl 1.0 -Warning: for utterances a1 and a2, segments already overlap; leaving these times unchanged. -a1 A 0.00 2.00 -a2 A 1.00 4.00 -# ( echo a1 A 0 2; echo a2 A 5 6; echo a3 A 3 4 ) | local/extend_segments.pl 1.0 -a1 A 0.00 2.50 -a2 A 4.50 7.00 -a3 A 2.50 4.50 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/format_acronyms_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/format_acronyms_dict.py deleted file mode 100644 index fa598dd03c33540c46d1ec324199a30c15c184f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/format_acronyms_dict.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2015 Minhua Wu -# Apache 2.0 - -# convert acronyms in swbd dict to fisher convention -# IBM to i._b._m. -# BBC to b._b._c. -# BBCs to b._b._c.s -# BBC's to b._b._c.'s - -import argparse -import re - -__author__ = "Minhua Wu" - -parser = argparse.ArgumentParser(description="format acronyms to a._b._c.") -parser.add_argument("-i", "--input", help="Input lexicon", required=True) -parser.add_argument("-o", "--output", help="Output lexicon", required=True) -parser.add_argument( - "-L", "--Letter", help="Input single letter pronunciation", required=True -) -parser.add_argument("-M", "--Map", help="Output acronyms mapping", required=True) -args = parser.parse_args() - - -fin_lex = open(args.input, "r") -fin_Letter = open(args.Letter, "r") -fout_lex = open(args.output, "w") -fout_map = open(args.Map, "w") - -# Initialise single letter dictionary -dict_letter = {} -for single_letter_lex in fin_Letter: - items = single_letter_lex.split() - dict_letter[items[0]] = single_letter_lex[len(items[0]) + 1 :].strip() -fin_Letter.close() -# print dict_letter - -for lex in fin_lex: - items = lex.split() - word = items[0] - lexicon = lex[len(items[0]) + 1 :].strip() - # find acronyms from words with only letters and ' - pre_match = re.match(r"^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$", word) - if pre_match: - # find if words in the form of xxx's is acronym - if word[-2:] == "'s" and (lexicon[-1] == "s" or lexicon[-1] == "z"): - actual_word = word[:-2] - actual_lexicon = lexicon[:-2] - acronym_lexicon = "" - for w in actual_word: - acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " " - if acronym_lexicon.strip() == actual_lexicon: - acronym_mapped = "" - acronym_mapped_back = "" - for w in actual_word[:-1]: - acronym_mapped = acronym_mapped + w.lower() + "._" - acronym_mapped_back = acronym_mapped_back + w.lower() + " " - acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".'s" - acronym_mapped_back = ( - acronym_mapped_back + actual_word[-1].lower() + "'s" - ) - fout_map.write( - word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n" - ) - fout_lex.write(acronym_mapped + " " + lexicon + "\n") - else: - fout_lex.write(lex) - - # find if words in the form of xxxs is acronym - elif word[-1] == "s" and (lexicon[-1] == "s" or lexicon[-1] == "z"): - actual_word = word[:-1] - actual_lexicon = lexicon[:-2] - acronym_lexicon = "" - for w in actual_word: - acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " " - if acronym_lexicon.strip() == actual_lexicon: - acronym_mapped = "" - acronym_mapped_back = "" - for w in actual_word[:-1]: - acronym_mapped = acronym_mapped + w.lower() + "._" - acronym_mapped_back = acronym_mapped_back + w.lower() + " " - acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".s" - acronym_mapped_back = ( - acronym_mapped_back + actual_word[-1].lower() + "'s" - ) - fout_map.write( - word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n" - ) - fout_lex.write(acronym_mapped + " " + lexicon + "\n") - else: - fout_lex.write(lex) - - # find if words in the form of xxx (not ended with 's or s) is acronym - elif word.find("'") == -1 and word[-1] != "s": - acronym_lexicon = "" - for w in word: - acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " " - if acronym_lexicon.strip() == lexicon: - acronym_mapped = "" - acronym_mapped_back = "" - for w in word[:-1]: - acronym_mapped = acronym_mapped + w.lower() + "._" - acronym_mapped_back = acronym_mapped_back + w.lower() + " " - acronym_mapped = acronym_mapped + word[-1].lower() + "." - acronym_mapped_back = acronym_mapped_back + word[-1].lower() - fout_map.write( - word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n" - ) - fout_lex.write(acronym_mapped + " " + lexicon + "\n") - else: - fout_lex.write(lex) - else: - fout_lex.write(lex) - - else: - fout_lex.write(lex) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/map_acronyms_transcripts.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/map_acronyms_transcripts.py deleted file mode 100644 index ba02aaec34b5b3be7a8fb51dd31abed23c1bacf5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/map_acronyms_transcripts.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2015 Minhua Wu -# Apache 2.0 - -# convert acronyms in swbd transcript to fisher convention -# according to first two columns in the input acronyms mapping - -import argparse -import re - -__author__ = "Minhua Wu" - -parser = argparse.ArgumentParser(description="format acronyms to a._b._c.") -parser.add_argument("-i", "--input", help="Input transcripts", required=True) -parser.add_argument("-o", "--output", help="Output transcripts", required=True) -parser.add_argument("-M", "--Map", help="Input acronyms mapping", required=True) -args = parser.parse_args() - -fin_map = open(args.Map, "r") -dict_acronym = {} -dict_acronym_noi = {} # Mapping of acronyms without I, i -for pair in fin_map: - items = pair.split("\t") - dict_acronym[items[0]] = items[1] - dict_acronym_noi[items[0]] = items[1] -fin_map.close() -del dict_acronym_noi["I"] -del dict_acronym_noi["i"] - - -fin_trans = open(args.input, "r") -fout_trans = open(args.output, "w") -for line in fin_trans: - items = line.split() - L = len(items) - # First pass mapping to map I as part of acronym - for i in range(L): - if items[i] == "I": - x = 0 - while i - 1 - x >= 0 and re.match(r"^[A-Z]$", items[i - 1 - x]): - x += 1 - - y = 0 - while i + 1 + y < L and re.match(r"^[A-Z]$", items[i + 1 + y]): - y += 1 - - if x + y > 0: - for bias in range(-x, y + 1): - items[i + bias] = dict_acronym[items[i + bias]] - - # Second pass mapping (not mapping 'i' and 'I') - for i in range(len(items)): - if items[i] in dict_acronym_noi.keys(): - items[i] = dict_acronym_noi[items[i]] - sentence = " ".join(items[1:]) - fout_trans.write(items[0] + " " + sentence.lower() + "\n") - -fin_trans.close() -fout_trans.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_data_download.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_data_download.sh deleted file mode 100644 index ee27048bf2f4295f330b7d9dec3597f317e0cda1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_data_download.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -# Switchboard-1 training data preparation customized for Edinburgh -# Author: Arnab Ghoshal (Jan 2013) - -# To be run from one directory above this script. - -## The input is some directory containing the switchboard-1 release 2 -## corpus (LDC97S62). Note: we don't make many assumptions about how -## you unpacked this. We are just doing a "find" command to locate -## the .sph files. - -. ./path.sh - -#check existing directories -if [ $# != 1 ]; then - echo "Usage: swbd1_data_download.sh /path/to/SWBD" - exit 1; -fi - -SWBD_DIR=$1 - -dir=data/local/train -mkdir -p $dir - -# Audio data directory check -if [ ! -d $SWBD_DIR ]; then - echo "Error: run.sh requires a directory argument" - exit 1; -fi - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - ( - cd $dir; - if [ ! -d swb_ms98_transcriptions ]; then - echo " *** Downloading trascriptions and dictionary ***" - wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - fi - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_data_prep.sh deleted file mode 100644 index 6dc8630863bb03e291ec50f7d7b497d8cdca4183..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_data_prep.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env bash - -# Switchboard-1 training data preparation customized for Edinburgh -# Author: Arnab Ghoshal (Jan 2013) - -# To be run from one directory above this script. - -## The input is some directory containing the switchboard-1 release 2 -## corpus (LDC97S62). Note: we don't make many assumptions about how -## you unpacked this. We are just doing a "find" command to locate -## the .sph files. - -## The second input is optional, which should point to a directory containing -## Switchboard transcriptions/documentations (specifically, the conv.tab file). -## If specified, the script will try to use the actual speaker PINs provided -## with the corpus instead of the conversation side ID (Kaldi default). We -## will be using "find" to locate this file so we don't make any assumptions -## on the directory structure. (Peng Qi, Aug 2014) - -. ./path.sh - -#check existing directories -if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]" - exit 1; -fi - -SWBD_DIR=$1 - -dir=data/local/train -mkdir -p $dir - - -# Audio data directory check -if [ ! -d $SWBD_DIR ]; then - echo "Error: run.sh requires a directory argument" - exit 1; -fi - -# Option A: SWBD dictionary file check -[ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ - echo "SWBD dictionary file does not exist" && exit 1; - -# find sph audio files -find -L $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist - -n=`cat $dir/sph.flist | wc -l` -[ $n -ne 2435 ] && [ $n -ne 2438 ] && \ - echo Warning: expected 2435 or 2438 data data files, found $n - - -# (1a) Transcriptions preparation -# make basic transcription file (add segments info) -awk '{ - name=substr($1,1,6); gsub("^sw","sw0",name); side=substr($1,7,1); - stime=$2; etime=$3; - printf("%s-%s_%06.0f-%06.0f", - name, side, int(100*stime+0.5), int(100*etime+0.5)); - for(i=4;i<=NF;i++) printf(" %s", $i); printf "\n" -}' $dir/swb_ms98_transcriptions/*/*/*-trans.text > $dir/transcripts1.txt - -# test if trans. file is sorted -export LC_ALL=C; -sort -c $dir/transcripts1.txt || exit 1; # check it's sorted. - -# Remove SILENCE, and . - -# Note: we have [NOISE], [VOCALIZED-NOISE], [LAUGHTER], [SILENCE]. -# removing [SILENCE], and the and markers that mark -# speech to somone; we will give phones to the other three (NSN, SPN, LAU). -# There will also be a silence phone, SIL. -# **NOTE: modified the pattern matches to make them case insensitive -cat $dir/transcripts1.txt \ - | perl -ane 's:\s\[SILENCE\](\s|$):$1:gi; - s///gi; - s///gi; - print;' \ - | awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt - -# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches -# case insensitive -local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text - -# format acronyms in text -python3 local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \ - -M data/local/dict_nosp/acronyms.map -mv $dir/text_map $dir/text - -# (1c) Make segment files from transcript -#segments file format is: utt-id side-id start-time end-time, e.g.: -#sw02001-A_000098-001156 sw02001-A 0.98 11.56 -awk '{ - segment=$1; - split(segment,S,"[_-]"); - side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4]; - print segment " " audioname "-" side " " startf/100 " " endf/100 -}' < $dir/text > $dir/segments - -sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ - > $dir/sph.scp - -# side A - channel 1, side B - channel 2 -bash tools/sph2wav.sh --nj 16 $dir/sph.scp $dir/segments $dir/wav.scp - -# this file reco2file_and_channel maps recording-id (e.g. sw02001-A) -# to the file name sw02001 and the A, e.g. -# sw02001-A sw02001 A -# In this case it's trivial, but in other corpora the information might -# be less obvious. Later it will be needed for ctm scoring. -awk '{print $1}' $dir/wav_ori.scp \ - | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; - print "$1-$2 $1 $2\n"; ' \ - > $dir/reco2file_and_channel || exit 1; - -awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \ - || exit 1; -sort -k 2 $dir/utt2spk | tools/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; - -# We assume each conversation side is a separate speaker. This is a very -# reasonable assumption for Switchboard. The actual speaker info file is at: -# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary - -# Copy stuff into its final locations [this has been moved from the format_data -# script] -mkdir -p data/train -for f in spk2utt utt2spk wav.scp text; do - cp data/local/train/$f data/train/$f || exit 1; -done - -if [ $# == 2 ]; then # fix speaker IDs - find $2 -name conv.tab > $dir/conv.tab - local/swbd1_fix_speakerid.pl `cat $dir/conv.tab` data/train - tools/utt2spk_to_spk2utt.pl data/train/utt2spk.new > data/train/spk2utt.new - # patch files - for f in spk2utt utt2spk text segments; do - cp data/train/$f data/train/$f.old || exit 1; - cp data/train/$f.new data/train/$f || exit 1; - done - rm $dir/conv.tab -fi - -echo Switchboard-1 data preparation succeeded. - -utils/fix_data_dir.sh data/train diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_fix_speakerid.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_fix_speakerid.pl deleted file mode 100644 index 785493928949377787d816af954ab34f31e73edb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_fix_speakerid.pl +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -# Author: Peng Qi (pengqi@cs.stanford.edu) -# This script maps Switchboard speaker IDs to the true physical speakers -# and fixes the utterances IDs accordingly. Expected to be run one level of -# directory above. - -sub trim { - (my $s = $_[0]) =~ s/^\s+|\s+$//g; - return $s; -} - -if ($#ARGV != 1) { - print "Usage: swbd1_fix_speakerid.pl \n"; - print "E.g.: swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n"; -} - -$tab_file = $ARGV[0]; -$dir = $ARGV[1]; - -%conv_to_spk = (); - -open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n"; - -while (my $line = <$conv_tab>) { - chomp $line; - - my @fields = split "," , $line; - #$fields[0] = trim($fields[0]); - $fields[2] = trim($fields[2]); - $fields[3] = trim($fields[3]); - $conv_to_spk{'sw0' . $fields[0] . '-A'} = $fields[2]; - $conv_to_spk{'sw0' . $fields[0] . '-B'} = $fields[3]; -} - -close($conv_tab); - -# fix utt2spk - -%missingconv = (); - -open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n"; -open(my $utt2spk_new, '>', $dir . '/utt2spk.new'); - -while (my $line = <$utt2spk>) { - chomp $line; - - my @fields = split " " , $line; - my $convid = substr $fields[0], 0, 9; - - if (exists $conv_to_spk{ $convid }) { - my $spkid = $conv_to_spk{ $convid }; - $spkid = "sw" . $spkid; - my $newuttid = $spkid . '-' . (substr $fields[0], 2); - - print $utt2spk_new "$newuttid $spkid\n"; - } else { - my $convid = substr $convid, 3, 4; - $missingconv{$convid} = 1; - - print $utt2spk_new $fields[0]." ".$fields[1]."\n"; - } -} - -close($utt2spk); -close($utt2spk_new); - -foreach my $conv (keys %missingconv) { - print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n" -} - -# fix segments and text - -foreach my $file ('segments','text') { - open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n"; - open(my $newfile, '>', "$dir/$file.new"); - - while (my $line = <$oldfile>) { - chomp $line; - - my $convid = substr $line, 0, 9; - if (exists $conv_to_spk{$convid}) { - my $spkid = $conv_to_spk{$convid}; - print $newfile "sw$spkid-" . (substr $line, 2) . "\n"; - } else { - print $newfile "$line\n"; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_map_words.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_map_words.pl deleted file mode 100644 index 4fb8d4ffe7292121d3e25dd11c2afb0c9386ea0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_map_words.pl +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env perl - -# Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern -# matches case-insensitive --Arnab (Jan 2013) - -if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } -} - - -while (<>) { - @A = split(" ", $_); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - # e.g. [LAUGHTER-STORY] -> STORY; - $a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i; - # $1 and $3 relate to preserving trailing "-" - $a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note, - # 1st part may include partial-word stuff, which we process further below, - # e.g. [LEM[GUINI]-/LINGUINI] - # the (|\_) at the end is to accept and preserve trailing -'s. - $a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote; - # let the leading - be optional on input, as sometimes omitted. - $a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-; - # let the trailing - be optional on input, as sometimes omitted. - $a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX- - # which is a mistake in the input. - $a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM - $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT- - $a =~ s:_\d$::; # e.g. THEM_1 -> THEM - } - $A[$n] = $a; - } - print join(" ", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_prepare_dict.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_prepare_dict.sh deleted file mode 100644 index 8b5962d1f3698d2e77cbdf3140badc4a66e867fd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/local/swbd1_prepare_dict.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env bash - -# Formatting the Mississippi State dictionary for use in Edinburgh. Differs -# from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013) - -# To be run from one directory above this script. - -. ./path.sh - -#check existing directories -[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1; - -srcdir=data/local/train # This is where we downloaded some stuff.. -dir=data/local/dict_nosp -mkdir -p $dir -srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text - -# assume swbd_p1_data_prep.sh was done already. -[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1; - -cp $srcdict $dir/lexicon0.txt || exit 1; -patch 0' | sort > $dir/lexicon1.txt || exit 1; - -cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ - grep -v sil > $dir/nonsilence_phones.txt || exit 1; - -( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt - -echo sil > $dir/optional_silence.txt - -# No "extra questions" in the input to this setup, as we don't -# have stress or tone. -echo -n >$dir/extra_questions.txt - -cp local/MSU_single_letter.txt $dir/ -# Add to the lexicon the silences, noises etc. -# Add single letter lexicon -# The original swbd lexicon does not have precise single letter lexicion -# e.g. it does not have entry of W -( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \ - echo '[laughter] lau'; echo ' spn' ) \ - | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1; - -# Map the words in the lexicon. That is-- for each word in the lexicon, we map it -# to a new written form. The transformations we do are: -# remove laughter markings, e.g. -# [LAUGHTER-STORY] -> STORY -# Remove partial-words, e.g. -# -[40]1K W AH N K EY -# becomes -1K -# and -# -[AN]Y IY -# becomes -# -Y -# -[A]B[OUT]- B -# becomes -# -B- -# Also, curly braces, which appear to be used for "nonstandard" -# words or non-words, are removed, e.g. -# {WOLMANIZED} W OW L M AX N AY Z D -# -> WOLMANIZED -# Also, mispronounced words, e.g. -# [YEAM/YEAH] Y AE M -# are changed to just e.g. YEAM, i.e. the orthography -# of the mispronounced version. -# Note-- this is only really to be used in training. The main practical -# reason is to avoid having tons of disambiguation symbols, which -# we otherwise would get because there are many partial words with -# the same phone sequences (most problematic: S). -# Also, map -# THEM_1 EH M -> THEM -# so that multiple pronunciations just have alternate entries -# in the lexicon. - -local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \ - > $dir/lexicon3.txt || exit 1; - -python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \ - -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map -cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map - -( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt - -pushd $dir >&/dev/null -ln -sf lexicon5.txt lexicon.txt # This is the final lexicon. -popd >&/dev/null -rm $dir/lexiconp.txt 2>/dev/null -echo Prepared input dictionary and phone-sets for Switchboard phase 1. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/run.sh deleted file mode 100644 index 9b94c8cc1d54bfb71d164e8fca8d4d5b421029b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/run.sh +++ /dev/null @@ -1,250 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - -nj=16 -feat_dir=raw_wav -data_type=shard # raw or shard -num_utts_per_shard=1000 -prefetch=100 -# bpemode (unigram or bpe) -nbpe=2000 -bpemode=bpe - -# data directory -swbd1_dir=/home/backup_nfs2/hlyu/swbd/LDC97S62 -eval2000_dir="/home/backup_nfs2/hlyu/swbd/LDC2002S09/hub5e_00 /home/backup_nfs2/hlyu/swbd/LDC2002T43" - -train_set=train_nodup -train_config=conf/train_conformer.yaml -cmvn=true -dir=exp/conformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=10 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/swbd1_data_download.sh ${swbd1_dir} - local/swbd1_prepare_dict.sh - local/swbd1_data_prep.sh ${swbd1_dir} - local/eval2000_data_prep.sh ${eval2000_dir} - # process the train set by - # 1) convert lower to upper - # 2) remove ._._ -1 symbols from text - # 3) subset training set and dev set - # 4) remove duplicated utterances - cp data/train/text data/train/text.org - paste -d" " <(cut -f 1 -d" " data/train/text.org) \ - <(cut -f 2- -d" " data/train/text.org | tr "[:lower:]" "[:upper:]") > data/train/text - sed -i 's/\._/ /g; s/\.//g; s/THEM_1/THEM/g' data/train/text - tools/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min - n=$(($(wc -l < data/train/text) - 4000)) - tools/subset_data_dir.sh --last data/train ${n} data/train_nodev - tools/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup - # process eval2000 set by - # 1) remove tags (%AH) (%HESITATION) (%UH) - # 2) remove - # 3) remove "(" or ")" - # 4) remove file with empty text - cp data/eval2000/text data/eval2000/text.org - paste -d "" \ - <(cut -f 1 -d" " data/eval2000/text.org) \ - <(awk '{$1=""; print toupper($0)}' data/eval2000/text.org \ - | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' \ - | sed -e "s/(//g" -e "s/)//g") \ - | sed -e 's/\s\+/ /g' > data/eval2000/text.org2 - awk -F ' ' '{if(length($2) != 0) print $0}' data/eval2000/text.org2 > data/eval2000/text - tools/fix_data_dir.sh data/eval2000 -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # For wav feature, just copy the data. Fbank extraction is done in training - mkdir -p ${feat_dir} - for x in ${train_set} train_dev eval2000; do - cp -r data/${x} ${feat_dir} - done - tools/compute_cmvn_stats.py --num_workers 16 --train_config ${train_config} \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn ${feat_dir}/${train_set}/global_cmvn - -fi - -dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt - - tools/spm_train --input=data/lang_char/input.txt \ - --vocab_size=${nbpe} \ - --character_coverage=1.0 \ - --model_type=${bpemode} \ - --model_prefix=${bpemodel} \ - --input_sentence_size=100000000 \ - --user_defined_symbols="[LAUGHTER],[NOISE],[VOCALIZED-NOISE]" - tools/spm_encode --model=${bpemodel}.model \ - --output_format=piece < data/lang_char/input.txt | \ - tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - - num_token=$(cat ${dict} | wc -l) - echo " ${num_token}" >> ${dict} # - wc -l ${dict} -fi - - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - for x in ${train_set} train_dev eval2000; do - if [ ${data_type} == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard ${num_utts_per_shard} \ - --num_threads ${nj} ${feat_dir}/${x}/wav.scp ${feat_dir}/${x}/text \ - $(realpath ${feat_dir}/${x}/shards) ${feat_dir}/${x}/data.list - else - tools/make_raw_list.py ${feat_dir}/${x}/wav.scp ${feat_dir}/${x}/text \ - ${feat_dir}/${x}/data.list - fi - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p ${dir} - INIT_FILE=${dir}/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f ${INIT_FILE}) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo ${CUDA_VISIBLE_DEVICES} | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr ${num_gpus} \* ${num_nodes}` - echo "total gpus is: ${world_size}" - cmvn_opts= - ${cmvn} && cp ${feat_dir}/${train_set}/global_cmvn ${dir} - ${cmvn} && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < ${num_gpus}; ++i)); do - { - gpu_id=$(echo ${CUDA_VISIBLE_DEVICES} | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr ${node_rank} \* ${num_gpus} + ${i}` - python wenet/bin/train.py --gpu ${gpu_id} \ - --config ${train_config} \ - --data_type ${data_type} \ - --symbol_table ${dict} \ - --prefetch ${prefetch} \ - --bpe_model ${bpemodel}.model \ - --train_data ${feat_dir}/${train_set}/data.list \ - --cv_data ${feat_dir}/train_dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir ${dir} \ - --ddp.init_method ${init_method} \ - --ddp.world_size ${world_size} \ - --ddp.rank ${rank} \ - --ddp.dist_backend ${dist_backend} \ - --num_workers 4 \ - ${cmvn_opts} \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=${dir}/avg_${average_num}.pt - echo "do model average and final checkpoint is ${decode_checkpoint}" - python wenet/bin/average_model.py \ - --dst_model ${decode_checkpoint} \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=${dir}/test_${mode} - mkdir -p ${test_dir} - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data $feat_dir/eval2000/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --bpe_model $bpemodel.model \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - sed -i.bak -r 's/ //g' ${test_dir}/text - mv ${test_dir}/text ${test_dir}/text.bak2 - tools/spm_decode --model=${bpemodel}.model --input_format=piece \ - < ${test_dir}/text.bak2 | sed -e "s/▁/ /g" > ${test_dir}/text - python tools/compute-wer.py --char=1 --v=1 \ - $feat_dir/eval2000/text $test_dir/text > $test_dir/wer - } - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/swbd/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/README.md deleted file mode 100644 index 1555e465ef036790ae2067bf2a54a6a3e0f8b7a9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: using fbank feature, dither, cmvn, without speed perturb (not supported segments yet) -* Training info: lr 0.001, batch size 20, 8 gpu, acc_grad 1, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 10 - - -| decoding mode | Dev WER | Test WER | -|---------------------|---------|----------| -| attention rescoring | 9.54% | 8.66% | \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/conf/train_conformer.yaml deleted file mode 100644 index a610ea5fb7829e44a201f45845d415a14f16bd38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 31 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 2000 - min_length: 10 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 3 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 20 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/download_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/download_data.sh deleted file mode 100644 index 0ac85fcbd14a257ae73b6a2b81dbf6cef021bfea..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/download_data.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2014 Nickolay V. Shmyrev -# 2014 Brno University of Technology (Author: Karel Vesely) -# 2016 John Hopkins University (author: Daniel Povey) -# Apache 2.0 - -mkdir -p db - -cd db ### Note: the rest of this script is executed from the directory 'db'. - -# TED-LIUM database: -if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then - if [ ! -e TEDLIUM_release-3 ]; then - ln -sf /export/corpora5/TEDLIUM_release-3 - fi - echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3" -else - if [ ! -e TEDLIUM_release-3 ]; then - echo "$0: downloading TEDLIUM_release2 data (it won't re-download if it was already downloaded.)" - # the following command won't re-get it if it's already there - # because of the --continue switch. - wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1 - tar xf "TEDLIUM_release-3.tgz" - else - echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists." - fi -fi - - -num_sph=$(find -L TEDLIUM_release-3/legacy -name '*.sph' | wc -l) -# We mainly use TED-LIUM 3 "legacy" distribution, on which the dev and test datasets are the same as in TED-LIUM 2 (and TED-LIUM1). -# It contains 2351 sph files for training and 19 sph files for dev/test (total 2370). -# Because the "legacy" contains symbolic links to "data", we use `find -L`. -if [ "$num_sph" != 2370 ]; then - echo "$0: expected to find 2370 .sph files in the directory db/TEDLIUM_release3/legacy, found $num_sph" - exit 1 -fi - -exit 0 - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/join_suffix.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/join_suffix.py deleted file mode 100644 index e496c4d074144d6d99a2affc5ee72286d35542ef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/join_suffix.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2014 Nickolay V. Shmyrev -# 2016 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - - -import sys - -# This script joins together pairs of split-up words like "you 're" -> "you're". -# The TEDLIUM transcripts are normalized in a way that's not traditional for -# speech recognition. - -prev_line = "" -for line in sys.stdin: - if line == prev_line: - continue - items = line.split() - new_items = [] - i = 0 - while i < len(items): - if i < len(items) - 1 and items[i + 1][0] == "'": - new_items.append(items[i] + items[i + 1]) - i = i + 1 - else: - new_items.append(items[i]) - i = i + 1 - print(" ".join(new_items)) - prev_line = line diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/prepare_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/prepare_data.sh deleted file mode 100644 index f5a22c2c2913a9eb916c2347aad9b2282ec3cf3c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/local/prepare_data.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright 2014 Nickolay V. Shmyrev -# 2014 Brno University of Technology (Author: Karel Vesely) -# 2016 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -# To be run from one directory above this script. - -. ./path.sh - -export LC_ALL=C - -sph2pipe=sph2pipe - -data_type=$1 - -# Prepare: test, train, -for set in dev test train; do - dir=data/$set.orig - mkdir -p $dir - - # Merge transcripts into a single 'stm' file, do some mappings: - # - -> : map dev stm labels to be coherent with train + test, - # - -> : --||-- - # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary - # - -> null : remove marked , it is modelled implicitly (in kaldi) - # - (...) -> null : remove utterance names from end-lines of train - # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py) - { # Add STM header, so sclite can prepare the '.lur' file - echo ';; -;; LABEL "o" "Overall" "Overall results" -;; LABEL "f0" "f0" "Wideband channel" -;; LABEL "f2" "f2" "Telephone channel" -;; LABEL "male" "Male" "Male Talkers" -;; LABEL "female" "Female" "Female Talkers" -;;' - # Process the STMs - cat db/TEDLIUM_release-3/${data_type}/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \ - sed -e 's:::' \ - -e 's:::' \ - -e 's:([0-9])::g' \ - -e 's:::g' \ - -e 's:([^ ]*)$::' | \ - awk '{ $2 = "A"; print $0; }' - } | local/join_suffix.py > data/$set.orig/stm - - # Prepare 'text' file - # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary - cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \ - awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); - for (i=7;i<=NF;i++) { printf(" %s", $i); } - printf("\n"); - }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1 - - # Prepare 'segments', 'utt2spk', 'spk2utt' - cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments - cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk - cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt - - # Prepare 'wav.scp', 'reco2file_and_channel' - cat $dir/spk2utt | awk -v data_type=$data_type -v set=$set -v pwd=$PWD '{ printf("%s %s/db/TEDLIUM_release-3/%s/%s/sph/%s.sph\n", $1, pwd, data_type, set, $1); }' > $dir/wav.scp - cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel - - # Create empty 'glm' file - echo ';; empty.glm - [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token - ' > data/$set.orig/glm - - # The training set seems to not have enough silence padding in the segmentations, - # especially at the beginning of segments. Extend the times. - if [ $set == "train" ]; then - mv data/$set.orig/segments data/$set.orig/segments.temp - utils/data/extend_segment_times.py --start-padding=0.15 \ - --end-padding=0.1 data/$set.orig/segments || exit 1 - rm data/$set.orig/segments.temp - fi - - # Check that data dirs are okay! - utils/validate_data_dir.sh --no-feats $dir || exit 1 -done - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/run.sh deleted file mode 100644 index 9697e02f1af3249d2de70fcbb42aa13e3dcef483..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/run.sh +++ /dev/null @@ -1,216 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 - - -nj=16 -feat_dir=raw_wav - -data_type=raw # raw or shard -num_utts_per_shard=1000 - -data_cat=legacy - -train_set=train -train_config=conf/train_conformer.yaml -cmvn=true -dir=exp/conformer -checkpoint= - -# bpemode (unigram or bpe) -nbpe=500 -bpemode=unigram - - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=10 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - local/download_data.sh # make soft link by yourself if you already have the dataset -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/prepare_data.sh $data_cat - for dset in dev test train; do - utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 \ - data/${dset}.orig data/${dset} - done -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # For wav feature, just copy the data. Fbank extraction is done in training - mkdir -p $feat_dir - for x in ${train_set} dev test; do - cp -r data/$x $feat_dir - done - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn $feat_dir/$train_set/global_cmvn - -fi - -dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt - tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} \ - --model_type=${bpemode} --model_prefix=${bpemodel} \ - --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model \ - --output_format=piece < data/lang_char/input.txt | \ - tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - if [ ! -f $feat_dir/$train_set/segments ]; then - echo "$0: No such file segments" && exit 1; - else - for x in dev test ${train_set}; do - tools/make_raw_list.py --segments $feat_dir/$x/segments \ - $feat_dir/$x/wav.scp $feat_dir/$x/text $feat_dir/$x/data.list - done - fi -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --bpe_model $bpemodel.model \ - --train_data $feat_dir/$train_set/data.list \ - --cv_data $feat_dir/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 8 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data $feat_dir/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --bpe_model $bpemodel.model \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - $feat_dir/test/text $test_dir/text > $test_dir/wer - } & - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/tedlium3/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/README.md deleted file mode 100644 index 811581f4881478ac430e59bf15d4c7c4f9f6eb83..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: dither + specaug + speed perturb -* Training info: lr 0.002, warmup_steps 5000 batch size 16, 1 gpu, acc_grad 4, 120 epochs -* Decoding info: average_num 20 -* trans_type: phn - - -| decoding mode | test (wer) | -| :--------------------: | :---------: | -| ctc_greedy_search | 16.70% | -| ctc_prefix_beam_search | 16.60% | -| attention | 22.37% | -| attention_rescoring | 16.60% | - -## transformer Result - -* Feature info: dither + specaug + speed perturb -* Training info: lr 0.002, warmup_steps 5000 batch size 16, 1 gpu, acc_grad 4, 120 epochs -* Decoding info: average_num 20 -* trans_type: phn - - -| decoding mode | test (wer) | -| :--------------------: | :---------: | -| ctc_greedy_search | 17.78% | -| ctc_prefix_beam_search | 17.46% | -| attention | 21.77% | -| attention_rescoring | 17.06% | \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/conf/train_conformer.yaml deleted file mode 100644 index 92c8bfbedf1acf7525370b67d4c138bece38b523..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/conf/train_conformer.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - split_with_space: true - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 # -max_epoch: 120 -log_interval: 10 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 5000 # 20000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/conf/train_transformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/conf/train_transformer.yaml deleted file mode 100644 index 13d7a5887c9b0dff3f47822bec360be12b758742..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/conf/train_transformer.yaml +++ /dev/null @@ -1,73 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.2 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.2 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - split_with_space: true - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 120 -log_interval: 10 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 5000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/dev_spk.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/dev_spk.list deleted file mode 100644 index 564da1f1ec672839fcd1531766800c774f0d1398..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/dev_spk.list +++ /dev/null @@ -1,50 +0,0 @@ -faks0 -fdac1 -fjem0 -mgwt0 -mjar0 -mmdb1 -mmdm2 -mpdf0 -fcmh0 -fkms0 -mbdg0 -mbwm0 -mcsh0 -fadg0 -fdms0 -fedw0 -mgjf0 -mglb0 -mrtk0 -mtaa0 -mtdt0 -mthc0 -mwjg0 -fnmr0 -frew0 -fsem0 -mbns0 -mmjr0 -mdls0 -mdlf0 -mdvc0 -mers0 -fmah0 -fdrw0 -mrcs0 -mrjm4 -fcal1 -mmwh0 -fjsj0 -majc0 -mjsw0 -mreb0 -fgjd0 -fjmg0 -mroa0 -mteb0 -mjfc0 -mrjr0 -fmml0 -mrws1 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/phones.60-48-39.map b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/phones.60-48-39.map deleted file mode 100644 index 6d24f094d198df6cc33190a835d5c19a2122827b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/phones.60-48-39.map +++ /dev/null @@ -1,61 +0,0 @@ -aa aa aa -ae ae ae -ah ah ah -ao ao aa -aw aw aw -ax ax ah -ax-h ax ah -axr er er -ay ay ay -b b b -bcl vcl sil -ch ch ch -d d d -dcl vcl sil -dh dh dh -dx dx dx -eh eh eh -el el l -em m m -en en n -eng ng ng -epi epi sil -er er er -ey ey ey -f f f -g g g -gcl vcl sil -h# sil sil -hh hh hh -hv hh hh -ih ih ih -ix ix ih -iy iy iy -jh jh jh -k k k -kcl cl sil -l l l -m m m -n n n -ng ng ng -nx n n -ow ow ow -oy oy oy -p p p -pau sil sil -pcl cl sil -q -r r r -s s s -sh sh sh -t t t -tcl cl sil -th th th -uh uh uh -uw uw uw -ux uw uw -v v v -w w w -y y y -z z z -zh zh sh diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/sph2pipe_process.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/sph2pipe_process.py deleted file mode 100644 index 25195407f54f2693f79d21f2febb533b0e6196ef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/sph2pipe_process.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import sys -import os - - -def sph2pipe_wav(in_wav, tmp_out_wav, out_wav): - with open(in_wav, 'r', encoding='utf-8') as in_f: - with open(tmp_out_wav, 'w', encoding='utf-8') as tmp_out_f: - with open(out_wav, 'w', encoding='utf-8') as out_f: - for line in in_f: - _tmp = line.strip().split(' ') - wav_out_path = _tmp[4] - wav_out_path = wav_out_path.split('/') - wav_out_path[-4] = wav_out_path[-4] + '_pipe' - if not os.path.exists('/'.join(wav_out_path[:-1])): - os.makedirs('/'.join(wav_out_path[:-1])) - wav_out_path = '/'.join(wav_out_path) - tmp_out_f.write(' '.join(_tmp[1:5]) + ' ' + wav_out_path + - '\n') - out_f.write(_tmp[0] + ' ' + wav_out_path + '\n') - - -if __name__ == '__main__': - if len(sys.argv) != 4: - print('wrong input parameter') - raise NotImplementedError(len(sys.argv)) - in_wav = sys.argv[1] - tmp_out_wav = sys.argv[2] - out_wav = sys.argv[3] - sph2pipe_wav(in_wav, tmp_out_wav, out_wav) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/test_spk.list b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/test_spk.list deleted file mode 100644 index 47f6653d64d412e61bdd2e7a10646f581fdbf96d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/test_spk.list +++ /dev/null @@ -1,24 +0,0 @@ -mdab0 -mwbt0 -felc0 -mtas1 -mwew0 -fpas0 -mjmp0 -mlnt0 -fpkt0 -mlll0 -mtls0 -fjlm0 -mbpm0 -mklt0 -fnlp0 -mcmj0 -mjdh0 -fmgd0 -mgrt0 -mnjm0 -fdhc0 -mjln0 -mpam0 -fmld0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_data_prep.sh deleted file mode 100644 index 080fa1b9426962df22e019f436baf803568075db..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_data_prep.sh +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2013 (Authors: Bagher BabaAli, Daniel Povey, Arnab Ghoshal) -# 2014 Brno University of Technology (Author: Karel Vesely) -# 2019 IIIT-Bangalore (Shreekantha Nadig) -# Apache 2.0. - - -create_glm_stm=false - -if [ $# -le 0 ]; then - echo "Argument should be the Timit directory, see ../run.sh for example." - exit 1; -fi - -dir=`pwd`/data/local/data -lmdir=`pwd`/data/local/nist_lm -mkdir -p $dir $lmdir -local=`pwd`/local -utils=`pwd`/utils -conf=`pwd`/conf - -if [ $2 ]; then - if [[ $2 = "char" || $2 = "phn" ]]; then - trans_type=$2 - else - echo "Transcript type must be one of [phn, char]" - echo $2 - fi -else - trans_type=phn -fi - -. ./path.sh - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe - -if ! command -v "${sph2pipe}" &> /dev/null; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi - -[ -f $local/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found."; -[ -f $local/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found."; - -# First check if the train & test directories exist (these can either be upper- -# or lower-cased -if [ ! -d $1/TRAIN -o ! -d $1/TEST ] && [ ! -d $1/train -o ! -d $1/test ]; then - echo "timit_data_prep.sh: Spot check of command line argument failed" - echo "Command line argument must be absolute pathname to TIMIT directory" - echo "with name like /export/corpora5/LDC/LDC93S1/timit/TIMIT" - exit 1; -fi - -# Now check what case the directory structure is -uppercased=false -train_dir=train -test_dir=test -if [ -d $1/TRAIN ]; then - uppercased=true - train_dir=TRAIN - test_dir=TEST -fi - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT - -# Get the list of speakers. The list of speakers in the 24-speaker core test -# set and the 50-speaker development set must be supplied to the script. All -# speakers in the 'train' directory are used for training. -if $uppercased; then - tr '[:lower:]' '[:upper:]' < $local/dev_spk.list > $tmpdir/dev_spk - tr '[:lower:]' '[:upper:]' < $local/test_spk.list > $tmpdir/test_spk - ls -d "$1"/TRAIN/DR*/* | sed -e "s:^.*/::" > $tmpdir/train_spk -else - tr '[:upper:]' '[:lower:]' < $local/dev_spk.list > $tmpdir/dev_spk - tr '[:upper:]' '[:lower:]' < $local/test_spk.list > $tmpdir/test_spk - ls -d "$1"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk -fi - -cd $dir -for x in train dev test; do - # First, find the list of audio files (use only si & sx utterances). - # Note: train & test sets are under different directories, but doing find on - # both and grepping for the speakers will work correctly. - find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \ - | grep -f $tmpdir/${x}_spk > ${x}_sph.flist - - sed -e 's:.*/\(.*\)/\(.*\).WAV$:\1_\2:i' ${x}_sph.flist \ - > $tmpdir/${x}_sph.uttids - paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \ - | sort -k1,1 > ${x}_sph.scp - - cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids - - # Now, Convert the transcripts into our format (no normalization yet) - # Get the transcripts: each line of the output contains an utterance - # ID followed by the transcript. - - if [ $trans_type = "phn" ] - then - echo "phone transcript!" - find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \ - | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist - sed -e 's:.*/\(.*\)/\(.*\).PHN$:\1_\2:i' $tmpdir/${x}_phn.flist \ - > $tmpdir/${x}_phn.uttids - while read line; do - [ -f $line ] || error_exit "Cannot find transcription file '$line'"; - cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' - done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans - paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \ - | sort -k1,1 > ${x}.trans - - elif [ $trans_type = "char" ] - then - echo "char transcript!" - find $1/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WRD' \ - | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_wrd.flist - sed -e 's:.*/\(.*\)/\(.*\).WRD$:\1_\2:i' $tmpdir/${x}_wrd.flist \ - > $tmpdir/${x}_wrd.uttids - while read line; do - [ -f $line ] || error_exit "Cannot find transcription file '$line'"; - cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z A-Z]//g' - done < $tmpdir/${x}_wrd.flist > $tmpdir/${x}_wrd.trans - paste $tmpdir/${x}_wrd.uttids $tmpdir/${x}_wrd.trans \ - | sort -k1,1 > ${x}.trans - else - echo "WRONG!" - echo $trans_type - exit 0; - fi - - # Do normalization steps. - cat ${x}.trans | $local/timit_norm_trans.pl -i - -m $local/phones.60-48-39.map -to 39 | sort > $x.text || exit 1; - # cat ${x}.trans | sort > $x.text || exit 1; - - # Create wav.scp - awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp - - # Make the utt2spk and spk2utt files. - cut -f1 -d'_' $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk - cat $x.utt2spk | $local/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; - - # Prepare gender mapping - cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; $g = lc($&); print "$_ $g\n";' > $x.spk2gender - - - if "${create_glm_stm}"; then - # Prepare STM file for sclite: - wav-to-duration --read-entire-file=true scp:${x}_wav.scp ark,t:${x}_dur.ark || exit 1 - awk -v dur=${x}_dur.ark \ - 'BEGIN{ - while(getline < dur) { durH[$1]=$2; } - print ";; LABEL \"O\" \"Overall\" \"Overall\""; - print ";; LABEL \"F\" \"Female\" \"Female speakers\""; - print ";; LABEL \"M\" \"Male\" \"Male speakers\""; - } - { wav=$1; spk=wav; sub(/_.*/,"",spk); $1=""; ref=$0; - gender=(substr(spk,0,1) == "f" ? "F" : "M"); - printf("%s 1 %s 0.0 %f %s\n", wav, spk, durH[wav], gender, ref); - } - ' ${x}.text >${x}.stm || exit 1 - - # Create dummy GLM file for sclite: - echo ';; empty.glm - [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token - ' > ${x}.glm - fi -done - -echo "Data preparation succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_format_data.sh deleted file mode 100644 index 8cb531f36fdcff697f99cdc4cbddec552d7a5013..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_format_data.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2013 (Author: Daniel Povey) -# Apache 2.0 - -# This script takes data prepared in a corpus-dependent way -# in data/local/, and converts it into the "canonical" form, -# in various subdirectories of data/, e.g. data/lang, data/train, etc. - -. ./path.sh || exit 1; - -echo "Preparing train, dev and test data" -srcdir=data/local/data - - -for x in train dev test; do - mkdir -p data/$x - # cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; - local/sph2pipe_process.py $srcdir/${x}_wav.scp data/${x}/tmp_wav.scp data/${x}/wav.scp || exit 1; - while read line - do - echo $line - $line - done < data/${x}/tmp_wav.scp - rm data/${x}/tmp_wav.scp - - cp $srcdir/$x.text data/$x/text || exit 1; - cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; - cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; - tools/filter_scp.pl data/$x/spk2utt $srcdir/$x.spk2gender > data/$x/spk2gender || exit 1; - [ -e $srcdir/${x}.stm ] && cp $srcdir/${x}.stm data/$x/stm - [ -e $srcdir/${x}.glm ] && cp $srcdir/${x}.glm data/$x/glm - # tools/validate_data_dir.sh --no-feats data/$x || exit 1 -done \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_norm_trans.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_norm_trans.pl deleted file mode 100644 index 566eb7693ac8901e3eeacbd795eddd2b0502a002..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/timit_norm_trans.pl +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script normalizes the TIMIT phonetic transcripts that have been -# extracted in a format where each line contains an utterance ID followed by -# the transcript, e.g.: -# fcke0_si1111 h# hh ah dx ux w iy dcl d ix f ay n ih q h# - -my $usage = "Usage: timit_norm_trans.pl -i transcript -m phone_map -from [60|48] -to [48|39] > normalized\n -Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a -smaller set defined by the -m option. This script assumes that the mapping is -done in the \"standard\" fashion, i.e. to 48 or 39 phones. The input is -assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can -be changed using the -from option. The input format is assumed to be utterance -ID followed by transcript on the same line.\n"; - -use strict; -use Getopt::Long; -die "$usage" unless(@ARGV >= 1); -my ($in_trans, $phone_map, $num_phones_out); -my $num_phones_in = 60; -GetOptions ("i=s" => \$in_trans, # Input transcription - "m=s" => \$phone_map, # File containing phone mappings - "from=i" => \$num_phones_in, # Input #phones: must be 60 or 48 - "to=i" => \$num_phones_out ); # Output #phones: must be 48 or 39 - -die $usage unless(defined($in_trans) && defined($phone_map) && - defined($num_phones_out)); -if ($num_phones_in != 60 && $num_phones_in != 48) { - die "Can only used 60 or 48 for -from (used $num_phones_in)." -} -if ($num_phones_out != 48 && $num_phones_out != 39) { - die "Can only used 48 or 39 for -to (used $num_phones_out)." -} -unless ($num_phones_out < $num_phones_in) { - die "Argument to -from ($num_phones_in) must be greater than that to -to ($num_phones_out)." -} - - -open(M, "<$phone_map") or die "Cannot open mappings file '$phone_map': $!"; -my (%phonemap, %seen_phones); -my $num_seen_phones = 0; -while () { - chomp; - next if ($_ =~ /^q\s*.*$/); # Ignore glottal stops. - m:^(\S+)\s+(\S+)\s+(\S+)$: or die "Bad line: $_"; - my $mapped_from = ($num_phones_in == 60)? $1 : $2; - my $mapped_to = ($num_phones_out == 48)? $2 : $3; - if (!defined($seen_phones{$mapped_to})) { - $seen_phones{$mapped_to} = 1; - $num_seen_phones += 1; - } - $phonemap{$mapped_from} = $mapped_to; -} -if ($num_seen_phones != $num_phones_out) { - die "Trying to map to $num_phones_out phones, but seen only $num_seen_phones"; -} - -open(T, "<$in_trans") or die "Cannot open transcription file '$in_trans': $!"; -while () { - chomp; - $_ =~ m:^(\S+)\s+(.+): or die "Bad line: $_"; - my $utt_id = $1; - my $trans = $2; - - $trans =~ s/q//g; # Remove glottal stops. - $trans =~ s/^\s*//; $trans =~ s/\s*$//; # Normalize spaces - - print $utt_id; - for my $phone (split(/\s+/, $trans)) { - if(exists $phonemap{$phone}) { print " $phonemap{$phone}"; } - if(not exists $phonemap{$phone}) { print " $phone"; } - } - print "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/validate_data_dir.sh deleted file mode 100644 index 9c0e350eeef428dd29501dc3368d373dd749b437..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/local/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/usr/bin/env bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(utils/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - utils/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/run.sh deleted file mode 100644 index 01a08a8ffb9f04a49b6dde37552006909f178042..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/run.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=4 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 -# data -timit_data=/home/Liangcd/data/timit -# path to save preproecssed data -# export data=data - - -nj=16 - -# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, -# `shard` is used for large dataset which is over 1k hours, and `shard` is -# faster on reading data and training. -data_type=raw -num_utts_per_shard=1000 - -train_set=train -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding -# 6. conf/train_u2++_conformer.yaml: U2++ conformer -# 7. conf/train_u2++_transformer.yaml: U2++ transformer -train_config=conf/train_transformer.yaml -cmvn=true -dir=exp/transformer_phn_5k_acc4_bs16 -checkpoint= - - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=20 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" -# choose in [phn] -trans_type=phn - -dict=data/dict/${trans_type}_units.txt - - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - - echo "stage 0: Data preparation" - echo "preparing data for TIMIT for ${trans_type} level transcripts" - local/timit_data_prep.sh ${timit_data} ${trans_type} || exit 1; - local/timit_format_data.sh - echo "Finish stage 0" -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "stage 1: compute global cmvn" - # compute cmvn - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/${train_set}/global_cmvn - echo "Finish stage 1" -fi - - - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "stage 2: make train dict" - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - tools/text2token.py -s 1 -n 1 --space sil --trans_type ${trans_type} data/${train_set}/text \ - | cut -f 2- -d" " | tr " " "\n" | sort | uniq | grep -v -e '^\s*$' | \ - awk '{print $0 " " NR+1}' >> ${dict} - wc -l ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - echo "Finish stage 2" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "stage 3: Prepare data, prepare required format" - for x in dev test ${train_set}; do - if [ $data_type == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data/$x/wav.scp data/$x/text \ - $(realpath data/$x/shards) data/$x/data.list - else - tools/make_raw_list.py data/$x/wav.scp data/$x/text \ - data/$x/data.list - fi - done - echo "Finish stage 3" -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - mkdir -p $dir - # You have to rm `INIT_FILE` manually when you resume or restart a - # multi-machine training. - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - - # train.py rewrite $train_config to $dir/train.yaml with model input - # and output dimension, and $dir/train.yaml will be used for inference - # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/dev/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Please specify decoding_chunk_size for unified streaming and - # non-streaming model. The default value is -1, which is full chunk - # for non-streaming inference. - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/test/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} \ - --connect_symbol ▁ - python tools/compute-wer.py --char=1 --v=1 \ - data/test/text $test_dir/text > $test_dir/wer - } & - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # compute wer - for mode in ${decode_modes}; do - for test_set in test; do - test_dir=$dir/test_${mode} - sed 's:▁: :g' $test_dir/text > $test_dir/text.norm - python tools/compute-wer.py --char=1 --v=1 \ - data/$test_set/text $test_dir/text.norm > $test_dir/wer - done - done -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/tools b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/tools deleted file mode 100644 index 570c2efd663fd0125c0f115baf89e9f0c3c4433f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools/ \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/wenet b/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/wenet deleted file mode 100644 index 5f46eee4df1252a868b5524c689353f264df6921..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/timit/wenet +++ /dev/null @@ -1 +0,0 @@ -../../../wenet/ \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/README.md deleted file mode 100644 index eccf85f6497da528ed3c6516a2424e7e86ad6a05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# conformer based end-to-end model for VKW challenge - -## Standard E2E Results - -Conformer without speed perpurb and lm -* config: conf/train_train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml -* beam: 10 -* num of gpu: 8 -* num of averaged model: 5 -* ctc weight (used for attention rescoring): 0.5 - -dev set results trained only with training set (785 keywords, 1505 hour train set) - -| scenario | Precision | Recall | F1 | ATWV | -|----------|-----------|----------|--------|--------| -| lgv | 0.9281 | 0.6420 | 0.7590 | 0.5183 | -| liv | 0.8886 | 0.6515 | 0.7518 | 0.6050 | -| stv | 0.9120 | 0.7471 | 0.8213 | 0.6256 | - -dev set results trained with training set and finetune set (785 keywords, 1505 hour train set + 15 hour finetune set) - -| scenario | Precision | Recall | F1 | ATWV | -|----------|-----------|----------|--------|--------| -| lgv | 0.9478 | 0.7311 | 0.8255 | 0.6352 | -| liv | 0.9177 | 0.8398 | 0.8770 | 0.7412 | -| stv | 0.9320 | 0.8207 | 0.8729 | 0.7120 | - -test set results trained only with training set (384 keywords, 1505 hour train set) - -| scenario | Precision | Recall | F1 | ATWV | -|----------|-----------|----------|--------|--------| -| lgv | 0.6262 | 0.5648 | 0.5939 | 0.5825 | -| liv | 0.8797 | 0.6282 | 0.7330 | 0.6061 | -| stv | 0.9102 | 0.7221 | 0.8053 | 0.6682 | - -test set results trained with training set and finetune set (384 keywords, 1505 hour train set + 15 hour finetune set) - -| scenario | Precision | Recall | F1 | ATWV | -|----------|-----------|----------|--------|--------| -| lgv | 0.6469 | 0.6276 | 0.6371 | 0.6116 | -| liv | 0.9278 | 0.7560 | 0.8331 | 0.6927 | -| stv | 0.9434 | 0.8061 | 0.8693 | 0.7275 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/conf/combine_finetune_5h_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/conf/combine_finetune_5h_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml deleted file mode 100644 index dc1d25a27ae4c161e77088356be9dc9a7549b586..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/conf/combine_finetune_5h_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: false - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 100 -log_interval: 400 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/conf/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/conf/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml deleted file mode 100644 index d2e6c4b6fb1af08da5cfae1a8dd3831d47801d0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/conf/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char.yaml +++ /dev/null @@ -1,85 +0,0 @@ -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: false - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# use raw_wav or kaldi feature -raw_wav: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 1 -max_epoch: 100 -log_interval: 400 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/run_finetune_5h.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/run_finetune_5h.sh deleted file mode 100644 index 4d6a8a1eda3e26ea797cfed9cb85be25bf5f1a9b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/run_finetune_5h.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/bin/bash -# Copyright 2021 Tencent Inc. (Author: Yougen Yuan). -# Apach 2.0 - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=-1 -stop_stage=0 - -# The num of nodes -num_nodes=1 -# The rank of current node -node_rank=0 - -# data -data=data -dict=data/dict/lang_char.txt -data_type=raw # raw or shard - -train_set=train -dev_set=combine_dev -finetune2_set=combine_finetune_5h -# Optional train_config -name=vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char -train_config=conf/${finetune2_set}_${name}.yaml -cmvn=true -dir=exp/${finetune2_set}_${name}_new -checkpoint= #$dir/0.pt - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=10 - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - # Data preparation - local/vkw_data_prep.sh -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - x=finetune_5h - for z in lgv liv stv; do - [ ! -f data/vkw/label/lab_${z}/${x}/wav_ori.scp ] && \ - mv data/vkw/label/lab_${z}/${x}/wav.scp \ - data/vkw/label/lab_${z}/${x}/wav_ori.scp && \ - cut -d " " -f 1,4 data/vkw/label/lab_${z}/${x}/wav_ori.scp \ - > data/vkw/label/lab_${z}/${x}/wav.scp - done - y=`echo $x | cut -d "_" -f 1` - mkdir -p combine_${y} - for f in text wav.scp segments; do - for z in lgv liv stv; do - cat data/vkw/label/lab_${z}/${x}/$f - done > combine_${y}/$f - done - # remove the space between the text labels for Mandarin dataset - # download and transfer to wav.scp - cp data/${finetune2_set}/text data/${finetune2_set}/text.org - paste -d " " <(cut -f 1 -d" " data/${finetune2_set}/text.org) \ - <(cut -f 2- -d" " data/${finetune2_set}/text.org | tr -d " ") \ - > data/${finetune2_set}/text - rm data/${finetune2_set}/text.org -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "stage 1: generate segmented wav.scp and compute cmvn" - ## For wav feature, just copy the data. Fbank extraction is done in training - [ ! -f $data/$finetune2_set/segmentd_wav.scp ] && \ - python tools/segment.py --segments $data/$finetune2_set/segments \ - --input $data/$finetune2_set/wav.scp \ - --output $data/$finetune2_set/segmented_wav.scp -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - tools/make_raw_list.py --segments $data/$finetune2_set/segments \ - $data/$finetune2_set/wav.scp $data/$finetune2_set/text $data/$finetune2_set/data.list -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp ${data}/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=$i ###`expr $node_rank \* $num_gpus + $i` - echo "start training" - [ ! -f exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/avg_5.pt ] && \ - echo "Please use a pretrained model for finetuning" && exit 0 - [ ! -f $checkpoint ] && \ - cp exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/avg_5.pt $checkpoint && \ - cp exp/train_vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char_new/0.yaml $dir/0.yaml - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data $data/${finetune2_set}/data.list \ - --cv_data $data/${dev_set}/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 4 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - [ ! -f $decode_checkpoint ] && \ - python3 wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Test model, please specify the model you want to use by --checkpoint - sets=${dev_set} - keywords_list=$data/vkw/keyword/kwlist - input_data=$feat_dir/${sets}/data.list - checkpoint=$dir/avg_${average_num}.pt - keyword_results=$dir/keyword_results_${sets} - ctc_results=$dir/ctc_results_${sets} - python3 local/vkw_kws_results.py --gpu 0 \ - --config $dir/train.yaml \ - --data_type $data_type \ - --symbol_table $dict \ - --num_workers 4 \ - --prefetch 32 \ - --input_data $input_data \ - --checkpoint $checkpoint \ - --keyword_unit_dict $keywords_list \ - --keyword_results $keyword_results \ - --ctc_results $ctc_results - - [ ! -f scripts/bin/results_to_score.sh ] && \ - ln -sf data/vkw/scripts scripts && chmod -R 755 scripts - ### attention: install the F4DE tool before testing - for y in "stv" "lgv" "liv"; do - mkdir -p $dir/dev_${y} - #[ ! -f data/vkw/score/dev_${y}/utter_map ] && \ - if [ $y == "lgv" ]; then - grep "TV1" $keyword_results > $dir/dev_${y}/kws_results - elif [ $y == "liv" ]; then - grep "sph_live" $keyword_results > $dir/dev_${y}/kws_results - elif [ $y == "stv" ]; then - grep "sph_video" $keyword_results > $dir/dev_${y}/kws_results - else - "invalid $y" - fi - ./data/vkw/scripts/bin/results_to_score.sh \ - data/vkw/score/dev_${y}/ecf \ - data/vkw/label/lab_${y}/dev_5h/segments \ - data/vkw/score/dev_${y}/utter_map \ - $dir/dev_${y}/kws_results \ - data/vkw/keyword/kwlist.xml \ - data/vkw/score/dev_${y}/rttm - ./data/vkw/scripts/bin/F1.sh \ - $dir/dev_${y}/kws_outputs/f4de_scores_unnormalized/alignment.csv - done -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/vkw_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/vkw_data_prep.sh deleted file mode 100644 index 58eb552e5ba51d6651b12e8505b5f53bd3518fc8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/vkw_data_prep.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Copyright 2021 Tencent Inc. (Author: Yougen Yuan). -# Apach 2.0 - -current_dir=$(pwd) -stage=0 -stop_stage=0 -. ./path.sh || exit 1; - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - cd $current_dir/data/ - [ ! -z vkw_v1.1.zip ] && echo "wget vkw challenge data to this directory" && exit 0 - [ ! -z vkw ] && unzip vkw_v1.1.zip - cd $current_dir -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - x=train - [ ! -f data/${x}/text ] && echo "vkw trainset is missing, wget to this directory" && exit 0 -fi - -echo "$0: vkw data preparation succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/vkw_kws_results.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/vkw_kws_results.py deleted file mode 100644 index 45ac39e7d0e95e69fc7fd30011c9b191b3a5fef0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/local/vkw_kws_results.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# Tencent (Yougen Yuan) -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.transformer.asr_model import init_asr_model -from wenet.utils.checkpoint import load_checkpoint - -from wenet.utils.common import get_subsample -from wenet.utils.common import remove_duplicates_and_blank -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask - - -def map_words2char(word_list_file): - word_unit_dict = {} - word_id_dict = {} - for line in open(word_list_file, mode="r", encoding="utf8"): - ids, keyword = line.split("\n")[0].split() - keyword_char = [] - for i in keyword: - keyword_char.append(i) - word_unit_dict[keyword] = keyword_char - word_id_dict[keyword] = ids - return word_id_dict, word_unit_dict - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - - if end == len(alignment) and start < end: - if start == 0: - timestamp.append(alignment[start:]) - else: - timestamp[-1] += alignment[start:] - break - - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat_frames(timestamp, subsample, char_dict): - begin = 0 - duration = 0 - word_seq = [] - word_time = [] - for idx, t in enumerate(timestamp): - duration = len(t) * subsample - if idx < len(timestamp) - 1: - word_seq.append(char_dict[t[-1]]) - word_time.append([begin, begin + duration]) - else: - non_blank = 0 - token = 0 - for i in t: - if i != 0: - token = i - break - word_seq.append(char_dict[token]) - word_time.append([begin, begin + duration]) - begin = begin + duration - return word_seq, word_time - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--input_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument('--keyword_unit_dict', - required=True, - help='keyword id') - parser.add_argument('--keyword_results', - required=True, - help='keyword results') - parser.add_argument('--ctc_results', required=True, help='ctc results') - - args = parser.parse_args() - - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - # Load dict - char_dict = {} - with open(args.symbol_table, mode='r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - - cv_dataset = Dataset(args.data_type, - args.input_data, - symbol_table, - cv_conf, - None, - partition=False) - - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - print("Reading: ", args.keyword_unit_dict) - word_id_dict, word_unit_dict = map_words2char(args.keyword_unit_dict) - word_unit_list = list(word_unit_dict.keys()) - print("word_unit_list has the size of %d" % (len(word_unit_list))) - - # Init asr model from configs - model = init_asr_model(configs) - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - f_keyword_results = open(args.keyword_results, 'w', encoding='utf-8') - f_ctc_results = open(args.ctc_results, 'w', encoding='utf-8') - with torch.no_grad(): - for batch_idx, batch in enumerate(cv_data_loader): - key, feat, target, feats_length, target_length = batch - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - batch_size = encoder_out.size(0) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - alignment = [hyp.tolist() for hyp in topk_index] - hyps = [remove_duplicates_and_blank(hyp) for hyp in alignment] - for index, i in enumerate(key): - content = [] - if len(hyps[index]) > 0: - for w in hyps[index]: - if w == eos: - break - content.append(char_dict[w]) - f_ctc_results.write('{} {}\n'.format(i, " ".join(content))) - f_ctc_results.flush() - for index, i in enumerate(key): - timestamp = get_frames_timestamp(alignment[index]) - subsample = get_subsample(configs) - word_seq, word_time = get_labformat_frames( - timestamp, subsample, char_dict) - for index_j in range(len(word_seq)): - for keyword in word_unit_list: - keyword_len = len(word_unit_dict[keyword]) - if index_j + keyword_len > len(word_seq): - continue - if (word_seq[index_j:index_j + - keyword_len] == word_unit_dict[keyword]): - f_keyword_results.write("{} {} {} {} {}\n".format( - word_id_dict[keyword], i, - word_time[index_j][0], - word_time[index_j + keyword_len - 1][1], 0.0)) - f_keyword_results.flush() - f_keyword_results.close() - f_ctc_results.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/run.sh deleted file mode 100644 index 4c3cf1bcfd42e909f090b0d159cbb1c4fb10a385..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/run.sh +++ /dev/null @@ -1,220 +0,0 @@ -#!/bin/bash -# Copyright 2021 Tencent Inc. (Author: Yougen Yuan). -# Apach 2.0 - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=-1 -stop_stage=0 - -# The num of nodes -num_nodes=1 -# The rank of current node -node_rank=0 - -# data -data=data -dict=data/dict/lang_char.txt -data_type=raw # raw or shard - -train_set=train -dev_set=combine_dev -# Optional train_config -name=vkw_bidirect_12conformer_hs2048_output256_att4_conv2d_char -train_config=conf/train_${name}.yaml -cmvn=true -dir=exp/train_${name}_new -checkpoint= #$dir/0.pt - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=10 - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - # Data preparation - local/vkw_data_prep.sh -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - x=dev_5h - for z in lgv liv stv; do - [ ! -f data/vkw/label/lab_${z}/${x}/wav_ori.scp ] && \ - mv data/vkw/label/lab_${z}/${x}/wav.scp \ - data/vkw/label/lab_${z}/${x}/wav_ori.scp && \ - cut -d " " -f 1,4 data/vkw/label/lab_${z}/${x}/wav_ori.scp \ - > data/vkw/label/lab_${z}/${x}/wav.scp - done - y=`echo $x | cut -d "_" -f 1` - mkdir -p combine_${y} - for f in text wav.scp segments; do - for z in lgv liv stv; do - cat data/vkw/label/lab_${z}/${x}/$f - done > combine_${y}/$f - done - # remove the space between the text labels for Mandarin dataset - # download and transfer to wav.scp - for x in ${dev_set} ${train_set}; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " \ - data/${x}/text.org | tr -d " ") > data/${x}/text - rm data/${x}/text.org - done -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "stage 1: generate segmented wav.scp and compute cmvn" - ## For wav feature, just copy the data. Fbank extraction is done in training - for x in ${dev_set} ${train_set}; do - [ ! -f $data/$x/segmentd_wav.scp ] && \ - python tools/segment.py --segments $data/$x/segments \ - --input $data/$x/wav.scp \ - --output $data/$x/segmented_wav.scp - done - - ### generate global_cmvn using training set - tools/compute_cmvn_stats.py --num_workers 12 --train_config $train_config \ - --in_scp $data/${train_set}/segmented_wav.scp \ - --out_cmvn $data/$train_set/global_cmvn -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - tools/text2token.py -s 1 -n 1 $data/${train_set}/text | cut -f 2- -d" " | \ - tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | grep -P '[\p{Han}]'\ - | awk '{print $0 " " NR+1}' >> ${dict} - - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - for x in ${dev_set} ${train_set}; do - tools/make_raw_list.py --segments $data/$x/segments \ - $data/$x/wav.scp $data/$x/text $data/$x/data.list - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp ${data}/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=$i ###`expr $node_rank \* $num_gpus + $i` - echo "start training" - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data $data/$train_set/data.list \ - --cv_data $data/${dev_set}/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 4 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - [ ! -f $decode_checkpoint ] && \ - python3 wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Test model, please specify the model you want to use by --checkpoint - sets=${dev_set} - keywords_list=$data/vkw/keyword/kwlist - input_data=$feat_dir/${sets}/data.list - checkpoint=$dir/avg_${average_num}.pt - keyword_results=$dir/keyword_results_${sets} - ctc_results=$dir/ctc_results_${sets} - python3 local/vkw_kws_results.py --gpu 0 \ - --config $dir/train.yaml \ - --data_type $data_type \ - --symbol_table $dict \ - --num_workers 4 \ - --prefetch 32 \ - --input_data $input_data \ - --checkpoint $checkpoint \ - --keyword_unit_dict $keywords_list \ - --keyword_results $keyword_results \ - --ctc_results $ctc_results - - [ ! -f scripts/bin/results_to_score.sh ] && \ - ln -sf data/vkw/scripts scripts && chmod -R 755 scripts - ### attention: install the F4DE tool before testing - for y in "stv" "lgv" "liv"; do - mkdir -p $dir/dev_${y} - #[ ! -f data/vkw/score/dev_${y}/utter_map ] && \ - if [ $y == "lgv" ]; then - grep "TV1" $keyword_results > $dir/dev_${y}/kws_results - elif [ $y == "liv" ]; then - grep "sph_live" $keyword_results > $dir/dev_${y}/kws_results - elif [ $y == "stv" ]; then - grep "sph_video" $keyword_results > $dir/dev_${y}/kws_results - else - "invalid $y" - fi - ./data/vkw/scripts/bin/results_to_score.sh \ - data/vkw/score/dev_${y}/ecf \ - data/vkw/label/lab_${y}/dev_5h/segments \ - data/vkw/score/dev_${y}/utter_map \ - $dir/dev_${y}/kws_results \ - data/vkw/keyword/kwlist.xml \ - data/vkw/score/dev_${y}/rttm - ./data/vkw/scripts/bin/F1.sh \ - $dir/dev_${y}/kws_outputs/f4de_scores_unnormalized/alignment.csv - done -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - echo "adding 5h finetune data for each scenario to obtain better results" - local/run_finetune_5h.sh -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/vkw2021/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/README.md deleted file mode 100644 index 0b84381c047a11f19950ced7d839951c764506cf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Performance Record - -## Conformer - -* Feature info: using fbank feature, with dither 1.0, with cmvn -* Training info: lr 0.001, batch size 32, 24 gpus on V100, acc_grad 16, 26 epochs -* Decoding info: ctc_weight 0.5, average_num 10 - -| decoding_method | Dev | Test\_Net | Test\_Meeting | -|:-------------------:|:----:|:---------:|:-------------:| -| ctc_greedy_search | 8.88 | 10.29 | 15.96 | -| attention | 9.38 | 10.12 | 17.28 | -| attention_rescoring | 8.69 | 9.7 | 15.59 | - -## Conformer bidecoder - -* Feature info: using fbank feature, with dither 1.0, with cmvn -* Training info: lr 0.001, batch size 32, 24 gpus on V100, acc_grad 16, 26 epochs -* Decoding info: ctc_weight 0.5, average_num 10 - -| decoding_method | Dev | Test\_Net | Test\_Meeting | -|:-------------------:|:----:|:---------:|:-------------:| -| ctc_greedy_search | 8.98 | 9.55 | 16.48 | -| attention | 9.42 | 10.57 | 18.05 | -| attention_rescoring | 8.85 | 9.25 | 16.18 | - -## U2++ conformer - -* Feature info: using fbank feature, with dither 1.0, with cmvn -* Training info: lr 0.001, batch size 48, 8 gpus on A100, acc_grad 16, 50 epochs -* Decoding info: ctc_weight 0.5, reverse_weight 0.3, average_num 10 - -| Decoding mode - Chunk size | Dev | Test\_Net | Test\_Meeting | -|:-----------------------------:|:----:|:---------:|:-------------:| -| ctc greedy search - full | 8.85 | 9.78 | 17.77 | -| ctc greedy search - 16 | 9.32 | 11.02 | 18.79 | -| ctc prefix beam search - full | 8.80 | 9.73 | 17.57 | -| ctc prefix beam search - 16 | 9.25 | 10.96 | 18.62 | -| attention rescoring - full | 8.60 | 9.26 | 17.34 | -| attention rescoring - 16 | 8.87 | 10.22 | 18.11 | diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/conf/train_conformer.yaml deleted file mode 100644 index aaa0d0ab7080fe3d64ab17eaf6c9695820ce6431..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - cnn_module_norm: 'layer_norm' - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 1200 - min_length: 10 - token_max_length: 100 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: false - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 30 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 1000 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 32 - -grad_clip: 5 -accum_grad: 16 -max_epoch: 26 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 5000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/conf/train_conformer_bidecoder.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/conf/train_conformer_bidecoder.yaml deleted file mode 100644 index 6451e71ae684a76b5774010e84a2a38634514019..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/conf/train_conformer_bidecoder.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - cnn_module_norm: 'layer_norm' - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -dataset_conf: - filter_conf: - max_length: 1200 - min_length: 10 - token_max_length: 100 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: false - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 30 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 1000 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 32 - -grad_clip: 5 -accum_grad: 16 -max_epoch: 26 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 5000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/extract_meta.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/extract_meta.py deleted file mode 100644 index ce2871d0b8e5cf14a552175cfe5d1699d8bf226d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/extract_meta.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) -# Mobvoi Inc(Author: Di Wu, Binbin Zhang) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -import argparse -import json - - -def get_args(): - parser = argparse.ArgumentParser(description=""" - This script is used to process raw json dataset of WenetSpeech, - where the long wav is splitinto segments and - data of wenet format is generated. - """) - parser.add_argument('input_json', help="""Input json file of WenetSpeech""") - parser.add_argument('output_dir', help="""Output dir for prepared data""") - - args = parser.parse_args() - return args - - -def meta_analysis(input_json, output_dir): - input_dir = os.path.dirname(input_json) - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - try: - with open(input_json, 'r') as injson: - json_data = json.load(injson) - except Exception: - sys.exit(f'Failed to load input json file: {input_json}') - else: - if json_data['audios'] is not None: - with open(f'{output_dir}/text', 'w') as utt2text, \ - open(f'{output_dir}/segments', 'w') as segments, \ - open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ - open(f'{output_dir}/wav.scp', 'w') as wavscp, \ - open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ - open(f'{output_dir}/reco2dur', 'w') as reco2dur: - for long_audio in json_data['audios']: - try: - long_audio_path = os.path.realpath( - os.path.join(input_dir, long_audio['path'])) - aid = long_audio['aid'] - segments_lists = long_audio['segments'] - duration = long_audio['duration'] - assert (os.path.exists(long_audio_path)) - except AssertionError: - print(f'''Warning: {aid} something is wrong, - maybe AssertionError, skipped''') - continue - except Exception: - print(f'''Warning: {aid} something is wrong, maybe the - error path: {long_audio_path}, skipped''') - continue - else: - wavscp.write(f'{aid}\t{long_audio_path}\n') - reco2dur.write(f'{aid}\t{duration}\n') - for segment_file in segments_lists: - try: - sid = segment_file['sid'] - start_time = segment_file['begin_time'] - end_time = segment_file['end_time'] - dur = end_time - start_time - text = segment_file['text'] - segment_subsets = segment_file["subsets"] - except Exception: - print(f'''Warning: {segment_file} something - is wrong, skipped''') - continue - else: - utt2text.write(f'{sid}\t{text}\n') - segments.write( - f'{sid}\t{aid}\t{start_time}\t{end_time}\n' - ) - utt2dur.write(f'{sid}\t{dur}\n') - segment_sub_names = " ".join(segment_subsets) - utt2subsets.write( - f'{sid}\t{segment_sub_names}\n') - -def main(): - args = get_args() - - meta_analysis(args.input_json, args.output_dir) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/process_opus.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/process_opus.py deleted file mode 100644 index 9f71eb1a62509739d318b564a3deb2e7acc3347f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/process_opus.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# process_opus.py: segmentation and downsampling of opus audio - -# usage: python3 process_opus.py wav.scp segments output_wav.scp - -from pydub import AudioSegment -import sys -import os - - -def read_file(wav_scp, segments): - wav_scp_dict = {} - with open(wav_scp, 'r', encoding='UTF-8') as fin: - for line_str in fin: - wav_id, path = line_str.strip().split() - wav_scp_dict[wav_id] = path - - utt_list = [] - seg_path_list = [] - start_time_list = [] - end_time_list = [] - with open(segments, 'r', encoding='UTF-8') as fin: - for line_str in fin: - arr = line_str.strip().split() - assert len(arr) == 4 - utt_list.append(arr[0]) - seg_path_list.append(wav_scp_dict[arr[1]]) - start_time_list.append(float(arr[2])) - end_time_list.append(float(arr[3])) - return utt_list, seg_path_list, start_time_list, end_time_list - - -# TODO(Qijie): Fix the process logic -def output(output_wav_scp, utt_list, seg_path_list, start_time_list, - end_time_list): - num_utts = len(utt_list) - step = int(num_utts * 0.01) - with open(output_wav_scp, 'w', encoding='UTF-8') as fout: - previous_wav_path = "" - for i in range(num_utts): - utt_id = utt_list[i] - current_wav_path = seg_path_list[i] - output_dir = (os.path.dirname(current_wav_path)) \ - .replace("audio", 'audio_seg') - seg_wav_path = os.path.join(output_dir, utt_id + '.wav') - - # if not os.path.exists(output_dir): - # os.makedirs(output_dir) - - if current_wav_path != previous_wav_path: - source_wav = AudioSegment.from_file(current_wav_path) - previous_wav_path = current_wav_path - - start = int(start_time_list[i] * 1000) - end = int(end_time_list[i] * 1000) - target_audio = source_wav[start:end].set_frame_rate(16000) \ - .set_sample_width(2) - target_audio.export(seg_wav_path, format="wav") - - fout.write("{} {}\n".format(utt_id, seg_wav_path)) - if i % step == 0: - print("seg wav finished: {}%".format(int(i / step))) - - -def main(): - wav_scp = sys.argv[1] - segments = sys.argv[2] - output_wav_scp = sys.argv[3] - - utt_list, seg_path_list, start_time_list, end_time_list \ - = read_file(wav_scp, segments) - output(output_wav_scp, utt_list, seg_path_list, start_time_list, - end_time_list) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/wenetspeech_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/wenetspeech_data_prep.sh deleted file mode 100644 index 0fd3b5bc3893f7ef534010203dc3c97337277df7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/local/wenetspeech_data_prep.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) -# Seasalt AI, Inc (Author: Guoguo Chen) -# Mobvoi Inc(Author: Di Wu, Binbin Zhang) -# NPU, ASLP Group (Author: Qijie Shao) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -set -o pipefail - -stage=1 -prefix= -train_subset=L - -. ./tools/parse_options.sh || exit 1; - -filter_by_id () { - idlist=$1 - input=$2 - output=$3 - field=1 - if [ $# -eq 4 ]; then - field=$4 - fi - cat $input | perl -se ' - open(F, "<$idlist") || die "Could not open id-list file $idlist"; - while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; - } - while(<>) { - @A = split; - @A > 0 || die "Invalid file line $_"; - @A >= $field || die "Invalid file line $_"; - if ($seen{$A[$field-1]}) { - print $_; - } - }' -- -idlist="$idlist" -field="$field" > $output ||\ - (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; -} - -subset_data_dir () { - utt_list=$1 - src_dir=$2 - dest_dir=$3 - mkdir -p $dest_dir || exit 1; - # wav.scp text segments utt2dur - filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ - (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; - filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ - (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; - filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ - (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; - awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco - filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ - (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; - rm -f $dest_dir/reco -} - -if [ $# -ne 2 ]; then - echo "Usage: $0 [options] " - echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/" - echo "" - echo "This script takes the WenetSpeech source directory, and prepares the" - echo "WeNet format data directory." - echo " --prefix # Prefix for output data directory." - echo " --stage # Processing stage." - echo " --train-subset # Train subset to be created." - exit 1 -fi - -wenetspeech_dir=$1 -data_dir=$2 - -declare -A subsets -subsets=( - [L]="train_l" - [M]="train_m" - [S]="train_s" - [W]="train_w" - [DEV]="dev" - [TEST_NET]="test_net" - [TEST_MEETING]="test_meeting") - -prefix=${prefix:+${prefix}_} - -corpus_dir=$data_dir/${prefix}corpus/ -if [ $stage -le 1 ]; then - echo "$0: Extract meta into $corpus_dir" - # Sanity check. - [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\ - echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1; - [ ! -d $wenetspeech_dir/audio ] &&\ - echo "$0: Please download $wenetspeech_dir/audio!" && exit 1; - - [ ! -d $corpus_dir ] && mkdir -p $corpus_dir - - # Files to be created: - # wav.scp text segments utt2dur - python3 local/extract_meta.py \ - $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1; -fi - -if [ $stage -le 2 ]; then - echo "$0: Split data to train, dev, test_net, and test_meeting" - [ ! -f $corpus_dir/utt2subsets ] &&\ - echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; - for label in $train_subset DEV TEST_NET TEST_MEETING; do - if [ ! ${subsets[$label]+set} ]; then - echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1; - fi - subset=${subsets[$label]} - [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset - cat $corpus_dir/utt2subsets | \ - awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \ - > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; - subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ - $corpus_dir $data_dir/${prefix}$subset || exit 1; - done -fi - -echo "$0: Done" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/run.sh deleted file mode 100644 index 52d288375503321a1a2dd702ee8806a213bb44e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/run.sh +++ /dev/null @@ -1,208 +0,0 @@ -#!/bin/bash - -# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang) -# NPU, ASLP Group (Author: Qijie Shao) - -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=0 -stop_stage=5 - -# The num of nodes -num_nodes=1 -# The rank of current node -node_rank=0 - -# Use your own data path. You need to download the WenetSpeech dataset by yourself. -wenetspeech_data_dir=/ssd/nfs07/binbinzhang/wenetspeech -# Make sure you have 1.2T for ${shards_dir} -shards_dir=/ssd/nfs06/unified_data/wenetspeech_shards - -# WenetSpeech training set -set=L -train_set=train_`echo $set | tr 'A-Z' 'a-z'` -dev_set=dev -test_sets="test_net test_meeting" - -train_config=conf/train_conformer.yaml -checkpoint= -cmvn=true -cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn -dir=exp/conformer - -decode_checkpoint= -average_checkpoint=true -average_num=10 -decode_modes="attention_rescoring ctc_greedy_search" - -. tools/parse_options.sh || exit 1; - -set -u -set -o pipefail - -# Data download -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data." - exit 0; -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - echo "Data preparation" - local/wenetspeech_data_prep.sh \ - --train-subset $set \ - $wenetspeech_data_dir \ - data || exit 1; -fi - -dict=data/dict/lang_char.txt -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "Make a dictionary" - echo "dictionary: ${dict}" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - echo "▁ 2" >> ${dict} # ▁ is for space - tools/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \ - | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' \ - | grep -v "▁" \ - | awk '{print $0 " " NR+2}' >> ${dict} \ - || exit 1; - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "Compute cmvn" - # Here we use all the training data, you can sample some some data to save time - # BUG!!! We should use the segmented data for CMVN - if $cmvn; then - full_size=`cat data/${train_set}/wav.scp | wc -l` - sampling_size=$((full_size / cmvn_sampling_divisor)) - shuf -n $sampling_size data/$train_set/wav.scp \ - > data/$train_set/wav.scp.sampled - python3 tools/compute_cmvn_stats.py \ - --num_workers 16 \ - --train_config $train_config \ - --in_scp data/$train_set/wav.scp.sampled \ - --out_cmvn data/$train_set/global_cmvn \ - || exit 1; - fi -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Making shards, please wait..." - RED='\033[0;31m' - NOCOLOR='\033[0m' - echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space" - echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads" - for x in $dev_set $test_sets ${train_set}; do - dst=$shards_dir/$x - mkdir -p $dst - tools/make_shard_list.py --resample 16000 --num_utts_per_shard 1000 \ - --num_threads 32 --segments data/$x/segments \ - data/$x/wav.scp data/$x/text \ - $(realpath $dst) data/$x/data.list - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - echo "Start training" - mkdir -p $dir - # INIT_FILE is for DDP synchronization - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type "shard" \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/$dev_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - $cmvn_opts \ - --num_workers 8 \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - echo "Test model" - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for testset in ${test_sets} ${dev_set}; do - { - for mode in ${decode_modes}; do - { - base=$(basename $decode_checkpoint) - result_dir=$dir/${testset}_${mode}_${base} - mkdir -p $result_dir - python wenet/bin/recognize.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type "shard" \ - --test_data data/$testset/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $result_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/$testset/text $result_dir/text > $result_dir/wer - } - done - wait - } - done -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - echo "Export the best model you want" - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wenetspeech/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/README.md b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/README.md deleted file mode 100644 index d5a7b6da20712f81354ea6f25b309804d4df3e71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: dither + specaug + speed perturb -* Training info: lr 0.002, warmup_steps 20000 batch size 16, 1 gpu, acc_grad 4, 120 epochs -* Decoding info: average_num 20 - -| decoding mode | dev93 (cer) | dev93 (wer) | -|:----------------------:|:-------------:|:-------------:| -| ctc_greedy_search | 5.25% | 13.16% | -| ctc_prefix_beam_search | 5.17% | 13.10% | -| attention_rescoring | 5.11% | 12.17% | \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/conf/train_conformer.yaml b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/conf/train_conformer.yaml deleted file mode 100644 index 785911d09e74d84f516915dc11354c164d5e0554..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/conf/train_conformer.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -dataset_conf: - filter_conf: - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 - resample_conf: - resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 0.1 - spec_aug: true - spec_aug_conf: - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 - -grad_clip: 5 -accum_grad: 4 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 20000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/find_transcripts.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/find_transcripts.pl deleted file mode 100644 index e6d93027c5a29dd20293d9eada3bdaee192457f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/find_transcripts.pl +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program takes on its standard input a list of utterance -# id's, one for each line. (e.g. 4k0c030a is a an utterance id). -# It takes as -# Extracts from the dot files the transcripts for a given -# dataset (represented by a file list). - - -@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; -$dot_flist = shift @ARGV; - -open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; -while(){ - chop; - m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; - $spk = $1; - $spk2dot{$spk} = $_; -} - - -while(){ - chop; - $uttid = $_; - $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; - $spk = $1; - if($spk ne $curspk) { - %utt2trans = { }; # Don't keep all the transcripts in memory... - $curspk = $spk; - $dotfile = $spk2dot{$spk}; - defined $dotfile || die "No dot file for speaker $spk\n"; - open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; - while() { - $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; - $trans = $1; - $utt = $2; - $utt2trans{$utt} = $trans; - } - } - if(!defined $utt2trans{$uttid}) { - print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; - } else { - print "$uttid $utt2trans{$uttid}\n"; - } -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/flist2scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/flist2scp.pl deleted file mode 100644 index 7edf1e3f1f44e4ac3b97b39361a46ba8c453c88d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/flist2scp.pl +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# takes in a file list with lines like -# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 -# and outputs an scp in kaldi format with lines like -# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 -# (the first thing is the utterance-id, which is the same as the basename of the file. - - -while(<>){ - m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; - $id = $1; - $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) - print "$id $_"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/ndx2flist.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/ndx2flist.pl deleted file mode 100644 index 0dae1556858250654016920ed98a71fea5440a02..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/ndx2flist.pl +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This program takes as its standard input an .ndx file from the WSJ corpus that looks -# like this: -#;; File: tr_s_wv1.ndx, updated 04/26/94 -#;; -#;; Index for WSJ0 SI-short Sennheiser training data -#;; Data is read WSJ sentences, Sennheiser mic. -#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts -#;; per speaker TI) = 7236 utts -#;; -#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 -#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 - -#and as command-line arguments it takes the names of the WSJ disk locations, e.g.: -#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc. -# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with -# /mnt/matylda2/data/WSJ0/11-1.1. -# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with -# uppercase rather than lower case filenames. - -foreach $fn (@ARGV) { - $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n"; - $disk_id=$1; - $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1 - $fn =~ s:/$::; # Remove final slash, just in case it is present. - $disk2fn{$disk_id} = $fn; -} - -while(){ - if(m/^;/){ next; } # Comment. Ignore it. - else { - m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; - $disk=$1; - if(!defined $disk2fn{$disk}) { - die "Disk id $disk not found"; - } - $filename = $2; # as a subdirectory of the distributed disk. - if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) { - # The disk 13-16.1 has been uppercased for some reason, on the - # BUT system. This is a fix specifically for that case. - $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why? - } - print "$disk2fn{$disk}/$filename\n"; - } -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/normalize_transcript.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/normalize_transcript.pl deleted file mode 100644 index 7a696956cebd4e8de4281b57eabbf48f62b751a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/normalize_transcript.pl +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This takes data from the standard input that's unnormalized transcripts in the format -# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] -# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] -# and outputs normalized transcripts. -# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc - -@ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2"; -$noise_word = shift @ARGV; - -while() { - $_ =~ m:^(\S+) (.+): || die "bad line $_"; - $utt = $1; - $trans = $2; - print "$utt"; - foreach $w (split (" ",$trans)) { - $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. . - $w =~ s:\\::g; # Remove backslashes. We don't need the quoting. - $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts. - $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts. - if($w =~ m:^\[\<\w+\]$: || # E.g. [\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete. - $w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon. - $w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon. - $w eq "~" || # This is used to indicate truncation of an utterance. Not a word. - $w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much - # point including this in the transcript. - next; # we won't print this word. - } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath]. - print " $noise_word"; - } elsif($w =~ m:^\<([\w\']+)\>$:) { - # e.g. replace with and. (the <> means verbal deletion of a word).. but it's pronounced. - print " $1"; - } elsif($w eq "--DASH") { - print " -DASH"; # This is a common issue; the CMU dictionary has it as -DASH. -# } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word -# print " $1 -DASH"; - } else { - print " $w"; - } - } - print "\n"; -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_data_prep.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_data_prep.sh deleted file mode 100644 index 9272cdc3208011c9538a1f8cd1ba2e333d1533bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_data_prep.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - - -# set -eu - -if [ $# -le 3 ]; then - echo "Arguments should be a list of WSJ directories, see ../run.sh for example." - exit 1; -fi - -dir=`pwd`/data/local/data -mkdir -p $dir -local=`pwd`/local - -cd $dir -# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command -# line arguments being absolute pathnames. -rm -r links/ 2>/dev/null -mkdir links/ -ln -s $* links - -# Do some basic checks that we have what we expected. -if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then - echo "wsj_data_prep.sh: Spot check of command line arguments failed" - echo "Command line arguments must be absolute pathnames to WSJ directories" - echo "with names like 11-13.1." - echo "Note: if you have old-style WSJ distribution," - echo "local/cstr_wsj_data_prep.sh may work instead, see run.sh for example." - exit 1; -fi - -# This version for SI-284 -cat links/13-34.1/wsj1/doc/indices/si_tr_s.ndx \ - links/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx | \ - $local/ndx2flist.pl $* | sort | \ - grep -v -i 11-2.1/wsj0/si_tr_s/401 > train_si284.flist - -nl=`cat train_si284.flist | wc -l` -[ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl" - - -# Nov'92 (333 utts) -# These index files have a slightly different format; -# have to add .wv1, which is done in cstr_ndx2flist.pl -cat links/11-13.1/wsj0/doc/indices/test/nvp/si_et_20.ndx | \ - $local/ndx2flist.pl $* | awk '{printf("%s.wv1\n", $1)}' | \ - sort > test_eval92.flist - -# Dev-set for Nov'93 (503 utts) -cat links/13-34.1/wsj1/doc/indices/h1_p0.ndx | \ - $local/ndx2flist.pl $* | sort > test_dev93.flist - -# Finding the transcript files: -for x in $*; do find -L $x -iname '*.dot'; done > dot_files.flist - -# Convert the transcripts into our format (no normalization yet) -for x in train_si284 test_eval92 test_dev93; do - $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp - cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1 -done - -# Do some basic normalization steps. At this point we don't remove OOVs-- -# that will be done inside the training scripts, as we'd like to make the -# data-preparation stage independent of the specific lexicon used. -noiseword=""; -for x in train_si284 test_eval92 test_dev93; do - cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; -done - -# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) - -sph2pipe=/home/lsq/kaldi/tools/sph2pipe_v2.5/sph2pipe -for x in train_si284 test_eval92 test_dev93; do - awk '{printf("%s '$sph2pipe' -f wav %s \n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp -done - -echo "Data preparation succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_format_data.sh deleted file mode 100644 index 7e2096b9061e0996824c541f11de7952691bcbe8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_format_data.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# 2015 Guoguo Chen -# Apache 2.0 - -# This script takes data prepared in a corpus-dependent way -# in data/local/, and converts it into the "canonical" form, -# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, -# data/train_si284, data/train_si84, etc. - -# Don't bother doing train_si84 separately (although we have the file lists -# in data/local/) because it's just the first 7138 utterances in train_si284. -# We'll create train_si84 after doing the feature extraction. - -echo "$0 $@" # Print the command line for logging -. ./tools/parse_options.sh || exit 1; - -. ./path.sh || exit 1; - -echo "Preparing train and test data" -srcdir=data/local/data - -for x in train_si284 test_eval92 test_dev93; do - mkdir -p data/$x - cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; - cp $srcdir/$x.txt data/$x/text || exit 1; -done - -echo "Succeeded in formatting data." \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_gen_wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_gen_wav.sh deleted file mode 100644 index 056c4be418e64f761bf319b1c62e9321996f37f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/local/wsj_gen_wav.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -set -eu - -[ $# -ne 2 ] && echo "Script format error: $0 " && exit 0 - -data_dir=$1 -dump_dir=$2 - -mkdir -p $dump_dir - -num_utts=$(cat $data_dir/wav.scp | wc -l) -echo "Orginal utterances (.wav + .wv1): $num_utts" - -# cat $data_dir/wav.scp | grep "sph2pipe" | \ -# awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash - -awk '{print $1,$5}' $data_dir/wav.scp > $data_dir/raw_wav.scp -find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \ - sed 's:\.wav::' > $data_dir/wav.scp - -num_utts=$(cat $data_dir/wav.scp | wc -l) -echo "Wave utterances (.wav): $num_utts" - -echo "$0: Generate wav => $dump_dir done" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/path.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/path.sh deleted file mode 100644 index 73fc1c56602086182f66201870e28d46a0cada55..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/run.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/run.sh deleted file mode 100644 index 1b240d7186ba5da3ec9936accb1a7e55afa1db1c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/run.sh +++ /dev/null @@ -1,227 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=4 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 -# data -WSJ0=/home/lsq/corpus/WSJ/wsj0 -WSJ1=/home/lsq/corpus/WSJ/wsj1 - - -nj=16 - -train_set=train_si284 -valid_set=test_dev93 -test_sets="test_dev93" - -data_type=raw -# for lm training -other_text=data/local/other_text/text - -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding -# 6. conf/train_u2++_conformer.yaml: U2++ conformer -# 7. conf/train_u2++_transformer.yaml: U2++ transformer -train_config=conf/train_conformer.yaml -cmvn=true -dir=/home/lsq/exp_dir/exp_wenet/wsj/conformer_1202 -dump_wav_dir=/home/lsq/corpus/wsj_wav -checkpoint= - - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=20 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - - echo "stage 0: Data preparation" - local/wsj_data_prep.sh ${WSJ0}/??-{?,??}.? ${WSJ1}/??-{?,??}.? - local/wsj_format_data.sh - - for x in ${valid_set} ${train_set}; do - { - ./local/wsj_gen_wav.sh data/$x $dump_wav_dir/$x - } - done - - echo "Prepare text from lng_modl dir: ${WSJ1}/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z -> ${other_text}" - mkdir -p "$(dirname ${other_text})" - # NOTE(kamo): Give utterance id to each texts. - zcat ${WSJ1}/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z | \ - grep -v "<" | tr "[:lower:]" "[:upper:]" | \ - awk '{ printf("wsj1_lng_%07d %s\n",NR,$0) } ' > ${other_text} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - - # compute cmvn - tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn data/${train_set}/global_cmvn - -fi - -dict=data/dict/${train_set}_units.txt -nlsyms=data/nlsyms.txt - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - echo "make a non-linguistic symbol list" - cut -f 2- data/${train_set}/text | tr " " "\n" | sort | uniq | grep "<" > ${nlsyms} - cat ${nlsyms} - - tools/text2token.py -s 1 -n 1 -l ${nlsyms} --space ▁ data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - wc -l ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - echo "Prepare data, prepare required format" - for x in ${valid_set} ${train_set}; do - if [ $data_type == "shard" ]; then - tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ - --num_threads 16 data/$x/wav.scp data/$x/text \ - $(realpath data/$x/shards) data/$x/data.list - else - tools/make_raw_list.py data/$x/wav.scp data/$x/text \ - data/$x/data.list - fi - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - mkdir -p $dir - # You have to rm `INIT_FILE` manually when you resume or restart a - # multi-machine training. - INIT_FILE=$dir/ddp_init - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="gloo" - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - - # train.py rewrite $train_config to $dir/train.yaml with model input - # and output dimension, and $dir/train.yaml will be used for inference - # and export. - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train.py --gpu $gpu_id \ - --config $train_config \ - --data_type $data_type \ - --symbol_table $dict \ - --train_data data/$train_set/data.list \ - --cv_data data/$valid_set/data.list \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts \ - --pin_memory \ - --non_lang_syms ${nlsyms} - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Please specify decoding_chunk_size for unified streaming and - # non-streaming model. The default value is -1, which is full chunk - # for non-streaming inference. - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - result_text=$test_dir/text - mkdir -p $(dirname $result_text) - python wenet/bin/recognize.py --gpu 3 \ - --mode $mode \ - --config $dir/train.yaml \ - --data_type $data_type \ - --test_data data/test_dev93/data.list \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --non_lang_syms $nlsyms \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $result_text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - data/test_dev93/text $test_dir/text > $test_dir/wer - } & - done - wait -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # compute wer - for mode in ${decode_modes}; do - for test_set in $test_sets; do - test_dir=$dir/test_${mode} - sed 's:▁: :g' $test_dir/text > $test_dir/text.norm - python tools/compute-wer.py --char=1 --v=1 \ - data/$test_set/text $test_dir/text.norm > $test_dir/wer - done - done -fi - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/init_model.py deleted file mode 100644 index 4a008183ee25cd88b2fa25d93bdc3f9e3a55d31a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/wsj/s0/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/file_utils.py similarity index 100% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell/rnnt/wenet/utils/file_utils.py rename to models/audio/speech_recognition/conformer/igie/wenet/file_utils.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/processor.py similarity index 83% rename from models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/processor.py rename to models/audio/speech_recognition/conformer/igie/wenet/processor.py index b4bd07ce674eb3288cd1b13a09085eec48d40845..9a542a3d204cdb3def8cf61ce0b0fd8bb31af32e 100644 --- a/models/audio/speech_recognition/conformer/igie/wenet/examples/aishell2/rnnt/wenet/dataset/processor.py +++ b/models/audio/speech_recognition/conformer/igie/wenet/processor.py @@ -49,7 +49,7 @@ def url_opener(data): stream = open(url, 'rb') # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP else: - cmd = f'wget -q -O - {url}' + cmd = f'curl -s -L {url}' process = Popen(cmd, shell=True, stdout=PIPE) sample.update(process=process) stream = process.stdout @@ -282,43 +282,6 @@ def compute_fbank(data, yield dict(key=sample['key'], label=sample['label'], feat=mat) -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - def __tokenize_by_bpe_model(sp, txt): tokens = [] # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: @@ -342,10 +305,7 @@ def __tokenize_by_bpe_model(sp, txt): return tokens -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, +def tokenize(data, symbol_table, bpe_model=None, non_lang_syms=None, split_with_space=False): """ Decode text to chars or BPE Inplace operation @@ -443,58 +403,6 @@ def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): yield sample -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - def shuffle(data, shuffle_size=10000): """ Local shuffle the data @@ -634,24 +542,6 @@ def padding(data): padded_feats = pad_sequence(sorted_feats, batch_first=True, padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) padding_labels = pad_sequence(sorted_labels, batch_first=True, padding_value=-1) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/requirements.txt b/models/audio/speech_recognition/conformer/igie/wenet/requirements.txt deleted file mode 100644 index 72c857232732836576c12ccf4a7e590f907370fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -Pillow -pyyaml>=5.1 -sentencepiece -tensorboard -tensorboardX -typeguard -textgrid -pytest -flake8==3.8.2 -flake8-bugbear -flake8-comprehensions -flake8-executable -flake8-pyi==20.5.0 -mccabe -pycodestyle==2.6.0 -pyflakes==2.2.0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/README.md deleted file mode 100644 index 8275416dcf01de842d9969c8fe666c4eed06ecf4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Runtime on WeNet - -This is the runtime of WeNet. - -We are going to support the following platforms: - -1. Various deep learning inference engines, such as LibTorch, ONNX, OpenVINO, TVM, and so on. -2. Various OS, such as android, iOS, Harmony, and so on. -3. Various AI chips, such as GPU, Horzion BPU, and so on. -4. Various hardware platforms, such as Raspberry Pi. -5. Various language binding, such as python and go. - -Feel free to volunteer yourself if you are interested in trying out some items(they do not have to be on the list). - -## Introduction - -Here is a brief summary of all platforms and OSs. please note the corresponding working `OS` and `inference engine`. - -| runtime | OS | inference engine | Description | -|-----------------|---------------------|----------------------|--------------------------------------------------------------------------------------------------| -| core | / | / | common core code of all runtime | -| android | android | libtorch | android demo, [English demo](https://www.youtube.com/shorts/viEnvmZf03s ), [Chinese demo](TODO) | -| bingding/python | linux, windows, mac | libtorch | python binding of wenet, mac M1/M2 are is not supported now. | -| gpu | linux | onnxruntime/tensorrt | GPU inference with NV's Triton and TensorRT | -| horizonbpu | linux | bpu runtime | Horizon BPU runtime | -| ios | ios | libtorch | ios demo, [link](TODO) | -| kunlun | linux | xpu runtime | Kunlun XPU runtime | -| libtorch | linux, windows, mac | libtorch | c++ build with libtorch | -| onnxrutnime | linux, windows, mac | onnxruntime | c++ build with onnxruntime | -| raspberrypi | linux | onnxruntime | c++ build on raspberrypi with onnxruntime | -| web | linux, windows, mac | libtorch | web demo with gradio and python binding, [link]() | - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/README.md deleted file mode 100644 index 44fbf619a43c687e0c8132d2f79b3f3ce8bbdfe2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# WeNet On-device ASR Android Demo - -This Android demo shows we can run on-device streaming ASR with WeNet. You can download our prebuilt APK or build your APK from source code. - -## Prebuilt APK - -* [Chinese ASR Demo APK, with model trained on AIShell data](http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210202_app.apk) -* [English ASR Demo APK, with model trained on GigaSpeech data](http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/gigaspeech/20210823_app.apk) - -## Build your APK from source code - -### 1) Build model - -You can use our pretrained model (click the following link to download): - -[中文(WenetSpeech)](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/wenetspeech/wenetspeech_u2pp_conformer_libtorch_quant.tar.gz) -| [English(GigaSpeech)](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/gigaspeech/gigaspeech_u2pp_conformer_libtorch_quant.tar.gz) - -Or you can train your own model using WeNet training pipeline on your data. - -### 2) Build APK - -When your model is ready, put `final.zip` and `units.txt` into Android assets (`app/src/main/assets`) folder, -then just build and run the APK. Here is a gif demo, which shows how our on-device streaming e2e ASR runs with low latency. -Please note the wifi and data has been disabled in the demo so there is no network connection ^\_^. - -![Runtime android demo](../../../../docs/images/runtime_android.gif) - -## Compute the RTF - -Step 1, connect your Android phone, and use `adb push` command to push your model, wav scp, and waves to the sdcard. - -Step 2, build the binary and the APK with Android Studio directly, or with the commands as follows: - -``` sh -cd runtime/android -./gradlew build -``` - -Step 3, push your binary and the dynamic library to `/data/local/tmp` as follows: - -``` sh -adb push app/.cxx/cmake/release/arm64-v8a/decoder_main /data/local/tmp -adb push app/build/pytorch_android-1.10.0.aar/jni/arm64-v8a/* /data/local/tmp -``` - -Step 4, change to the directory `/data/local/tmp` of your phone, and export the library path by: - -``` sh -adb shell -cd /data/local/tmp -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:. -``` - -Step 5, execute the same command as the [x86 demo](../../../libtorch) to run the binary to decode and compute the RTF. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/build.gradle b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/build.gradle deleted file mode 100644 index 8f760052d552cb6eff57419761b8cff9d8cd93d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/build.gradle +++ /dev/null @@ -1,103 +0,0 @@ -plugins { - id 'com.android.application' -} - -repositories { - jcenter() - maven { - url "https://oss.sonatype.org/content/repositories/snapshots" - } -} - -android { - signingConfigs { - release { - storeFile file('wenet.keystore') - storePassword '123456' - keyAlias 'wenet' - keyPassword '123456' - } - } - packagingOptions { - pickFirst 'lib/arm64-v8a/libc++_shared.so' - } - configurations { - extractForNativeBuild - } - compileSdkVersion 30 - buildToolsVersion "30.0.3" - - defaultConfig { - applicationId "com.mobvoi.wenet" - minSdkVersion 21 - targetSdkVersion 30 - versionCode 1 - versionName "1.0" - - testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" - externalNativeBuild { - cmake { - targets "wenet", "decoder_main" - cppFlags "-std=c++14", "-DC10_USE_GLOG", "-DC10_USE_MINIMAL_GLOG", "-DANDROID", "-Wno-c++11-narrowing", "-fexceptions" - } - } - - ndkVersion '21.1.6352462' - ndk { - abiFilters 'armeabi-v7a', 'arm64-v8a', 'x86', 'x86_64' - } - } - - buildTypes { - release { - minifyEnabled false - signingConfig signingConfigs.release - proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' - } - } - externalNativeBuild { - cmake { - path "src/main/cpp/CMakeLists.txt" - } - } - compileOptions { - sourceCompatibility JavaVersion.VERSION_1_8 - targetCompatibility JavaVersion.VERSION_1_8 - } -} - -dependencies { - - implementation 'androidx.appcompat:appcompat:1.2.0' - implementation 'com.google.android.material:material:1.2.1' - implementation 'androidx.constraintlayout:constraintlayout:2.0.4' - testImplementation 'junit:junit:4.+' - androidTestImplementation 'androidx.test.ext:junit:1.1.2' - androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' - - implementation 'org.pytorch:pytorch_android:1.10.0' - extractForNativeBuild 'org.pytorch:pytorch_android:1.10.0' - - implementation 'com.github.pengzhendong:wenet-openfst-android:1.0.2' - extractForNativeBuild 'com.github.pengzhendong:wenet-openfst-android:1.0.2' -} - -task extractAARForNativeBuild { - doLast { - configurations.extractForNativeBuild.files.each { - def file = it.absoluteFile - copy { - from zipTree(file) - into "$buildDir/$file.name" - include "headers/**" - include "jni/**" - } - } - } -} - -tasks.whenTaskAdded { task -> - if (task.name.contains('externalNativeBuild')) { - task.dependsOn(extractAARForNativeBuild) - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/proguard-rules.pro b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/proguard-rules.pro deleted file mode 100644 index 481bb434814107eb79d7a30b676d344b0df2f8ce..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/proguard-rules.pro +++ /dev/null @@ -1,21 +0,0 @@ -# Add project specific ProGuard rules here. -# You can control the set of applied configuration files using the -# proguardFiles setting in build.gradle. -# -# For more details, see -# http://developer.android.com/guide/developing/tools/proguard.html - -# If your project uses WebView with JS, uncomment the following -# and specify the fully qualified class name to the JavaScript interface -# class: -#-keepclassmembers class fqcn.of.javascript.interface.for.webview { -# public *; -#} - -# Uncomment this to preserve the line number information for -# debugging stack traces. -#-keepattributes SourceFile,LineNumberTable - -# If you keep the line number information, uncomment this to -# hide the original source file name. -#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/androidTest/java/com/mobvoi/wenet/ExampleInstrumentedTest.java b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/androidTest/java/com/mobvoi/wenet/ExampleInstrumentedTest.java deleted file mode 100644 index e1943606a656b95647a415d66aee79f0c97c2232..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/androidTest/java/com/mobvoi/wenet/ExampleInstrumentedTest.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.mobvoi.wenet; - -import android.content.Context; - -import androidx.test.platform.app.InstrumentationRegistry; -import androidx.test.ext.junit.runners.AndroidJUnit4; - -import org.junit.Test; -import org.junit.runner.RunWith; - -import static org.junit.Assert.*; - -/** - * Instrumented test, which will execute on an Android device. - * - * @see Testing documentation - */ -@RunWith(AndroidJUnit4.class) -public class ExampleInstrumentedTest { - @Test - public void useAppContext() { - // Context of the app under test. - Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); - assertEquals("com.mobvoi.wenet", appContext.getPackageName()); - } -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/AndroidManifest.xml b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/AndroidManifest.xml deleted file mode 100644 index 6da613a4c20b330f30853241ac1a30501e6d5cc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/AndroidManifest.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - - - - - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/assets/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/assets/README.md deleted file mode 100644 index 2d78237d0f670185692a1a292c7acdedd4730e90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/assets/README.md +++ /dev/null @@ -1 +0,0 @@ -put final.zip and units.txt here. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/CMakeLists.txt deleted file mode 100644 index d548d38dde926240ede8c090bd2aa4663b166102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -cmake_minimum_required(VERSION 3.4.1) -set(TARGET wenet) -project(${TARGET} CXX) -set(CMAKE_CXX_STANDARD 14) -include(ExternalProject) - -option(TORCH "whether to build with Torch" ON) -option(ONNX "whether to build with ONNX" OFF) -set(CMAKE_VERBOSE_MAKEFILE on) -set(build_DIR ${CMAKE_SOURCE_DIR}/../../../build) -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -string(REPLACE "-Wl,--exclude-libs,libgcc_real.a" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") - -include(libtorch) -include(openfst) - -include_directories( - ${CMAKE_SOURCE_DIR} - ${CMAKE_SOURCE_DIR}/kaldi -) - -add_subdirectory(utils) -add_subdirectory(frontend) -add_subdirectory(post_processor) -add_subdirectory(kaldi) # kaldi: wfst based decoder -add_subdirectory(decoder) - -link_libraries(frontend decoder android) -add_library(${TARGET} SHARED wenet.cc) - -add_executable(decoder_main bin/decoder_main.cc) -target_link_libraries(decoder_main PUBLIC libc++_shared.so) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/CMakeLists.txt deleted file mode 100644 index a117b8bcb580c8738a7ce72f88bc10ff0a450e98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -add_executable(decoder_main decoder_main.cc) -target_link_libraries(decoder_main PUBLIC decoder) - -add_executable(label_checker_main label_checker_main.cc) -target_link_libraries(label_checker_main PUBLIC decoder) - -# if(TORCH) -# add_executable(api_main api_main.cc) -# target_link_libraries(api_main PUBLIC wenet_api) -# endif() - -if(WEBSOCKET) - add_executable(websocket_client_main websocket_client_main.cc) - target_link_libraries(websocket_client_main PUBLIC websocket) - add_executable(websocket_server_main websocket_server_main.cc) - target_link_libraries(websocket_server_main PUBLIC websocket) -endif() - -if(GRPC) - add_executable(grpc_server_main grpc_server_main.cc) - target_link_libraries(grpc_server_main PUBLIC wenet_grpc) - add_executable(grpc_client_main grpc_client_main.cc) - target_link_libraries(grpc_client_main PUBLIC wenet_grpc) -endif() - -if(HTTP) - add_executable(http_client_main http_client_main.cc) - target_link_libraries(http_client_main PUBLIC http) - add_executable(http_server_main http_server_main.cc) - target_link_libraries(http_server_main PUBLIC http) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/api_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/api_main.cc deleted file mode 100644 index 94b20d52a7b8eee5c39a12af4e1e25324d7d880f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/api_main.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" -#include "frontend/wav.h" -#include "utils/flags.h" - -DEFINE_string(model_dir, "", "model dir path"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_bool(enable_timestamp, false, "enable timestamps"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet_set_log_level(2); - - void* decoder = wenet_init(FLAGS_model_dir.c_str()); - wenet_set_timestamp(decoder, FLAGS_enable_timestamp == true ? 1 : 0); - wenet::WavReader wav_reader(FLAGS_wav_path); - std::vector data(wav_reader.num_samples()); - for (int i = 0; i < wav_reader.num_samples(); i++) { - data[i] = static_cast(*(wav_reader.data() + i)); - } - - for (int i = 0; i < 10; i++) { - // Return the final result when last is 1 - wenet_decode(decoder, reinterpret_cast(data.data()), - data.size() * 2, 1); - const char* result = wenet_get_result(decoder); - LOG(INFO) << i << " " << result; - wenet_reset(decoder); - } - wenet_free(decoder); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/decoder_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/decoder_main.cc deleted file mode 100644 index b8f1dbae6b88390504cc9ce63f33dc9bd54a2d6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/decoder_main.cc +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" -#include "utils/thread_pool.h" -#include "utils/timer.h" -#include "utils/utils.h" - -DEFINE_bool(simulate_streaming, false, "simulate streaming input"); -DEFINE_bool(output_nbest, false, "output n-best of decode result"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_string(wav_scp, "", "input wav scp"); -DEFINE_string(result, "", "result output file"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); -DEFINE_int32(thread_num, 1, "num of decode thread"); -DEFINE_int32(warmup, 0, "num of warmup decode, 0 means no warmup"); - -std::shared_ptr g_decode_config; -std::shared_ptr g_feature_config; -std::shared_ptr g_decode_resource; - -std::ofstream g_result; -std::mutex g_mutex; -int g_total_waves_dur = 0; -int g_total_decode_time = 0; - -void decode(std::pair wav, bool warmup = false) { - wenet::WavReader wav_reader(wav.second); - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - - auto feature_pipeline = - std::make_shared(*g_feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - - wenet::AsrDecoder decoder(feature_pipeline, g_decode_resource, - *g_decode_config); - - int wave_dur = static_cast(static_cast(num_samples) / - wav_reader.sample_rate() * 1000); - int decode_time = 0; - std::string final_result; - while (true) { - wenet::Timer timer; - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - } - int chunk_decode_time = timer.Elapsed(); - decode_time += chunk_decode_time; - if (decoder.DecodedSomething()) { - LOG(INFO) << "Partial result: " << decoder.result()[0].sentence; - } - - if (FLAGS_continuous_decoding && state == wenet::DecodeState::kEndpoint) { - if (decoder.DecodedSomething()) { - decoder.Rescoring(); - LOG(INFO) << "Final result (continuous decoding): " - << decoder.result()[0].sentence; - final_result.append(decoder.result()[0].sentence); - } - decoder.ResetContinuousDecoding(); - } - - if (state == wenet::DecodeState::kEndFeats) { - break; - } else if (FLAGS_chunk_size > 0 && FLAGS_simulate_streaming) { - float frame_shift_in_ms = - static_cast(g_feature_config->frame_shift) / - wav_reader.sample_rate() * 1000; - auto wait_time = - decoder.num_frames_in_current_chunk() * frame_shift_in_ms - - chunk_decode_time; - if (wait_time > 0) { - LOG(INFO) << "Simulate streaming, waiting for " << wait_time << "ms"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(wait_time))); - } - } - } - if (decoder.DecodedSomething()) { - final_result.append(decoder.result()[0].sentence); - } - LOG(INFO) << wav.first << " Final result: " << final_result << std::endl; - LOG(INFO) << "Decoded " << wave_dur << "ms audio taken " << decode_time - << "ms."; - - if (!warmup) { - g_mutex.lock(); - std::ostream& buffer = FLAGS_result.empty() ? std::cout : g_result; - if (!FLAGS_output_nbest) { - buffer << wav.first << " " << final_result << std::endl; - } else { - buffer << "wav " << wav.first << std::endl; - auto& results = decoder.result(); - for (auto& r : results) { - if (r.sentence.empty()) continue; - buffer << "candidate " << r.score << " " << r.sentence << std::endl; - } - } - g_total_waves_dur += wave_dur; - g_total_decode_time += decode_time; - g_mutex.unlock(); - } -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - g_decode_config = wenet::InitDecodeOptionsFromFlags(); - g_feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - g_decode_resource = wenet::InitDecodeResourceFromFlags(); - - if (FLAGS_wav_path.empty() && FLAGS_wav_scp.empty()) { - LOG(FATAL) << "Please provide the wave path or the wav scp."; - } - std::vector> waves; - if (!FLAGS_wav_path.empty()) { - waves.emplace_back(make_pair("test", FLAGS_wav_path)); - } else { - std::ifstream wav_scp(FLAGS_wav_scp); - std::string line; - while (getline(wav_scp, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_GE(strs.size(), 2); - waves.emplace_back(make_pair(strs[0], strs[1])); - } - - if (waves.empty()) { - LOG(FATAL) << "Please provide non-empty wav scp."; - } - } - - if (!FLAGS_result.empty()) { - g_result.open(FLAGS_result, std::ios::out); - } - - // Warmup - if (FLAGS_warmup > 0) { - LOG(INFO) << "Warming up..."; - { - ThreadPool pool(FLAGS_thread_num); - auto wav = waves[0]; - for (int i = 0; i < FLAGS_warmup; i++) { - pool.enqueue(decode, wav, true); - } - } - LOG(INFO) << "Warmup done."; - } - - { - ThreadPool pool(FLAGS_thread_num); - for (auto& wav : waves) { - pool.enqueue(decode, wav, false); - } - } - - LOG(INFO) << "Total: decoded " << g_total_waves_dur << "ms audio taken " - << g_total_decode_time << "ms."; - LOG(INFO) << "RTF: " << std::setprecision(4) - << static_cast(g_total_decode_time) / g_total_waves_dur; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/grpc_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/grpc_client_main.cc deleted file mode 100644 index f2d226d48d3757c5f095335eff3288f5d227282b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/grpc_client_main.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "grpc/grpc_client.h" -#include "utils/flags.h" -#include "utils/timer.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::GrpcClient client(FLAGS_hostname, FLAGS_port, FLAGS_nbest, - FLAGS_continuous_decoding); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - std::vector pcm_data(wav_reader.data(), - wav_reader.data() + num_samples); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(pcm_data[j])); - } - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/grpc_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/grpc_server_main.cc deleted file mode 100644 index b00f3cbade1ee70dadfb49829e9ca73fd50c2be2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/grpc_server_main.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "grpc/grpc_server.h" -#include "utils/log.h" - -DEFINE_int32(port, 10086, "grpc listening port"); -DEFINE_int32(workers, 4, "grpc num workers"); - -using grpc::Server; -using grpc::ServerBuilder; - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::GrpcServer service(feature_config, decode_config, decode_resource); - grpc::EnableDefaultHealthCheckService(true); - grpc::reflection::InitProtoReflectionServerBuilderPlugin(); - ServerBuilder builder; - std::string address("0.0.0.0:" + std::to_string(FLAGS_port)); - builder.AddListeningPort(address, grpc::InsecureServerCredentials()); - builder.RegisterService(&service); - builder.SetSyncServerOption(ServerBuilder::SyncServerOption::NUM_CQS, - FLAGS_workers); - std::unique_ptr server(builder.BuildAndStart()); - LOG(INFO) << "Listening at port " << FLAGS_port; - server->Wait(); - google::ShutdownGoogleLogging(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/http_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/http_client_main.cc deleted file mode 100644 index b59ee3f5f32bf08552416b183802029ac5d5afa5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/http_client_main.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "http/http_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of http server"); -DEFINE_int32(port, 10086, "port of http server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Convert to short - std::vector data; - data.reserve(num_samples); - for (int j = 0; j < num_samples; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // Send data - wenet::HttpClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - wenet::Timer timer; - VLOG(2) << "Send " << data.size() << " samples"; - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/http_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/http_server_main.cc deleted file mode 100644 index e30cf2bcdf746c2072f023e90f470ccba5467c2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/http_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "http/http_server.h" - -DEFINE_int32(port, 10086, "http listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::HttpServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/label_checker_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/label_checker_main.cc deleted file mode 100644 index e36e3d5c29a38a7ebee80606ebd8e69ae8b1eb96..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/label_checker_main.cc +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_string(text, "", "kaldi style text input file"); -DEFINE_string(wav_scp, "", "kaldi style wav scp"); -DEFINE_double(is_penalty, 1.0, - "insertion/substitution penalty for align insertion"); -DEFINE_double(del_penalty, 1.0, "deletion penalty for align insertion"); -DEFINE_string(result, "", "result output file"); -DEFINE_string(timestamp, "", "timestamp output file"); - -namespace wenet { - -const char* kDeletion = ""; -// Is: Insertion and substitution -const char* kIsStart = ""; -const char* kIsEnd = ""; - -bool MapToLabel(const std::string& text, - std::shared_ptr symbol_table, - std::vector* labels) { - labels->clear(); - // Split label to char sequence - std::vector chars; - SplitUTF8StringToChars(text, &chars); - for (size_t i = 0; i < chars.size(); i++) { - // ▁ is special symbol for white space - std::string label = chars[i] != " " ? chars[i] : "▁"; - int id = symbol_table->Find(label); - if (id != -1) { // fst::kNoSymbol - // LOG(INFO) << label << " " << id; - labels->push_back(id); - } - } - return true; -} - -std::shared_ptr MakeSymbolTableForFst( - std::shared_ptr isymbol_table) { - LOG(INFO) << isymbol_table; - CHECK(isymbol_table != nullptr); - auto osymbol_table = std::make_shared(); - osymbol_table->AddSymbol("", 0); - CHECK_EQ(isymbol_table->Find(""), 0); - osymbol_table->AddSymbol("", 1); - for (int i = 1; i < isymbol_table->NumSymbols(); i++) { - std::string symbol = isymbol_table->Find(i); - osymbol_table->AddSymbol(symbol, i + 1); - } - osymbol_table->AddSymbol(kDeletion, isymbol_table->NumSymbols() + 1); - osymbol_table->AddSymbol(kIsStart, isymbol_table->NumSymbols() + 2); - osymbol_table->AddSymbol(kIsEnd, isymbol_table->NumSymbols() + 3); - return osymbol_table; -} - -void CompileCtcFst(std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int start = ofst->AddState(); - ofst->SetStart(start); - CHECK_EQ(symbol_table->Find(""), 0); - CHECK_EQ(symbol_table->Find(""), 1); - ofst->AddArc(start, fst::StdArc(1, 0, 0.0, start)); - // Exclude kDeletion and kInsertion - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - int s = ofst->AddState(); - ofst->AddArc(start, fst::StdArc(i, i, 0.0, s)); - ofst->AddArc(s, fst::StdArc(i, 0, 0.0, s)); - ofst->AddArc(s, fst::StdArc(0, 0, 0.0, start)); - } - ofst->SetFinal(start, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdOLabelCompare()); -} - -void CompileAlignFst(std::vector labels, - std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int deletion = symbol_table->Find(kDeletion); - int insertion_start = symbol_table->Find(kIsStart); - int insertion_end = symbol_table->Find(kIsEnd); - - int start = ofst->AddState(); - ofst->SetStart(start); - // Filler State - int filler_start = ofst->AddState(); - int filler_end = ofst->AddState(); - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - ofst->AddArc(filler_start, fst::StdArc(i, i, FLAGS_is_penalty, filler_end)); - } - ofst->AddArc(filler_end, fst::StdArc(0, 0, 0.0, filler_start)); - - int prev = start; - // Alignment path and optional filler - for (size_t i = 0; i < labels.size(); i++) { - int cur = ofst->AddState(); - // 1. Insertion or Substitution - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - // 2. Correct - ofst->AddArc(prev, fst::StdArc(labels[i], labels[i], 0.0, cur)); - // 3. Deletion - ofst->AddArc(prev, fst::StdArc(0, deletion, FLAGS_del_penalty, cur)); - - prev = cur; - } - // Optional add endding filler - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - ofst->SetFinal(prev, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdILabelCompare()); -} - -} // namespace wenet - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - CHECK(decode_resource->unit_table != nullptr); - - auto wfst_symbol_table = - wenet::MakeSymbolTableForFst(decode_resource->unit_table); - // wfst_symbol_table->WriteText("fst.txt"); - // Reset symbol_table to on-the-fly generated wfst_symbol_table - decode_resource->symbol_table = wfst_symbol_table; - - // Compile ctc FST - fst::StdVectorFst ctc_fst; - wenet::CompileCtcFst(wfst_symbol_table, &ctc_fst); - // ctc_fst.Write("ctc.fst"); - - std::unordered_map wav_table; - std::ifstream wav_is(FLAGS_wav_scp); - std::string line; - while (std::getline(wav_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_EQ(strs.size(), 2); - wav_table[strs[0]] = strs[1]; - } - - std::ifstream text_is(FLAGS_text); - std::ofstream result_os(FLAGS_result, std::ios::out); - std::ofstream timestamp_out; - if (!FLAGS_timestamp.empty()) { - timestamp_out.open(FLAGS_timestamp, std::ios::out); - } - std::ostream& timestamp_os = - FLAGS_timestamp.empty() ? std::cout : timestamp_out; - - while (std::getline(text_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - if (strs.size() < 2) continue; - std::string key = strs[0]; - LOG(INFO) << "Processing " << key; - if (wav_table.find(key) != wav_table.end()) { - strs.erase(strs.begin()); - std::string text = wenet::JoinString(" ", strs); - std::vector labels; - wenet::MapToLabel(text, wfst_symbol_table, &labels); - // Prepare FST for alignment decoding - fst::StdVectorFst align_fst; - wenet::CompileAlignFst(labels, wfst_symbol_table, &align_fst); - // align_fst.Write("align.fst"); - auto decoding_fst = std::make_shared(); - fst::Compose(ctc_fst, align_fst, decoding_fst.get()); - // decoding_fst->Write("decoding.fst"); - // Preapre feature pipeline - wenet::WavReader wav_reader; - if (!wav_reader.Open(wav_table[key])) { - LOG(WARNING) << "Error in reading " << wav_table[key]; - continue; - } - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - auto feature_pipeline = - std::make_shared(*feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - decode_resource->fst = decoding_fst; - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - wenet::AsrDecoder decoder(feature_pipeline, decode_resource, - *decode_config); - while (true) { - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - break; - } - } - std::string final_result; - std::string timestamp_str; - if (decoder.DecodedSomething()) { - const wenet::DecodeResult& result = decoder.result()[0]; - final_result = result.sentence; - std::stringstream ss; - for (const auto& w : result.word_pieces) { - ss << " " << w.word << " " << w.start << " " << w.end; - } - timestamp_str = ss.str(); - } - result_os << key << " " << final_result << std::endl; - timestamp_os << key << " " << timestamp_str << std::endl; - LOG(INFO) << key << " " << final_result; - } else { - LOG(WARNING) << "No wav file for " << key; - } - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/websocket_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/websocket_client_main.cc deleted file mode 100644 index 3eaa96069dc5f57673fbb2819bf7d4883e0d5ffa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/websocket_client_main.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "websocket/websocket_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::WebSocketClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - client.set_continuous_decoding(FLAGS_continuous_decoding); - client.SendStartSignal(); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // TODO(Binbin Zhang): Network order? - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - client.SendEndSignal(); - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/websocket_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/websocket_server_main.cc deleted file mode 100644 index 796d9d2e6d151f7c08b43d66b7245c58ee086cc2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/bin/websocket_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "websocket/websocket_server.h" - -DEFINE_int32(port, 10086, "websocket listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::WebSocketServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/boost.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/boost.cmake deleted file mode 100644 index 8684c0ec43960da213da923dc57416f04301ea2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/boost.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FetchContent_Declare(boost - URL https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz - URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a -) -FetchContent_MakeAvailable(boost) -include_directories(${boost_SOURCE_DIR}) - -if(MSVC) - add_definitions(-DBOOST_ALL_DYN_LINK -DBOOST_ALL_NO_LIB) -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/bpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/bpu.cmake deleted file mode 100644 index 350d76c19d6f656fb130de09877d649cf49972a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/bpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -if(BPU) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(EASY_DNN_URL "https://github.com/xingchensong/toolchain_pkg/releases/download/easy_dnn/easy_dnn.0.4.11.tar.gz") - set(URL_HASH "SHA256=a1a6f77d1baae7181d75ec5d37a2ee529ac4e1c4400babd6ceb1c007392a4904") - else() - message(FATAL_ERROR "Unsupported CMake System Processor '${CMAKE_SYSTEM_PROCESSOR}' (expected 'aarch64')") - endif() - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Linux')") - endif() - - FetchContent_Declare(easy_dnn - URL ${EASY_DNN_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(easy_dnn) - include_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/include) - link_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/lib) - - add_definitions(-DUSE_BPU) - # NOTE(xcsong): Reasons for adding flag `-fuse-ld=gold`: - # https://stackoverflow.com/questions/59915966/unknown-gcc-linker-error-but-builds-sucessfully/59916438#59916438 - # https://github.com/tensorflow/tensorflow/issues/47849 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/gflags.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/gflags.cmake deleted file mode 100644 index 53ae5763b5a8c860b7e64d35b380eee5429f539d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/gflags.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(gflags - URL https://github.com/gflags/gflags/archive/v2.2.2.zip - URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 -) -FetchContent_MakeAvailable(gflags) -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/glog.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/glog.cmake deleted file mode 100644 index 447ab4132f669ee2c3a52c37959dd684a39ff21b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/glog.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(glog - URL https://github.com/google/glog/archive/v0.4.0.zip - URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc -) -FetchContent_MakeAvailable(glog) -include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/grpc.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/grpc.cmake deleted file mode 100644 index 644093a4bf8191f3a45b0df0a72c000981c48f58..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/grpc.cmake +++ /dev/null @@ -1,9 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) -# third_party: grpc -# On how to build grpc, you may refer to https://github.com/grpc/grpc -# We recommend manually recursive clone the repo to avoid internet connection problem -FetchContent_Declare(gRPC - GIT_REPOSITORY https://github.com/grpc/grpc - GIT_TAG v1.37.1 -) -FetchContent_MakeAvailable(gRPC) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/gtest.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/gtest.cmake deleted file mode 100644 index 30dc7c1a31d8b83991841a4dc33f61ed078b532a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/gtest.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FetchContent_Declare(googletest - URL https://github.com/google/googletest/archive/release-1.11.0.zip - URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a -) -if(MSVC) - set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) -endif() -FetchContent_MakeAvailable(googletest) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/libtorch.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/libtorch.cmake deleted file mode 100644 index 3cd9245b2da52f8be206d27164de5f411bff171b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/libtorch.cmake +++ /dev/null @@ -1,79 +0,0 @@ -if(TORCH) - add_definitions(-DUSE_TORCH) - if(NOT ANDROID) - if(GPU) - if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(FATAL_ERROR "GPU is supported only Linux, you can use CPU version") - else() - add_definitions(-DUSE_GPU) - endif() - endif() - - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - if(${CMAKE_BUILD_TYPE} MATCHES "Release") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bece54d36377990257e9d028c687c5b6759c5cfec0a0153da83cf6f0f71f648f") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=3cc7ba3c3865d86f03d78c2f0878fdbed8b764359476397a5c95cf3bba0d665a") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CXX11_ABI) - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=d52f63577a07adb0bfd6d77c90f7da21896e94f71eb7dcd55ed7835ccb3b2b59") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip") - set(URL_HASH "SHA256=80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865") - endif() - else() - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bee1b7be308792aa60fc95a4f5274d9658cb7248002d0e333d49eb81ec88430c") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip") - set(URL_HASH "SHA256=90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad") - endif() - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.13.0.zip") - set(URL_HASH "SHA256=a8f80050b95489b4e002547910410c2c230e9f590ffab2482e19e809afe4f7aa") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_definitions(-DIOS) - else() - message(FATAL_ERROR "Unsupported System '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux', 'Darwin' or 'iOS')") - endif() - - # iOS use LibTorch from pod install - if(NOT IOS) - FetchContent_Declare(libtorch - URL ${LIBTORCH_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(libtorch) - find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG") - endif() - - if(MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - file(COPY ${TORCH_DLLS} DESTINATION ${CMAKE_BINARY_DIR}) - endif() - else() - # Change version in runtime/android/app/build.gradle. - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - include_directories( - ${PYTORCH_INCLUDE_DIRS} - ${PYTORCH_INCLUDE_DIRS}/torch/csrc/api/include - ) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/onnx.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/onnx.cmake deleted file mode 100644 index bd55402cb2a6024620fa6ff8b5c413207041adfa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/onnx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -if(ONNX) - set(ONNX_VERSION "1.12.0") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-win-x64-${ONNX_VERSION}.zip") - set(URL_HASH "SHA256=8b5d61204989350b7904ac277f5fbccd3e6736ddbb6ec001e412723d71c9c176") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-aarch64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5820d9f343df73c63b6b2b174a1ff62575032e171c9564bcf92060f46827d0ac") - else() - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5d503ce8540358b59be26c675e42081be14a3e833a5301926f555451046929c5") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-osx-x86_64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=09b17f712f8c6f19bb63da35d508815b443cbb473e16c6192abfaa297c02f600") - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux' or 'Darwin')") - endif() - - FetchContent_Declare(onnxruntime - URL ${ONNX_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(onnxruntime) - include_directories(${onnxruntime_SOURCE_DIR}/include) - link_directories(${onnxruntime_SOURCE_DIR}/lib) - - if(MSVC) - file(GLOB ONNX_DLLS "${onnxruntime_SOURCE_DIR}/lib/*.dll") - file(COPY ${ONNX_DLLS} DESTINATION ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}) - endif() - - add_definitions(-DUSE_ONNX) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/openfst.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/openfst.cmake deleted file mode 100644 index 490a3da6b571ec228114167fb9c0d9e9b4043bd2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/openfst.cmake +++ /dev/null @@ -1,45 +0,0 @@ -if(NOT ANDROID) - include(gflags) - # We can't build glog with gflags, unless gflags is pre-installed. - # If build glog with pre-installed gflags, there will be conflict. - set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) - include(glog) - - if(NOT GRAPH_TOOLS) - set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) - set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) - endif() - set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) - set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) - set(HAVE_GRM OFF CACHE BOOL "Build grm" FORCE) - set(HAVE_FAR OFF CACHE BOOL "Build far" FORCE) - set(HAVE_PDT OFF CACHE BOOL "Build pdt" FORCE) - set(HAVE_MPDT OFF CACHE BOOL "Build mpdt" FORCE) - set(HAVE_LINEAR OFF CACHE BOOL "Build linear" FORCE) - set(HAVE_LOOKAHEAD OFF CACHE BOOL "Build lookahead" FORCE) - set(HAVE_NGRAM OFF CACHE BOOL "Build ngram" FORCE) - set(HAVE_SPECIAL OFF CACHE BOOL "Build special" FORCE) - - if(MSVC) - add_compile_options(/W0 /wd4244 /wd4267) - endif() - - # "OpenFST port for Windows" builds openfst with cmake for multiple platforms. - # Openfst is compiled with glog/gflags to avoid log and flag conflicts with log and flags in wenet/libtorch. - # To build openfst with gflags and glog, we comment out some vars of {flags, log}.h and flags.cc. - set(openfst_SOURCE_DIR ${fc_base}/openfst-src CACHE PATH "OpenFST source directory") - FetchContent_Declare(openfst - URL https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz - URL_HASH SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e - PATCH_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} - ) - FetchContent_MakeAvailable(openfst) - add_dependencies(fst gflags glog) - target_link_libraries(fst PUBLIC gflags_nothreads_static glog) - include_directories(${openfst_SOURCE_DIR}/src/include) -else() - set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) - include_directories(${openfst_BINARY_DIR}/include) - link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) - link_libraries(log gflags_nothreads glog fst) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/pybind11.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/pybind11.cmake deleted file mode 100644 index 6bdae202c1c4d94228e5f92dab051c118dba7d3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/pybind11.cmake +++ /dev/null @@ -1,7 +0,0 @@ -FetchContent_Declare(pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.zip - URL_HASH SHA256=d1646e6f70d8a3acb2ddd85ce1ed543b5dd579c68b8fb8e9638282af20edead8 -) -FetchContent_MakeAvailable(pybind11) - -add_subdirectory(${pybind11_SOURCE_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/xpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/xpu.cmake deleted file mode 100644 index 38418671b0237550cd01d4d95e8743067e113e56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/cmake/xpu.cmake +++ /dev/null @@ -1,37 +0,0 @@ -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -if(XPU) - set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") - set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) - if(NOT DEFINED ENV{XPU_API_PATH}) - message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") - else() - set(XPU_API_PATH $ENV{XPU_API_PATH}) - message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") - endif() - - include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ - ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) - link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) - - add_definitions(-DUSE_XPU) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/CMakeLists.txt deleted file mode 100644 index fe03efb288eb1c7ae3d05e896e95855e5865472f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -set(decoder_srcs - asr_decoder.cc - asr_model.cc - context_graph.cc - ctc_prefix_beam_search.cc - ctc_wfst_beam_search.cc - ctc_endpoint.cc -) - -if(NOT TORCH AND NOT ONNX AND NOT XPU AND NOT IOS AND NOT BPU) - message(FATAL_ERROR "Please build with TORCH or ONNX or XPU or IOS or BPU!!!") -endif() -if(TORCH OR IOS) - list(APPEND decoder_srcs torch_asr_model.cc) -endif() -if(ONNX) - list(APPEND decoder_srcs onnx_asr_model.cc) -endif() - -add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend - post_processor utils) - -if(ANDROID) - target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) -else() - if(TORCH) - target_link_libraries(decoder PUBLIC ${TORCH_LIBRARIES}) - endif() - if(ONNX) - target_link_libraries(decoder PUBLIC onnxruntime) - endif() - if(BPU) - target_link_libraries(decoder PUBLIC bpu_asr_model) - endif() - if(XPU) - target_link_libraries(decoder PUBLIC xpu_conformer) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_decoder.cc deleted file mode 100644 index 34de7550ea287b37d2cb707e148f5d6853b3d804..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_decoder.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/asr_decoder.h" - -#include - -#include -#include -#include - -#include "utils/timer.h" - -namespace wenet { - -AsrDecoder::AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts) - : feature_pipeline_(std::move(feature_pipeline)), - // Make a copy of the model ASR model since we will change the inner - // status of the model - model_(resource->model->Copy()), - post_processor_(resource->post_processor), - symbol_table_(resource->symbol_table), - fst_(resource->fst), - unit_table_(resource->unit_table), - opts_(opts), - ctc_endpointer_(new CtcEndpoint(opts.ctc_endpoint_config)) { - if (opts_.reverse_weight > 0) { - // Check if model has a right to left decoder - CHECK(model_->is_bidirectional_decoder()); - } - if (nullptr == fst_) { - searcher_.reset(new CtcPrefixBeamSearch(opts.ctc_prefix_search_opts, - resource->context_graph)); - } else { - searcher_.reset(new CtcWfstBeamSearch(*fst_, opts.ctc_wfst_search_opts, - resource->context_graph)); - } - ctc_endpointer_->frame_shift_in_ms(frame_shift_in_ms()); -} - -void AsrDecoder::Reset() { - start_ = false; - result_.clear(); - num_frames_ = 0; - global_frame_offset_ = 0; - model_->Reset(); - searcher_->Reset(); - feature_pipeline_->Reset(); - ctc_endpointer_->Reset(); -} - -void AsrDecoder::ResetContinuousDecoding() { - global_frame_offset_ = num_frames_; - start_ = false; - result_.clear(); - model_->Reset(); - searcher_->Reset(); - ctc_endpointer_->Reset(); -} - -DecodeState AsrDecoder::Decode(bool block) { - return this->AdvanceDecoding(block); -} - -void AsrDecoder::Rescoring() { - // Do attention rescoring - Timer timer; - AttentionRescoring(); - VLOG(2) << "Rescoring cost latency: " << timer.Elapsed() << "ms."; -} - -DecodeState AsrDecoder::AdvanceDecoding(bool block) { - DecodeState state = DecodeState::kEndBatch; - model_->set_chunk_size(opts_.chunk_size); - model_->set_num_left_chunks(opts_.num_left_chunks); - int num_required_frames = model_->num_frames_for_chunk(start_); - std::vector> chunk_feats; - // Return immediately if we do not want to block - if (!block && !feature_pipeline_->input_finished() && - feature_pipeline_->NumQueuedFrames() < num_required_frames) { - return DecodeState::kWaitFeats; - } - // If not okay, that means we reach the end of the input - if (!feature_pipeline_->Read(num_required_frames, &chunk_feats)) { - state = DecodeState::kEndFeats; - } - - num_frames_ += chunk_feats.size(); - VLOG(2) << "Required " << num_required_frames << " get " - << chunk_feats.size(); - Timer timer; - std::vector> ctc_log_probs; - model_->ForwardEncoder(chunk_feats, &ctc_log_probs); - int forward_time = timer.Elapsed(); - if (opts_.ctc_wfst_search_opts.blank_scale != 1.0) { - for (int i = 0; i < ctc_log_probs.size(); i++) { - ctc_log_probs[i][0] = ctc_log_probs[i][0] - + std::log(opts_.ctc_wfst_search_opts.blank_scale); - } - } - timer.Reset(); - searcher_->Search(ctc_log_probs); - int search_time = timer.Elapsed(); - VLOG(3) << "forward takes " << forward_time << " ms, search takes " - << search_time << " ms"; - UpdateResult(); - - if (state != DecodeState::kEndFeats) { - if (ctc_endpointer_->IsEndpoint(ctc_log_probs, DecodedSomething())) { - VLOG(1) << "Endpoint is detected at " << num_frames_; - state = DecodeState::kEndpoint; - } - } - - start_ = true; - return state; -} - -void AsrDecoder::UpdateResult(bool finish) { - const auto& hypotheses = searcher_->Outputs(); - const auto& inputs = searcher_->Inputs(); - const auto& likelihood = searcher_->Likelihood(); - const auto& times = searcher_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - int offset = global_frame_offset_ * feature_frame_shift_in_ms(); - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (searcher_->Type() == kWfstBeamSearch) { - path.sentence += (' ' + word); - } else { - path.sentence += (word); - } - } - - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - const std::vector& input = inputs[i]; - const std::vector& time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ > 0 - ? time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ - : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : start; - } - int end = time_stamp[j] * frame_shift_in_ms(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : end; - } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } - } - - if (post_processor_ != nullptr) { - path.sentence = post_processor_->Process(path.sentence, finish); - } - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } -} - -void AsrDecoder::AttentionRescoring() { - searcher_->FinalizeSearch(); - UpdateResult(true); - // No need to do rescoring - if (0.0 == opts_.rescoring_weight) { - return; - } - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = searcher_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - std::vector rescoring_score; - model_->AttentionRescoring(hypotheses, opts_.reverse_weight, - &rescoring_score); - - // Combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - result_[i].score = opts_.rescoring_weight * rescoring_score[i] + - opts_.ctc_weight * result_[i].score; - } - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_decoder.h deleted file mode 100644 index df71f5b7bad7b2ffdc69bbd7ab11f576bed464d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_decoder.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_ASR_DECODER_H_ -#define DECODER_ASR_DECODER_H_ - -#include -#include -#include -#include - -#include "fst/fstlib.h" -#include "fst/symbol-table.h" - -#include "decoder/asr_model.h" -#include "decoder/context_graph.h" -#include "decoder/ctc_endpoint.h" -#include "decoder/ctc_prefix_beam_search.h" -#include "decoder/ctc_wfst_beam_search.h" -#include "decoder/search_interface.h" -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/utils.h" - -namespace wenet { - -struct DecodeOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 64 = 16*4 - int chunk_size = 16; - int num_left_chunks = -1; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores in the following two search - // methods are different. - // For CtcPrefixBeamSearch, it's a sum(prefix) score + context score - // For CtcWfstBeamSearch, it's a max(viterbi) path score + context score - // So we should carefully set ctc_weight according to the search methods. - float ctc_weight = 0.5; - float rescoring_weight = 1.0; - float reverse_weight = 0.0; - CtcEndpointConfig ctc_endpoint_config; - CtcPrefixBeamSearchOptions ctc_prefix_search_opts; - CtcWfstBeamSearchOptions ctc_wfst_search_opts; -}; - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -enum DecodeState { - kEndBatch = 0x00, // End of current decoding batch, normal case - kEndpoint = 0x01, // Endpoint is detected - kEndFeats = 0x02, // All feature is decoded - kWaitFeats = 0x03 // Feat is not enough for one chunk inference, wait -}; - -// DecodeResource is thread safe, which can be shared for multiple -// decoding threads -struct DecodeResource { - std::shared_ptr model = nullptr; - std::shared_ptr symbol_table = nullptr; - std::shared_ptr> fst = nullptr; - std::shared_ptr unit_table = nullptr; - std::shared_ptr context_graph = nullptr; - std::shared_ptr post_processor = nullptr; -}; - -// Torch ASR decoder -class AsrDecoder { - public: - AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts); - // @param block: if true, block when feature is not enough for one chunk - // inference. Otherwise, return kWaitFeats. - DecodeState Decode(bool block = true); - void Rescoring(); - void Reset(); - void ResetContinuousDecoding(); - bool DecodedSomething() const { - return !result_.empty() && !result_[0].sentence.empty(); - } - - // This method is used for time benchmark - int num_frames_in_current_chunk() const { - return num_frames_in_current_chunk_; - } - int frame_shift_in_ms() const { - return model_->subsampling_rate() * - feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - int feature_frame_shift_in_ms() const { - return feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - const std::vector& result() const { return result_; } - - private: - DecodeState AdvanceDecoding(bool block = true); - void AttentionRescoring(); - - void UpdateResult(bool finish = false); - - std::shared_ptr feature_pipeline_; - std::shared_ptr model_; - std::shared_ptr post_processor_; - - std::shared_ptr> fst_ = nullptr; - // output symbol table - std::shared_ptr symbol_table_; - // e2e unit symbol table - std::shared_ptr unit_table_ = nullptr; - const DecodeOptions& opts_; - // cache feature - bool start_ = false; - // For continuous decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = 100; // timestamp gap between words in a sentence - - std::unique_ptr searcher_; - std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(AsrDecoder); -}; - -} // namespace wenet - -#endif // DECODER_ASR_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_model.cc deleted file mode 100644 index 8c7b0fb1195cf07bac6c3ff1bb8cb0e187e977da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_model.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#include "decoder/asr_model.h" - -#include -#include - -namespace wenet { - -int AsrModel::num_frames_for_chunk(bool start) const { - int num_required_frames = 0; - if (chunk_size_ > 0) { - if (!start) { // First batch - int context = right_context_ + 1; // Add current frame - num_required_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - num_required_frames = chunk_size_ * subsampling_rate_; - } - } else { - num_required_frames = std::numeric_limits::max(); - } - return num_required_frames; -} - -void AsrModel::CacheFeature( - const std::vector>& chunk_feats) { - // Cache feature for next chunk - const int cached_feature_size = 1 + right_context_ - subsampling_rate_; - if (chunk_feats.size() >= cached_feature_size) { - // TODO(Binbin Zhang): Only deal the case when - // chunk_feats.size() > cached_feature_size here, and it's consistent - // with our current model, refine it later if we have new model or - // new requirements - cached_feature_.resize(cached_feature_size); - for (int i = 0; i < cached_feature_size; ++i) { - cached_feature_[i] = - chunk_feats[chunk_feats.size() - cached_feature_size + i]; - } - } -} - -void AsrModel::ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) { - ctc_prob->clear(); - int num_frames = cached_feature_.size() + chunk_feats.size(); - if (num_frames >= right_context_ + 1) { - this->ForwardEncoderFunc(chunk_feats, ctc_prob); - this->CacheFeature(chunk_feats); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_model.h deleted file mode 100644 index d100dd818551014fa4769c1766bc3b1b626e8453..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/asr_model.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#ifndef DECODER_ASR_MODEL_H_ -#define DECODER_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "utils/timer.h" -#include "utils/utils.h" - -namespace wenet { - -class AsrModel { - public: - virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } - virtual int sos() const { return sos_; } - virtual int eos() const { return eos_; } - virtual bool is_bidirectional_decoder() const { - return is_bidirectional_decoder_; - } - virtual int offset() const { return offset_; } - - // If chunk_size > 0, streaming case. Otherwise, none streaming case - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { - num_left_chunks_ = num_left_chunks; - } - // start: if it is the start chunk of one sentence - virtual int num_frames_for_chunk(bool start) const; - - virtual void Reset() = 0; - - virtual void ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob); - - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - - virtual std::shared_ptr Copy() const = 0; - - protected: - virtual void ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) = 0; - virtual void CacheFeature(const std::vector>& chunk_feats); - - int right_context_ = 1; - int subsampling_rate_ = 1; - int sos_ = 0; - int eos_ = 0; - bool is_bidirectional_decoder_ = false; - int chunk_size_ = 16; - int num_left_chunks_ = -1; // -1 means all left chunks - int offset_ = 0; - - std::vector> cached_feature_; -}; - -} // namespace wenet - -#endif // DECODER_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/context_graph.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/context_graph.cc deleted file mode 100644 index adc59c506de2afa7087815887295e4d8735d2a35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/context_graph.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/context_graph.h" - -#include - -#include "fst/determinize.h" - -#include "utils/string.h" -#include "utils/utils.h" - -namespace wenet { - -ContextGraph::ContextGraph(ContextConfig config) : config_(config) {} - -void ContextGraph::BuildContextGraph( - const std::vector& query_contexts, - const std::shared_ptr& symbol_table) { - CHECK(symbol_table != nullptr) << "Symbols table should not be nullptr!"; - start_tag_id_ = symbol_table->AddSymbol(""); - end_tag_id_ = symbol_table->AddSymbol(""); - symbol_table_ = symbol_table; - if (query_contexts.empty()) { - if (graph_ != nullptr) graph_.reset(); - return; - } - - std::unique_ptr ofst(new fst::StdVectorFst()); - // State 0 is the start state and the final state. - int start_state = ofst->AddState(); - ofst->SetStart(start_state); - ofst->SetFinal(start_state, fst::StdArc::Weight::One()); - - LOG(INFO) << "Contexts count size: " << query_contexts.size(); - int count = 0; - for (const auto& context : query_contexts) { - if (context.size() > config_.max_context_length) { - LOG(INFO) << "Skip long context: " << context; - continue; - } - if (++count > config_.max_contexts) break; - - std::vector words; - // Split context to words by symbol table, and build the context graph. - bool no_oov = SplitUTF8StringToWords(Trim(context), symbol_table, &words); - if (!no_oov) { - LOG(WARNING) << "Ignore unknown word found during compilation."; - continue; - } - - int prev_state = start_state; - int next_state = start_state; - float escape_score = 0; - for (size_t i = 0; i < words.size(); ++i) { - int word_id = symbol_table_->Find(words[i]); - float score = (i * config_.incremental_context_score - + config_.context_score) * UTF8StringLength(words[i]); - next_state = (i < words.size() - 1) ? ofst->AddState() : start_state; - ofst->AddArc(prev_state, - fst::StdArc(word_id, word_id, score, next_state)); - // Add escape arc to clean the previous context score. - if (i > 0) { - // ilabel and olabel of the escape arc is 0 (). - ofst->AddArc(prev_state, fst::StdArc(0, 0, -escape_score, start_state)); - } - prev_state = next_state; - escape_score += score; - } - } - std::unique_ptr det_fst(new fst::StdVectorFst()); - fst::Determinize(*ofst, det_fst.get()); - graph_ = std::move(det_fst); -} - -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // escape score, will be overwritten when ilabel equals to word id. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - next_state = arc.nextstate; - *score = arc.weight.Value(); - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} - -bool ContextGraph::SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - bool no_oov = true; - for (size_t start = 0; start < chars.size();) { - for (size_t end = chars.size(); end > start; --end) { - std::string word; - for (size_t i = start; i < end; i++) { - word += chars[i]; - } - // Skip space. - if (word == " ") { - start = end; - continue; - } - // Add '▁' at the beginning of English word. - if (IsAlpha(word)) { - word = kSpaceSymbol + word; - } - - if (symbol_table->Find(word) != -1) { - words->emplace_back(word); - start = end; - continue; - } - if (end == start + 1) { - ++start; - no_oov = false; - LOG(WARNING) << word << " is oov."; - } - } - } - return no_oov; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/context_graph.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/context_graph.h deleted file mode 100644 index 41b59206987cfe22d421f40506057830b6311f8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/context_graph.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CONTEXT_GRAPH_H_ -#define DECODER_CONTEXT_GRAPH_H_ - -#include -#include -#include - -#include "fst/compose.h" -#include "fst/fst.h" -#include "fst/vector-fst.h" - -namespace wenet { - -using StateId = fst::StdArc::StateId; - -struct ContextConfig { - int max_contexts = 5000; - int max_context_length = 100; - float context_score = 3.0; - float incremental_context_score = 0.0; -}; - -class ContextGraph { - public: - explicit ContextGraph(ContextConfig config); - void BuildContextGraph(const std::vector& query_context, - const std::shared_ptr& symbol_table); - int GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary); - - int start_tag_id() { return start_tag_id_; } - int end_tag_id() { return end_tag_id_; } - - private: - bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - - int start_tag_id_ = -1; - int end_tag_id_ = -1; - ContextConfig config_; - std::shared_ptr symbol_table_ = nullptr; - std::unique_ptr graph_ = nullptr; - DISALLOW_COPY_AND_ASSIGN(ContextGraph); -}; - -} // namespace wenet - -#endif // DECODER_CONTEXT_GRAPH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_endpoint.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_endpoint.cc deleted file mode 100644 index 4a64dd048f32401ab0dca468836cfac8be943d26..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_endpoint.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_endpoint.h" - -#include - -#include -#include - -#include "utils/log.h" - -namespace wenet { - -CtcEndpoint::CtcEndpoint(const CtcEndpointConfig& config) : config_(config) { - Reset(); -} - -void CtcEndpoint::Reset() { - num_frames_decoded_ = 0; - num_frames_trailing_blank_ = 0; -} - -static bool RuleActivated(const CtcEndpointRule& rule, - const std::string& rule_name, bool decoded_sth, - int trailing_silence, int utterance_length) { - bool ans = (decoded_sth || !rule.must_decoded_sth) && - trailing_silence >= rule.min_trailing_silence && - utterance_length >= rule.min_utterance_length; - if (ans) { - VLOG(2) << "Endpointing rule " << rule_name - << " activated: " << (decoded_sth ? "true" : "false") << ',' - << trailing_silence << ',' << utterance_length; - } - return ans; -} - -bool CtcEndpoint::IsEndpoint( - const std::vector>& ctc_log_probs, - bool decoded_something) { - for (int t = 0; t < ctc_log_probs.size(); ++t) { - const auto& logp_t = ctc_log_probs[t]; - float blank_prob = expf(logp_t[config_.blank]); - - num_frames_decoded_++; - if (blank_prob > config_.blank_threshold) { - num_frames_trailing_blank_++; - } else { - num_frames_trailing_blank_ = 0; - } - } - CHECK_GE(num_frames_decoded_, num_frames_trailing_blank_); - CHECK_GT(frame_shift_in_ms_, 0); - int utterance_length = num_frames_decoded_ * frame_shift_in_ms_; - int trailing_silence = num_frames_trailing_blank_ * frame_shift_in_ms_; - if (RuleActivated(config_.rule1, "rule1", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule2, "rule2", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule3, "rule3", decoded_something, trailing_silence, - utterance_length)) - return true; - return false; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_endpoint.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_endpoint.h deleted file mode 100644 index 56d9e08e7d3fab5562028e956f7b1d6ebac7b9e4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_endpoint.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_ENDPOINT_H_ -#define DECODER_CTC_ENDPOINT_H_ - -#include - -namespace wenet { - -struct CtcEndpointRule { - bool must_decoded_sth; - int min_trailing_silence; - int min_utterance_length; - - CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000, - int min_utterance_length = 0) - : must_decoded_sth(must_decoded_sth), - min_trailing_silence(min_trailing_silence), - min_utterance_length(min_utterance_length) {} -}; - -struct CtcEndpointConfig { - /// We consider blank as silence for purposes of endpointing. - int blank = 0; // blank id - float blank_threshold = 0.8; // blank threshold to be silence - /// We support three rules. We terminate decoding if ANY of these rules - /// evaluates to "true". If you want to add more rules, do it by changing this - /// code. If you want to disable a rule, you can set the silence-timeout for - /// that rule to a very large number. - - /// rule1 times out after 5000 ms of silence, even if we decoded nothing. - CtcEndpointRule rule1; - /// rule2 times out after 1000 ms of silence after decoding something. - CtcEndpointRule rule2; - /// rule3 times out after the utterance is 20000 ms long, regardless of - /// anything else. - CtcEndpointRule rule3; - - CtcEndpointConfig() - : rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {} -}; - -class CtcEndpoint { - public: - explicit CtcEndpoint(const CtcEndpointConfig& config); - - void Reset(); - /// This function returns true if this set of endpointing rules thinks we - /// should terminate decoding. - bool IsEndpoint(const std::vector>& ctc_log_probs, - bool decoded_something); - - void frame_shift_in_ms(int frame_shift_in_ms) { - frame_shift_in_ms_ = frame_shift_in_ms; - } - - private: - CtcEndpointConfig config_; - int frame_shift_in_ms_ = -1; - int num_frames_decoded_ = 0; - int num_frames_trailing_blank_ = 0; -}; - -} // namespace wenet - -#endif // DECODER_CTC_ENDPOINT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_prefix_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index 154c8864ba98255528a33a80a35b18eee8fa5dc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_prefix_beam_search.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -CtcPrefixBeamSearch::CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : opts_(opts), context_graph_(context_graph) { - Reset(); -} - -void CtcPrefixBeamSearch::Reset() { - hypotheses_.clear(); - likelihood_.clear(); - cur_hyps_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); - abs_time_step_ = 0; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - prefix_score.v_s = 0.0; - prefix_score.v_ns = 0.0; - std::vector empty; - cur_hyps_[empty] = prefix_score; - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.total_score()); - times_.emplace_back(empty); -} - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.total_score() > b.second.total_score(); -} - -void CtcPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - const std::vector& start_boundaries = prefix.second.start_boundaries; - const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - if (s < start_boundaries.size() && i == start_boundaries[s]) { - output.emplace_back(context_graph_->start_tag_id()); - ++s; - } - output.emplace_back(input[i]); - if (e < end_boundaries.size() && i == end_boundaries[e]) { - output.emplace_back(context_graph_->end_tag_id()); - ++e; - } - } - outputs_.emplace_back(output); -} - -void CtcPrefixBeamSearch::UpdateHypotheses( - const std::vector, PrefixScore>>& hpys) { - cur_hyps_.clear(); - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - for (auto& item : hpys) { - cur_hyps_[item.first] = item.second; - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.total_score()); - viterbi_likelihood_.emplace_back(item.second.viterbi_score()); - times_.emplace_back(item.second.times()); - } -} - -// Please refer https://robin1001.github.io/2020/12/11/ctc-search -// for how CTC prefix beam search works, and there is a simple graph demo in -// it. -void CtcPrefixBeamSearch::Search(const std::vector>& logp) { - if (logp.size() == 0) return; - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. First beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. Token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields s(blank ending score) and - // ns(none blank ending score) to -inf, respectively. - if (id == opts_.blank) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = LogAdd(next_score.s, prefix_score.score() + prob); - next_score.v_s = prefix_score.viterbi_score() + prob; - next_score.times_s = prefix_score.times(); - // Prefix not changed, copy the context from prefix. - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.ns = LogAdd(next_score1.ns, prefix_score.ns + prob); - if (next_score1.v_ns < prefix_score.v_ns + prob) { - next_score1.v_ns = prefix_score.v_ns + prob; - if (next_score1.cur_token_prob < prob) { - next_score1.cur_token_prob = prob; - next_score1.times_ns = prefix_score.times_ns; - CHECK_GT(next_score1.times_ns.size(), 0); - next_score1.times_ns.back() = abs_time_step_; - } - } - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.ns = LogAdd(next_score2.ns, prefix_score.s + prob); - if (next_score2.v_ns < prefix_score.v_s + prob) { - next_score2.v_ns = prefix_score.v_s + prob; - next_score2.cur_token_prob = prob; - next_score2.times_ns = prefix_score.times_s; - next_score2.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score2.has_context) { - // Prefix changed, calculate the context score. - next_score2.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score2.has_context = true; - } - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = LogAdd(next_score.ns, prefix_score.score() + prob); - if (next_score.v_ns < prefix_score.viterbi_score() + prob) { - next_score.v_ns = prefix_score.viterbi_score() + prob; - next_score.cur_token_prob = prob; - next_score.times_ns = prefix_score.times(); - next_score.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score.has_context) { - // Calculate the context score. - next_score.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score.has_context = true; - } - } - } - } - - // 3. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), arr.begin() + second_beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. Update cur_hyps_ and get new result - UpdateHypotheses(arr); - } -} - -void CtcPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } - -void CtcPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - CHECK_EQ(hypotheses_.size(), cur_hyps_.size()); - CHECK_EQ(hypotheses_.size(), likelihood_.size()); - // We should backoff the context score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_state != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); - } - } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_prefix_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_prefix_beam_search.h deleted file mode 100644 index f44ec23c37af517c9e45140f89ef7346768f5d35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_prefix_beam_search.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_PREFIX_BEAM_SEARCH_H_ -#define DECODER_CTC_PREFIX_BEAM_SEARCH_H_ - -#include -#include -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "utils/utils.h" - -namespace wenet { - -struct CtcPrefixBeamSearchOptions { - int blank = 0; // blank id - int first_beam_size = 10; - int second_beam_size = 10; -}; - -struct PrefixScore { - float s = -kFloatMax; // blank ending score - float ns = -kFloatMax; // none blank ending score - float v_s = -kFloatMax; // viterbi blank ending score - float v_ns = -kFloatMax; // viterbi none blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_s; // times of viterbi blank path - std::vector times_ns; // times of viterbi none blank path - - float score() const { return LogAdd(s, ns); } - float viterbi_score() const { return v_s > v_ns ? v_s : v_ns; } - const std::vector& times() const { - return v_s > v_ns ? times_s : times_ns; - } - - bool has_context = false; - int context_state = 0; - float context_score = 0; - std::vector start_boundaries; - std::vector end_boundaries; - - void CopyContext(const PrefixScore& prefix_score) { - context_state = prefix_score.context_state; - context_score = prefix_score.context_score; - start_boundaries = prefix_score.start_boundaries; - end_boundaries = prefix_score.end_boundaries; - } - - void UpdateContext(const std::shared_ptr& context_graph, - const PrefixScore& prefix_score, int word_id, - int prefix_len) { - this->CopyContext(prefix_score); - - float score = 0; - bool is_start_boundary = false; - bool is_end_boundary = false; - - context_state = - context_graph->GetNextState(prefix_score.context_state, word_id, &score, - &is_start_boundary, &is_end_boundary); - context_score += score; - if (is_start_boundary) start_boundaries.emplace_back(prefix_len); - if (is_end_boundary) end_boundaries.emplace_back(prefix_len); - } - - float total_score() const { return score() + context_score; } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -class CtcPrefixBeamSearch : public SearchInterface { - public: - explicit CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph = nullptr); - - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kPrefixBeamSearch; } - void UpdateOutputs(const std::pair, PrefixScore>& prefix); - void UpdateHypotheses( - const std::vector, PrefixScore>>& hpys); - void UpdateFinalContext(); - - const std::vector& viterbi_likelihood() const { - return viterbi_likelihood_; - } - const std::vector>& Inputs() const override { - return hypotheses_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - int abs_time_step_ = 0; - - // N-best list and corresponding likelihood_, in sorted order - std::vector> hypotheses_; - std::vector likelihood_; - std::vector viterbi_likelihood_; - std::vector> times_; - - std::unordered_map, PrefixScore, PrefixHash> cur_hyps_; - std::shared_ptr context_graph_ = nullptr; - // Outputs contain the hypotheses_ and tags like: and - std::vector> outputs_; - const CtcPrefixBeamSearchOptions& opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(CtcPrefixBeamSearch); -}; - -} // namespace wenet - -#endif // DECODER_CTC_PREFIX_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_wfst_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_wfst_beam_search.cc deleted file mode 100644 index 10e93f387e87b5f16fb7784d7060c50f227bf58e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_wfst_beam_search.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_wfst_beam_search.h" - -#include - -namespace wenet { - -void DecodableTensorScaled::Reset() { - num_frames_ready_ = 0; - done_ = false; - // Give an empty initialization, will throw error when - // AcceptLoglikes is not called - logp_.clear(); -} - -void DecodableTensorScaled::AcceptLoglikes(const std::vector& logp) { - ++num_frames_ready_; - // TODO(Binbin Zhang): Avoid copy here - logp_ = logp; -} - -float DecodableTensorScaled::LogLikelihood(int32 frame, int32 index) { - CHECK_GT(index, 0); - CHECK_LT(frame, num_frames_ready_); - return scale_ * logp_[index - 1]; -} - -bool DecodableTensorScaled::IsLastFrame(int32 frame) const { - CHECK_LT(frame, num_frames_ready_); - return done_ && (frame == num_frames_ready_ - 1); -} - -int32 DecodableTensorScaled::NumIndices() const { - LOG(FATAL) << "Not implement"; - return 0; -} - -CtcWfstBeamSearch::CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : decodable_(opts.acoustic_scale), - decoder_(fst, opts, context_graph), - context_graph_(context_graph), - opts_(opts) { - Reset(); -} - -void CtcWfstBeamSearch::Reset() { - num_frames_ = 0; - decoded_frames_mapping_.clear(); - is_last_frame_blank_ = false; - last_best_ = 0; - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - decodable_.Reset(); - decoder_.InitDecoding(); -} - -void CtcWfstBeamSearch::Search(const std::vector>& logp) { - if (0 == logp.size()) { - return; - } - // Every time we get the log posterior, we decode it all before return - for (int i = 0; i < logp.size(); i++) { - float blank_score = std::exp(logp[i][0]); - if (blank_score > opts_.blank_skip_thresh * opts_.blank_scale) { - VLOG(3) << "skipping frame " << num_frames_ << " score " << blank_score; - is_last_frame_blank_ = true; - last_frame_prob_ = logp[i]; - } else { - // Get the best symbol - int cur_best = - std::max_element(logp[i].begin(), logp[i].end()) - logp[i].begin(); - // Optional, adding one blank frame if we has skipped it in two same - // symbols - if (cur_best != 0 && is_last_frame_blank_ && cur_best == last_best_) { - decodable_.AcceptLoglikes(last_frame_prob_); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_ - 1); - VLOG(2) << "Adding blank frame at symbol " << cur_best; - } - last_best_ = cur_best; - - decodable_.AcceptLoglikes(logp[i]); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_); - is_last_frame_blank_ = false; - } - num_frames_++; - } - // Get the best path - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - if (decoded_frames_mapping_.size() > 0) { - inputs_.resize(1); - outputs_.resize(1); - likelihood_.resize(1); - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, false); - std::vector alignment; - kaldi::LatticeWeight weight; - fst::GetLinearSymbolSequence(lat, &alignment, &outputs_[0], &weight); - ConvertToInputs(alignment, &inputs_[0]); - RemoveContinuousTags(&outputs_[0]); - VLOG(3) << weight.Value1() << " " << weight.Value2(); - likelihood_[0] = -(weight.Value1() + weight.Value2()); - } -} - -void CtcWfstBeamSearch::FinalizeSearch() { - decodable_.SetFinish(); - decoder_.FinalizeDecoding(); - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - if (decoded_frames_mapping_.size() > 0) { - std::vector nbest_lats; - if (opts_.nbest == 1) { - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, true); - nbest_lats.push_back(std::move(lat)); - } else { - // Get N-best path by lattice(CompactLattice) - kaldi::CompactLattice clat; - decoder_.GetLattice(&clat, true); - kaldi::Lattice lat, nbest_lat; - fst::ConvertLattice(clat, &lat); - // TODO(Binbin Zhang): it's n-best word lists here, not character n-best - fst::ShortestPath(lat, &nbest_lat, opts_.nbest); - fst::ConvertNbestToVector(nbest_lat, &nbest_lats); - } - int nbest = nbest_lats.size(); - inputs_.resize(nbest); - outputs_.resize(nbest); - likelihood_.resize(nbest); - times_.resize(nbest); - for (int i = 0; i < nbest; i++) { - kaldi::LatticeWeight weight; - std::vector alignment; - fst::GetLinearSymbolSequence(nbest_lats[i], &alignment, &outputs_[i], - &weight); - ConvertToInputs(alignment, &inputs_[i], ×_[i]); - RemoveContinuousTags(&outputs_[i]); - likelihood_[i] = -(weight.Value1() + weight.Value2()); - } - } -} - -void CtcWfstBeamSearch::ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time) { - input->clear(); - if (time != nullptr) time->clear(); - for (int cur = 0; cur < alignment.size(); ++cur) { - // ignore blank - if (alignment[cur] - 1 == 0) continue; - // merge continuous same label - if (cur > 0 && alignment[cur] == alignment[cur - 1]) continue; - - input->push_back(alignment[cur] - 1); - if (time != nullptr) { - time->push_back(decoded_frames_mapping_[cur]); - } - } -} - -void CtcWfstBeamSearch::RemoveContinuousTags(std::vector* output) { - if (context_graph_) { - for (auto it = output->begin(); it != output->end();) { - if (*it == context_graph_->start_tag_id() || - *it == context_graph_->end_tag_id()) { - if (it + 1 != output->end() && *it == *(it + 1)) { - it = output->erase(it); - continue; - } - } - ++it; - } - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_wfst_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_wfst_beam_search.h deleted file mode 100644 index 204a0c8db1254035b7e3bd4a6e02b65d66b756f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/ctc_wfst_beam_search.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_WFST_BEAM_SEARCH_H_ -#define DECODER_CTC_WFST_BEAM_SEARCH_H_ - -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "kaldi/decoder/lattice-faster-online-decoder.h" -#include "utils/utils.h" - -namespace wenet { - -class DecodableTensorScaled : public kaldi::DecodableInterface { - public: - explicit DecodableTensorScaled(float scale = 1.0) : scale_(scale) { Reset(); } - - void Reset(); - int32 NumFramesReady() const override { return num_frames_ready_; } - bool IsLastFrame(int32 frame) const override; - float LogLikelihood(int32 frame, int32 index) override; - int32 NumIndices() const override; - void AcceptLoglikes(const std::vector& logp); - void SetFinish() { done_ = true; } - - private: - int num_frames_ready_ = 0; - float scale_ = 1.0; - bool done_ = false; - std::vector logp_; -}; - -// LatticeFasterDecoderConfig has the following key members -// beam: decoding beam -// max_active: Decoder max active states -// lattice_beam: Lattice generation beam -struct CtcWfstBeamSearchOptions : public kaldi::LatticeFasterDecoderConfig { - float acoustic_scale = 1.0; - float nbest = 10; - // When blank score is greater than this thresh, skip the frame in viterbi - // search - float blank_skip_thresh = 0.98; - float blank_scale = 1.0; -}; - -class CtcWfstBeamSearch : public SearchInterface { - public: - explicit CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph); - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kWfstBeamSearch; } - // For CTC prefix beam search, both inputs and outputs are hypotheses_ - const std::vector>& Inputs() const override { - return inputs_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - // Sub one and remove - void ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time = nullptr); - void RemoveContinuousTags(std::vector* output); - - int num_frames_ = 0; - std::vector decoded_frames_mapping_; - - int last_best_ = 0; // last none blank best id - std::vector last_frame_prob_; - bool is_last_frame_blank_ = false; - std::vector> inputs_, outputs_; - std::vector likelihood_; - std::vector> times_; - DecodableTensorScaled decodable_; - kaldi::LatticeFasterOnlineDecoder decoder_; - std::shared_ptr context_graph_; - const CtcWfstBeamSearchOptions& opts_; -}; - -} // namespace wenet - -#endif // DECODER_CTC_WFST_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/onnx_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/onnx_asr_model.cc deleted file mode 100644 index fc7afc704febbde3b7e350e392dc46763c453e74..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/onnx_asr_model.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/onnx_asr_model.h" - -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -Ort::Env OnnxAsrModel::env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, ""); -Ort::SessionOptions OnnxAsrModel::session_options_ = Ort::SessionOptions(); - -void OnnxAsrModel::InitEngineThreads(int num_threads) { - session_options_.SetIntraOpNumThreads(num_threads); -} - -void OnnxAsrModel::GetInputOutputInfo( - const std::shared_ptr& session, - std::vector* in_names, std::vector* out_names) { - Ort::AllocatorWithDefaultOptions allocator; - // Input info - int num_nodes = session->GetInputCount(); - in_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetInputName(i, allocator); - Ort::TypeInfo type_info = session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tInput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*in_names)[i] = name; - } - // Output info - num_nodes = session->GetOutputCount(); - out_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetOutputName(i, allocator); - Ort::TypeInfo type_info = session->GetOutputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tOutput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*out_names)[i] = name; - } -} - -void OnnxAsrModel::Read(const std::string& model_dir) { - std::string encoder_onnx_path = model_dir + "/encoder.onnx"; - std::string rescore_onnx_path = model_dir + "/decoder.onnx"; - std::string ctc_onnx_path = model_dir + "/ctc.onnx"; - - // 1. Load sessions - try { -#ifdef _MSC_VER - encoder_session_ = std::make_shared( - env_, ToWString(encoder_onnx_path).c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, ToWString(rescore_onnx_path).c_str(), session_options_); - ctc_session_ = std::make_shared( - env_, ToWString(ctc_onnx_path).c_str(), session_options_); -#else - encoder_session_ = std::make_shared( - env_, encoder_onnx_path.c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, rescore_onnx_path.c_str(), session_options_); - ctc_session_ = std::make_shared(env_, ctc_onnx_path.c_str(), - session_options_); -#endif - } catch (std::exception const& e) { - LOG(ERROR) << "error when load onnx model: " << e.what(); - exit(0); - } - - // 2. Read metadata - auto model_metadata = encoder_session_->GetModelMetadata(); - - Ort::AllocatorWithDefaultOptions allocator; - encoder_output_size_ = - atoi(model_metadata.LookupCustomMetadataMap("output_size", allocator)); - num_blocks_ = - atoi(model_metadata.LookupCustomMetadataMap("num_blocks", allocator)); - head_ = atoi(model_metadata.LookupCustomMetadataMap("head", allocator)); - cnn_module_kernel_ = atoi( - model_metadata.LookupCustomMetadataMap("cnn_module_kernel", allocator)); - subsampling_rate_ = atoi( - model_metadata.LookupCustomMetadataMap("subsampling_rate", allocator)); - right_context_ = - atoi(model_metadata.LookupCustomMetadataMap("right_context", allocator)); - sos_ = atoi(model_metadata.LookupCustomMetadataMap("sos_symbol", allocator)); - eos_ = atoi(model_metadata.LookupCustomMetadataMap("eos_symbol", allocator)); - is_bidirectional_decoder_ = atoi(model_metadata.LookupCustomMetadataMap( - "is_bidirectional_decoder", allocator)); - chunk_size_ = - atoi(model_metadata.LookupCustomMetadataMap("chunk_size", allocator)); - num_left_chunks_ = - atoi(model_metadata.LookupCustomMetadataMap("left_chunks", allocator)); - - LOG(INFO) << "Onnx Model Info:"; - LOG(INFO) << "\tencoder_output_size " << encoder_output_size_; - LOG(INFO) << "\tnum_blocks " << num_blocks_; - LOG(INFO) << "\thead " << head_; - LOG(INFO) << "\tcnn_module_kernel " << cnn_module_kernel_; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; - LOG(INFO) << "\tchunk_size " << chunk_size_; - LOG(INFO) << "\tnum_left_chunks " << num_left_chunks_; - - // 3. Read model nodes - LOG(INFO) << "Onnx Encoder:"; - GetInputOutputInfo(encoder_session_, &encoder_in_names_, &encoder_out_names_); - LOG(INFO) << "Onnx CTC:"; - GetInputOutputInfo(ctc_session_, &ctc_in_names_, &ctc_out_names_); - LOG(INFO) << "Onnx Rescore:"; - GetInputOutputInfo(rescore_session_, &rescore_in_names_, &rescore_out_names_); -} - -OnnxAsrModel::OnnxAsrModel(const OnnxAsrModel& other) { - // metadatas - encoder_output_size_ = other.encoder_output_size_; - num_blocks_ = other.num_blocks_; - head_ = other.head_; - cnn_module_kernel_ = other.cnn_module_kernel_; - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - // sessions - encoder_session_ = other.encoder_session_; - ctc_session_ = other.ctc_session_; - rescore_session_ = other.rescore_session_; - - // node names - encoder_in_names_ = other.encoder_in_names_; - encoder_out_names_ = other.encoder_out_names_; - ctc_in_names_ = other.ctc_in_names_; - ctc_out_names_ = other.ctc_out_names_; - rescore_in_names_ = other.rescore_in_names_; - rescore_out_names_ = other.rescore_out_names_; -} - -std::shared_ptr OnnxAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void OnnxAsrModel::Reset() { - offset_ = 0; - encoder_outs_.clear(); - cached_feature_.clear(); - // Reset att_cache - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - if (num_left_chunks_ > 0) { - int required_cache_size = chunk_size_ * num_left_chunks_; - offset_ = required_cache_size; - att_cache_.resize(num_blocks_ * head_ * required_cache_size * - encoder_output_size_ / head_ * 2, - 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, required_cache_size, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } else { - att_cache_.resize(0, 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, 0, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } - - // Reset cnn_cache - cnn_cache_.resize( - num_blocks_ * encoder_output_size_ * (cnn_module_kernel_ - 1), 0.0); - const int64_t cnn_cache_shape[] = {num_blocks_, 1, encoder_output_size_, - cnn_module_kernel_ - 1}; - cnn_cache_ort_ = Ort::Value::CreateTensor( - memory_info, cnn_cache_.data(), cnn_cache_.size(), cnn_cache_shape, 4); -} - -void OnnxAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - // 1. Prepare onnx required data, splice cached_feature_ and chunk_feats - // chunk - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - std::vector feats; - for (size_t i = 0; i < cached_feature_.size(); ++i) { - feats.insert(feats.end(), cached_feature_[i].begin(), - cached_feature_[i].end()); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - feats.insert(feats.end(), chunk_feats[i].begin(), chunk_feats[i].end()); - } - const int64_t feats_shape[3] = {1, num_frames, feature_dim}; - Ort::Value feats_ort = Ort::Value::CreateTensor( - memory_info, feats.data(), feats.size(), feats_shape, 3); - // offset - int64_t offset_int64 = static_cast(offset_); - Ort::Value offset_ort = Ort::Value::CreateTensor( - memory_info, &offset_int64, 1, std::vector{}.data(), 0); - // required_cache_size - int64_t required_cache_size = chunk_size_ * num_left_chunks_; - Ort::Value required_cache_size_ort = Ort::Value::CreateTensor( - memory_info, &required_cache_size, 1, std::vector{}.data(), 0); - // att_mask - Ort::Value att_mask_ort{nullptr}; - std::vector att_mask(required_cache_size + chunk_size_, 1); - if (num_left_chunks_ > 0) { - int chunk_idx = offset_ / chunk_size_ - num_left_chunks_; - if (chunk_idx < num_left_chunks_) { - for (int i = 0; i < (num_left_chunks_ - chunk_idx) * chunk_size_; ++i) { - att_mask[i] = 0; - } - } - const int64_t att_mask_shape[] = {1, 1, required_cache_size + chunk_size_}; - att_mask_ort = Ort::Value::CreateTensor( - memory_info, reinterpret_cast(att_mask.data()), att_mask.size(), - att_mask_shape, 3); - } - - // 2. Encoder chunk forward - std::vector inputs; - for (auto name : encoder_in_names_) { - if (!strcmp(name, "chunk")) { - inputs.emplace_back(std::move(feats_ort)); - } else if (!strcmp(name, "offset")) { - inputs.emplace_back(std::move(offset_ort)); - } else if (!strcmp(name, "required_cache_size")) { - inputs.emplace_back(std::move(required_cache_size_ort)); - } else if (!strcmp(name, "att_cache")) { - inputs.emplace_back(std::move(att_cache_ort_)); - } else if (!strcmp(name, "cnn_cache")) { - inputs.emplace_back(std::move(cnn_cache_ort_)); - } else if (!strcmp(name, "att_mask")) { - inputs.emplace_back(std::move(att_mask_ort)); - } - } - - std::vector ort_outputs = encoder_session_->Run( - Ort::RunOptions{nullptr}, encoder_in_names_.data(), inputs.data(), - inputs.size(), encoder_out_names_.data(), encoder_out_names_.size()); - - offset_ += static_cast( - ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape()[1]); - att_cache_ort_ = std::move(ort_outputs[1]); - cnn_cache_ort_ = std::move(ort_outputs[2]); - - std::vector ctc_inputs; - ctc_inputs.emplace_back(std::move(ort_outputs[0])); - - std::vector ctc_ort_outputs = ctc_session_->Run( - Ort::RunOptions{nullptr}, ctc_in_names_.data(), ctc_inputs.data(), - ctc_inputs.size(), ctc_out_names_.data(), ctc_out_names_.size()); - encoder_outs_.push_back(std::move(ctc_inputs[0])); - - float* logp_data = ctc_ort_outputs[0].GetTensorMutableData(); - auto type_info = ctc_ort_outputs[0].GetTensorTypeAndShapeInfo(); - - int num_outputs = type_info.GetShape()[1]; - int output_dim = type_info.GetShape()[2]; - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), logp_data + i * output_dim, - sizeof(float) * output_dim); - } -} - -float OnnxAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void OnnxAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - - std::vector rescore_input; - int encoder_len = 0; - for (int i = 0; i < encoder_outs_.size(); i++) { - float* encoder_outs_data = encoder_outs_[i].GetTensorMutableData(); - auto type_info = encoder_outs_[i].GetTensorTypeAndShapeInfo(); - for (int j = 0; j < type_info.GetElementCount(); j++) { - rescore_input.emplace_back(encoder_outs_data[j]); - } - encoder_len += type_info.GetShape()[1]; - } - - const int64_t decode_input_shape[] = {1, encoder_len, encoder_output_size_}; - - std::vector hyps_pad; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad.emplace_back(0); - } - } - - const int64_t hyps_pad_shape[] = {num_hyps, max_hyps_len}; - - const int64_t hyps_lens_shape[] = {num_hyps}; - - Ort::Value decode_input_tensor_ = Ort::Value::CreateTensor( - memory_info, rescore_input.data(), rescore_input.size(), - decode_input_shape, 3); - Ort::Value hyps_pad_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_pad.data(), hyps_pad.size(), hyps_pad_shape, 2); - Ort::Value hyps_lens_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_lens.data(), hyps_lens.size(), hyps_lens_shape, 1); - - std::vector rescore_inputs; - - rescore_inputs.emplace_back(std::move(hyps_pad_tensor_)); - rescore_inputs.emplace_back(std::move(hyps_lens_tensor_)); - rescore_inputs.emplace_back(std::move(decode_input_tensor_)); - - std::vector rescore_outputs = rescore_session_->Run( - Ort::RunOptions{nullptr}, rescore_in_names_.data(), rescore_inputs.data(), - rescore_inputs.size(), rescore_out_names_.data(), - rescore_out_names_.size()); - - float* decoder_outs_data = rescore_outputs[0].GetTensorMutableData(); - float* r_decoder_outs_data = rescore_outputs[1].GetTensorMutableData(); - - auto type_info = rescore_outputs[0].GetTensorTypeAndShapeInfo(); - int decode_out_len = type_info.GetShape()[2]; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - score = ComputeAttentionScore( - decoder_outs_data + max_hyps_len * decode_out_len * i, hyp, eos_, - decode_out_len); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore( - r_decoder_outs_data + max_hyps_len * decode_out_len * i, r_hyp, eos_, - decode_out_len); - } - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/onnx_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/onnx_asr_model.h deleted file mode 100644 index f5d9e9a0c61d728f2fb6d45d1428234abae98c90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/onnx_asr_model.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_ONNX_ASR_MODEL_H_ -#define DECODER_ONNX_ASR_MODEL_H_ - -#include -#include -#include - -#include "onnxruntime_cxx_api.h" // NOLINT - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -class OnnxAsrModel : public AsrModel { - public: - static void InitEngineThreads(int num_threads = 1); - - public: - OnnxAsrModel() = default; - OnnxAsrModel(const OnnxAsrModel& other); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - void GetInputOutputInfo(const std::shared_ptr& session, - std::vector* in_names, - std::vector* out_names); - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // sessions - // NOTE(Mddct): The Env holds the logging state used by all other objects. - // One Env must be created before using any other Onnxruntime functionality. - static Ort::Env env_; // shared environment across threads. - static Ort::SessionOptions session_options_; - std::shared_ptr encoder_session_ = nullptr; - std::shared_ptr rescore_session_ = nullptr; - std::shared_ptr ctc_session_ = nullptr; - - // node names - std::vector encoder_in_names_, encoder_out_names_; - std::vector ctc_in_names_, ctc_out_names_; - std::vector rescore_in_names_, rescore_out_names_; - - // caches - Ort::Value att_cache_ort_{nullptr}; - Ort::Value cnn_cache_ort_{nullptr}; - std::vector encoder_outs_; - // NOTE: Instead of making a copy of the xx_cache, ONNX only maintains - // its data pointer when initializing xx_cache_ort (see https://github.com/ - // microsoft/onnxruntime/blob/master/onnxruntime/core/framework - // /tensor.cc#L102-L129), so we need the following variables to keep - // our data "alive" during the lifetime of decoder. - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // DECODER_ONNX_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/params.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/params.h deleted file mode 100644 index 3edc877f1bb6d876ca087cab8e4ed00d42e97e63..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/params.h +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_PARAMS_H_ -#define DECODER_PARAMS_H_ - -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#ifdef USE_ONNX -#include "decoder/onnx_asr_model.h" -#endif -#ifdef USE_TORCH -#include "decoder/torch_asr_model.h" -#endif -#ifdef USE_XPU -#include "xpu/xpu_asr_model.h" -#endif -#ifdef USE_BPU -#include "bpu/bpu_asr_model.h" -#endif -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); - -// TorchAsrModel flags -DEFINE_string(model_path, "", "pytorch exported model path"); -// OnnxAsrModel flags -DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); -// XPUAsrModel flags -DEFINE_string(xpu_model_dir, "", - "directory where the XPU model and weights is saved"); -// BPUAsrModel flags -DEFINE_string(bpu_model_dir, "", - "directory where the HORIZON BPU model is saved"); - -// FeaturePipelineConfig flags -DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); -DEFINE_int32(sample_rate, 16000, "sample rate for audio"); - -// TLG fst -DEFINE_string(fst_path, "", "TLG fst path"); - -// DecodeOptions flags -DEFINE_int32(chunk_size, 16, "decoding chunk size"); -DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); -DEFINE_double(ctc_weight, 0.5, - "ctc weight when combining ctc score and rescoring score"); -DEFINE_double(rescoring_weight, 1.0, - "rescoring weight when combining ctc score and rescoring score"); -DEFINE_double(reverse_weight, 0.0, - "used for bitransformer rescoring. it must be 0.0 if decoder is" - "conventional transformer decoder, and only reverse_weight > 0.0" - "dose the right to left decoder will be calculated and used"); -DEFINE_int32(max_active, 7000, "max active states in ctc wfst search"); -DEFINE_int32(min_active, 200, "min active states in ctc wfst search"); -DEFINE_double(beam, 16.0, "beam in ctc wfst search"); -DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); -DEFINE_double(blank_skip_thresh, 1.0, - "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(blank_scale, 1.0, "blank scale for ctc wfst search"); -DEFINE_double(length_penalty, 0.0, - "length penalty ctc wfst search, will not" - "apply on self-loop arc, for balancing the del/ins ratio, " - "suggest set to -3.0"); -DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); - -// SymbolTable flags -DEFINE_string(dict_path, "", - "dict symbol table path, required when LM is enabled"); -DEFINE_string(unit_path, "", - "e2e model unit symbol table, it is used in both " - "with/without LM scenarios for context/timestamp"); - -// Context flags -DEFINE_string(context_path, "", "context path, is used to build context graph"); -DEFINE_double(context_score, 3.0, "is used to rescore the decoded result"); - -// PostProcessOptions flags -DEFINE_int32(language_type, 0, - "remove spaces according to language type" - "0x00 = kMandarinEnglish, " - "0x01 = kIndoEuropean"); -DEFINE_bool(lowercase, true, "lowercase final result if needed"); - -namespace wenet { -std::shared_ptr InitFeaturePipelineConfigFromFlags() { - auto feature_config = std::make_shared( - FLAGS_num_bins, FLAGS_sample_rate); - return feature_config; -} - -std::shared_ptr InitDecodeOptionsFromFlags() { - auto decode_config = std::make_shared(); - decode_config->chunk_size = FLAGS_chunk_size; - decode_config->num_left_chunks = FLAGS_num_left_chunks; - decode_config->ctc_weight = FLAGS_ctc_weight; - decode_config->reverse_weight = FLAGS_reverse_weight; - decode_config->rescoring_weight = FLAGS_rescoring_weight; - decode_config->ctc_wfst_search_opts.max_active = FLAGS_max_active; - decode_config->ctc_wfst_search_opts.min_active = FLAGS_min_active; - decode_config->ctc_wfst_search_opts.beam = FLAGS_beam; - decode_config->ctc_wfst_search_opts.lattice_beam = FLAGS_lattice_beam; - decode_config->ctc_wfst_search_opts.acoustic_scale = FLAGS_acoustic_scale; - decode_config->ctc_wfst_search_opts.blank_skip_thresh = - FLAGS_blank_skip_thresh; - decode_config->ctc_wfst_search_opts.blank_scale = FLAGS_blank_scale; - decode_config->ctc_wfst_search_opts.length_penalty = FLAGS_length_penalty; - decode_config->ctc_wfst_search_opts.nbest = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; - return decode_config; -} - -std::shared_ptr InitDecodeResourceFromFlags() { - auto resource = std::make_shared(); - const int kNumGemmThreads = 1; - if (!FLAGS_onnx_dir.empty()) { -#ifdef USE_ONNX - LOG(INFO) << "Reading onnx model "; - OnnxAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_onnx_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; -#endif - } else if (!FLAGS_model_path.empty()) { -#ifdef USE_TORCH - LOG(INFO) << "Reading torch model " << FLAGS_model_path; - TorchAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_model_path); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; -#endif - } else if (!FLAGS_xpu_model_dir.empty()) { -#ifdef USE_XPU - LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; - auto model = std::make_shared(); - model->SetEngineThreads(kNumGemmThreads); - model->SetDeviceId(FLAGS_device_id); - model->Read(FLAGS_xpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; -#endif - } else if (!FLAGS_bpu_model_dir.empty()) { -#ifdef USE_BPU - LOG(INFO) << "Reading Horizon BPU model from " << FLAGS_bpu_model_dir; - auto model = std::make_shared(); - model->Read(FLAGS_bpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DBPU=ON'."; -#endif - } else { - LOG(FATAL) << "Please set ONNX, TORCH, XPU or BPU model path!!!"; - } - - LOG(INFO) << "Reading unit table " << FLAGS_unit_path; - auto unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_unit_path)); - CHECK(unit_table != nullptr); - resource->unit_table = unit_table; - - if (!FLAGS_fst_path.empty()) { // With LM - CHECK(!FLAGS_dict_path.empty()); - LOG(INFO) << "Reading fst " << FLAGS_fst_path; - auto fst = std::shared_ptr>( - fst::Fst::Read(FLAGS_fst_path)); - CHECK(fst != nullptr); - resource->fst = fst; - - LOG(INFO) << "Reading symbol table " << FLAGS_dict_path; - auto symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_dict_path)); - CHECK(symbol_table != nullptr); - resource->symbol_table = symbol_table; - } else { // Without LM, symbol_table is the same as unit_table - resource->symbol_table = unit_table; - } - - if (!FLAGS_context_path.empty()) { - LOG(INFO) << "Reading context " << FLAGS_context_path; - std::vector contexts; - std::ifstream infile(FLAGS_context_path); - std::string context; - while (getline(infile, context)) { - contexts.emplace_back(Trim(context)); - } - ContextConfig config; - config.context_score = FLAGS_context_score; - resource->context_graph = std::make_shared(config); - resource->context_graph->BuildContextGraph(contexts, - resource->symbol_table); - } - - PostProcessOptions post_process_opts; - post_process_opts.language_type = - FLAGS_language_type == 0 ? kMandarinEnglish : kIndoEuropean; - post_process_opts.lowercase = FLAGS_lowercase; - resource->post_processor = - std::make_shared(std::move(post_process_opts)); - return resource; -} - -} // namespace wenet - -#endif // DECODER_PARAMS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/search_interface.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/search_interface.h deleted file mode 100644 index 25bad26705f8be44561d2c686f50a63035b14bbf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/search_interface.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_SEARCH_INTERFACE_H_ -#define DECODER_SEARCH_INTERFACE_H_ - -namespace wenet { - -#include - -enum SearchType { - kPrefixBeamSearch = 0x00, - kWfstBeamSearch = 0x01, -}; - -class SearchInterface { - public: - virtual ~SearchInterface() {} - virtual void Search(const std::vector>& logp) = 0; - virtual void Reset() = 0; - virtual void FinalizeSearch() = 0; - - virtual SearchType Type() const = 0; - // N-best inputs id - virtual const std::vector>& Inputs() const = 0; - // N-best outputs id - virtual const std::vector>& Outputs() const = 0; - // N-best likelihood - virtual const std::vector& Likelihood() const = 0; - // N-best timestamp - virtual const std::vector>& Times() const = 0; -}; - -} // namespace wenet - -#endif // DECODER_SEARCH_INTERFACE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/torch_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/torch_asr_model.cc deleted file mode 100644 index 3abca283e12f5c173c9511707229ea82b31f26d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/torch_asr_model.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/torch_asr_model.h" - -#include -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -namespace wenet { - -#ifndef IOS -void TorchAsrModel::InitEngineThreads(int num_threads) { - // For multi-thread performance - at::set_num_threads(num_threads); - VLOG(1) << "Num intra-op threads: " << at::get_num_threads(); -} -#endif - -void TorchAsrModel::Read(const std::string& model_path) { - torch::DeviceType device = at::kCPU; -#ifdef USE_GPU - if (!torch::cuda::is_available()) { - VLOG(1) << "CUDA is not available! Please check your GPU settings"; - throw std::runtime_error("CUDA is not available!"); - } else { - VLOG(1) << "CUDA available! Running on GPU"; - device = at::kCUDA; - } -#endif - torch::jit::script::Module model = torch::jit::load(model_path, device); - model_ = std::make_shared(std::move(model)); - torch::NoGradGuard no_grad; - model_->eval(); - torch::jit::IValue o1 = model_->run_method("subsampling_rate"); - CHECK_EQ(o1.isInt(), true); - subsampling_rate_ = o1.toInt(); - torch::jit::IValue o2 = model_->run_method("right_context"); - CHECK_EQ(o2.isInt(), true); - right_context_ = o2.toInt(); - torch::jit::IValue o3 = model_->run_method("sos_symbol"); - CHECK_EQ(o3.isInt(), true); - sos_ = o3.toInt(); - torch::jit::IValue o4 = model_->run_method("eos_symbol"); - CHECK_EQ(o4.isInt(), true); - eos_ = o4.toInt(); - torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder"); - CHECK_EQ(o5.isBool(), true); - is_bidirectional_decoder_ = o5.toBool(); - - VLOG(1) << "Torch Model Info:"; - VLOG(1) << "\tsubsampling_rate " << subsampling_rate_; - VLOG(1) << "\tright context " << right_context_; - VLOG(1) << "\tsos " << sos_; - VLOG(1) << "\teos " << eos_; - VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - // 2. Model copy, just copy the model ptr since: - // PyTorch allows using multiple CPU threads during TorchScript model - // inference, please see https://pytorch.org/docs/stable/notes/cpu_ - // threading_torchscript_inference.html - model_ = other.model_; - - // NOTE(Binbin Zhang): - // inner states for forward are not copied here. -} - -std::shared_ptr TorchAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void TorchAsrModel::Reset() { - offset_ = 0; - att_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - encoder_outs_.clear(); - cached_feature_.clear(); -} - -void TorchAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - torch::Tensor feats = - torch::zeros({1, num_frames, feature_dim}, torch::kFloat); - for (size_t i = 0; i < cached_feature_.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(cached_feature_[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][i] = std::move(row); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(chunk_feats[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][cached_feature_.size() + i] = std::move(row); - } - - // 2. Encoder chunk forward -#ifdef USE_GPU - feats = feats.to(at::kCUDA); - att_cache_ = att_cache_.to(at::kCUDA); - cnn_cache_ = cnn_cache_.to(at::kCUDA); -#endif - int required_cache_size = chunk_size_ * num_left_chunks_; - torch::NoGradGuard no_grad; - std::vector inputs = {feats, offset_, required_cache_size, - att_cache_, cnn_cache_}; - - // Refer interfaces in wenet/transformer/asr_model.py - auto outputs = - model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements(); - CHECK_EQ(outputs.size(), 3); -#ifdef USE_GPU - torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU); - att_cache_ = outputs[1].toTensor().to(at::kCPU); - cnn_cache_ = outputs[2].toTensor().to(at::kCPU); -#else - torch::Tensor chunk_out = outputs[0].toTensor(); - att_cache_ = outputs[1].toTensor(); - cnn_cache_ = outputs[2].toTensor(); -#endif - offset_ += chunk_out.size(1); - - // The first dimension of returned value is for batchsize, which is 1 -#ifdef USE_GPU - chunk_out = chunk_out.to(at::kCUDA); - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor(); - ctc_log_probs = ctc_log_probs.to(at::kCPU)[0]; - encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU))); -#else - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor()[0]; - encoder_outs_.push_back(std::move(chunk_out)); -#endif - - // Copy to output - int num_outputs = ctc_log_probs.size(0); - int output_dim = ctc_log_probs.size(1); - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(), - sizeof(float) * output_dim); - } -} - -float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, - int eos) { - float score = 0.0f; - auto accessor = prob.accessor(); - for (size_t j = 0; j < hyp.size(); ++j) { - score += accessor[j][hyp[j]]; - } - score += accessor[hyp.size()][eos]; - return score; -} - -void TorchAsrModel::AttentionRescoring( - const std::vector>& hyps, float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - torch::NoGradGuard no_grad; - // Step 1: Prepare input for libtorch - torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong); - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_length[i] = static_cast(length); - } - torch::Tensor hyps_tensor = - torch::zeros({num_hyps, max_hyps_len}, torch::kLong); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_tensor[i][0] = sos_; - for (size_t j = 0; j < hyp.size(); ++j) { - hyps_tensor[i][j + 1] = hyp[j]; - } - } - - // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_ - torch::Tensor encoder_out = torch::cat(encoder_outs_, 1); -#ifdef USE_GPU - hyps_tensor = hyps_tensor.to(at::kCUDA); - hyps_length = hyps_length.to(at::kCUDA); - encoder_out = encoder_out.to(at::kCUDA); -#endif - auto outputs = model_ - ->run_method("forward_attention_decoder", hyps_tensor, - hyps_length, encoder_out, reverse_weight) - .toTuple() - ->elements(); -#ifdef USE_GPU - auto probs = outputs[0].toTensor().to(at::kCPU); - auto r_probs = outputs[1].toTensor().to(at::kCPU); -#else - auto probs = outputs[0].toTensor(); - auto r_probs = outputs[1].toTensor(); -#endif - CHECK_EQ(probs.size(0), num_hyps); - CHECK_EQ(probs.size(1), max_hyps_len); - - // Step 3: Compute rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left-to-right decoder score - score = ComputeAttentionScore(probs[i], hyp, eos_); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - // right-to-left score - CHECK_EQ(r_probs.size(0), num_hyps); - CHECK_EQ(r_probs.size(1), max_hyps_len); - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_); - } - - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/torch_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/torch_asr_model.h deleted file mode 100644 index a3cebe08798f1cad60ca4cd73c7b2488173b6114..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/decoder/torch_asr_model.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_TORCH_ASR_MODEL_H_ -#define DECODER_TORCH_ASR_MODEL_H_ - -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -#include "decoder/asr_model.h" -#include "utils/utils.h" - -namespace wenet { - -class TorchAsrModel : public AsrModel { - public: -#ifndef IOS - static void InitEngineThreads(int num_threads = 1); -#endif - - public: - using TorchModule = torch::jit::script::Module; - TorchAsrModel() = default; - TorchAsrModel(const TorchAsrModel& other); - void Read(const std::string& model_path); - std::shared_ptr torch_model() const { return model_; } - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, int eos); - - private: - std::shared_ptr model_ = nullptr; - std::vector encoder_outs_; - // transformer/conformer attention cache - torch::Tensor att_cache_ = torch::zeros({0, 0, 0, 0}); - // conformer-only conv_module cache - torch::Tensor cnn_cache_ = torch::zeros({0, 0, 0, 0}); -}; - -} // namespace wenet - -#endif // DECODER_TORCH_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/CMakeLists.txt deleted file mode 100644 index 78872257e43bb9a6ffcedaae977bf0173817ae50..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(frontend STATIC - feature_pipeline.cc - fft.cc -) -target_link_libraries(frontend PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fbank.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fbank.h deleted file mode 100644 index 5a650dc035b8e244388cc1f2e0b9512654de7fda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fbank.h +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FBANK_H_ -#define FRONTEND_FBANK_H_ - -#include -#include -#include -#include -#include - -#include "frontend/fft.h" -#include "utils/log.h" - -namespace wenet { - -// This code is based on kaldi Fbank implementation, please see -// https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.cc -class Fbank { - public: - Fbank(int num_bins, int sample_rate, int frame_length, int frame_shift) - : num_bins_(num_bins), - sample_rate_(sample_rate), - frame_length_(frame_length), - frame_shift_(frame_shift), - use_log_(true), - remove_dc_offset_(true), - generator_(0), - distribution_(0, 1.0), - dither_(0.0) { - fft_points_ = UpperPowerOfTwo(frame_length_); - // generate bit reversal table and trigonometric function table - const int fft_points_4 = fft_points_ / 4; - bitrev_.resize(fft_points_); - sintbl_.resize(fft_points_ + fft_points_4); - make_sintbl(fft_points_, sintbl_.data()); - make_bitrev(fft_points_, bitrev_.data()); - - int num_fft_bins = fft_points_ / 2; - float fft_bin_width = static_cast(sample_rate_) / fft_points_; - int low_freq = 20, high_freq = sample_rate_ / 2; - float mel_low_freq = MelScale(low_freq); - float mel_high_freq = MelScale(high_freq); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); - bins_.resize(num_bins_); - center_freqs_.resize(num_bins_); - for (int bin = 0; bin < num_bins; ++bin) { - float left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - center_freqs_[bin] = InverseMelScale(center_mel); - std::vector this_bin(num_fft_bins); - int first_index = -1, last_index = -1; - for (int i = 0; i < num_fft_bins; ++i) { - float freq = (fft_bin_width * i); // Center frequency of this fft - // bin. - float mel = MelScale(freq); - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) - weight = (mel - left_mel) / (center_mel - left_mel); - else - weight = (right_mel - mel) / (right_mel - center_mel); - this_bin[i] = weight; - if (first_index == -1) first_index = i; - last_index = i; - } - } - CHECK(first_index != -1 && last_index >= first_index); - bins_[bin].first = first_index; - int size = last_index + 1 - first_index; - bins_[bin].second.resize(size); - for (int i = 0; i < size; ++i) { - bins_[bin].second[i] = this_bin[first_index + i]; - } - } - - // povey window - povey_window_.resize(frame_length_); - double a = M_2PI / (frame_length - 1); - for (int i = 0; i < frame_length; ++i) { - povey_window_[i] = pow(0.5 - 0.5 * cos(a * i), 0.85); - } - } - - void set_use_log(bool use_log) { use_log_ = use_log; } - - void set_remove_dc_offset(bool remove_dc_offset) { - remove_dc_offset_ = remove_dc_offset; - } - - void set_dither(float dither) { dither_ = dither; } - - int num_bins() const { return num_bins_; } - - static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); - } - - static inline float MelScale(float freq) { - return 1127.0f * logf(1.0f + freq / 700.0f); - } - - static int UpperPowerOfTwo(int n) { - return static_cast(pow(2, ceil(log(n) / log(2)))); - } - - // pre emphasis - void PreEmphasis(float coeff, std::vector* data) const { - if (coeff == 0.0) return; - for (int i = data->size() - 1; i > 0; i--) - (*data)[i] -= coeff * (*data)[i - 1]; - (*data)[0] -= coeff * (*data)[0]; - } - - // Apply povey window on data in place - void Povey(std::vector* data) const { - CHECK_GE(data->size(), povey_window_.size()); - for (size_t i = 0; i < povey_window_.size(); ++i) { - (*data)[i] *= povey_window_[i]; - } - } - - // Compute fbank feat, return num frames - int Compute(const std::vector& wave, - std::vector>* feat) { - int num_samples = wave.size(); - if (num_samples < frame_length_) return 0; - int num_frames = 1 + ((num_samples - frame_length_) / frame_shift_); - feat->resize(num_frames); - std::vector fft_real(fft_points_, 0), fft_img(fft_points_, 0); - std::vector power(fft_points_ / 2); - for (int i = 0; i < num_frames; ++i) { - std::vector data(wave.data() + i * frame_shift_, - wave.data() + i * frame_shift_ + frame_length_); - // optional add noise - if (dither_ != 0.0) { - for (size_t j = 0; j < data.size(); ++j) - data[j] += dither_ * distribution_(generator_); - } - // optinal remove dc offset - if (remove_dc_offset_) { - float mean = 0.0; - for (size_t j = 0; j < data.size(); ++j) mean += data[j]; - mean /= data.size(); - for (size_t j = 0; j < data.size(); ++j) data[j] -= mean; - } - - PreEmphasis(0.97, &data); - Povey(&data); - // copy data to fft_real - memset(fft_img.data(), 0, sizeof(float) * fft_points_); - memset(fft_real.data() + frame_length_, 0, - sizeof(float) * (fft_points_ - frame_length_)); - memcpy(fft_real.data(), data.data(), sizeof(float) * frame_length_); - fft(bitrev_.data(), sintbl_.data(), fft_real.data(), fft_img.data(), - fft_points_); - // power - for (int j = 0; j < fft_points_ / 2; ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - } - - (*feat)[i].resize(num_bins_); - // cepstral coefficients, triangle filter array - for (int j = 0; j < num_bins_; ++j) { - float mel_energy = 0.0; - int s = bins_[j].first; - for (size_t k = 0; k < bins_[j].second.size(); ++k) { - mel_energy += bins_[j].second[k] * power[s + k]; - } - // optional use log - if (use_log_) { - if (mel_energy < std::numeric_limits::epsilon()) - mel_energy = std::numeric_limits::epsilon(); - mel_energy = logf(mel_energy); - } - - (*feat)[i][j] = mel_energy; - } - } - return num_frames; - } - - private: - int num_bins_; - int sample_rate_; - int frame_length_, frame_shift_; - int fft_points_; - bool use_log_; - bool remove_dc_offset_; - std::vector center_freqs_; - std::vector>> bins_; - std::vector povey_window_; - std::default_random_engine generator_; - std::normal_distribution distribution_; - float dither_; - - // bit reversal table - std::vector bitrev_; - // trigonometric function table - std::vector sintbl_; -}; - -} // namespace wenet - -#endif // FRONTEND_FBANK_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/feature_pipeline.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/feature_pipeline.cc deleted file mode 100644 index ab450b15cd35ebd8101a3bcdec4f963a73bed10c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/feature_pipeline.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/feature_pipeline.h" - -#include -#include - -namespace wenet { - -FeaturePipeline::FeaturePipeline(const FeaturePipelineConfig& config) - : config_(config), - feature_dim_(config.num_bins), - fbank_(config.num_bins, config.sample_rate, config.frame_length, - config.frame_shift), - num_frames_(0), - input_finished_(false) {} - -void FeaturePipeline::AcceptWaveform(const float* pcm, const int size) { - std::vector> feats; - std::vector waves; - waves.insert(waves.end(), remained_wav_.begin(), remained_wav_.end()); - waves.insert(waves.end(), pcm, pcm + size); - int num_frames = fbank_.Compute(waves, &feats); - feature_queue_.Push(std::move(feats)); - num_frames_ += num_frames; - - int left_samples = waves.size() - config_.frame_shift * num_frames; - remained_wav_.resize(left_samples); - std::copy(waves.begin() + config_.frame_shift * num_frames, waves.end(), - remained_wav_.begin()); - // We are still adding wave, notify input is not finished - finish_condition_.notify_one(); -} - -void FeaturePipeline::AcceptWaveform(const int16_t* pcm, const int size) { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = static_cast(pcm[i]); - } - this->AcceptWaveform(float_pcm, size); - delete[] float_pcm; -} - -void FeaturePipeline::set_input_finished() { - CHECK(!input_finished_); - { - std::lock_guard lock(mutex_); - input_finished_ = true; - } - finish_condition_.notify_one(); -} - -bool FeaturePipeline::ReadOne(std::vector* feat) { - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - return false; - } - } -} - -bool FeaturePipeline::Read(int num_frames, - std::vector>* feats) { - feats->clear(); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - *feats = std::move(feature_queue_.Pop(feature_queue_.Size())); - return false; - } - } -} - -void FeaturePipeline::Reset() { - input_finished_ = false; - num_frames_ = 0; - remained_wav_.clear(); - feature_queue_.Clear(); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/feature_pipeline.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/feature_pipeline.h deleted file mode 100644 index 9918d6b573255795e0e665f0a9598c44be625c19..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/feature_pipeline.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FEATURE_PIPELINE_H_ -#define FRONTEND_FEATURE_PIPELINE_H_ - -#include -#include -#include -#include - -#include "frontend/fbank.h" -#include "utils/blocking_queue.h" -#include "utils/log.h" - -namespace wenet { - -struct FeaturePipelineConfig { - int num_bins; - int sample_rate; - int frame_length; - int frame_shift; - FeaturePipelineConfig(int num_bins, int sample_rate) - : num_bins(num_bins), // 80 dim fbank - sample_rate(sample_rate) { // 16k sample rate - frame_length = sample_rate / 1000 * 25; // frame length 25ms - frame_shift = sample_rate / 1000 * 10; // frame shift 10ms - } - - void Info() const { - LOG(INFO) << "feature pipeline config" - << " num_bins " << num_bins << " frame_length " << frame_length - << " frame_shift " << frame_shift; - } -}; - -// Typically, FeaturePipeline is used in two threads: one thread A calls -// AcceptWaveform() to add raw wav data and set_input_finished() to notice -// the end of input wav, another thread B (decoder thread) calls Read() to -// consume features.So a BlockingQueue is used to make this class thread safe. - -// The Read() is designed as a blocking method when there is no feature -// in feature_queue_ and the input is not finished. - -// See bin/decoder_main.cc, websocket/websocket_server.cc and -// decoder/torch_asr_decoder.cc for usage - -class FeaturePipeline { - public: - explicit FeaturePipeline(const FeaturePipelineConfig& config); - - // The feature extraction is done in AcceptWaveform(). - void AcceptWaveform(const float* pcm, const int size); - void AcceptWaveform(const int16_t* pcm, const int size); - - // Current extracted frames number. - int num_frames() const { return num_frames_; } - int feature_dim() const { return feature_dim_; } - const FeaturePipelineConfig& config() const { return config_; } - - // The caller should call this method when speech input is end. - // Never call AcceptWaveform() after calling set_input_finished() ! - void set_input_finished(); - bool input_finished() const { return input_finished_; } - - // Return False if input is finished and no feature could be read. - // Return True if a feature is read. - // This function is a blocking method. It will block the thread when - // there is no feature in feature_queue_ and the input is not finished. - bool ReadOne(std::vector* feat); - - // Read #num_frames frame features. - // Return False if less than #num_frames features are read and the - // input is finished. - // Return True if #num_frames features are read. - // This function is a blocking method when there is no feature - // in feature_queue_ and the input is not finished. - bool Read(int num_frames, std::vector>* feats); - - void Reset(); - bool IsLastFrame(int frame) const { - return input_finished_ && (frame == num_frames_ - 1); - } - - int NumQueuedFrames() const { return feature_queue_.Size(); } - - private: - const FeaturePipelineConfig& config_; - int feature_dim_; - Fbank fbank_; - - BlockingQueue> feature_queue_; - int num_frames_; - bool input_finished_; - - // The feature extraction is done in AcceptWaveform(). - // This waveform sample points are consumed by frame size. - // The residual waveform sample points after framing are - // kept to be used in next AcceptWaveform() calling. - std::vector remained_wav_; - - // Used to block the Read when there is no feature in feature_queue_ - // and the input is not finished. - mutable std::mutex mutex_; - std::condition_variable finish_condition_; -}; - -} // namespace wenet - -#endif // FRONTEND_FEATURE_PIPELINE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fft.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fft.cc deleted file mode 100644 index 9e05f854e79ea733d0411045385e924c2670b7f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fft.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include - -#include "frontend/fft.h" - -namespace wenet { - -void make_sintbl(int n, float* sintbl) { - int i, n2, n4, n8; - float c, s, dc, ds, t; - - n2 = n / 2; - n4 = n / 4; - n8 = n / 8; - t = sin(M_PI / n); - dc = 2 * t * t; - ds = sqrt(dc * (2 - dc)); - t = 2 * dc; - c = sintbl[n4] = 1; - s = sintbl[0] = 0; - for (i = 1; i < n8; ++i) { - c -= dc; - dc += t * c; - s += ds; - ds -= t * s; - sintbl[i] = s; - sintbl[n4 - i] = c; - } - if (n8 != 0) sintbl[n8] = sqrt(0.5); - for (i = 0; i < n4; ++i) sintbl[n2 - i] = sintbl[i]; - for (i = 0; i < n2 + n4; ++i) sintbl[i + n2] = -sintbl[i]; -} - -void make_bitrev(int n, int* bitrev) { - int i, j, k, n2; - - n2 = n / 2; - i = j = 0; - for (;;) { - bitrev[i] = j; - if (++i >= n) break; - k = n2; - while (k <= j) { - j -= k; - k /= 2; - } - j += k; - } -} - -// bitrev: bit reversal table -// sintbl: trigonometric function table -// x:real part -// y:image part -// n: fft length -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n) { - int i, j, k, ik, h, d, k2, n4, inverse; - float t, s, c, dx, dy; - - /* preparation */ - if (n < 0) { - n = -n; - inverse = 1; /* inverse transform */ - } else { - inverse = 0; - } - n4 = n / 4; - if (n == 0) { - return 0; - } - - /* bit reversal */ - for (i = 0; i < n; ++i) { - j = bitrev[i]; - if (i < j) { - t = x[i]; - x[i] = x[j]; - x[j] = t; - t = y[i]; - y[i] = y[j]; - y[j] = t; - } - } - - /* transformation */ - for (k = 1; k < n; k = k2) { - h = 0; - k2 = k + k; - d = n / k2; - for (j = 0; j < k; ++j) { - c = sintbl[h + n4]; - if (inverse) - s = -sintbl[h]; - else - s = sintbl[h]; - for (i = j; i < n; i += k2) { - ik = i + k; - dx = s * y[ik] + c * x[ik]; - dy = c * y[ik] - s * x[ik]; - x[ik] = x[i] - dx; - x[i] += dx; - y[ik] = y[i] - dy; - y[i] += dy; - } - h += d; - } - } - if (inverse) { - /* divide by n in case of the inverse transformation */ - for (i = 0; i < n; ++i) { - x[i] /= n; - y[i] /= n; - } - } - return 0; /* finished successfully */ -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fft.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fft.h deleted file mode 100644 index 6b92e406c44b4768eaee6e734f55bb39cd9af28b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/fft.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_FFT_H_ -#define FRONTEND_FFT_H_ - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -namespace wenet { - -// Fast Fourier Transform - -void make_sintbl(int n, float* sintbl); - -void make_bitrev(int n, int* bitrev); - -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n); - -} // namespace wenet - -#endif // FRONTEND_FFT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/wav.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/wav.h deleted file mode 100644 index 688a049a940ebbdc83f24e59134fff22b7b09bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/frontend/wav.h +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2016 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_WAV_H_ -#define FRONTEND_WAV_H_ - -#include -#include -#include -#include -#include - -#include - -#include "utils/log.h" - -namespace wenet { - -struct WavHeader { - char riff[4] = {'R', 'I', 'F', 'F'}; - unsigned int size = 0; - char wav[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - unsigned int fmt_size = 16; - uint16_t format = 1; - uint16_t channels = 0; - unsigned int sample_rate = 0; - unsigned int bytes_per_second = 0; - uint16_t block_size = 0; - uint16_t bit = 0; - char data[4] = {'d', 'a', 't', 'a'}; - unsigned int data_size = 0; - - WavHeader() {} - - WavHeader(int num_samples, int num_channel, int sample_rate, - int bits_per_sample) { - data_size = num_samples * num_channel * (bits_per_sample / 8); - size = sizeof(WavHeader) - 8 + data_size; - channels = num_channel; - this->sample_rate = sample_rate; - bytes_per_second = sample_rate * num_channel * (bits_per_sample / 8); - block_size = num_channel * (bits_per_sample / 8); - bit = bits_per_sample; - } -}; - -class WavReader { - public: - WavReader() : data_(nullptr) {} - explicit WavReader(const std::string& filename) { Open(filename); } - - bool Open(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "rb"); - if (NULL == fp) { - LOG(WARNING) << "Error in read " << filename; - return false; - } - - WavHeader header; - fread(&header, 1, sizeof(header), fp); - if (header.fmt_size < 16) { - fprintf(stderr, - "WaveData: expect PCM format data " - "to have fmt chunk of at least size 16.\n"); - return false; - } else if (header.fmt_size > 16) { - int offset = 44 - 8 + header.fmt_size - 16; - fseek(fp, offset, SEEK_SET); - fread(header.data, 8, sizeof(char), fp); - } - // check "RIFF" "WAVE" "fmt " "data" - - // Skip any sub-chunks between "fmt" and "data". Usually there will - // be a single "fact" sub chunk, but on Windows there can also be a - // "list" sub chunk. - while (0 != strncmp(header.data, "data", 4)) { - // We will just ignore the data in these chunks. - fseek(fp, header.data_size, SEEK_CUR); - // read next sub chunk - fread(header.data, 8, sizeof(char), fp); - } - - num_channel_ = header.channels; - sample_rate_ = header.sample_rate; - bits_per_sample_ = header.bit; - int num_data = header.data_size / (bits_per_sample_ / 8); - data_ = new float[num_data]; - num_samples_ = num_data / num_channel_; - - for (int i = 0; i < num_data; ++i) { - switch (bits_per_sample_) { - case 8: { - char sample; - fread(&sample, 1, sizeof(char), fp); - data_[i] = static_cast(sample); - break; - } - case 16: { - int16_t sample; - fread(&sample, 1, sizeof(int16_t), fp); - data_[i] = static_cast(sample); - break; - } - case 32: { - int sample; - fread(&sample, 1, sizeof(int), fp); - data_[i] = static_cast(sample); - break; - } - default: - fprintf(stderr, "unsupported quantization bits"); - exit(1); - } - } - fclose(fp); - return true; - } - - int num_channel() const { return num_channel_; } - int sample_rate() const { return sample_rate_; } - int bits_per_sample() const { return bits_per_sample_; } - int num_samples() const { return num_samples_; } - - ~WavReader() { - delete[] data_; - } - - const float* data() const { return data_; } - - private: - int num_channel_; - int sample_rate_; - int bits_per_sample_; - int num_samples_; // sample points per channel - float* data_; -}; - -class WavWriter { - public: - WavWriter(const float* data, int num_samples, int num_channel, - int sample_rate, int bits_per_sample) - : data_(data), - num_samples_(num_samples), - num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample) {} - - void Write(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "wb"); - WavHeader header(num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fwrite(&header, 1, sizeof(header), fp); - - for (int i = 0; i < num_samples_; ++i) { - for (int j = 0; j < num_channel_; ++j) { - switch (bits_per_sample_) { - case 8: { - char sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 16: { - int16_t sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 32: { - int sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - } - } - } - fclose(fp); - } - - private: - const float* data_; - int num_samples_; // total float points in data_ - int num_channel_; - int sample_rate_; - int bits_per_sample_; -}; - -class StreamWavWriter { - public: - StreamWavWriter(int num_channel, int sample_rate, int bits_per_sample) - : num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample), - total_num_samples_(0) {} - - StreamWavWriter(const std::string& filename, int num_channel, - int sample_rate, int bits_per_sample) - : StreamWavWriter(num_channel, sample_rate, bits_per_sample) { - Open(filename); - } - - void Open(const std::string& filename) { - fp_ = fopen(filename.c_str(), "wb"); - fseek(fp_, sizeof(WavHeader), SEEK_SET); - } - - void Write(const int16_t* sample_data, size_t num_samples) { - fwrite(sample_data, sizeof(int16_t), num_samples, fp_); - total_num_samples_ += num_samples; - } - - void Close() { - WavHeader header(total_num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fseek(fp_, 0L, SEEK_SET); - fwrite(&header, 1, sizeof(header), fp_); - fclose(fp_); - } - - private: - FILE* fp_; - int num_channel_; - int sample_rate_; - int bits_per_sample_; - size_t total_num_samples_; -}; - -} // namespace wenet - -#endif // FRONTEND_WAV_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/CMakeLists.txt deleted file mode 100644 index b072309e44b90dcee44ea31e9bcbc1741e73f151..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) - -project(kaldi) - -# include_directories() is called in the root CMakeLists.txt - -add_library(kaldi-util - base/kaldi-error.cc - base/kaldi-math.cc - util/kaldi-io.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/text-utils.cc -) -target_link_libraries(kaldi-util PUBLIC utils) - -add_library(kaldi-decoder - lat/determinize-lattice-pruned.cc - lat/lattice-functions.cc - decoder/lattice-faster-decoder.cc - decoder/lattice-faster-online-decoder.cc -) -target_link_libraries(kaldi-decoder PUBLIC kaldi-util) - -if(GRAPH_TOOLS) - # Arpa binary - add_executable(arpa2fst - lm/arpa-file-parser.cc - lm/arpa-lm-compiler.cc - lmbin/arpa2fst.cc - ) - target_link_libraries(arpa2fst PUBLIC kaldi-util) - - # FST tools binary - set(FST_BINS - fstaddselfloops - fstdeterminizestar - fstisstochastic - fstminimizeencoded - fsttablecompose - ) - - if(NOT MSVC) - # dl is for dynamic linking, otherwise there is a linking error on linux - link_libraries(dl) - endif() - foreach(name IN LISTS FST_BINS) - add_executable(${name} - fstbin/${name}.cc - fstext/kaldi-fst-io.cc - ) - target_link_libraries(${name} PUBLIC kaldi-util) - endforeach() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/README.md deleted file mode 100644 index 4eb9c9173b747686f00b658afc5e1e0dfdc17e68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/README.md +++ /dev/null @@ -1,21 +0,0 @@ -We use Kaldi decoder to implement TLG based language model integration, -so we copied related files to this directory. -The main changes are: - -1. To minimize the change, we use the same directories tree as Kaldi. - -2. We replace Kaldi log system with glog in the following way. - -``` c++ -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_INFO \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) -``` - -3. We lint all the files to satisfy the lint in WeNet. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs-inl.h deleted file mode 100644 index 9397400833676b323492321183c989cec2f41c3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs-inl.h +++ /dev/null @@ -1,329 +0,0 @@ -// base/io-funcs-inl.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; -// Johns Hopkins University (Author: Daniel Povey) -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_INL_H_ -#define KALDI_BASE_IO_FUNCS_INL_H_ 1 - -// Do not include this file directly. It is included by base/io-funcs.h - -#include -#include -#include - -namespace kaldi { - -// Template that covers integers. -template -void WriteBasicType(std::ostream &os, bool binary, T t) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char len_c = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(t)); - os.put(len_c); - os.write(reinterpret_cast(&t), sizeof(t)); - } else { - if (sizeof(t) == 1) - os << static_cast(t) << " "; - else - os << t << " "; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteBasicType."; - } -} - -// Template that covers integers. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t) { - KALDI_PARANOID_ASSERT(t != NULL); - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - int len_c_in = is.get(); - if (len_c_in == -1) - KALDI_ERR << "ReadBasicType: encountered end of stream."; - char len_c = static_cast(len_c_in), - len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(*t)); - if (len_c != len_c_expected) { - KALDI_ERR << "ReadBasicType: did not get expected integer type, " - << static_cast(len_c) << " vs. " - << static_cast(len_c_expected) - << ". You can change this code to successfully" - << " read it later, if needed."; - // insert code here to read "wrong" type. Might have a switch statement. - } - is.read(reinterpret_cast(t), sizeof(*t)); - } else { - if (sizeof(*t) == 1) { - int16 i; - is >> i; - *t = i; - } else { - is >> *t; - } - } - if (is.fail()) { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << is.peek(); - } -} - -// Template that covers integers. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector >::const_iterator iter = v.begin(), - end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(iter->first) << ',' - << static_cast(iter->second) << ' '; - else - os << iter->first << ',' << iter->second << ' '; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerPairVector."; - } -} - -// Template that covers integers. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz * 2); - } - } else { - std::vector > tmp_v; // use temporary so v doesn't use - // extra memory due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); - } else { - T next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::pair(next_t1, next_t2)); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerPairVector: read failure at file position " - << is.tellg(); -} - -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(*iter) << " "; - else - os << *iter << " "; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerVector."; - } -} - -template -inline void ReadIntegerVector(std::istream &is, bool binary, - std::vector *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz); - } - } else { - std::vector tmp_v; // use temporary so v doesn't use extra memory - // due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back((T)next_t); - } else { - T next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(next_t); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerVector: read failure at file position " - << is.tellg(); -} - -// Initialize an opened stream for writing by writing an optional binary -// header and modifying the floating-point precision. -inline void InitKaldiOutputStream(std::ostream &os, bool binary) { - // This does not throw exceptions (does not check for errors). - if (binary) { - os.put('\0'); - os.put('B'); - } - // Note, in non-binary mode we may at some point want to mess with - // the precision a bit. - // 7 is a bit more than the precision of float.. - if (os.precision() < 7) os.precision(7); -} - -/// Initialize an opened stream for reading by detecting the binary header and -// setting the "binary" value appropriately. -inline bool InitKaldiInputStream(std::istream &is, bool *binary) { - // Sets the 'binary' variable. - // Throws exception in the very unusual situation that stream - // starts with '\0' but not then 'B'. - - if (is.peek() == '\0') { // seems to be binary - is.get(); - if (is.peek() != 'B') { - return false; - } - is.get(); - *binary = true; - return true; - } else { - *binary = false; - return true; - } -} - -} // end namespace kaldi. - -#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs.cc deleted file mode 100644 index bd6c350780d1096ff8c452fd00864aa07a30ac65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs.cc +++ /dev/null @@ -1,215 +0,0 @@ -// base/io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" - -namespace kaldi { - -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b) { - os << (b ? "T" : "F"); - if (!binary) os << " "; - if (os.fail()) KALDI_ERR << "Write failure in WriteBasicType"; -} - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b) { - KALDI_PARANOID_ASSERT(b != NULL); - if (!binary) is >> std::ws; // eat up whitespace. - char c = is.peek(); - if (c == 'T') { - *b = true; - is.get(); - } else if (c == 'F') { - *b = false; - is.get(); - } else { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << CharToString(c); - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, float f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f) { - KALDI_PARANOID_ASSERT(f != NULL); - if (binary) { - double d; - int c = is.peek(); - if (c == sizeof(*f)) { - is.get(); - is.read(reinterpret_cast(f), sizeof(*f)); - } else if (c == sizeof(d)) { - ReadBasicType(is, binary, &d); - *f = d; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *f; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, double *d) { - KALDI_PARANOID_ASSERT(d != NULL); - if (binary) { - float f; - int c = is.peek(); - if (c == sizeof(*d)) { - is.get(); - is.read(reinterpret_cast(d), sizeof(*d)); - } else if (c == sizeof(f)) { - ReadBasicType(is, binary, &f); - *d = f; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *d; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -void CheckToken(const char *token) { - if (*token == '\0') KALDI_ERR << "Token is empty (not a valid token)"; - const char *orig_token = token; - while (*token != '\0') { - if (::isspace(*token)) - KALDI_ERR << "Token is not a valid token (contains space): '" - << orig_token << "'"; - token++; - } -} - -void WriteToken(std::ostream &os, bool binary, const char *token) { - // binary mode is ignored; - // we use space as termination character in either case. - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - os << token << " "; - if (os.fail()) { - KALDI_ERR << "Write failure in WriteToken."; - } -} - -int Peek(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // eat up whitespace. - return is.peek(); -} - -void WriteToken(std::ostream &os, bool binary, const std::string &token) { - WriteToken(os, binary, token.c_str()); -} - -void ReadToken(std::istream &is, bool binary, std::string *str) { - KALDI_ASSERT(str != NULL); - if (!binary) is >> std::ws; // consume whitespace. - is >> *str; - if (is.fail()) { - KALDI_ERR << "ReadToken, failed to read token at file position " - << is.tellg(); - } - if (!isspace(is.peek())) { - KALDI_ERR << "ReadToken, expected space after token, saw instead " - << CharToString(static_cast(is.peek())) - << ", at file position " << is.tellg(); - } - is.get(); // consume the space. -} - -int PeekToken(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // consume whitespace. - bool read_bracket; - if (static_cast(is.peek()) == '<') { - read_bracket = true; - is.get(); - } else { - read_bracket = false; - } - int ans = is.peek(); - if (read_bracket) { - if (!is.unget()) { - // Clear the bad bit. This code can be (and is in fact) reached, since the - // C++ standard does not guarantee that a call to unget() must succeed. - is.clear(); - } - } - return ans; -} - -void ExpectToken(std::istream &is, bool binary, const char *token) { - int pos_at_start = is.tellg(); - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - if (!binary) is >> std::ws; // consume whitespace. - std::string str; - is >> str; - is.get(); // consume the space. - if (is.fail()) { - KALDI_ERR << "Failed to read token [started at file position " - << pos_at_start << "], expected " << token; - } - // The second half of the '&&' expression below is so that if we're expecting - // "", we will accept "Foo>" instead. This is so that the model-reading - // code will tolerate errors in PeekToken where is.unget() failed; search for - // is.clear() in PeekToken() for an explanation. - if (strcmp(str.c_str(), token) != 0 && - !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { - KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str - << "\"."; - } -} - -void ExpectToken(std::istream &is, bool binary, const std::string &token) { - ExpectToken(is, binary, token.c_str()); -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs.h deleted file mode 100644 index 06ad1e3d2d8dc8385886a7c6653f620642c7c05a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/io-funcs.h +++ /dev/null @@ -1,246 +0,0 @@ -// base/io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_H_ -#define KALDI_BASE_IO_FUNCS_H_ - -// This header only contains some relatively low-level I/O functions. -// The full Kaldi I/O declarations are in ../util/kaldi-io.h -// and ../util/kaldi-table.h -// They were put in util/ in order to avoid making the Matrix library -// dependent on them. - -#include -#include -#include -#include - -#include "base/io-funcs-inl.h" -#include "base/kaldi-common.h" - -namespace kaldi { - -/* - This comment describes the Kaldi approach to I/O. All objects can be written - and read in two modes: binary and text. In addition we want to make the I/O - work if we redefine the typedef "BaseFloat" between floats and doubles. - We also want to have control over whitespace in text mode without affecting - the meaning of the file, for pretty-printing purposes. - - Errors are handled by throwing a KaldiFatalError exception. - - For integer and floating-point types (and boolean values): - - WriteBasicType(std::ostream &, bool binary, const T&); - ReadBasicType(std::istream &, bool binary, T*); - - and we expect these functions to be defined in such a way that they work when - the type T changes between float and double, so you can read float into double - and vice versa]. Note that for efficiency and space-saving reasons, the - Vector and Matrix classes do not use these functions [but they preserve the - type interchangeability in their own way] - - For a class (or struct) C: - class C { - .. - Write(std::ostream &, bool binary, [possibly extra optional args for - specific classes]) const; Read(std::istream &, bool binary, [possibly extra - optional args for specific classes]); - .. - } - NOTE: The only actual optional args we used are the "add" arguments in - Vector/Matrix classes, which specify whether we should sum the data already - in the class with the data being read. - - For types which are typedef's involving stl classes, I/O is as follows: - typedef std::vector > MyTypedefName; - - The user should define something like: - - WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); - ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); - - The user would have to write these functions. - - For a type std::vector: - - void WriteIntegerVector(std::ostream &os, bool binary, const std::vector - &v); void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - - For other types, e.g. vectors of pairs, the user should create a routine of - the type WriteMyTypedefName. This is to avoid introducing confusing templated - functions; we could easily create templated functions to handle most of these - cases but they would have to share the same name. - - It also often happens that the user needs to write/read special tokens as part - of a file. These might be class headers, or separators/identifiers in the - class. We provide special functions for manipulating these. These special - tokens must be nonempty and must not contain any whitespace. - - void WriteToken(std::ostream &os, bool binary, const char*); - void WriteToken(std::ostream &os, bool binary, const std::string & token); - int Peek(std::istream &is, bool binary); - void ReadToken(std::istream &is, bool binary, std::string *str); - void PeekToken(std::istream &is, bool binary, std::string *str); - - WriteToken writes the token and one space (whether in binary or text mode). - - Peek returns the first character of the next token, by consuming whitespace - (in text mode) and then returning the peek() character. It returns -1 at EOF; - it doesn't throw. It's useful if a class can have various forms based on - typedefs and virtual classes, and wants to know which version to read. - - ReadToken allows the caller to obtain the next token. PeekToken works just - like ReadToken, but seeks back to the beginning of the token. A subsequent - call to ReadToken will read the same token again. This is useful when - different object types are written to the same file; using PeekToken one can - decide which of the objects to read. - - There is currently no special functionality for writing/reading strings (where - the strings contain data rather than "special tokens" that are whitespace-free - and nonempty). This is because Kaldi is structured in such a way that strings - don't appear, except as OpenFst symbol table entries (and these have their own - format). - - - NOTE: you should not call ReadIntegerType and WriteIntegerType with types, - such as int and size_t, that are machine-independent -- at least not - if you want your file formats to port between machines. Use int32 and - int64 where necessary. There is no way to detect this using compile-time - assertions because C++ only keeps track of the internal representation of - the type. -*/ - -/// \addtogroup io_funcs_basic -/// @{ - -/// WriteBasicType is the name of the write function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void WriteBasicType(std::ostream &os, bool binary, T t); - -/// ReadBasicType is the name of the read function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void ReadBasicType(std::istream &is, bool binary, T *t); - -// Declare specialization for bool. -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b); - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b); - -// Declare specializations for float and double. -template <> -void WriteBasicType(std::ostream &os, bool binary, float f); - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f); - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f); - -template <> -void ReadBasicType(std::istream &is, bool binary, double *f); - -// Define ReadBasicType that accepts an "add" parameter to add to -// the destination. Caution: if used in Read functions, be careful -// to initialize the parameters concerned to zero in the default -// constructor. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { - if (!add) { - ReadBasicType(is, binary, t); - } else { - T tmp = T(0); - ReadBasicType(is, binary, &tmp); - *t += tmp; - } -} - -/// Function for writing STL vectors of integer types. -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v); - -/// Function for reading STL vector of integer types. -template -inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - -/// Function for writing STL vectors of pairs of integer types. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v); - -/// Function for reading STL vector of pairs of integer types. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v); - -/// The WriteToken functions are for writing nonempty sequences of non-space -/// characters. They are not for general strings. -void WriteToken(std::ostream &os, bool binary, const char *token); -void WriteToken(std::ostream &os, bool binary, const std::string &token); - -/// Peek consumes whitespace (if binary == false) and then returns the peek() -/// value of the stream. -int Peek(std::istream &is, bool binary); - -/// ReadToken gets the next token and puts it in str (exception on failure). If -/// PeekToken() had been previously called, it is possible that the stream had -/// failed to unget the starting '<' character. In this case ReadToken() returns -/// the token string without the leading '<'. You must be prepared to handle -/// this case. ExpectToken() handles this internally, and is not affected. -void ReadToken(std::istream &is, bool binary, std::string *token); - -/// PeekToken will return the first character of the next token, or -1 if end of -/// file. It's the same as Peek(), except if the first character is '<' it will -/// skip over it and will return the next character. It will attempt to unget -/// the '<' so the stream is where it was before you did PeekToken(), however, -/// this is not guaranteed (see ReadToken()). -int PeekToken(std::istream &is, bool binary); - -/// ExpectToken tries to read in the given token, and throws an exception -/// on failure. -void ExpectToken(std::istream &is, bool binary, const char *token); -void ExpectToken(std::istream &is, bool binary, const std::string &token); - -/// ExpectPretty attempts to read the text in "token", but only in non-binary -/// mode. Throws exception on failure. It expects an exact match except that -/// arbitrary whitespace matches arbitrary whitespace. -void ExpectPretty(std::istream &is, bool binary, const char *token); -void ExpectPretty(std::istream &is, bool binary, const std::string &token); - -/// @} end "addtogroup io_funcs_basic" - -/// InitKaldiOutputStream initializes an opened stream for writing by writing an -/// optional binary header and modifying the floating-point precision; it will -/// typically not be called by users directly. -inline void InitKaldiOutputStream(std::ostream &os, bool binary); - -/// InitKaldiInputStream initializes an opened stream for reading by detecting -/// the binary header and setting the "binary" value appropriately; -/// It will typically not be called by users directly. -inline bool InitKaldiInputStream(std::istream &is, bool *binary); - -} // end namespace kaldi. -#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-common.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-common.h deleted file mode 100644 index eee5f34d7234e7c029e6bb59584d3ee65ff5a875..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-common.h +++ /dev/null @@ -1,41 +0,0 @@ -// base/kaldi-common.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_COMMON_H_ -#define KALDI_BASE_KALDI_COMMON_H_ 1 - -#include -#include -#include // C string stuff like strcpy -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-utils.h" -#include "base/kaldi-error.h" -#include "base/kaldi-types.h" -// #include "base/io-funcs.h" -#include "base/kaldi-math.h" -// #include "base/timer.h" - -#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-error.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-error.cc deleted file mode 100644 index 77edc6af6e56bb8fa3431d519e58fda9ee0bac6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-error.cc +++ /dev/null @@ -1,42 +0,0 @@ -// base/kaldi-error.cc - -// Copyright 2019 LAIX (Yi Sun) -// Copyright 2019 SmartAction LLC (kkm) -// Copyright 2016 Brno University of Technology (author: Karel Vesely) -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-error.h" - -#include - -namespace kaldi { - -/***** GLOBAL VARIABLES FOR LOGGING *****/ - -int32 g_kaldi_verbose_level = 0; -static std::string program_name; // NOLINT - -void SetProgramName(const char *basename) { - // Using the 'static std::string' for the program name is mostly harmless, - // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ - // string implementation has been found in the wild that would not be just - // an empty string when zero-initialized but not yet constructed. - program_name = basename; -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-error.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-error.h deleted file mode 100644 index 0f65db372b5f05a8017433eed7c95badc819a0a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// base/kaldi-error.h - -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_ERROR_H_ -#define KALDI_BASE_KALDI_ERROR_H_ 1 - -#include "utils/log.h" - -namespace kaldi { - -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_LOG \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) - - -/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ - -/// Called by ParseOptions to set base name (no directory) of the executing -/// program. The name is printed in logging code along with every message, -/// because in our scripts, we often mix together the stderr of many programs. -/// This function is very thread-unsafe. -void SetProgramName(const char *basename); - -/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. -/// Do not use directly, prefer {Get,Set}VerboseLevel(). -extern int32 g_kaldi_verbose_level; - -/// Get verbosity level, usually set via command line '--verbose=' switch. -inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } - -/// This should be rarely used, except by programs using Kaldi as library; -/// command-line programs set the verbose level automatically from ParseOptions. -inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } - -} // namespace kaldi - -#endif // KALDI_BASE_KALDI_ERROR_H_ - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-math.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-math.cc deleted file mode 100644 index 175d9f49b6c5216645e90e146f4e2eab5572c342..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-math.cc +++ /dev/null @@ -1,164 +0,0 @@ -// base/kaldi-math.cc - -// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; -// Saarland University; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-math.h" -#ifndef _MSC_VER -#include -#include -#endif -#include -#include - -namespace kaldi { -// These routines are tested in matrix/matrix-test.cc - -int32 RoundUpToNearestPowerOfTwo(int32 n) { - KALDI_ASSERT(n > 0); - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n+1; -} - -static std::mutex _RandMutex; - -int Rand(struct RandomState* state) { -#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) - // On Windows and Cygwin, just call Rand() - return rand(); -#else - if (state) { - return rand_r(&(state->seed)); - } else { - std::lock_guard lock(_RandMutex); - return rand(); - } -#endif -} - -RandomState::RandomState() { - // we initialize it as Rand() + 27437 instead of just Rand(), because on some - // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be - // the case that rand_r when initialized with rand() will give you the exact - // same sequence of numbers that rand() will give if you keep calling rand() - // after that initial call. This can cause problems with repeated sequences. - // For example if you initialize two RandomState structs one after the other - // without calling rand() in between, they would give you the same sequence - // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just - // a randomly chosen prime number. - seed = unsigned(Rand()) + 27437; -} - -bool WithProb(BaseFloat prob, struct RandomState* state) { - KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, - // but we allow slightly larger values that could arise from roundoff in - // previous calculations. - KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); - if (prob == 0) { - return false; - } else if (prob == 1.0) { - return true; - } else if (prob * RAND_MAX < 128.0) { - // prob is very small but nonzero, and the "main algorithm" - // wouldn't work that well. So: with probability 1/128, we - // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... - // Note: we know that prob * 128.0 < 1.0, because - // we asserted RAND_MAX > 128 * 128. - return WithProb(prob * 128.0); - } else { - return false; - } - } else { - return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); - } -} - -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { - // This is not exact. - KALDI_ASSERT(max_val >= min_val); - if (max_val == min_val) return min_val; - -#ifdef _MSC_VER - // RAND_MAX is quite small on Windows -> may need to handle larger numbers. - if (RAND_MAX > (max_val-min_val)*8) { - // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + - ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); - } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > - (unsigned int)((max_val+1-min_val)*8)) { - // *8 to avoid inaccuracies in probability, from the modulus... - return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) - % (unsigned int)(max_val+1-min_val)); - } else { - KALDI_ERR << "rand_int failed because we do not support such large " - "random numbers. (Extend this function)."; - } - } -#else - return min_val + - (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); -#endif -} - -// Returns poisson-distributed random number. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state) { - // Knuth's algorithm. - KALDI_ASSERT(lambda >= 0); - float L = expf(-lambda), p = 1.0; - int32 k = 0; - do { - k++; - float u = RandUniform(state); - p *= u; - } while (p > L); - return k-1; -} - -void RandGauss2(float *a, float *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float u1 = RandUniform(state); - float u2 = RandUniform(state); - u1 = sqrtf(-2.0f * logf(u1)); - u2 = 2.0f * M_PI * u2; - *a = u1 * cosf(u2); - *b = u1 * sinf(u2); -} - -void RandGauss2(double *a, double *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float a_float, b_float; - // Just because we're using doubles doesn't mean we need super-high-quality - // random numbers, so we just use the floating-point version internally. - RandGauss2(&a_float, &b_float, state); - *a = a_float; - *b = b_float; -} - - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-math.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-math.h deleted file mode 100644 index 93c265ee96e704893da26b9083a44a9e60c6c192..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-math.h +++ /dev/null @@ -1,363 +0,0 @@ -// base/kaldi-math.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; -// Jan Silovsky; Saarland University -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_MATH_H_ -#define KALDI_BASE_KALDI_MATH_H_ 1 - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include - -#include "base/kaldi-types.h" -#include "base/kaldi-common.h" - - -#ifndef DBL_EPSILON -#define DBL_EPSILON 2.2204460492503131e-16 -#endif -#ifndef FLT_EPSILON -#define FLT_EPSILON 1.19209290e-7f -#endif - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.4142135623730950488016887 -#endif - -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244008443621048490 -#endif - -#ifndef M_LOG_2PI -#define M_LOG_2PI 1.8378770664093454835606594728112 -#endif - -#ifndef M_LN2 -#define M_LN2 0.693147180559945309417232121458 -#endif - -#ifndef M_LN10 -#define M_LN10 2.302585092994045684017991454684 -#endif - - -#define KALDI_ISNAN std::isnan -#define KALDI_ISINF std::isinf -#define KALDI_ISFINITE(x) std::isfinite(x) - -#if !defined(KALDI_SQR) -# define KALDI_SQR(x) ((x) * (x)) -#endif - -namespace kaldi { - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline double Exp(double x) { return exp(x); } -#ifndef KALDI_NO_EXPF -inline float Exp(float x) { return expf(x); } -#else -inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF -#else -inline double Exp(double x) { return exp(x); } -#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -// Microsoft CL v18.0 buggy 64-bit implementation of -// expf() incorrectly returns -inf for exp(-inf). -inline float Exp(float x) { return exp(static_cast(x)); } -#else -inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -inline double Log(double x) { return log(x); } -inline float Log(float x) { return logf(x); } - -#if !defined(_MSC_VER) || (_MSC_VER >= 1700) -inline double Log1p(double x) { return log1p(x); } -inline float Log1p(float x) { return log1pf(x); } -#else -inline double Log1p(double x) { - const double cutoff = 1.0e-08; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} - -inline float Log1p(float x) { - const float cutoff = 1.0e-07; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} -#endif - -static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! -static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! - -// -infinity -const float kLogZeroFloat = -std::numeric_limits::infinity(); -const double kLogZeroDouble = -std::numeric_limits::infinity(); -const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); - -// Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state = NULL); - -// State for thread-safe random number generator -struct RandomState { - RandomState(); - unsigned seed; -}; - -// Returns a random integer between first and last inclusive. -int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); - -// Returns true with probability "prob", -bool WithProb(BaseFloat prob, struct RandomState* state = NULL); -// with 0 <= prob <= 1 [we check this]. -// Internally calls Rand(). This function is carefully implemented so -// that it should work even if prob is very small. - -/// Returns a random number strictly between 0 and 1. -inline float RandUniform(struct RandomState* state = NULL) { - return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); -} - -inline float RandGauss(struct RandomState* state = NULL) { - return static_cast(sqrtf (-2 * Log(RandUniform(state))) - * cosf(2*M_PI*RandUniform(state))); -} - -// Returns poisson-distributed random number. Uses Knuth's algorithm. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state = NULL); - -// Returns a pair of gaussian random numbers. Uses Box-Muller transform -void RandGauss2(float *a, float *b, RandomState *state = NULL); -void RandGauss2(double *a, double *b, RandomState *state = NULL); - -// Also see Vector::RandCategorical(). - -// This is a randomized pruning mechanism that preserves expectations, -// that we typically use to prune posteriors. -template -inline Float RandPrune(Float post, BaseFloat prune_thresh, - struct RandomState* state = NULL) { - KALDI_ASSERT(prune_thresh >= 0.0); - if (post == 0.0 || std::abs(post) >= prune_thresh) - return post; - return (post >= 0 ? 1.0 : -1.0) * - (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); -} - -// returns log(exp(x) + exp(y)). -inline double LogAdd(double x, double y) { - double diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffDouble) { - double res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) + exp(y)). -inline float LogAdd(float x, float y) { - float diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffFloat) { - float res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) - exp(y)). -inline double LogSub(double x, double y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - double diff = y - x; // Will be negative. - double res = x + Log(1.0 - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroDouble; - return res; -} - - -// returns log(exp(x) - exp(y)). -inline float LogSub(float x, float y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - float diff = y - x; // Will be negative. - float res = x + Log(1.0f - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroFloat; - return res; -} - -/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). -static inline bool ApproxEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - if (a == b) return true; - float diff = std::abs(a-b); - if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); -} - -/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) -static inline void AssertEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); -} - - -// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. -int32 RoundUpToNearestPowerOfTwo(int32 n); - -/// Returns a / b, rounding towards negative infinity in all cases. -static inline int32 DivideRoundingDown(int32 a, int32 b) { - KALDI_ASSERT(b != 0); - if (a * b >= 0) - return a / b; - else if (a < 0) - return (a - b + 1) / b; - else - return (a - b - 1) / b; -} - -template I Gcd(I m, I n) { - if (m == 0 || n == 0) { - if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. - KALDI_ERR << "Undefined GCD since m = 0, n = 0."; - } - return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); - // return absolute value of whichever is nonzero - } - // could use compile-time assertion - // but involves messing with complex template stuff. - KALDI_ASSERT(std::numeric_limits::is_integer); - while (1) { - m %= n; - if (m == 0) return (n > 0 ? n : -n); - n %= m; - if (n == 0) return (m > 0 ? m : -m); - } -} - -/// Returns the least common multiple of two integers. Will -/// crash unless the inputs are positive. -template I Lcm(I m, I n) { - KALDI_ASSERT(m > 0 && n > 0); - I gcd = Gcd(m, n); - return gcd * (m/gcd) * (n/gcd); -} - - -template void Factorize(I m, std::vector *factors) { - // Splits a number into its prime factors, in sorted order from - // least to greatest, with duplication. A very inefficient - // algorithm, which is mainly intended for use in the - // mixed-radix FFT computation (where we assume most factors - // are small). - KALDI_ASSERT(factors != NULL); - KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. - factors->clear(); - I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; - - // First try small factors. - for (I i = 0; i < 10; i++) { - if (m == 1) return; // We're done. - while (m % small_factors[i] == 0) { - m /= small_factors[i]; - factors->push_back(small_factors[i]); - } - } - // Next try all odd numbers starting from 31. - for (I j = 31;; j += 2) { - if (m == 1) return; - while (m % j == 0) { - m /= j; - factors->push_back(j); - } - } -} - -inline double Hypot(double x, double y) { return hypot(x, y); } -inline float Hypot(float x, float y) { return hypotf(x, y); } - - - - -} // namespace kaldi - - -#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-types.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-types.h deleted file mode 100644 index 7ebf4f85386192a65e176d8f0ecde9bb348af4a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-types.h +++ /dev/null @@ -1,75 +0,0 @@ -// base/kaldi-types.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_TYPES_H_ -#define KALDI_BASE_KALDI_TYPES_H_ 1 - -namespace kaldi { -// TYPEDEFS .................................................................. -#if (KALDI_DOUBLEPRECISION != 0) -typedef double BaseFloat; -#else -typedef float BaseFloat; -#endif -} - -#ifdef _MSC_VER -#include -#define ssize_t SSIZE_T -#endif - -// we can do this a different way if some platform -// we find in the future lacks stdint.h -#include - -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - -#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-utils.h deleted file mode 100644 index bd434d09ed92ec94bc4208f53a4416f941edfdb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/base/kaldi-utils.h +++ /dev/null @@ -1,155 +0,0 @@ -// base/kaldi-utils.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; -// Saarland University; Karel Vesely; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_UTILS_H_ -#define KALDI_BASE_KALDI_UTILS_H_ 1 - -#if defined(_MSC_VER) -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX -# include -#endif - -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) -#if _MSC_VER < 1400 -#define __restrict__ -#else -#define __restrict__ __restrict -#endif -#endif - -#if defined(_MSC_VER) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = _aligned_malloc(size, align)) -# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) -#elif defined(__CYGWIN__) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = aligned_alloc(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#else -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#endif - -#ifdef __ICC -#pragma warning(disable: 383) // ICPC remark we don't want. -#pragma warning(disable: 810) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#pragma warning(disable: 1418) // ICPC remark we don't want. -#pragma warning(disable: 444) // ICPC remark we don't want. -#pragma warning(disable: 869) // ICPC remark we don't want. -#pragma warning(disable: 1287) // ICPC remark we don't want. -#pragma warning(disable: 279) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#endif - - -namespace kaldi { - - -// CharToString prints the character in a human-readable form, for debugging. -std::string CharToString(const char &c); - - -inline int MachineIsLittleEndian() { - int check = 1; - return (*reinterpret_cast(&check) != 0); -} - -// This function kaldi::Sleep() provides a portable way -// to sleep for a possibly fractional -// number of seconds. On Windows it's only accurate to microseconds. -void Sleep(float seconds); -} // namespace kaldi - -#define KALDI_SWAP8(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ - (reinterpret_cast(&a))[7] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ - (reinterpret_cast(&a))[6] = t;\ - t = (reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ - (reinterpret_cast(&a))[5] = t;\ - t = (reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ - (reinterpret_cast(&a))[4] = t;} while (0) -#define KALDI_SWAP4(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=t;} while (0) -#define KALDI_SWAP2(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1] = t;} while (0) - - -// Makes copy constructor and operator= private. -#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ - type(const type&); \ - void operator = (const type&) - -template class KaldiCompileTimeAssert { }; -template<> class KaldiCompileTimeAssert { - public: - static inline void Check() { } -}; - -#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() - -#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ - KaldiCompileTimeAssert::is_specialized \ - && std::numeric_limits::is_integer>::Check() - -#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ - KaldiCompileTimeAssert::is_specialized \ - && !std::numeric_limits::is_integer>::Check() - -#if defined(_MSC_VER) -#define KALDI_STRCASECMP _stricmp -#elif defined(__CYGWIN__) -#include -#define KALDI_STRCASECMP strcasecmp -#else -#define KALDI_STRCASECMP strcasecmp -#endif -#ifdef _MSC_VER -# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif - -#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-decoder.cc deleted file mode 100644 index 06f77557fa49a23f6a44d07c327a1b3b081c6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-decoder.cc +++ /dev/null @@ -1,1101 +0,0 @@ -// decoder/lattice-faster-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2018 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "decoder/lattice-faster-decoder.h" -// #include "lat/lattice-functions.h" - -namespace kaldi { - -// instantiate this class once for each thing you have to decode. -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : fst_(&fst), - delete_fst_(false), - config_(config), - num_toks_(0), - context_graph_(context_graph) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const LatticeFasterDecoderConfig &config, FST *fst) - : fst_(fst), delete_fst_(true), config_(config), num_toks_(0) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::~LatticeFasterDecoderTpl() { - DeleteElems(toks_.Clear()); - ClearActiveTokens(); - if (delete_fst_) delete fst_; -} - -template -void LatticeFasterDecoderTpl::InitDecoding() { - // clean up from last time: - DeleteElems(toks_.Clear()); - cost_offsets_.clear(); - ClearActiveTokens(); - warned_ = false; - num_toks_ = 0; - decoding_finalized_ = false; - final_costs_.clear(); - StateId start_state = fst_->Start(); - KALDI_ASSERT(start_state != fst::kNoStateId); - active_toks_.resize(1); - Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL); - active_toks_[0].toks = start_tok; - toks_.Insert(start_state, start_tok); - num_toks_++; - ProcessNonemitting(config_.beam); -} - -// Returns true if any kind of traceback is available (not necessarily from -// a final state). It should only very rarely return false; this indicates -// an unusual search error. -template -bool LatticeFasterDecoderTpl::Decode( - DecodableInterface *decodable) { - InitDecoding(); - // We use 1-based indexing for frames in this decoder (if you view it in - // terms of features), but note that the decodable object uses zero-based - // numbering, which we have to correct for when we call it. - AdvanceDecoding(decodable); - FinalizeDecoding(); - - // Returns true if we have any kind of traceback available (not necessarily - // to the end state; query ReachedFinal() for that). - return !active_toks_.empty() && active_toks_.back().toks != NULL; -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - Lattice raw_lat; - GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, olat); - return (olat->NumStates() != 0); -} - -// Outputs an FST corresponding to the raw, state-level lattice -template -bool LatticeFasterDecoderTpl::GetRawLattice( - Lattice *ofst, bool use_final_probs) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (decoding_finalized_ ? final_costs_ : final_costs_local); - if (!decoding_finalized_ && use_final_probs) - ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_ / 2 + 3; - unordered_map tok_map(bucket_count); - // First create all states. - std::vector token_list; - for (int32 f = 0; f <= num_frames; f++) { - if (active_toks_[f].toks == NULL) { - KALDI_WARN << "GetRawLattice: no tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - TopSortTokens(active_toks_[f].toks, &token_list); - for (size_t i = 0; i < token_list.size(); i++) - if (token_list[i] != NULL) tok_map[token_list[i]] = ofst->AddState(); - } - // The next statement sets the start state of the output FST. Because we - // topologically sorted the tokens, state zero must be the start-state. - ofst->SetStart(0); - - KALDI_VLOG(4) << "init:" << num_toks_ / 2 + 3 - << " buckets:" << tok_map.bucket_count() - << " load:" << tok_map.load_factor() - << " max:" << tok_map.max_load_factor(); - // Now create all arcs. - for (int32 f = 0; f <= num_frames; f++) { - for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) { - StateId cur_state = tok_map[tok]; - for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) { - typename unordered_map::const_iterator iter = - tok_map.find(l->next_tok); - StateId nextstate = iter->second; - KALDI_ASSERT(iter != tok_map.end()); - BaseFloat cost_offset = 0.0; - if (l->ilabel != 0) { // emitting.. - KALDI_ASSERT(f >= 0 && f < cost_offsets_.size()); - cost_offset = cost_offsets_[f]; - } - - StateId state = cur_state; - if (l->is_start_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->start_tag_id(), Weight(0, 0), tmp); - ofst->AddArc(state, arc); - state = tmp; - } - if (l->is_end_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->end_tag_id(), Weight(0, 0), nextstate); - ofst->AddArc(tmp, arc); - nextstate = tmp; - } - - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(state, arc); - } - if (f == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - } - - fst::TopSort(ofst); - return (ofst->NumStates() > 0); -} - -// This function is now deprecated, since now we do determinization from outside -// the LatticeFasterDecoder class. Outputs an FST corresponding to the -// lattice-determinized lattice (one path per word sequence). -template -bool LatticeFasterDecoderTpl::GetLattice( - CompactLattice *ofst, bool use_final_probs) const { - Lattice raw_fst; - GetRawLattice(&raw_fst, use_final_probs); - Invert(&raw_fst); // make it so word labels are on the input. - // (in phase where we get backward-costs). - fst::ILabelCompare ilabel_comp; - ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes - // lattice-determinization more efficient. - - fst::DeterminizeLatticePrunedOptions lat_opts; - lat_opts.max_mem = config_.det_opts.max_mem; - - DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); - raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. - Connect(ofst); // Remove unreachable states... there might be - // a small number of these, in some cases. - // Note: if something went wrong and the raw lattice was empty, - // we should still get to this point in the code without warnings or failures. - return (ofst->NumStates() != 0); -} - -template -void LatticeFasterDecoderTpl::PossiblyResizeHash(size_t num_toks) { - size_t new_sz = static_cast(static_cast(num_toks) * - config_.hash_ratio); - if (new_sz > toks_.Size()) { - toks_.SetSize(new_sz); - } -} - -/* - A note on the definition of extra_cost. - - extra_cost is used in pruning tokens, to save memory. - - extra_cost can be thought of as a beta (backward) cost assuming - we had set the betas on currently-active tokens to all be the negative - of the alphas for those tokens. (So all currently active tokens would - be on (tied) best paths). - - We can use the extra_cost to accurately prune away tokens that we know will - never appear in the lattice. If the extra_cost is greater than the desired - lattice beam, the token would provably never appear in the lattice, so we can - prune away the token. - - (Note: we don't update all the extra_costs every time we update a frame; we - only do it every 'config_.prune_interval' frames). - */ - -// FindOrAddToken either locates a token in hash of toks_, -// or if necessary inserts a new, empty token (i.e. with no forward links) -// for the current frame. [note: it's inserted if necessary into hash toks_ -// and also into the singly linked list of tokens active on this frame -// (whose head is at active_toks_[frame]). -template -inline typename LatticeFasterDecoderTpl::Elem * -LatticeFasterDecoderTpl::FindOrAddToken(StateId state, - int32 frame_plus_one, - BaseFloat tot_cost, - Token *backpointer, - bool *changed) { - // Returns the Token pointer. Sets "changed" (if non-NULL) to true - // if the token was newly created or the cost changed. - KALDI_ASSERT(frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - Elem *e_found = toks_.Insert(state, NULL); - if (e_found->val == NULL) { // no such token presently. - const BaseFloat extra_cost = 0.0; - // tokens on the currently final frame have zero extra_cost - // as any of them could end up - // on the winning path. - Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer); - // NULL: no forward links yet - toks = new_tok; - num_toks_++; - e_found->val = new_tok; - if (changed) *changed = true; - return e_found; - } else { - Token *tok = e_found->val; // There is an existing Token for this state. - if (tok->tot_cost > tot_cost) { // replace old token - tok->tot_cost = tot_cost; - // SetBackpointer() just does tok->backpointer = backpointer in - // the case where Token == BackpointerToken, else nothing. - tok->SetBackpointer(backpointer); - // we don't allocate a new token, the old stays linked in active_toks_ - // we only replace the tot_cost - // in the current frame, there are no forward links (and no extra_cost) - // only in ProcessNonemitting we have to delete forward links - // in case we visit a state for the second time - // those forward links, that lead to this replaced token before: - // they remain and will hopefully be pruned later (PruneForwardLinks...) - if (changed) *changed = true; - } else { - if (changed) *changed = false; - } - return e_found; - } -} - -// prunes outgoing links for all tokens in active_toks_[frame] -// it's called by PruneActiveTokens -// all links, that have link_extra_cost > lattice_beam are pruned -template -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - // delta is the amount by which the extra_costs must change - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - - *extra_costs_changed = false; - *links_pruned = false; - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - if (active_toks_[frame_plus_one].toks == - NULL) { // empty list; should not happen. - if (!warned_) { - KALDI_WARN << "No tokens alive [doing pruning].. warning first " - "time only for each utterance\n"; - warned_ = true; - } - } - - // We have to iterate until there is no more change, because the links - // are not guaranteed to be in topological order. - bool changed = true; // difference new minus old extra cost >= delta ? - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost for tok. - BaseFloat tok_extra_cost = std::numeric_limits::infinity(); - // tok_extra_cost is the best (min) of link_extra_cost of outgoing links - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state - KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN - // the graph_cost contatins the context score - // if it's the score of the backoff arc, it should be removed. - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - *links_pruned = true; - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; // move to next link - link = link->next; - } - } // for all outgoing links - if (fabs(tok_extra_cost - tok->extra_cost) > delta) - changed = true; // difference new minus old is bigger than delta - tok->extra_cost = tok_extra_cost; - // will be +infinity or <= lattice_beam_. - // infinity indicates, that no forward link survived pruning - } // for all Token on active_toks_[frame] - if (changed) *extra_costs_changed = true; - - // Note: it's theoretically possible that aggressive compiler - // optimizations could cause an infinite loop here for small delta and - // high-dynamic-range scores. - } // while changed -} - -// PruneForwardLinksFinal is a version of PruneForwardLinks that we call -// on the final frame. If there are final tokens active, it uses -// the final-probs for pruning, otherwise it treats all tokens as final. -template -void LatticeFasterDecoderTpl::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame_plus_one = active_toks_.size() - 1; - - if (active_toks_[frame_plus_one].toks == - NULL) // empty list; should not happen. - KALDI_WARN << "No tokens alive at end of file"; - - typedef typename unordered_map::const_iterator IterType; - ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); - decoding_finalized_ = true; - // We call DeleteElems() as a nicety, not because it's really necessary; - // otherwise there would be a time, after calling PruneTokensForFrame() on the - // final frame, when toks_.GetList() or toks_.Clear() would contain pointers - // to nonexistent tokens. - DeleteElems(toks_.Clear()); - - // Now go through tokens on this frame, pruning forward links... may have to - // iterate a few times until there is no more change, because the list is not - // in topological order. This is a modified version of the code in - // PruneForwardLinks, but here we also take account of the final-probs. - bool changed = true; - BaseFloat delta = 1.0e-05; - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost. It has a term in it that corresponds - // to the "final-prob", so instead of initializing tok_extra_cost to - // infinity below we set it to the difference between the - // (score+final_prob) of this token, and the best such (score+final_prob). - BaseFloat final_cost; - if (final_costs_.empty()) { - final_cost = 0.0; - } else { - IterType iter = final_costs_.find(tok); - if (iter != final_costs_.end()) - final_cost = iter->second; - else - final_cost = std::numeric_limits::infinity(); - } - BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_; - // tok_extra_cost will be a "min" over either directly being final, or - // being indirectly final through other links, and the loop below may - // decrease its value: - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; - link = link->next; - } - } - // prune away tokens worse than lattice_beam above best path. This step - // was not necessary in the non-final case because then, this case - // showed up as having no forward links. Here, the tok_extra_cost has - // an extra component relating to the final-prob. - if (tok_extra_cost > config_.lattice_beam) - tok_extra_cost = std::numeric_limits::infinity(); - // to be pruned in PruneTokensForFrame - - if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true; - tok->extra_cost = - tok_extra_cost; // will be +infinity or <= lattice_beam_. - } - } // while changed -} - -template -BaseFloat LatticeFasterDecoderTpl::FinalRelativeCost() const { - if (!decoding_finalized_) { - BaseFloat relative_cost; - ComputeFinalCosts(NULL, &relative_cost, NULL); - return relative_cost; - } else { - // we're not allowed to call that function if FinalizeDecoding() has - // been called; return a cached value. - return final_relative_cost_; - } -} - -// Prune away any tokens on this frame that have no forward links. -// [we don't do this in PruneForwardLinks because it would give us -// a problem with dangling pointers]. -// It's called by PruneActiveTokens if any forward links have been pruned -template -void LatticeFasterDecoderTpl::PruneTokensForFrame( - int32 frame_plus_one) { - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]"; - Token *tok, *next_tok, *prev_tok = NULL; - for (tok = toks; tok != NULL; tok = next_tok) { - next_tok = tok->next; - if (tok->extra_cost == std::numeric_limits::infinity()) { - // token is unreachable from end of graph; (no forward links survived) - // excise tok from list and delete tok. - if (prev_tok != NULL) - prev_tok->next = tok->next; - else - toks = tok->next; - delete tok; - num_toks_--; - } else { // fetch next Token - prev_tok = tok; - } - } -} - -// Go backwards through still-alive tokens, pruning them, starting not from -// the current frame (where we want to keep all tokens) but from the frame -// before that. We go backwards through the frames and stop when we reach a -// point where the delta-costs are not changing (and the delta controls when we -// consider a cost to have "not changed"). -template -void LatticeFasterDecoderTpl::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // The index "f" below represents a "frame plus one", i.e. you'd have to - // subtract one to get the corresponding index for the decodable object. - for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) { - // Reason why we need to prune forward links in this situation: - // (1) we have never pruned them (new TokenList) - // (2) we have not yet pruned the forward links to the next f, - // after any of those tokens have changed their extra_cost. - if (active_toks_[f].must_prune_forward_links) { - bool extra_costs_changed = false, links_pruned = false; - PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); - if (extra_costs_changed && f > 0) // any token has changed extra_cost - active_toks_[f - 1].must_prune_forward_links = true; - if (links_pruned) // any link was pruned - active_toks_[f].must_prune_tokens = true; - active_toks_[f].must_prune_forward_links = false; // job done - } - if (f + 1 < cur_frame_plus_one && // except for last f (no forward links) - active_toks_[f + 1].must_prune_tokens) { - PruneTokensForFrame(f + 1); - active_toks_[f + 1].must_prune_tokens = false; - } - } - KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin - << " to " << num_toks_; -} - -template -void LatticeFasterDecoderTpl::ComputeFinalCosts( - unordered_map *final_costs, - BaseFloat *final_relative_cost, BaseFloat *final_best_cost) const { - KALDI_ASSERT(!decoding_finalized_); - if (final_costs != NULL) final_costs->clear(); - const Elem *final_toks = toks_.GetList(); - BaseFloat infinity = std::numeric_limits::infinity(); - BaseFloat best_cost = infinity, best_cost_with_final = infinity; - - while (final_toks != NULL) { - StateId state = final_toks->key; - Token *tok = final_toks->val; - const Elem *next = final_toks->tail; - BaseFloat final_cost = fst_->Final(state).Value(); - BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost; - best_cost = std::min(cost, best_cost); - best_cost_with_final = std::min(cost_with_final, best_cost_with_final); - if (final_costs != NULL && final_cost != infinity) - (*final_costs)[tok] = final_cost; - final_toks = next; - } - if (final_relative_cost != NULL) { - if (best_cost == infinity && best_cost_with_final == infinity) { - // Likely this will only happen if there are no tokens surviving. - // This seems the least bad way to handle it. - *final_relative_cost = infinity; - } else { - *final_relative_cost = best_cost_with_final - best_cost; - } - } - if (final_best_cost != NULL) { - if (best_cost_with_final != infinity) { // final-state exists. - *final_best_cost = best_cost_with_final; - } else { // no final-state exists. - *final_best_cost = best_cost; - } - } -} - -template -void LatticeFasterDecoderTpl::AdvanceDecoding( - DecodableInterface *decodable, int32 max_num_frames) { - if (std::is_same >::value) { - // if the type 'FST' is the FST base-class, then see if the FST type of fst_ - // is actually VectorFst or ConstFst. If so, call the AdvanceDecoding() - // function after casting *this to the more specific type. - if (fst_->Type() == "const") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } else if (fst_->Type() == "vector") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } - } - - KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ && - "You must call InitDecoding() before AdvanceDecoding"); - int32 num_frames_ready = decodable->NumFramesReady(); - // num_frames_ready must be >= num_frames_decoded, or else - // the number of frames ready must have decreased (which doesn't - // make sense) or the decodable object changed between calls - // (which isn't allowed). - KALDI_ASSERT(num_frames_ready >= NumFramesDecoded()); - int32 target_frames_decoded = num_frames_ready; - if (max_num_frames >= 0) - target_frames_decoded = - std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames); - while (NumFramesDecoded() < target_frames_decoded) { - if (NumFramesDecoded() % config_.prune_interval == 0) { - PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); - } -} - -// FinalizeDecoding() is a version of PruneActiveTokens that we call -// (optionally) on the final frame. Takes into account the final-prob of -// tokens. This function used to be called PruneActiveTokensFinal(). -template -void LatticeFasterDecoderTpl::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // PruneForwardLinksFinal() prunes final frame (with final-probs), and - // sets decoding_finalized_. - PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { - bool b1, b2; // values not used. - BaseFloat dontcare = 0.0; // delta of zero means we must always update - PruneForwardLinks(f, &b1, &b2, dontcare); - PruneTokensForFrame(f + 1); - } - PruneTokensForFrame(0); - KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " - << num_toks_; -} - -/// Gets the weight cutoff. Also counts the active tokens. -template -BaseFloat LatticeFasterDecoderTpl::GetCutoff( - Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, - Elem **best_elem) { - BaseFloat best_weight = std::numeric_limits::infinity(); - // positive == high cost == bad. - size_t count = 0; - if (config_.max_active == std::numeric_limits::max() && - config_.min_active == 0) { - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = static_cast(e->val->tot_cost); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - if (adaptive_beam != NULL) *adaptive_beam = config_.beam; - return best_weight + config_.beam; - } else { - tmp_array_.clear(); - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = e->val->tot_cost; - tmp_array_.push_back(w); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - - BaseFloat beam_cutoff = best_weight + config_.beam, - min_active_cutoff = std::numeric_limits::infinity(), - max_active_cutoff = std::numeric_limits::infinity(); - - KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() - << " is " << tmp_array_.size(); - - if (tmp_array_.size() > static_cast(config_.max_active)) { - std::nth_element(tmp_array_.begin(), - tmp_array_.begin() + config_.max_active, - tmp_array_.end()); - max_active_cutoff = tmp_array_[config_.max_active]; - } - if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam. - if (adaptive_beam) - *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; - return max_active_cutoff; - } - if (tmp_array_.size() > static_cast(config_.min_active)) { - if (config_.min_active == 0) { - min_active_cutoff = best_weight; - } else { - std::nth_element( - tmp_array_.begin(), tmp_array_.begin() + config_.min_active, - tmp_array_.size() > static_cast(config_.max_active) - ? tmp_array_.begin() + config_.max_active - : tmp_array_.end()); - min_active_cutoff = tmp_array_[config_.min_active]; - } - } - if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. - if (adaptive_beam) - *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; - return min_active_cutoff; - } else { - *adaptive_beam = config_.beam; - return beam_cutoff; - } - } -} - -template -BaseFloat LatticeFasterDecoderTpl::ProcessEmitting( - DecodableInterface *decodable) { - KALDI_ASSERT(active_toks_.size() > 0); - int32 frame = - active_toks_.size() - 1; // frame is the frame-index - // (zero-based) used to get likelihoods - // from the decodable object. - active_toks_.resize(active_toks_.size() + 1); - - Elem *final_toks = - toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_ - // in simple-decoder.h. Removes the Elems from - // being indexed in the hash in toks_. - Elem *best_elem = NULL; - BaseFloat adaptive_beam; - size_t tok_cnt; - BaseFloat cur_cutoff = - GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); - KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " - << adaptive_beam; - - PossiblyResizeHash( - tok_cnt); // This makes sure the hash is always big enough. - - BaseFloat next_cutoff = std::numeric_limits::infinity(); - // pruning "online" before having seen all tokens - - BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good - // dynamic range. - - // First process the best token to get a hopefully - // reasonably tight bound on the next cutoff. The only - // products of the next block are "next_cutoff" and "cost_offset". - if (best_elem) { - StateId state = best_elem->key; - Token *tok = best_elem->val; - cost_offset = -tok->tot_cost; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat new_weight = arc.weight.Value() + cost_offset - - decodable->LogLikelihood(frame, arc.ilabel) + - tok->tot_cost; - if (state != arc.nextstate) { - new_weight += config_.length_penalty; - } - if (new_weight + adaptive_beam < next_cutoff) - next_cutoff = new_weight + adaptive_beam; - } - } - } - - // Store the offset on the acoustic likelihoods that we're applying. - // Could just do cost_offsets_.push_back(cost_offset), but we - // do it this way as it's more robust to future code changes. - cost_offsets_.resize(frame + 1, 0.0); - cost_offsets_[frame] = cost_offset; - - // the tokens are now owned here, in final_toks, and the hash is empty. - // 'owned' is a complex thing here; the point is we need to call DeleteElem - // on each elem 'e' to let toks_ know we're done with them. - for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) { - // loop this way because we delete "e" as we go. - StateId state = e->key; - Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat ac_cost = cost_offset - - decodable->LogLikelihood(frame, arc.ilabel), - graph_cost = arc.weight.Value(); - if (state != arc.nextstate) { - graph_cost += config_.length_penalty; - } - BaseFloat cur_cost = tok->tot_cost, - tot_cost = cur_cost + ac_cost + graph_cost; - if (tot_cost >= next_cutoff) - continue; - else if (tot_cost + adaptive_beam < next_cutoff) - next_cutoff = - tot_cost + adaptive_beam; // prune by best current token - // Note: the frame indexes into active_toks_ are one-based, - // hence the + 1. - Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); - // NULL: no change indicator needed - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - // Add ForwardLink from tok to next_tok (put on head of list - // tok->links) - tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); - tok->links->context_score = context_score; - } - } // for all arcs - } - e_tail = e->tail; - toks_.Delete(e); // delete Elem - } - return next_cutoff; -} - -// static inline -template -void LatticeFasterDecoderTpl::DeleteForwardLinks(Token *tok) { - ForwardLinkT *l = tok->links, *m; - while (l != NULL) { - m = l->next; - delete l; - l = m; - } - tok->links = NULL; -} - -template -void LatticeFasterDecoderTpl::ProcessNonemitting(BaseFloat cutoff) { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame = static_cast(active_toks_.size()) - 2; - // Note: "frame" is the time-index we just processed, or -1 if - // we are processing the nonemitting transitions before the - // first frame (called from InitDecoding()). - - // Processes nonemitting arcs for one frame. Propagates within toks_. - // Note-- this queue structure is not very optimal as - // it may cause us to process states unnecessarily (e.g. more than once), - // but in the baseline code, turning this vector into a set to fix this - // problem did not improve overall speed. - - KALDI_ASSERT(queue_.empty()); - - if (toks_.GetList() == NULL) { - if (!warned_) { - KALDI_WARN << "Error, no surviving tokens: frame is " << frame; - warned_ = true; - } - } - - int before = 0, after = 0; - for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) { - StateId state = e->key; - if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(e); - ++before; - } - - while (!queue_.empty()) { - ++after; - const Elem *e = queue_.back(); - queue_.pop_back(); - - StateId state = e->key; - Token *tok = - e->val; // would segfault if e is a NULL pointer but this can't happen. - BaseFloat cur_cost = tok->tot_cost; - if (cur_cost >= cutoff) // Don't bother processing successors. - continue; - // If "tok" has any existing forward links, delete them, - // because we're about to regenerate them. This is a kind - // of non-optimality (remember, this is the simple decoder), - // but since most states are emitting it's not a huge issue. - DeleteForwardLinks(tok); // necessary when re-visiting - tok->links = NULL; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel == 0) { // propagate nonemitting only... - BaseFloat graph_cost = arc.weight.Value(), - tot_cost = cur_cost + graph_cost; - if (tot_cost < cutoff) { - bool changed; - - Elem *e_new = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed); - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_new->val->context_state = tok->context_state; - } else { - e_new->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - - tok->links = - new ForwardLinkT(e_new->val, 0, arc.olabel, graph_cost, 0, - is_start_boundary, is_end_boundary, tok->links); - tok->links->context_score = context_score; - - // "changed" tells us whether the new token has a different - // cost from before, or is new [if so, add into queue]. - if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0) - queue_.push_back(e_new); - } - } - } // for all arcs - } // while queue not empty - KALDI_VLOG(3) << "ProcessNonemitting " << before << " " << after; -} - -template -void LatticeFasterDecoderTpl::DeleteElems(Elem *list) { - for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { - e_tail = e->tail; - toks_.Delete(e); - } -} - -template -void LatticeFasterDecoderTpl< - FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin - for (size_t i = 0; i < active_toks_.size(); i++) { - // Delete all tokens alive on this frame, and any forward - // links they may have. - for (Token *tok = active_toks_[i].toks; tok != NULL;) { - DeleteForwardLinks(tok); - Token *next_tok = tok->next; - delete tok; - num_toks_--; - tok = next_tok; - } - } - active_toks_.clear(); - KALDI_ASSERT(num_toks_ == 0); -} - -// static -template -void LatticeFasterDecoderTpl::TopSortTokens( - Token *tok_list, std::vector *topsorted_list) { - unordered_map token2pos; - using std::unordered_set; - typedef typename unordered_map::iterator IterType; - int32 num_toks = 0; - for (Token *tok = tok_list; tok != NULL; tok = tok->next) num_toks++; - int32 cur_pos = 0; - // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0. - // This is likely to be in closer to topological order than - // if we had given them ascending order, because of the way - // new tokens are put at the front of the list. - for (Token *tok = tok_list; tok != NULL; tok = tok->next) - token2pos[tok] = num_toks - ++cur_pos; - - unordered_set reprocess; - - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) { - Token *tok = iter->first; - int32 pos = iter->second; - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - // We only need to consider epsilon links, since non-epsilon links - // transition between frames and this function only needs to sort a list - // of tokens from a single frame. - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { // another token on this - // frame, so must consider it. - int32 next_pos = following_iter->second; - if (next_pos < pos) { // reassign the position of the next Token. - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - // In case we had previously assigned this token to be reprocessed, we can - // erase it from that set because it's "happy now" (we just processed it). - reprocess.erase(tok); - } - - size_t max_loop = 1000000, - loop_count; // max_loop is to detect epsilon cycles. - for (loop_count = 0; !reprocess.empty() && loop_count < max_loop; - ++loop_count) { - std::vector reprocess_vec; - for (typename unordered_set::iterator iter = reprocess.begin(); - iter != reprocess.end(); ++iter) - reprocess_vec.push_back(*iter); - reprocess.clear(); - for (typename std::vector::iterator iter = reprocess_vec.begin(); - iter != reprocess_vec.end(); ++iter) { - Token *tok = *iter; - int32 pos = token2pos[tok]; - // Repeat the processing we did above (for comments, see above). - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { - int32 next_pos = following_iter->second; - if (next_pos < pos) { - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - } - } - KALDI_ASSERT(loop_count < max_loop && - "Epsilon loops exist in your decoding " - "graph (this is not allowed!)"); - - topsorted_list->clear(); - topsorted_list->resize(cur_pos, - NULL); // create a list with NULLs in between. - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) - (*topsorted_list)[iter->second] = iter->first; -} - -// Instantiate the template for the combination of token types and FST types -// that we'll need. -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; - -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-decoder.h deleted file mode 100644 index 0152b85447e354b770745b748d266b1ca2d57024..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-decoder.h +++ /dev/null @@ -1,558 +0,0 @@ -// decoder/lattice-faster-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_ - -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "decoder/context_graph.h" -#include "fst/fstlib.h" -#include "fstext/fstext-lib.h" -#include "itf/decodable-itf.h" -#include "lat/determinize-lattice-pruned.h" -#include "lat/kaldi-lattice.h" -#include "util/hash-list.h" - -namespace kaldi { - -struct LatticeFasterDecoderConfig { - BaseFloat beam; - int32 max_active; - int32 min_active; - BaseFloat lattice_beam; - int32 prune_interval; - bool determinize_lattice; // not inspected by this class... used in - // command-line program. - BaseFloat beam_delta; - BaseFloat hash_ratio; - // Note: we don't make prune_scale configurable on the command line, it's not - // a very important parameter. It affects the algorithm that prunes the - // tokens as we go. - BaseFloat prune_scale; - BaseFloat length_penalty; // for balancing the del/ins ratio, suggested -3.0 - - // Most of the options inside det_opts are not actually queried by the - // LatticeFasterDecoder class itself, but by the code that calls it, for - // example in the function DecodeUtteranceLatticeFaster. - fst::DeterminizeLatticePhonePrunedOptions det_opts; - - LatticeFasterDecoderConfig() - : beam(16.0), - max_active(std::numeric_limits::max()), - min_active(200), - lattice_beam(10.0), - prune_interval(25), - determinize_lattice(true), - beam_delta(0.5), - hash_ratio(2.0), - prune_scale(0.1), - length_penalty(0.0) {} - void Register(OptionsItf *opts) { - det_opts.Register(opts); - opts->Register("beam", &beam, - "Decoding beam. Larger->slower, more accurate."); - opts->Register("max-active", &max_active, - "Decoder max active states. Larger->slower; " - "more accurate"); - opts->Register("min-active", &min_active, - "Decoder minimum #active states."); - opts->Register("lattice-beam", &lattice_beam, - "Lattice generation beam. Larger->slower, " - "and deeper lattices"); - opts->Register("prune-interval", &prune_interval, - "Interval (in frames) at " - "which to prune tokens"); - opts->Register( - "determinize-lattice", &determinize_lattice, - "If true, " - "determinize the lattice (lattice-determinization, keeping only " - "best pdf-sequence for each word-sequence)."); - opts->Register( - "beam-delta", &beam_delta, - "Increment used in decoding-- this " - "parameter is obscure and relates to a speedup in the way the " - "max-active constraint is applied. Larger is more accurate."); - opts->Register("hash-ratio", &hash_ratio, - "Setting used in decoder to " - "control hash behavior"); - } - void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && - min_active <= max_active && prune_interval > 0 && - beam_delta > 0.0 && hash_ratio >= 1.0 && prune_scale > 0.0 && - prune_scale < 1.0); - } -}; - -namespace decoder { -// We will template the decoder on the token type as well as the FST type; this -// is a mechanism so that we can use the same underlying decoder code for -// versions of the decoder that support quickly getting the best path -// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also -// those that do not (LatticeFasterDecoder). - -// ForwardLinks are the links from a token to a token on the next frame. -// or sometimes on the current frame (for input-epsilon links). -template -struct ForwardLink { - using Label = fst::StdArc::Label; - - Token *next_tok; // the next token [or NULL if represents final-state] - Label ilabel; // ilabel on arc - Label olabel; // olabel on arc - BaseFloat graph_cost; // graph cost of traversing arc (contains LM, etc.) - BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing arc - bool is_start_boundary; - bool is_end_boundary; - float context_score; - ForwardLink *next; // next in singly-linked list of forward arcs (arcs - // in the state-level lattice) from a token. - inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, - bool is_start_boundary, bool is_end_boundary, - ForwardLink *next) - : next_tok(next_tok), - ilabel(ilabel), - olabel(olabel), - graph_cost(graph_cost), - acoustic_cost(acoustic_cost), - is_start_boundary(is_start_boundary), - is_end_boundary(is_end_boundary), - context_score(0), - next(next) {} -}; - -struct StdToken { - using ForwardLinkT = ForwardLink; - using Token = StdToken; - - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals the - // minimum difference between the cost of the best path that this link is a - // part of, and the cost of the absolute best path, under the assumption that - // any of the currently active states at the decoding front may eventually - // succeed (e.g. if you were to take the currently active states one by one - // and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - Token *next; - - // This function does nothing and should be optimized out; it's needed - // so we can share the regular LatticeFasterDecoderTpl code and the code - // for LatticeFasterOnlineDecoder that supports fast traceback. - inline void SetBackpointer(Token *backpointer) {} - - // This constructor just ignores the 'backpointer' argument. That argument is - // needed so that we can use the same decoder code for LatticeFasterDecoderTpl - // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a - // fast way to obtain the best path). - inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links, - Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - context_state(0), - next(next) {} -}; - -struct BackpointerToken { - using ForwardLinkT = ForwardLink; - using Token = BackpointerToken; - - // BackpointerToken is like Token but also - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - BackpointerToken *next; - - // Best preceding BackpointerToken (could be a on this frame, connected to - // this via an epsilon transition, or on a previous frame). This is only - // required for an efficient GetBestPath function in - // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation - // (the "links" list is what stores the forward links, for that). - Token *backpointer; - - inline void SetBackpointer(Token *backpointer) { - this->backpointer = backpointer; - } - - inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, - ForwardLinkT *links, Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - next(next), - backpointer(backpointer), - context_state(0) {} -}; - -} // namespace decoder - -/** This is the "normal" lattice-generating decoder. - See \ref lattices_generation \ref decoders_faster and \ref decoders_simple - for more information. - - The decoder is templated on the FST type and the token type. The token type - will normally be StdToken, but also may be BackpointerToken which is to - support quick lookup of the current best path (see - lattice-faster-online-decoder.h) - - The FST you invoke this decoder which is expected to equal - Fst::Fst, a.k.a. StdFst, or GrammarFst. If you invoke it with - FST == StdFst and it notices that the actual FST type is - fst::VectorFst or fst::ConstFst, the decoder object - will internally cast itself to one that is templated on those more specific - types; this is an optimization for speed. - */ -template -class LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph); - - // This version of the constructor takes ownership of the fst, and will delete - // it when this object is destroyed. - LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config, FST *fst); - - void SetOptions(const LatticeFasterDecoderConfig &config) { - config_ = config; - } - - const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - - ~LatticeFasterDecoderTpl(); - - /// Decodes until there are no more frames left in the "decodable" object.. - /// note, this may block waiting for input if the "decodable" object blocks. - /// Returns true if any kind of traceback is available (not necessarily from a - /// final state). - bool Decode(DecodableInterface *decodable); - - /// says whether a final-state was active on the last frame. If it was not, - /// the lattice (or traceback) will end with states that are not final-states. - bool ReachedFinal() const { - return FinalRelativeCost() != std::numeric_limits::infinity(); - } - - /// Outputs an FST corresponding to the single best path through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. Note: this just calls - /// GetRawLattice() and figures out the shortest path. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// Outputs an FST corresponding to the raw, state-level - /// tracebacks. Returns true if result is nonempty. - /// If "use_final_probs" is true AND we reached the final-state - /// of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - /// The raw lattice will be topologically sorted. - /// - /// See also GetRawLatticePruned in lattice-faster-online-decoder.h, - /// which also supports a pruning beam, in case for some reason - /// you want it pruned tighter than the regular lattice beam. - /// We could put that here in future needed. - bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const; - - /// [Deprecated, users should now use GetRawLattice and determinize it - /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper]. - /// Outputs an FST corresponding to the lattice-determinized - /// lattice (one path per word sequence). Returns true if result is - /// nonempty. If "use_final_probs" is true AND we reached the final-state of - /// the graph then it will include those as final-probs, else it will treat - /// all final-probs as one. - bool GetLattice(CompactLattice *ofst, bool use_final_probs = true) const; - - /// InitDecoding initializes the decoding, and should only be used if you - /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to - /// call this. You can also call InitDecoding if you have already decoded an - /// utterance and want to start with a new utterance. - void InitDecoding(); - - /// This will decode until there are no more frames ready in the decodable - /// object. You can keep calling it each time more frames become available. - /// If max_num_frames is specified, it specifies the maximum number of frames - /// the function will decode before returning. - void AdvanceDecoding(DecodableInterface *decodable, - int32 max_num_frames = -1); - - /// This function may be optionally called after AdvanceDecoding(), when you - /// do not plan to decode any further. It does an extra pruning step that - /// will help to prune the lattices output by GetLattice and (particularly) - /// GetRawLattice more completely, particularly toward the end of the - /// utterance. If you call this, you cannot call AdvanceDecoding again (it - /// will fail), and you cannot call GetLattice() and related functions with - /// use_final_probs = false. Used to be called PruneActiveTokensFinal(). - void FinalizeDecoding(); - - /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives - /// more information. It returns the difference between the best (final-cost - /// plus cost) of any token on the final frame, and the best cost of any token - /// on the final frame. If it is infinity it means no final-states were - /// present on the final frame. It will usually be nonnegative. If it not - /// too positive (e.g. < 5 is my first guess, but this is not tested) you can - /// take it as a good indication that we reached the final-state with - /// reasonable likelihood. - BaseFloat FinalRelativeCost() const; - - // Returns the number of frames decoded so far. The value returned changes - // whenever we call ProcessEmitting(). - inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; } - - protected: - // we make things protected instead of private, as code in - // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the - // internals. - - // Deletes the elements of the singly linked list tok->links. - inline static void DeleteForwardLinks(Token *tok); - - // head of per-frame list of Tokens (list is in topological order), - // and something saying whether we ever pruned it using PruneForwardLinks. - struct TokenList { - Token *toks; - bool must_prune_forward_links; - bool must_prune_tokens; - TokenList() - : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) {} - }; - - using Elem = typename HashList::Elem; - // Equivalent to: - // struct Elem { - // StateId key; - // Token *val; - // Elem *tail; - // }; - - void PossiblyResizeHash(size_t num_toks); - - // FindOrAddToken either locates a token in hash of toks_, or if necessary - // inserts a new, empty token (i.e. with no forward links) for the current - // frame. [note: it's inserted if necessary into hash toks_ and also into the - // singly linked list of tokens active on this frame (whose head is at - // active_toks_[frame]). The frame_plus_one argument is the acoustic frame - // index plus one, which is used to index into the active_toks_ array. - // Returns the Token pointer. Sets "changed" (if non-NULL) to true if the - // token was newly created or the cost changed. - // If Token == StdToken, the 'backpointer' argument has no purpose (and will - // hopefully be optimized out). - inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one, - BaseFloat tot_cost, Token *backpointer, - bool *changed); - - // prunes outgoing links for all tokens in active_toks_[frame] - // it's called by PruneActiveTokens - // all links, that have link_extra_cost > lattice_beam are pruned - // delta is the amount by which the extra_costs must change - // before we set *extra_costs_changed = true. - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed, - bool *links_pruned, BaseFloat delta); - - // This function computes the final-costs for tokens active on the final - // frame. It outputs to final-costs, if non-NULL, a map from the Token* - // pointer to the final-prob of the corresponding state, for all Tokens - // that correspond to states that have final-probs. This map will be - // empty if there were no final-probs. It outputs to - // final_relative_cost, if non-NULL, the difference between the best - // forward-cost including the final-prob cost, and the best forward-cost - // without including the final-prob cost (this will usually be positive), or - // infinity if there were no final-probs. [c.f. FinalRelativeCost(), which - // outputs this quanitity]. It outputs to final_best_cost, if - // non-NULL, the lowest for any token t active on the final frame, of - // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in - // the graph of the state corresponding to token t, or the best of - // forward-cost[t] if there were no final-probs active on the final frame. - // You cannot call this after FinalizeDecoding() has been called; in that - // case you should get the answer from class-member variables. - void ComputeFinalCosts(unordered_map *final_costs, - BaseFloat *final_relative_cost, - BaseFloat *final_best_cost) const; - - // PruneForwardLinksFinal is a version of PruneForwardLinks that we call - // on the final frame. If there are final tokens active, it uses - // the final-probs for pruning, otherwise it treats all tokens as final. - void PruneForwardLinksFinal(); - - // Prune away any tokens on this frame that have no forward links. - // [we don't do this in PruneForwardLinks because it would give us - // a problem with dangling pointers]. - // It's called by PruneActiveTokens if any forward links have been pruned - void PruneTokensForFrame(int32 frame_plus_one); - - // Go backwards through still-alive tokens, pruning them if the - // forward+backward cost is more than lat_beam away from the best path. It's - // possible to prove that this is "correct" in the sense that we won't lose - // anything outside of lat_beam, regardless of what happens in the future. - // delta controls when it considers a cost to have changed enough to continue - // going backward and propagating the change. larger delta -> will recurse - // less far. - void PruneActiveTokens(BaseFloat delta); - - /// Gets the weight cutoff. Also counts the active tokens. - BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, - BaseFloat *adaptive_beam, Elem **best_elem); - - /// Processes emitting arcs for one frame. Propagates from prev_toks_ to - /// cur_toks_. Returns the cost cutoff for subsequent ProcessNonemitting() to - /// use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); - - /// Processes nonemitting (epsilon) arcs for one frame. Called after - /// ProcessEmitting() on each frame. The cost cutoff is computed by the - /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); - - // HashList defined in ../util/hash-list.h. It actually allows us to maintain - // more than one list (e.g. for current and previous frames), but only one of - // them at a time can be indexed by StateId. It is indexed by frame-index - // plus one, where the frame-index is zero-based, as used in decodable object. - // That is, the emitting probs of frame t are accounted for in tokens at - // toks_[t+1]. The zeroth frame is for nonemitting transition at the start of - // the graph. - HashList toks_; - - std::vector active_toks_; // Lists of tokens, indexed by - // frame (members of TokenList are toks, must_prune_forward_links, - // must_prune_tokens). - std::vector - queue_; // temp variable used in ProcessNonemitting, - std::vector tmp_array_; // used in GetCutoff. - - // fst_ is a pointer to the FST we are decoding from. - const FST *fst_; - // delete_fst_ is true if the pointer fst_ needs to be deleted when this - // object is destroyed. - bool delete_fst_; - - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic log-likelihoods on that - // frame in order to keep everything in a nice dynamic range i.e. close to - // zero, to reduce roundoff errors. - LatticeFasterDecoderConfig config_; - int32 num_toks_; // current total #toks allocated... - bool warned_; - - /// decoding_finalized_ is true if someone called FinalizeDecoding(). [note, - /// calling this is optional]. If true, it's forbidden to decode more. Also, - /// if this is set, then the output of ComputeFinalCosts() is in the next - /// three variables. The reason we need to do this is that after - /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some - /// of the tokens on the last frame are freed, so we free the list from toks_ - /// to avoid having dangling pointers hanging around. - bool decoding_finalized_; - /// For the meaning of the next 3 variables, see the comment for - /// decoding_finalized_ above., and ComputeFinalCosts(). - unordered_map final_costs_; - BaseFloat final_relative_cost_; - BaseFloat final_best_cost_; - - std::shared_ptr context_graph_ = nullptr; - - // There are various cleanup tasks... the toks_ structure contains - // singly linked lists of Token pointers, where Elem is the list type. - // It also indexes them in a hash, indexed by state (this hash is only - // maintained for the most recent frame). toks_.Clear() - // deletes them from the hash and returns the list of Elems. The - // function DeleteElems calls toks_.Delete(elem) for each elem in - // the list, which returns ownership of the Elem to the toks_ structure - // for reuse, but does not delete the Token pointer. The Token pointers - // are reference-counted and are ultimately deleted in PruneTokensForFrame, - // but are also linked together on each frame by their own linked-list, - // using the "next" pointer. We delete them manually. - void DeleteElems(Elem *list); - - // This function takes a singly linked list of tokens for a single frame, and - // outputs a list of them in topological order (it will crash if no such order - // can be found, which will typically be due to decoding graphs with epsilon - // cycles, which are not allowed). Note: the output list may contain NULLs, - // which the caller should pass over; it just happens to be more efficient for - // the algorithm to output a list that contains NULLs. - static void TopSortTokens(Token *tok_list, - std::vector *topsorted_list); - - void ClearActiveTokens(); - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl); -}; - -typedef LatticeFasterDecoderTpl - LatticeFasterDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-online-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-online-decoder.cc deleted file mode 100644 index 2345b4d129ff905784762e973bad279f2fb55d31..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-online-decoder.cc +++ /dev/null @@ -1,278 +0,0 @@ -// decoder/lattice-faster-online-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2014 IMSL, PKU-HKUST (author: Wei Shi) -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.cc, about how to maintain this -// file in sync with lattice-faster-decoder.cc - -#include -#include -#include -#include - -#include "decoder/lattice-faster-online-decoder.h" - -namespace kaldi { - -template -bool LatticeFasterOnlineDecoderTpl::TestGetBestPath( - bool use_final_probs) const { - Lattice lat1; - { - Lattice raw_lat; - this->GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, &lat1); - } - Lattice lat2; - GetBestPath(&lat2, use_final_probs); - BaseFloat delta = 0.1; - int32 num_paths = 1; - if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) { - KALDI_WARN << "Best-path test failed"; - return false; - } else { - return true; - } -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterOnlineDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - olat->DeleteStates(); - BaseFloat final_graph_cost; - BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost); - if (iter.Done()) return false; // would have printed warning. - StateId state = olat->AddState(); - olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0)); - while (!iter.Done()) { - LatticeArc arc; - iter = TraceBackBestPath(iter, &arc); - arc.nextstate = state; - StateId new_state = olat->AddState(); - olat->AddArc(new_state, arc); - state = new_state; - } - olat->SetStart(state); - return true; -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::BestPathEnd( - bool use_final_probs, BaseFloat *final_cost_out) const { - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "BestPathEnd() with use_final_probs == false"; - KALDI_ASSERT(this->NumFramesDecoded() > 0 && - "You cannot call BestPathEnd if no frames were decoded."); - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - // Singly linked list of tokens on last frame (access list through "next" - // pointer). - BaseFloat best_cost = std::numeric_limits::infinity(); - BaseFloat best_final_cost = 0; - Token *best_tok = NULL; - for (Token *tok = this->active_toks_.back().toks; tok != NULL; - tok = tok->next) { - BaseFloat cost = tok->tot_cost, final_cost = 0.0; - if (use_final_probs && !final_costs.empty()) { - // if we are instructed to use final-probs, and any final tokens were - // active on final frame, include the final-prob in the cost of the token. - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) { - final_cost = iter->second; - cost += final_cost; - } else { - cost = std::numeric_limits::infinity(); - } - } - if (cost < best_cost) { - best_cost = cost; - best_tok = tok; - best_final_cost = final_cost; - } - } - if (best_tok == - NULL) { // this should not happen, and is likely a code error or - // caused by infinities in likelihoods, but I'm not making - // it a fatal error for now. - KALDI_WARN << "No final token found."; - } - if (final_cost_out) *final_cost_out = best_final_cost; - return BestPathIterator(best_tok, this->NumFramesDecoded() - 1); -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::TraceBackBestPath(BestPathIterator iter, - LatticeArc *oarc) const { - KALDI_ASSERT(!iter.Done() && oarc != NULL); - Token *tok = static_cast(iter.tok); - int32 cur_t = iter.frame, step_t = 0; - if (tok->backpointer != NULL) { - // retrieve the correct forward link(with the best link cost) - BaseFloat best_cost = std::numeric_limits::infinity(); - ForwardLinkT *link; - for (link = tok->backpointer->links; link != NULL; link = link->next) { - if (link->next_tok == tok) { // this is a link to "tok" - BaseFloat graph_cost = link->graph_cost, - acoustic_cost = link->acoustic_cost; - BaseFloat cost = graph_cost + acoustic_cost; - if (cost < best_cost) { - oarc->ilabel = link->ilabel; - oarc->olabel = link->olabel; - if (link->ilabel != 0) { - KALDI_ASSERT(static_cast(cur_t) < - this->cost_offsets_.size()); - acoustic_cost -= this->cost_offsets_[cur_t]; - step_t = -1; - } else { - step_t = 0; - } - oarc->weight = LatticeWeight(graph_cost, acoustic_cost); - best_cost = cost; - } - } - } - if (link == NULL && - best_cost == - std::numeric_limits::infinity()) { // Did not find - // correct link. - KALDI_ERR << "Error tracing best-path back (likely " - << "bug in token-pruning algorithm)"; - } - } else { - oarc->ilabel = 0; - oarc->olabel = 0; - oarc->weight = LatticeWeight::One(); // zero costs. - } - return BestPathIterator(tok->backpointer, cur_t + step_t); -} - -template -bool LatticeFasterOnlineDecoderTpl::GetRawLatticePruned( - Lattice *ofst, bool use_final_probs, BaseFloat beam) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = this->active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - for (int32 f = 0; f <= num_frames; f++) { - if (this->active_toks_[f].toks == NULL) { - KALDI_WARN << "No tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - } - unordered_map tok_map; - std::queue > tok_queue; - // First initialize the queue and states. Put the initial state on the queue; - // this is the last token in the list active_toks_[0].toks. - for (Token *tok = this->active_toks_[0].toks; tok != NULL; tok = tok->next) { - if (tok->next == NULL) { - tok_map[tok] = ofst->AddState(); - ofst->SetStart(tok_map[tok]); - std::pair tok_pair(tok, 0); // #frame = 0 - tok_queue.push(tok_pair); - } - } - - // Next create states for "good" tokens - while (!tok_queue.empty()) { - std::pair cur_tok_pair = tok_queue.front(); - tok_queue.pop(); - Token *cur_tok = cur_tok_pair.first; - int32 cur_frame = cur_tok_pair.second; - KALDI_ASSERT(cur_frame >= 0 && cur_frame <= this->cost_offsets_.size()); - - typename unordered_map::const_iterator iter = - tok_map.find(cur_tok); - KALDI_ASSERT(iter != tok_map.end()); - StateId cur_state = iter->second; - - for (ForwardLinkT *l = cur_tok->links; l != NULL; l = l->next) { - Token *next_tok = l->next_tok; - if (next_tok->extra_cost < beam) { - // so both the current and the next token are good; create the arc - int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1; - StateId nextstate; - if (tok_map.find(next_tok) == tok_map.end()) { - nextstate = tok_map[next_tok] = ofst->AddState(); - tok_queue.push(std::pair(next_tok, next_frame)); - } else { - nextstate = tok_map[next_tok]; - } - BaseFloat cost_offset = - (l->ilabel != 0 ? this->cost_offsets_[cur_frame] : 0); - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(cur_state, arc); - } - } - if (cur_frame == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(cur_tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - return (ofst->NumStates() != 0); -} - -// Instantiate the template for the FST types that we'll need. -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-online-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-online-decoder.h deleted file mode 100644 index dc50cfa73e6574e9625eda9045c47f674fcbc1e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/decoder/lattice-faster-online-decoder.h +++ /dev/null @@ -1,131 +0,0 @@ -// decoder/lattice-faster-online-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.h, about how to maintain this -// file in sync with lattice-faster-decoder.h - -#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ - -#include "decoder/lattice-faster-decoder.h" - -#include - -namespace kaldi { - -/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also - supports an efficient way to get the best path (see the function - BestPathEnd()), which is useful in endpointing and in situations where you - might want to frequently access the best path. - - This is only templated on the FST type, since the Token type is required to - be BackpointerToken. Actually it only makes sense to instantiate - LatticeFasterDecoderTpl with Token == BackpointerToken if you do so - indirectly via this child class. - */ -template -class LatticeFasterOnlineDecoderTpl - : public LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using Token = decoder::BackpointerToken; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterOnlineDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : LatticeFasterDecoderTpl(fst, config, context_graph) {} - - // This version of the initializer takes ownership of 'fst', and will delete - // it when this object is destroyed. - LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config, - FST *fst) - : LatticeFasterDecoderTpl(config, fst) {} - - struct BestPathIterator { - void *tok; - int32 frame; - // note, "frame" is the frame-index of the frame you'll get the - // transition-id for next time, if you call TraceBackBestPath on this - // iterator (assuming it's not an epsilon transition). Note that this - // is one less than you might reasonably expect, e.g. it's -1 for - // the nonemitting transitions before the first frame. - BestPathIterator(void *t, int32 f) : tok(t), frame(f) {} - bool Done() const { return tok == NULL; } - }; - - /// Outputs an FST corresponding to the single best path through the lattice. - /// This is quite efficient because it doesn't get the entire raw lattice and - /// find the best path through it; instead, it uses the BestPathEnd and - /// BestPathIterator so it basically traces it back through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// This function does a self-test of GetBestPath(). Returns true on - /// success; returns false and prints a warning on failure. - bool TestGetBestPath(bool use_final_probs = true) const; - - /// This function returns an iterator that can be used to trace back - /// the best path. If use_final_probs == true and at least one final state - /// survived till the end, it will use the final-probs in working out the best - /// final Token, and will output the final cost to *final_cost (if non-NULL), - /// else it will use only the forward likelihood, and will put zero in - /// *final_cost (if non-NULL). - /// Requires that NumFramesDecoded() > 0. - BestPathIterator BestPathEnd(bool use_final_probs, - BaseFloat *final_cost = NULL) const; - - /// This function can be used in conjunction with BestPathEnd() to trace back - /// the best path one link at a time (e.g. this can be useful in endpoint - /// detection). By "link" we mean a link in the graph; not all links cross - /// frame boundaries, but each time you see a nonzero ilabel you can interpret - /// that as a frame. The return value is the updated iterator. It outputs - /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" - /// pointer, while leaving its "nextstate" variable unchanged. - BestPathIterator TraceBackBestPath(BestPathIterator iter, - LatticeArc *arc) const; - - /// Behaves the same as GetRawLattice but only processes tokens whose - /// extra_cost is smaller than the best-cost plus the specified beam. - /// It is only worthwhile to call this function if beam is less than - /// the lattice_beam specified in the config; otherwise, it would - /// return essentially the same thing as GetRawLattice, but more slowly. - bool GetRawLatticePruned(Lattice *ofst, bool use_final_probs, - BaseFloat beam) const; - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl); -}; - -typedef LatticeFasterOnlineDecoderTpl LatticeFasterOnlineDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstaddselfloops.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstaddselfloops.cc deleted file mode 100644 index 145bf006f2324136c5fea4a8d0012a7a4126c646..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstaddselfloops.cc +++ /dev/null @@ -1,100 +0,0 @@ -// fstbin/fstaddselfloops.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#include "util/simple-io-funcs.h" - -/* some test examples: - pushd ~/tmpdir - ( echo 3; echo 4) > in.list - ( echo 5; echo 6) > out.list - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list - | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | - fstcompile | fstaddselfloops in.list out.list | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Adds self-loops to states of an FST to propagate disambiguation " - "symbols through it\n" - "They are added on each final state and each state with non-epsilon " - "output symbols\n" - "on at least one arc out of the state. Useful in conjunction with " - "predeterminize\n" - "\n" - "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " - "[out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" - "in.list and out.list are lists of integers, one per line, of the\n" - "same length.\n"; - - ParseOptions po(usage); - po.Read(argc, argv); - - if (po.NumArgs() < 2 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string disambig_in_rxfilename = po.GetArg(1), - disambig_out_rxfilename = po.GetArg(2), - fst_in_filename = po.GetOptArg(3), - fst_out_filename = po.GetOptArg(4); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - std::vector disambig_in; - if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_in_rxfilename); - - std::vector disambig_out; - if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_out_rxfilename); - - if (disambig_in.size() != disambig_out.size()) - KALDI_ERR - << "fstaddselfloops: mismatch in size of disambiguation symbols"; - - AddSelfLoops(fst, disambig_in, disambig_out); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstdeterminizestar.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstdeterminizestar.cc deleted file mode 100644 index e818143025c0fd5d389c28c77715d65711fe63f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstdeterminizestar.cc +++ /dev/null @@ -1,114 +0,0 @@ -// fstbin/fstdeterminizestar.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#if !defined(_MSC_VER) && !defined(__APPLE__) -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. We are disabling this code if -// compiling on Windows because signal.h is not available there, and on -// MacOS due to a problem with in the initial release of Sierra. -#endif - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | - fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 - 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst - > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n - "." done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo - "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id - of fstdeterminizestar] # prints out a bunch of debugging output showing the - mess it got itself into. -*/ - -bool debug_location = false; -void signal_handler(int) { debug_location = true; } - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Removes epsilons and determinizes in one step\n" - "\n" - "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" - "\n" - "See also: fstdeterminizelog, lattice-determinize\n"; - - float delta = kDelta; - int max_states = -1; - bool use_log = false; - ParseOptions po(usage); - po.Register("use-log", &use_log, "Determinize in log semiring."); - po.Register("delta", &delta, - "Delta value used to determine equivalence of weights."); - po.Register( - "max-states", &max_states, - "Maximum number of states in determinized FST before it will abort."); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); - - // This enables us to get traceback info from determinization that is - // not seeming to terminate. -#if !defined(_MSC_VER) && !defined(__APPLE__) - signal(SIGUSR1, signal_handler); -#endif - // Normal case: just files. - VectorFst *fst = ReadFstKaldi(fst_in_str); - - ArcSort(fst, ILabelCompare()); // improves speed. - if (use_log) { - DeterminizeStarInLog(fst, delta, &debug_location, max_states); - } else { - VectorFst det_fst; - DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); - *fst = det_fst; // will do shallow copy and then det_fst goes - // out of scope anyway. - } - WriteFstKaldi(*fst, fst_out_str); - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstisstochastic.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstisstochastic.cc deleted file mode 100644 index 468ed0daa7d37cb9a25cf25264f86e48e137b975..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstisstochastic.cc +++ /dev/null @@ -1,91 +0,0 @@ -// fstbin/fstisstochastic.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -// e.g. of test: -// echo " 0 0" | fstcompile | fstisstochastic -// should return 0 and print "0 0" [meaning, min and -// max weight are one = exp(0)] -// echo " 0 1" | fstcompile | fstisstochastic -// should return 1, not stochastic, and print 1 1 -// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 -// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo -// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, -// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; -// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic -// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" -// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even -// though not stochastic because we gave it an absurdly large delta. - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Checks whether an FST is stochastic and exits with success if so.\n" - "Prints out maximum error (in log units).\n" - "\n" - "Usage: fstisstochastic [ in.fst ]\n"; - - float delta = 0.01; - bool test_in_log = true; - - ParseOptions po(usage); - po.Register("delta", &delta, "Maximum error to accept."); - po.Register("test-in-log", &test_in_log, - "Test stochasticity in log semiring."); - po.Read(argc, argv); - - if (po.NumArgs() > 1) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1); - - Fst *fst = ReadFstKaldiGeneric(fst_in_filename); - - bool ans; - StdArc::Weight min, max; - if (test_in_log) - ans = IsStochasticFstInLog(*fst, delta, &min, &max); - else - ans = IsStochasticFst(*fst, delta, &min, &max); - - std::cout << min.Value() << " " << max.Value() << '\n'; - delete fst; - if (ans) - return 0; // success; - else - return 1; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstminimizeencoded.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstminimizeencoded.cc deleted file mode 100644 index ae9ca6d75abe67d9a195572dd6d91ec3c7b44851..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fstminimizeencoded.cc +++ /dev/null @@ -1,74 +0,0 @@ -// fstbin/fstminimizeencoded.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint - ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | - fstminimizeencoded | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Minimizes FST after encoding [similar to fstminimize, but no " - "weight-pushing]\n" - "\n" - "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; - - float delta = kDelta; - ParseOptions po(usage); - po.Register("delta", &delta, - "Delta likelihood used for quantization of weights"); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1), - fst_out_filename = po.GetOptArg(2); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - MinimizeEncoded(fst, delta); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fsttablecompose.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fsttablecompose.cc deleted file mode 100644 index bdd476da78b8cb8823c60abf33b5278e05bfd92c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstbin/fsttablecompose.cc +++ /dev/null @@ -1,133 +0,0 @@ -// fstbin/fsttablecompose.cc - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "fstext/table-matcher.h" -#include "util/parse-options.h" - -/* - cd ~/tmpdir - while true; do - fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort - > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst - fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" - echo -n "." - done - -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - /* - fsttablecompose should always give equivalent results to compose, - but it is more efficient for certain kinds of inputs. - In particular, it is useful when, say, the left FST has states - that typically either have epsilon olabels, or - one transition out for each of the possible symbols (as the - olabel). The same with the input symbols of the right-hand FST - is possible. - */ - - const char *usage = - "Composition algorithm [between two FSTs of standard type, in " - "tropical\n" - "semiring] that is more efficient for certain cases-- in particular,\n" - "where one of the FSTs (the left one, if --match-side=left) has large\n" - "out-degree\n" - "\n" - "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " - "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; - - ParseOptions po(usage); - - TableComposeOptions opts; - std::string match_side = "left"; - std::string compose_filter = "sequence"; - - po.Register("connect", &opts.connect, "If true, trim FST before output."); - po.Register("match-side", &match_side, - "Side of composition to do table " - "match, one of: \"left\" or \"right\"."); - po.Register("compose-filter", &compose_filter, - "Composition filter to use, " - "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); - - po.Read(argc, argv); - - if (match_side == "left") { - opts.table_match_type = MATCH_OUTPUT; - } else if (match_side == "right") { - opts.table_match_type = MATCH_INPUT; - } else { - KALDI_ERR << "Invalid match-side option: " << match_side; - } - - if (compose_filter == "alt_sequence") { - opts.filter_type = ALT_SEQUENCE_FILTER; - } else if (compose_filter == "auto") { - opts.filter_type = AUTO_FILTER; - } else if (compose_filter == "match") { - opts.filter_type = MATCH_FILTER; - } else if (compose_filter == "sequence") { - opts.filter_type = SEQUENCE_FILTER; - } else { - KALDI_ERR << "Invalid compose-filter option: " << compose_filter; - } - - if (po.NumArgs() < 2 || po.NumArgs() > 3) { - po.PrintUsage(); - exit(1); - } - - std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), - fst_out_str = po.GetOptArg(3); - - VectorFst *fst1 = ReadFstKaldi(fst1_in_str); - - VectorFst *fst2 = ReadFstKaldi(fst2_in_str); - - // Checks if is olabel sorted and is ilabel sorted. - if (fst1->Properties(fst::kOLabelSorted, true) == 0) { - KALDI_WARN << "The first FST is not olabel sorted."; - } - if (fst2->Properties(fst::kILabelSorted, true) == 0) { - KALDI_WARN << "The second FST is not ilabel sorted."; - } - - VectorFst composed_fst; - - TableCompose(*fst1, *fst2, &composed_fst, opts); - - delete fst1; - delete fst2; - - WriteFstKaldi(composed_fst, fst_out_str); - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstext/determinize-lattice-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstext/determinize-lattice-inl.h deleted file mode 100644 index 0bfbc8f41c7e439b1fac037f60490e04fdcbdd8b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/fstext/determinize-lattice-inl.h +++ /dev/null @@ -1,1357 +0,0 @@ -// fstext/determinize-lattice-inl.h - -// Copyright 2009-2012 Microsoft Corporation -// 2012-2013 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -#define KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -// Do not include this file directly. It is included by determinize-lattice.h - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fst { - -// This class maps back and forth from/to integer id's to sequences of strings. -// used in determinization algorithm. It is constructed in such a way that -// finding the string-id of the successor of (string, next-label) has constant -// time. - -// Note: class IntType, typically int32, is the type of the element in the -// string (typically a template argument of the CompactLatticeWeightTpl). - -template -class LatticeStringRepository { - public: - struct Entry { - const Entry *parent; // NULL for empty string. - IntType i; - inline bool operator==(const Entry &other) const { - return (parent == other.parent && i == other.i); - } - Entry() {} - Entry(const Entry &e) : parent(e.parent), i(e.i) {} - }; - // Note: all Entry* pointers returned in function calls are - // owned by the repository itself, not by the caller! - - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } - - // Returns string of "parent" with i appended. Pointer - // owned by repository - const Entry *Successor(const Entry *parent, IntType i) { - new_entry_->parent = parent; - new_entry_->i = i; - - std::pair pr = set_.insert(new_entry_); - if (pr.second) { // Was successfully inserted (was not there). We need to - // replace the element we inserted, which resides on the - // stack, with one from the heap. - const Entry *ans = new_entry_; - new_entry_ = new Entry(); - return ans; - } else { // Was not inserted because an equivalent Entry already - // existed. - return *pr.first; - } - } - - const Entry *Concatenate(const Entry *a, const Entry *b) { - if (a == NULL) - return b; - else if (b == NULL) - return a; - std::vector v; - ConvertToVector(b, &v); - const Entry *ans = a; - for (size_t i = 0; i < v.size(); i++) ans = Successor(ans, v[i]); - return ans; - } - const Entry *CommonPrefix(const Entry *a, const Entry *b) { - std::vector a_vec, b_vec; - ConvertToVector(a, &a_vec); - ConvertToVector(b, &b_vec); - const Entry *ans = NULL; - for (size_t i = 0; - i < a_vec.size() && i < b_vec.size() && a_vec[i] == b_vec[i]; i++) - ans = Successor(ans, a_vec[i]); - return ans; - } - - // removes any elements from b that are not part of - // a common prefix with a. - void ReduceToCommonPrefix(const Entry *a, std::vector *b) { - size_t a_size = Size(a), b_size = b->size(); - while (a_size > b_size) { - a = a->parent; - a_size--; - } - if (b_size > a_size) b_size = a_size; - typename std::vector::iterator b_begin = b->begin(); - while (a_size != 0) { - if (a->i != *(b_begin + a_size - 1)) b_size = a_size - 1; - a = a->parent; - a_size--; - } - if (b_size != b->size()) b->resize(b_size); - } - - // removes the first n elements of a. - const Entry *RemovePrefix(const Entry *a, size_t n) { - if (n == 0) return a; - std::vector a_vec; - ConvertToVector(a, &a_vec); - assert(a_vec.size() >= n); - const Entry *ans = NULL; - for (size_t i = n; i < a_vec.size(); i++) ans = Successor(ans, a_vec[i]); - return ans; - } - - // Returns true if a is a prefix of b. If a is prefix of b, - // time taken is |b| - |a|. Else, time taken is |b|. - bool IsPrefixOf(const Entry *a, const Entry *b) const { - if (a == NULL) return true; // empty string prefix of all. - if (a == b) return true; - if (b == NULL) return false; - return IsPrefixOf(a, b->parent); - } - - inline size_t Size(const Entry *entry) const { - size_t ans = 0; - while (entry != NULL) { - ans++; - entry = entry->parent; - } - return ans; - } - - void ConvertToVector(const Entry *entry, std::vector *out) const { - size_t length = Size(entry); - out->resize(length); - if (entry != NULL) { - typename std::vector::reverse_iterator iter = out->rbegin(); - while (entry != NULL) { - *iter = entry->i; - entry = entry->parent; - ++iter; - } - } - } - - const Entry *ConvertFromVector(const std::vector &vec) { - const Entry *e = NULL; - for (size_t i = 0; i < vec.size(); i++) e = Successor(e, vec[i]); - return e; - } - - LatticeStringRepository() { new_entry_ = new Entry; } - - void Destroy() { - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) - delete *iter; - SetType tmp; - tmp.swap(set_); - if (new_entry_) { - delete new_entry_; - new_entry_ = NULL; - } - } - - // Rebuild will rebuild this object, guaranteeing only - // to preserve the Entry values that are in the vector pointed - // to (this list does not have to be unique). The point of - // this is to save memory. - void Rebuild(const std::vector &to_keep) { - SetType tmp_set; - for (typename std::vector::const_iterator iter = - to_keep.begin(); - iter != to_keep.end(); ++iter) - RebuildHelper(*iter, &tmp_set); - // Now delete all elems not in tmp_set. - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) { - if (tmp_set.count(*iter) == 0) - delete (*iter); // delete the Entry; not needed. - } - set_.swap(tmp_set); - } - - ~LatticeStringRepository() { Destroy(); } - int32 MemSize() const { - return set_.size() * sizeof(Entry) * 2; // this is a lower bound - // on the size this structure might take. - } - - private: - class EntryKey { // Hash function object. - public: - inline size_t operator()(const Entry *entry) const { - size_t prime = 49109; - return static_cast(entry->i) + - prime * reinterpret_cast(entry->parent); - } - }; - class EntryEqual { - public: - inline bool operator()(const Entry *e1, const Entry *e2) const { - return (*e1 == *e2); - } - }; - typedef std::unordered_set SetType; - - void RebuildHelper(const Entry *to_add, SetType *tmp_set) { - while (true) { - if (to_add == NULL) return; - typename SetType::iterator iter = tmp_set->find(to_add); - if (iter == tmp_set->end()) { // not in tmp_set. - tmp_set->insert(to_add); - to_add = to_add->parent; // and loop. - } else { - return; - } - } - } - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); - Entry *new_entry_; // We always have a pre-allocated Entry ready to use, - // to avoid unnecessary news and deletes. - SetType set_; -}; - -// class LatticeDeterminizer is templated on the same types that -// CompactLatticeWeight is templated on: the base weight (Weight), typically -// LatticeWeightTpl etc. but could also be e.g. TropicalWeight, and the -// IntType, typically int32, used for the output symbols in the compact -// representation of strings [note: the output symbols would usually be -// p.d.f. id's in the anticipated use of this code] It has a special requirement -// on the Weight type: that there should be a Compare function on the weights -// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 -// > w2. This requires that there be a total order on the weights. - -template -class LatticeDeterminizer { - public: - // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 - // correspondence between our states and the states in ofst. If destroy == - // true, release memory as we go (but we cannot output again). - - typedef CompactLatticeWeightTpl CompactWeight; - typedef ArcTpl - CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - - // Output to standard FST with CompactWeightTpl as its weight type - // (the weight stores the original output-symbol strings). If destroy == - // true, release memory as we go (but we cannot output again). - void Output(MutableFst *ofst, bool destroy = true) { - assert(determinized_); - typedef typename Arc::StateId StateId; - StateId nStates = static_cast(output_arcs_.size()); - if (destroy) FreeMostMemory(); - ofst->DeleteStates(); - ofst->SetStart(kNoStateId); - if (nStates == 0) { - return; - } - for (StateId s = 0; s < nStates; s++) { - OutputStateId news = ofst->AddState(); - assert(news == s); - } - ofst->SetStart(0); - // now process transitions. - for (StateId this_state = 0; this_state < nStates; this_state++) { - std::vector &this_vec(output_arcs_[this_state]); - typename std::vector::const_iterator iter = this_vec.begin(), - end = this_vec.end(); - - for (; iter != end; ++iter) { - const TempArc &temp_arc(*iter); - CompactArc new_arc; - std::vector is not treated as epsilon, create a common end state for - // all transitions accepting the , since they do not back off. This small - // optimization saves about 2% states in an average grammar. - if (sub_eps_ == 0) { - eos_state_ = fst_->AddState(); - fst_->SetFinal(eos_state_, 0); - } -} - -template -void ArpaLmCompilerImpl::ConsumeNGram(const NGram& ngram, - bool is_highest) { - // Generally, we do the following. Suppose we are adding an n-gram "A B - // C". Then find the node for "A B", add a new node for "A B C", and connect - // them with the arc accepting "C" with the specified weight. Also, add a - // backoff arc from the new "A B C" node to its backoff state "B C". - // - // Two notable exceptions are the highest order n-grams, and final n-grams. - // - // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM), - // the following optimization is performed. There is no point adding a node - // for "A B C" with a "C" arc from "A B", since there will be no other - // arcs ingoing to this node, and an epsilon backoff arc into the backoff - // model "B C", with the weight of \bar{1}. To save a node, create an arc - // accepting "C" directly from "A B" to "B C". This saves as many nodes - // as there are the highest order n-grams, which is typically about half - // the size of a large 3-gram model. - // - // Indeed, this does not apply to n-grams ending in EOS, since they do not - // back off. These are special, as they do not have a back-off state, and - // the node for "(..anything..) " is always final. These are handled - // in one of the two possible ways, If symbols and are being - // replaced by epsilons, neither node nor arc is created, and the logprob - // of the n-gram is applied to its source node as final weight. If and - // are preserved, then a special final node for is allocated and - // used as the destination of the "" acceptor arc. - HistKey heads(ngram.words.begin(), ngram.words.end() - 1); - typename HistoryMap::iterator source_it = history_.find(heads); - if (source_it == history_.end()) { - // There was no "A B", therefore the probability of "A B C" is zero. - // Print a warning and discard current n-gram. - if (parent_->ShouldWarn()) - KALDI_WARN << parent_->LineReference() - << " skipped: no parent (n-1)-gram exists"; - return; - } - - StateId source = source_it->second; - StateId dest; - Symbol sym = ngram.words.back(); - float weight = -ngram.logprob; - if (sym == sub_eps_ || sym == 0) { - KALDI_ERR << " or disambiguation symbol " << sym - << "found in the ARPA file. "; - } - if (sym == eos_symbol_) { - if (sub_eps_ == 0) { - // Keep as a real symbol when not substituting. - dest = eos_state_; - } else { - // Treat as if it was epsilon: mark source final, with the weight - // of the n-gram. - fst_->SetFinal(source, weight); - return; - } - } else { - // For the highest order n-gram, this may find an existing state, for - // non-highest, will create one (unless there are duplicate n-grams - // in the grammar, which cannot be reliably detected if highest order, - // so we better do not do that at all). - dest = AddStateWithBackoff( - HistKey(ngram.words.begin() + (is_highest ? 1 : 0), ngram.words.end()), - -ngram.backoff); - } - - if (sym == bos_symbol_) { - weight = 0; // Accepting is always free. - if (sub_eps_ == 0) { - // is as a real symbol, only accepted in the start state. - source = fst_->AddState(); - fst_->SetStart(source); - } else { - // The new state for unigram history *is* the start state. - fst_->SetStart(dest); - return; - } - } - - // Add arc from source to dest, whichever way it was found. - fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest)); - return; -} - -// Find or create a new state for n-gram defined by key, and ensure it has a -// backoff transition. The key is either the current n-gram for all but -// highest orders, or the tails of the n-gram for the highest order. The -// latter arises from the chain-collapsing optimization described above. -template -StateId ArpaLmCompilerImpl::AddStateWithBackoff(HistKey key, - float backoff) { - typename HistoryMap::iterator dest_it = history_.find(key); - if (dest_it != history_.end()) { - // Found an existing state in the history map. Invariant: if the state in - // the map, then its backoff arc is in the FST. We are done. - return dest_it->second; - } - // Otherwise create a new state and its backoff arc, and register in the map. - StateId dest = fst_->AddState(); - history_[key] = dest; - CreateBackoff(key.Tails(), dest, backoff); - return dest; -} - -// Create a backoff arc for a state. Key is a backoff destination that may or -// may not exist. When the destination is not found, naturally fall back to -// the lower order model, and all the way down until one is found (since the -// 0-gram model is always present, the search is guaranteed to terminate). -template -inline void ArpaLmCompilerImpl::CreateBackoff(HistKey key, - StateId state, - float weight) { - typename HistoryMap::iterator dest_it = history_.find(key); - while (dest_it == history_.end()) { - key = key.Tails(); - dest_it = history_.find(key); - } - - // The arc should transduce either or #0 to , depending on the - // epsilon substitution mode. This is the only case when input and output - // label may differ. - fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second)); -} - -ArpaLmCompiler::~ArpaLmCompiler() { - if (impl_ != NULL) delete impl_; -} - -void ArpaLmCompiler::HeaderAvailable() { - KALDI_ASSERT(impl_ == NULL); - // Use optimized implementation if the grammar is 4-gram or less, and the - // maximum attained symbol id will fit into the optimized range. - int64 max_symbol = 0; - if (Symbols() != NULL) max_symbol = Symbols()->AvailableKey() - 1; - // If augmenting the symbol table, assume the worst case when all words in - // the model being read are novel. - if (Options().oov_handling == ArpaParseOptions::kAddToSymbols) - max_symbol += NgramCounts()[0]; - - if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - } else { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - KALDI_LOG << "Reverting to slower state tracking because model is large: " - << NgramCounts().size() << "-gram with symbols up to " - << max_symbol; - } -} - -void ArpaLmCompiler::ConsumeNGram(const NGram& ngram) { - // is invalid in tails, in heads of an n-gram. - for (int i = 0; i < ngram.words.size(); ++i) { - if ((i > 0 && ngram.words[i] == Options().bos_symbol) || - (i + 1 < ngram.words.size() && - ngram.words[i] == Options().eos_symbol)) { - if (ShouldWarn()) - KALDI_WARN << LineReference() - << " skipped: n-gram has invalid BOS/EOS placement"; - return; - } - } - - bool is_highest = ngram.words.size() == NgramCounts().size(); - impl_->ConsumeNGram(ngram, is_highest); -} - -void ArpaLmCompiler::RemoveRedundantStates() { - fst::StdArc::Label backoff_symbol = sub_eps_; - if (backoff_symbol == 0) { - // The method of removing redundant states implemented in this function - // leads to slow determinization of L o G when people use the older style of - // usage of arpa2fst where the --disambig-symbol option was not specified. - // The issue seems to be that it creates a non-deterministic FST, while G is - // supposed to be deterministic. By 'return'ing below, we just disable this - // method if people were using an older script. This method isn't really - // that consequential anyway, and people will move to the newer-style - // scripts (see current utils/format_lm.sh), so this isn't much of a - // problem. - return; - } - - fst::StdArc::StateId num_states = fst_.NumStates(); - - // replace the #0 symbols on the input of arcs out of redundant states (states - // that are not final and have only a backoff arc leaving them), with . - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && - fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } - } - } - - // we could call fst::RemoveEps, and it would have the same effect in normal - // cases, where backoff_symbol != 0 and there are no epsilons in unexpected - // places, but RemoveEpsLocal is a bit safer in case something weird is going - // on; it guarantees not to blow up the FST. - fst::RemoveEpsLocal(&fst_); - KALDI_LOG << "Reduced num-states from " << num_states << " to " - << fst_.NumStates(); -} - -void ArpaLmCompiler::Check() const { - if (fst_.Start() == fst::kNoStateId) { - KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " - << Symbols()->Find(Options().bos_symbol) << "."; - } -} - -void ArpaLmCompiler::ReadComplete() { - fst_.SetInputSymbols(Symbols()); - fst_.SetOutputSymbols(Symbols()); - RemoveRedundantStates(); - Check(); -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/lm/arpa-lm-compiler.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/lm/arpa-lm-compiler.h deleted file mode 100644 index 069c71bd0e6f5acf0b9521ec1ef46796eb31fe4d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/lm/arpa-lm-compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// lm/arpa-lm-compiler.h - -// Copyright 2009-2011 Gilles Boulianne -// Copyright 2016 Smart Action LLC (kkm) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_LM_ARPA_LM_COMPILER_H_ -#define KALDI_LM_ARPA_LM_COMPILER_H_ - -#include - -#include "lm/arpa-file-parser.h" - -namespace kaldi { - -class ArpaLmCompilerImplInterface; - -class ArpaLmCompiler : public ArpaFileParser { - public: - ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps, - fst::SymbolTable* symbols) - : ArpaFileParser(options, symbols), sub_eps_(sub_eps), impl_(NULL) {} - ~ArpaLmCompiler(); - - const fst::StdVectorFst& Fst() const { return fst_; } - fst::StdVectorFst* MutableFst() { return &fst_; } - - protected: - // ArpaFileParser overrides. - virtual void HeaderAvailable(); - virtual void ConsumeNGram(const NGram& ngram); - virtual void ReadComplete(); - - private: - // this function removes states that only have a backoff arc coming - // out of them. - void RemoveRedundantStates(); - void Check() const; - - int sub_eps_; - ArpaLmCompilerImplInterface* impl_; // Owned. - fst::StdVectorFst fst_; - template - friend class ArpaLmCompilerImpl; -}; - -} // namespace kaldi - -#endif // KALDI_LM_ARPA_LM_COMPILER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/lmbin/arpa2fst.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/lmbin/arpa2fst.cc deleted file mode 100644 index 881a45c5b37810247ea38dae56237f59b5554a9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/lmbin/arpa2fst.cc +++ /dev/null @@ -1,145 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "lm/arpa-lm-compiler.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; // NOLINT - try { - const char *usage = - "Convert an ARPA format language model into an FST\n" - "Usage: arpa2fst [opts] \n" - " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" - "data/lang/words.txt lm/input.arpa G.fst\n\n" - "Note: When called without switches, the output G.fst will contain\n" - "an embedded symbol table. This is compatible with the way a previous\n" - "version of arpa2fst worked.\n"; - - ParseOptions po(usage); - - ArpaParseOptions options; - options.Register(&po); - - // Option flags. - std::string bos_symbol = ""; - std::string eos_symbol = ""; - std::string disambig_symbol; - std::string read_syms_filename; - std::string write_syms_filename; - bool keep_symbols = false; - bool ilabel_sort = true; - - po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); - po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); - po.Register("disambig-symbol", &disambig_symbol, - "Disambiguator. If provided (e. g. #0), used on input side of " - "backoff links, and and are replaced with epsilons"); - po.Register("read-symbol-table", &read_syms_filename, - "Use existing symbol table"); - po.Register("write-symbol-table", &write_syms_filename, - "Write generated symbol table to a file"); - po.Register("keep-symbols", &keep_symbols, - "Store symbol table with FST. Symbols always saved to FST if " - "symbol tables are neither read or written (otherwise symbols " - "would be lost entirely)"); - po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); - - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_rxfilename = po.GetArg(1), - fst_wxfilename = po.GetOptArg(2); - - int64 disambig_symbol_id = 0; - - fst::SymbolTable *symbols; - if (!read_syms_filename.empty()) { - // Use existing symbols. Required symbols must be in the table. - kaldi::Input kisym(read_syms_filename); - symbols = fst::SymbolTable::ReadText( - kisym.Stream(), PrintableWxfilename(read_syms_filename)); - if (symbols == NULL) - KALDI_ERR << "Could not read symbol table from file " - << read_syms_filename; - - options.oov_handling = ArpaParseOptions::kSkipNGram; - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == -1) // fst::kNoSymbol - KALDI_ERR << "Symbol table " << read_syms_filename - << " has no symbol for " << disambig_symbol; - } - } else { - // Create a new symbol table and populate it from ARPA file. - symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); - options.oov_handling = ArpaParseOptions::kAddToSymbols; - symbols->AddSymbol("", 0); - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->AddSymbol(disambig_symbol); - } - } - - // Add or use existing BOS and EOS. - options.bos_symbol = symbols->AddSymbol(bos_symbol); - options.eos_symbol = symbols->AddSymbol(eos_symbol); - - // If producing new (not reading existing) symbols and not saving them, - // need to keep symbols with FST, otherwise they would be lost. - if (read_syms_filename.empty() && write_syms_filename.empty()) - keep_symbols = true; - - // Actually compile LM. - KALDI_ASSERT(symbols != NULL); - ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); - { - Input ki(arpa_rxfilename); - lm_compiler.Read(ki.Stream()); - } - - // Sort the FST in-place if requested by options. - if (ilabel_sort) { - fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); - } - - // Write symbols if requested. - if (!write_syms_filename.empty()) { - kaldi::Output kosym(write_syms_filename, false); - symbols->WriteText(kosym.Stream()); - } - - // Write LM FST. - bool write_binary = true, write_header = false; - kaldi::Output kofst(fst_wxfilename, write_binary, write_header); - fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); - wopts.write_isymbols = wopts.write_osymbols = keep_symbols; - lm_compiler.Fst().Write(kofst.Stream(), wopts); - - delete symbols; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/basic-filebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/basic-filebuf.h deleted file mode 100644 index 22ec891064d5955c8b1d255e0d34781a9f505a38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/basic-filebuf.h +++ /dev/null @@ -1,952 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// This is a modified version of the std::basic_filebuf from libc++ -// Copyright 20XX LLVM -// (http://libcxx.llvm.org/). -// It allows one to create basic_filebuf from an existing FILE* handle or file -// descriptor. -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source License licenses. See LICENSE.TXT for details (included at the -// bottom). -/////////////////////////////////////////////////////////////////////////////// -#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ -#define KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////// -namespace kaldi { -/////////////////////////////////////////////////////////////////////////////// -template > -class basic_filebuf : public std::basic_streambuf { - public: - typedef CharT char_type; - typedef Traits traits_type; - typedef typename traits_type::int_type int_type; - typedef typename traits_type::pos_type pos_type; - typedef typename traits_type::off_type off_type; - typedef typename traits_type::state_type state_type; - - basic_filebuf(); - basic_filebuf(basic_filebuf&& rhs); - virtual ~basic_filebuf(); - - basic_filebuf& operator=(basic_filebuf&& rhs); - void swap(basic_filebuf& rhs); - - bool is_open() const; - basic_filebuf* open(const char* s, std::ios_base::openmode mode); - basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); - basic_filebuf* open(int fd, std::ios_base::openmode mode); - basic_filebuf* open(FILE* f, std::ios_base::openmode mode); - basic_filebuf* close(); - - FILE* file() { return this->_M_file; } - int fd() { return fileno(this->_M_file); } - - protected: - int_type underflow() override; - int_type pbackfail(int_type c = traits_type::eof()) override; - int_type overflow(int_type c = traits_type::eof()) override; - std::basic_streambuf* setbuf( - char_type* s, std::streamsize n) override; - pos_type seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - pos_type seekpos(pos_type sp, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - int sync() override; - void imbue(const std::locale& loc) override; - - protected: - char* _M_extbuf; - const char* _M_extbufnext; - const char* _M_extbufend; - char _M_extbuf_min[8]; - size_t _M_ebs; - char_type* _M_intbuf; - size_t _M_ibs; - FILE* _M_file; - const std::codecvt* _M_cv; - state_type _M_st; - state_type _M_st_last; - std::ios_base::openmode _M_om; - std::ios_base::openmode _M_cm; - bool _M_owns_eb; - bool _M_owns_ib; - bool _M_always_noconv; - - const char* _M_get_mode(std::ios_base::openmode mode); - bool _M_read_mode(); - void _M_write_mode(); -}; - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf() - : _M_extbuf(nullptr), - _M_extbufnext(nullptr), - _M_extbufend(nullptr), - _M_ebs(0), - _M_intbuf(nullptr), - _M_ibs(0), - _M_file(nullptr), - _M_cv(nullptr), - _M_st(), - _M_st_last(), - _M_om(std::ios_base::openmode(0)), - _M_cm(std::ios_base::openmode(0)), - _M_owns_eb(false), - _M_owns_ib(false), - _M_always_noconv(false) { - if (std::has_facet >( - this->getloc())) { - _M_cv = &std::use_facet >( - this->getloc()); - _M_always_noconv = _M_cv->always_noconv(); - } - setbuf(0, 4096); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf(basic_filebuf&& rhs) - : std::basic_streambuf(rhs) { - if (rhs._M_extbuf == rhs._M_extbuf_min) { - _M_extbuf = _M_extbuf_min; - _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); - _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); - } else { - _M_extbuf = rhs._M_extbuf; - _M_extbufnext = rhs._M_extbufnext; - _M_extbufend = rhs._M_extbufend; - } - _M_ebs = rhs._M_ebs; - _M_intbuf = rhs._M_intbuf; - _M_ibs = rhs._M_ibs; - _M_file = rhs._M_file; - _M_cv = rhs._M_cv; - _M_st = rhs._M_st; - _M_st_last = rhs._M_st_last; - _M_om = rhs._M_om; - _M_cm = rhs._M_cm; - _M_owns_eb = rhs._M_owns_eb; - _M_owns_ib = rhs._M_owns_ib; - _M_always_noconv = rhs._M_always_noconv; - if (rhs.pbase()) { - if (rhs.pbase() == rhs._M_intbuf) - this->setp(_M_intbuf, _M_intbuf + (rhs.epptr() - rhs.pbase())); - else - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + - (rhs.epptr() - rhs.pbase())); - this->pbump(rhs.pptr() - rhs.pbase()); - } else if (rhs.eback()) { - if (rhs.eback() == rhs._M_intbuf) - this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), - _M_intbuf + (rhs.egptr() - rhs.eback())); - else - this->setg( - reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (rhs.gptr() - rhs.eback()), - reinterpret_cast(_M_extbuf) + - (rhs.egptr() - rhs.eback())); - } - rhs._M_extbuf = nullptr; - rhs._M_extbufnext = nullptr; - rhs._M_extbufend = nullptr; - rhs._M_ebs = 0; - rhs._M_intbuf = nullptr; - rhs._M_ibs = 0; - rhs._M_file = nullptr; - rhs._M_st = state_type(); - rhs._M_st_last = state_type(); - rhs._M_om = std::ios_base::openmode(0); - rhs._M_cm = std::ios_base::openmode(0); - rhs._M_owns_eb = false; - rhs._M_owns_ib = false; - rhs.setg(0, 0, 0); - rhs.setp(0, 0); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf& basic_filebuf::operator=( - basic_filebuf&& rhs) { - close(); - swap(rhs); - return *this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::~basic_filebuf() { - // try - // { - // close(); - // } - // catch (...) - // { - // } - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::swap(basic_filebuf& rhs) { - std::basic_streambuf::swap(rhs); - if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - std::swap(_M_extbuf, rhs._M_extbuf); - std::swap(_M_extbufnext, rhs._M_extbufnext); - std::swap(_M_extbufend, rhs._M_extbufend); - } else { - ptrdiff_t ln = _M_extbufnext - _M_extbuf; - ptrdiff_t le = _M_extbufend - _M_extbuf; - ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; - ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; - if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - _M_extbuf = rhs._M_extbuf; - rhs._M_extbuf = rhs._M_extbuf_min; - } else if (_M_extbuf != _M_extbuf_min && - rhs._M_extbuf == rhs._M_extbuf_min) { - rhs._M_extbuf = _M_extbuf; - _M_extbuf = _M_extbuf_min; - } - _M_extbufnext = _M_extbuf + rn; - _M_extbufend = _M_extbuf + re; - rhs._M_extbufnext = rhs._M_extbuf + ln; - rhs._M_extbufend = rhs._M_extbuf + le; - } - std::swap(_M_ebs, rhs._M_ebs); - std::swap(_M_intbuf, rhs._M_intbuf); - std::swap(_M_ibs, rhs._M_ibs); - std::swap(_M_file, rhs._M_file); - std::swap(_M_cv, rhs._M_cv); - std::swap(_M_st, rhs._M_st); - std::swap(_M_st_last, rhs._M_st_last); - std::swap(_M_om, rhs._M_om); - std::swap(_M_cm, rhs._M_cm); - std::swap(_M_owns_eb, rhs._M_owns_eb); - std::swap(_M_owns_ib, rhs._M_owns_ib); - std::swap(_M_always_noconv, rhs._M_always_noconv); - if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->gptr() - this->eback(); - ptrdiff_t e = this->egptr() - this->eback(); - this->setg(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + n, - reinterpret_cast(_M_extbuf_min) + e); - } else if (this->pbase() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->pptr() - this->pbase(); - ptrdiff_t e = this->epptr() - this->pbase(); - this->setp(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + e); - this->pbump(n); - } - if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.gptr() - rhs.eback(); - ptrdiff_t e = rhs.egptr() - rhs.eback(); - rhs.setg(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + n, - reinterpret_cast(rhs._M_extbuf_min) + e); - } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.pptr() - rhs.pbase(); - ptrdiff_t e = rhs.epptr() - rhs.pbase(); - rhs.setp(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + e); - rhs.pbump(n); - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline void swap(basic_filebuf& x, - basic_filebuf& y) { - x.swap(y); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline bool basic_filebuf::is_open() const { - return _M_file != nullptr; -} - -/////////////////////////////////////////////////////////////////////////////// -template -const char* basic_filebuf::_M_get_mode( - std::ios_base::openmode mode) { - switch ((mode & ~std::ios_base::ate) | 0) { - case std::ios_base::out: - case std::ios_base::out | std::ios_base::trunc: - return "w"; - case std::ios_base::out | std::ios_base::app: - case std::ios_base::app: - return "a"; - break; - case std::ios_base::in: - return "r"; - case std::ios_base::in | std::ios_base::out: - return "r+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: - return "w+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app: - case std::ios_base::in | std::ios_base::app: - return "a+"; - case std::ios_base::out | std::ios_base::binary: - case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: - return "wb"; - case std::ios_base::out | std::ios_base::app | std::ios_base::binary: - case std::ios_base::app | std::ios_base::binary: - return "ab"; - case std::ios_base::in | std::ios_base::binary: - return "rb"; - case std::ios_base::in | std::ios_base::out | std::ios_base::binary: - return "r+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | - std::ios_base::binary: - return "w+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app | - std::ios_base::binary: - case std::ios_base::in | std::ios_base::app | std::ios_base::binary: - return "a+b"; - default: - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - const char* s, std::ios_base::openmode mode) { - basic_filebuf* rt = nullptr; - if (_M_file == nullptr) { - const char* md = _M_get_mode(mode); - if (md) { - _M_file = fopen(s, md); - if (_M_file) { - rt = this; - _M_om = mode; - if (mode & std::ios_base::ate) { - if (fseek(_M_file, 0, SEEK_END)) { - fclose(_M_file); - _M_file = nullptr; - rt = nullptr; - } - } - } - } - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf* basic_filebuf::open( - const std::string& s, std::ios_base::openmode mode) { - return open(s.c_str(), mode); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - int fd, std::ios_base::openmode mode) { - const char* md = this->_M_get_mode(mode); - if (md) { - this->_M_file = fdopen(fd, md); - this->_M_om = mode; - return this; - } else { - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - FILE* f, std::ios_base::openmode mode) { - this->_M_file = f; - this->_M_om = mode; - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::close() { - basic_filebuf* rt = nullptr; - if (_M_file) { - rt = this; - std::unique_ptr h(_M_file, fclose); - if (sync()) rt = nullptr; - if (fclose(h.release()) == 0) - _M_file = nullptr; - else - rt = nullptr; - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::underflow() { - if (_M_file == nullptr) return traits_type::eof(); - bool initial = _M_read_mode(); - char_type buf; - if (this->gptr() == nullptr) this->setg(&buf, &buf + 1, &buf + 1); - const size_t unget_sz = - initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); - int_type c = traits_type::eof(); - if (this->gptr() == this->egptr()) { - memmove(this->eback(), this->egptr() - unget_sz, - unget_sz * sizeof(char_type)); - if (_M_always_noconv) { - size_t nmemb = - static_cast(this->egptr() - this->eback() - unget_sz); - nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); - if (nmemb != 0) { - this->setg(this->eback(), this->eback() + unget_sz, - this->eback() + unget_sz + nmemb); - c = traits_type::to_int_type(*this->gptr()); - } - } else { - memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); - _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); - _M_extbufend = - _M_extbuf + - (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); - size_t nmemb = - std::min(static_cast(_M_ibs - unget_sz), - static_cast(_M_extbufend - _M_extbufnext)); - std::codecvt_base::result r; - _M_st_last = _M_st; - size_t nr = - fread(reinterpret_cast(const_cast(_M_extbufnext)), - 1, nmemb, _M_file); - if (nr != 0) { - if (!_M_cv) throw std::bad_cast(); - _M_extbufend = _M_extbufnext + nr; - char_type* inext; - r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, - this->eback() + unget_sz, this->eback() + _M_ibs, inext); - if (r == std::codecvt_base::noconv) { - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf), - const_cast(_M_extbufend)); - c = traits_type::to_int_type(*this->gptr()); - } else if (inext != this->eback() + unget_sz) { - this->setg(this->eback(), this->eback() + unget_sz, inext); - c = traits_type::to_int_type(*this->gptr()); - } - } - } - } else { - c = traits_type::to_int_type(*this->gptr()); - } - if (this->eback() == &buf) this->setg(0, 0, 0); - return c; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::pbackfail(int_type c) { - if (_M_file && this->eback() < this->gptr()) { - if (traits_type::eq_int_type(c, traits_type::eof())) { - this->gbump(-1); - return traits_type::not_eof(c); - } - if ((_M_om & std::ios_base::out) || - traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { - this->gbump(-1); - *this->gptr() = traits_type::to_char_type(c); - return c; - } - } - return traits_type::eof(); -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::overflow(int_type c) { - if (_M_file == nullptr) return traits_type::eof(); - _M_write_mode(); - char_type buf; - char_type* pb_save = this->pbase(); - char_type* epb_save = this->epptr(); - if (!traits_type::eq_int_type(c, traits_type::eof())) { - if (this->pptr() == nullptr) this->setp(&buf, &buf + 1); - *this->pptr() = traits_type::to_char_type(c); - this->pbump(1); - } - if (this->pptr() != this->pbase()) { - if (_M_always_noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else { - char* extbe = _M_extbuf; - std::codecvt_base::result r; - do { - if (!_M_cv) throw std::bad_cast(); - const char_type* e; - r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, _M_extbuf, - _M_extbuf + _M_ebs, extbe); - if (e == this->pbase()) return traits_type::eof(); - if (r == std::codecvt_base::noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else if (r == std::codecvt_base::ok || - r == std::codecvt_base::partial) { - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - if (r == std::codecvt_base::partial) { - this->setp(const_cast(e), this->pptr()); - this->pbump(this->epptr() - this->pbase()); - } - } else { - return traits_type::eof(); - } - } while (r == std::codecvt_base::partial); - } - this->setp(pb_save, epb_save); - } - return traits_type::not_eof(c); -} - -/////////////////////////////////////////////////////////////////////////////// -template -std::basic_streambuf* basic_filebuf::setbuf( - char_type* s, std::streamsize n) { - this->setg(0, 0, 0); - this->setp(0, 0); - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; - _M_ebs = n; - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv && s) { - _M_extbuf = reinterpret_cast(s); - _M_owns_eb = false; - } else { - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } - } else { - _M_extbuf = _M_extbuf_min; - _M_ebs = sizeof(_M_extbuf_min); - _M_owns_eb = false; - } - if (!_M_always_noconv) { - _M_ibs = std::max(n, sizeof(_M_extbuf_min)); - if (s && _M_ibs >= sizeof(_M_extbuf_min)) { - _M_intbuf = s; - _M_owns_ib = false; - } else { - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } else { - _M_ibs = 0; - _M_intbuf = 0; - _M_owns_ib = false; - } - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode) { - if (!_M_cv) throw std::bad_cast(); - int width = _M_cv->encoding(); - if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) - return pos_type(off_type(-1)); - // width > 0 || off == 0 - int whence; - switch (way) { - case std::ios_base::beg: - whence = SEEK_SET; - break; - case std::ios_base::cur: - whence = SEEK_CUR; - break; - case std::ios_base::end: - whence = SEEK_END; - break; - default: - return pos_type(off_type(-1)); - } -#if _WIN32 - if (fseek(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftell(_M_file); -#else - if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftello(_M_file); -#endif - r.state(_M_st); - return r; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { - if (_M_file == nullptr || sync()) return pos_type(off_type(-1)); -#if _WIN32 - if (fseek(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#else - if (fseeko(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#endif - _M_st = sp.state(); - return sp; -} - -/////////////////////////////////////////////////////////////////////////////// -template -int basic_filebuf::sync() { - if (_M_file == nullptr) return 0; - if (!_M_cv) throw std::bad_cast(); - if (_M_cm & std::ios_base::out) { - if (this->pptr() != this->pbase()) - if (overflow() == traits_type::eof()) return -1; - std::codecvt_base::result r; - do { - char* extbe; - r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) return -1; - } while (r == std::codecvt_base::partial); - if (r == std::codecvt_base::error) return -1; - if (fflush(_M_file)) return -1; - } else if (_M_cm & std::ios_base::in) { - off_type c; - state_type state = _M_st_last; - bool update_st = false; - if (_M_always_noconv) { - c = this->egptr() - this->gptr(); - } else { - int width = _M_cv->encoding(); - c = _M_extbufend - _M_extbufnext; - if (width > 0) { - c += width * (this->egptr() - this->gptr()); - } else { - if (this->gptr() != this->egptr()) { - const int off = _M_cv->length(state, _M_extbuf, _M_extbufnext, - this->gptr() - this->eback()); - c += _M_extbufnext - _M_extbuf - off; - update_st = true; - } - } - } -#if _WIN32 - if (fseek(_M_file_, -c, SEEK_CUR)) return -1; -#else - if (fseeko(_M_file, -c, SEEK_CUR)) return -1; -#endif - if (update_st) _M_st = state; - _M_extbufnext = _M_extbufend = _M_extbuf; - this->setg(0, 0, 0); - _M_cm = std::ios_base::openmode(0); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::imbue(const std::locale& loc) { - sync(); - _M_cv = &std::use_facet >(loc); - bool old_anc = _M_always_noconv; - _M_always_noconv = _M_cv->always_noconv(); - if (old_anc != _M_always_noconv) { - this->setg(0, 0, 0); - this->setp(0, 0); - // invariant, char_type is char, else we couldn't get here - // need to dump _M_intbuf - if (_M_always_noconv) { - if (_M_owns_eb) delete[] _M_extbuf; - _M_owns_eb = _M_owns_ib; - _M_ebs = _M_ibs; - _M_extbuf = reinterpret_cast(_M_intbuf); - _M_ibs = 0; - _M_intbuf = nullptr; - _M_owns_ib = false; - } else { // need to obtain an _M_intbuf. - // If _M_extbuf is user-supplied, use it, else new _M_intbuf - if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { - _M_ibs = _M_ebs; - _M_intbuf = reinterpret_cast(_M_extbuf); - _M_owns_ib = false; - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } else { - _M_ibs = _M_ebs; - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -bool basic_filebuf::_M_read_mode() { - if (!(_M_cm & std::ios_base::in)) { - this->setp(0, 0); - if (_M_always_noconv) - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + _M_ebs, - reinterpret_cast(_M_extbuf) + _M_ebs); - else - this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); - _M_cm = std::ios_base::in; - return true; - } - return false; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::_M_write_mode() { - if (!(_M_cm & std::ios_base::out)) { - this->setg(0, 0, 0); - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv) - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (_M_ebs - 1)); - else - this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); - } else { - this->setp(0, 0); - } - _M_cm = std::ios_base::out; - } -} - -/////////////////////////////////////////////////////////////////////////////// -} // namespace kaldi - -/////////////////////////////////////////////////////////////////////////////// -#endif // KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// - -/* - * ============================================================================ - * libc++ License - * ============================================================================ - * - * The libc++ library is dual licensed under both the University of Illinois - * "BSD-Like" license and the MIT license. As a user of this code you may - * choose to use it under either license. As a contributor, you agree to allow - * your code to be used under both. - * - * Full text of the relevant licenses is included below. - * - * ============================================================================ - * - * University of Illinois/NCSA - * Open Source License - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * All rights reserved. - * - * Developed by: - * - * LLVM Team - * - * University of Illinois at Urbana-Champaign - * - * http://llvm.org - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the names of the LLVM Team, University of Illinois at - * Urbana-Champaign, nor the names of its contributors may be used to - * endorse or promote products derived from this Software without specific - * prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - * ============================================================================== - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * ============================================================================== - * - * This file is a partial list of people who have contributed to the LLVM/libc++ - * project. If you have contributed a patch or made some other contribution to - * LLVM/libc++, please submit a patch to this file to add yourself, and it will - * be done! - * - * The list is sorted by surname and formatted to allow easy grepping and - * beautification by scripts. The fields are: name (N), email (E), web-address - * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address - * (S). - * - * N: Saleem Abdulrasool - * E: compnerd@compnerd.org - * D: Minor patches and Linux fixes. - * - * N: Dimitry Andric - * E: dimitry@andric.com - * D: Visibility fixes, minor FreeBSD portability patches. - * - * N: Holger Arnold - * E: holgerar@gmail.com - * D: Minor fix. - * - * N: Ruben Van Boxem - * E: vanboxem dot ruben at gmail dot com - * D: Initial Windows patches. - * - * N: David Chisnall - * E: theraven at theravensnest dot org - * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. - * - * N: Marshall Clow - * E: mclow.lists@gmail.com - * E: marshall@idio.com - * D: C++14 support, patches and bug fixes. - * - * N: Bill Fisher - * E: william.w.fisher@gmail.com - * D: Regex bug fixes. - * - * N: Matthew Dempsky - * E: matthew@dempsky.org - * D: Minor patches and bug fixes. - * - * N: Google Inc. - * D: Copyright owner and contributor of the CityHash algorithm - * - * N: Howard Hinnant - * E: hhinnant@apple.com - * D: Architect and primary author of libc++ - * - * N: Hyeon-bin Jeong - * E: tuhertz@gmail.com - * D: Minor patches and bug fixes. - * - * N: Argyrios Kyrtzidis - * E: kyrtzidis@apple.com - * D: Bug fixes. - * - * N: Bruce Mitchener, Jr. - * E: bruce.mitchener@gmail.com - * D: Emscripten-related changes. - * - * N: Michel Morin - * E: mimomorin@gmail.com - * D: Minor patches to is_convertible. - * - * N: Andrew Morrow - * E: andrew.c.morrow@gmail.com - * D: Minor patches and Linux fixes. - * - * N: Arvid Picciani - * E: aep at exys dot org - * D: Minor patches and musl port. - * - * N: Bjorn Reese - * E: breese@users.sourceforge.net - * D: Initial regex prototype - * - * N: Nico Rieck - * E: nico.rieck@gmail.com - * D: Windows fixes - * - * N: Jonathan Sauer - * D: Minor patches, mostly related to constexpr - * - * N: Craig Silverstein - * E: csilvers@google.com - * D: Implemented Cityhash as the string hash function on 64-bit machines - * - * N: Richard Smith - * D: Minor patches. - * - * N: Joerg Sonnenberger - * E: joerg@NetBSD.org - * D: NetBSD port. - * - * N: Stephan Tolksdorf - * E: st@quanttec.com - * D: Minor fix - * - * N: Michael van der Westhuizen - * E: r1mikey at gmail dot com - * - * N: Klaas de Vries - * E: klaas at klaasgaaf dot nl - * D: Minor bug fix. - * - * N: Zhang Xiongpang - * E: zhangxiongpang@gmail.com - * D: Minor patches and bug fixes. - * - * N: Xing Xue - * E: xingxue@ca.ibm.com - * D: AIX port - * - * N: Zhihao Yuan - * E: lichray@gmail.com - * D: Standard compatibility fixes. - * - * N: Jeffrey Yasskin - * E: jyasskin@gmail.com - * E: jyasskin@google.com - * D: Linux fixes. - */ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/const-integer-set-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/const-integer-set-inl.h deleted file mode 100644 index b93846148a3e4595774507f638396ce13393ac0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/const-integer-set-inl.h +++ /dev/null @@ -1,87 +0,0 @@ -// util/const-integer-set-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ - -// Do not include this file directly. It is included by const-integer-set.h - -namespace kaldi { - -template -void ConstIntegerSet::InitInternal() { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - quick_set_.clear(); // just in case we previously had data. - if (slow_set_.size() == 0) { - lowest_member_ = (I)1; - highest_member_ = (I)0; - contiguous_ = false; - quick_ = false; - } else { - lowest_member_ = slow_set_.front(); - highest_member_ = slow_set_.back(); - size_t range = highest_member_ + 1 - lowest_member_; - if (range == slow_set_.size()) { - contiguous_ = true; - quick_ = false; - } else { - contiguous_ = false; - // If it would be more compact to store as bool - if (range < slow_set_.size() * 8 * sizeof(I)) { - // (assuming 1 bit per element)... - quick_set_.resize(range, false); - for (size_t i = 0; i < slow_set_.size(); i++) - quick_set_[slow_set_[i] - lowest_member_] = true; - quick_ = true; - } else { - quick_ = false; - } - } - } -} - -template -int ConstIntegerSet::count(I i) const { - if (i < lowest_member_ || i > highest_member_) { - return 0; - } else { - if (contiguous_) return true; - if (quick_) { - return (quick_set_[i - lowest_member_] ? 1 : 0); - } else { - bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); - return (ans ? 1 : 0); - } - } -} - -template -void ConstIntegerSet::Write(std::ostream &os, bool binary) const { - WriteIntegerVector(os, binary, slow_set_); -} - -template -void ConstIntegerSet::Read(std::istream &is, bool binary) { - ReadIntegerVector(is, binary, &slow_set_); - InitInternal(); -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/const-integer-set.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/const-integer-set.h deleted file mode 100644 index 809a56a7c83804bfaa4badb5e28059734bfcad1e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/const-integer-set.h +++ /dev/null @@ -1,96 +0,0 @@ -// util/const-integer-set.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_H_ -#include -#include -#include -#include -#include -#include "util/stl-utils.h" - -/* ConstIntegerSet is a way to efficiently test whether something is in a - supplied set of integers. It can be initialized from a vector or set, but - never changed after that. It either uses a sorted vector or an array of - bool, depending on the input. It behaves like a const version of an STL set, - with only a subset of the functionality, except all the member functions are - upper-case. - - Note that we could get rid of the member slow_set_, but we'd have to - do more work to implement an iterator type. This would save memory. -*/ - -namespace kaldi { - -template -class ConstIntegerSet { - public: - ConstIntegerSet() : lowest_member_(1), highest_member_(0) {} - - void Init(const std::vector &input) { - slow_set_ = input; - SortAndUniq(&slow_set_); - InitInternal(); - } - - void Init(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - - explicit ConstIntegerSet(const std::vector &input) : slow_set_(input) { - SortAndUniq(&slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const ConstIntegerSet &other) - : slow_set_(other.slow_set_) { - InitInternal(); - } - - int count(I i) const; // returns 1 or 0. - - typedef typename std::vector::const_iterator iterator; - iterator begin() const { return slow_set_.begin(); } - iterator end() const { return slow_set_.end(); } - size_t size() const { return slow_set_.size(); } - bool empty() const { return slow_set_.empty(); } - - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); - - private: - I lowest_member_; - I highest_member_; - bool contiguous_; - bool quick_; - std::vector quick_set_; - std::vector slow_set_; - void InitInternal(); -}; - -} // end namespace kaldi - -#include "util/const-integer-set-inl.h" - -#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/hash-list-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/hash-list-inl.h deleted file mode 100644 index 063fa7131ec618f0aae9dc30f4edd26c9dcce7fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/hash-list-inl.h +++ /dev/null @@ -1,193 +0,0 @@ -// util/hash-list-inl.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_INL_H_ -#define KALDI_UTIL_HASH_LIST_INL_H_ - -// Do not include this file directly. It is included by fast-hash.h - -namespace kaldi { - -template -HashList::HashList() { - list_head_ = NULL; - bucket_list_tail_ = static_cast(-1); // invalid. - hash_size_ = 0; - freed_head_ = NULL; -} - -template -void HashList::SetSize(size_t size) { - hash_size_ = size; - KALDI_ASSERT(list_head_ == NULL && - bucket_list_tail_ == - static_cast(-1)); // make sure empty. - if (size > buckets_.size()) buckets_.resize(size, HashBucket(0, NULL)); -} - -template -typename HashList::Elem *HashList::Clear() { - // Clears the hashtable and gives ownership of the currently contained list - // to the user. - for (size_t cur_bucket = bucket_list_tail_; - cur_bucket != static_cast(-1); - cur_bucket = buckets_[cur_bucket].prev_bucket) { - buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". - } - bucket_list_tail_ = static_cast(-1); - Elem *ans = list_head_; - list_head_ = NULL; - return ans; -} - -template -const typename HashList::Elem *HashList::GetList() const { - return list_head_; -} - -template -inline void HashList::Delete(Elem *e) { - e->tail = freed_head_; - freed_head_ = e; -} - -template -inline typename HashList::Elem *HashList::Find(I key) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - if (bucket.last_elem == NULL) { - return NULL; // empty bucket. - } else { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - return NULL; // Not found. - } -} - -template -inline typename HashList::Elem *HashList::New() { - if (freed_head_) { - Elem *ans = freed_head_; - freed_head_ = freed_head_->tail; - return ans; - } else { - Elem *tmp = new Elem[allocate_block_size_]; - for (size_t i = 0; i + 1 < allocate_block_size_; i++) - tmp[i].tail = tmp + i + 1; - tmp[allocate_block_size_ - 1].tail = NULL; - freed_head_ = tmp; - allocated_.push_back(tmp); - return this->New(); - } -} - -template -HashList::~HashList() { - // First test whether we had any memory leak within the - // HashList, i.e. things for which the user did not call Delete(). - size_t num_in_list = 0, num_allocated = 0; - for (Elem *e = freed_head_; e != NULL; e = e->tail) num_in_list++; - for (size_t i = 0; i < allocated_.size(); i++) { - num_allocated += allocate_block_size_; - delete[] allocated_[i]; - } - if (num_in_list != num_allocated) { - KALDI_WARN << "Possible memory leak: " << num_in_list - << " != " << num_allocated - << ": you might have forgotten to call Delete on " - << "some Elems"; - } -} - -template -inline typename HashList::Elem *HashList::Insert(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - // Check the element is existing or not. - if (bucket.last_elem != NULL) { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - } - - // This is a new element. Insert it. - Elem *elem = New(); - elem->key = key; - elem->val = val; - if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at - // head of bucket list (which is tail of regular list, they go in - // opposite directions). - if (bucket_list_tail_ == static_cast(-1)) { - // list was empty so this is the first elem. - KALDI_ASSERT(list_head_ == NULL); - list_head_ = elem; - } else { - // link in to the chain of Elems - buckets_[bucket_list_tail_].last_elem->tail = elem; - } - elem->tail = NULL; - bucket.last_elem = elem; - bucket.prev_bucket = bucket_list_tail_; - bucket_list_tail_ = index; - } else { - // Already-occupied bucket. Insert at tail of list of elements within - // the bucket. - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - } - return elem; -} - -template -void HashList::InsertMore(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - Elem *elem = New(); - elem->key = key; - elem->val = val; - - KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here - if (bucket.last_elem->key == key) { // standard behavior: add as last element - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - return; - } - Elem *e = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail); - // find place to insert in linked list - while (e != bucket.last_elem->tail && e->key != key) e = e->tail; - KALDI_ASSERT(e->key == key); // not found? - should not happen - elem->tail = e->tail; - e->tail = elem; -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/hash-list.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/hash-list.h deleted file mode 100644 index 31cc9bdc4870773475f8c5139539e320746bf5fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/hash-list.h +++ /dev/null @@ -1,146 +0,0 @@ -// util/hash-list.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_H_ -#define KALDI_UTIL_HASH_LIST_H_ - -#include -#include -#include -#include -#include - -#include "base/kaldi-error.h" - -/* This header provides utilities for a structure that's used in a decoder (but - is quite generic in nature so we implement and test it separately). - Basically it's a singly-linked list, but implemented in such a way that we - can quickly search for elements in the list. We give it a slightly richer - interface than just a hash and a list. The idea is that we want to separate - the hash part and the list part: basically, in the decoder, we want to have a - single hash for the current frame and the next frame, because by the time we - need to access the hash for the next frame we no longer need the hash for the - previous frame. So we have an operation that clears the hash but leaves the - list structure intact. We also control memory management inside this object, - to avoid repeated new's/deletes. - - See hash-list-test.cc for an example of how to use this object. -*/ - -namespace kaldi { - -template -class HashList { - public: - struct Elem { - I key; - T val; - Elem *tail; - }; - - /// Constructor takes no arguments. - /// Call SetSize to inform it of the likely size. - HashList(); - - /// Clears the hash and gives the head of the current list to the user; - /// ownership is transferred to the user (the user must call Delete() - /// for each element in the list, at his/her leisure). - Elem *Clear(); - - /// Gives the head of the current list to the user. Ownership retained in the - /// class. Caution: in December 2013 the return type was changed to const - /// Elem* and this function was made const. You may need to change some types - /// of local Elem* variables to const if this produces compilation errors. - const Elem *GetList() const; - - /// Think of this like delete(). It is to be called for each Elem in turn - /// after you "obtained ownership" by doing Clear(). This is not the opposite - /// of. Insert, it is the opposite of New. It's really a memory operation. - inline void Delete(Elem *e); - - /// This should probably not be needed to be called directly by the user. - /// Think of it as opposite - /// to Delete(); - inline Elem *New(); - - /// Find tries to find this element in the current list using the hashtable. - /// It returns NULL if not present. The Elem it returns is not owned by the - /// user, it is part of the internal list owned by this object, but the user - /// is free to modify the "val" element. - inline Elem *Find(I key); - - /// Insert inserts a new element into the hashtable/stored list. - /// Because element keys in a hashtable are unique, this operation checks - /// whether each inserted element has a key equivalent to the one of an - /// element already in the hashtable. If so, the element is not inserted, - /// returning an pointer to this existing element. - inline Elem *Insert(I key, T val); - - /// Insert inserts another element with same key into the hashtable/ - /// stored list. - /// By calling this, the user asserts that one element with that key is - /// already present. - /// We insert it that way, that all elements with the same key - /// follow each other. - /// Find() will return the first one of the elements with the same key. - inline void InsertMore(I key, T val); - - /// SetSize tells the object how many hash buckets to allocate (should - /// typically be at least twice the number of objects we expect to go in the - /// structure, for fastest performance). It must be called while the hash - /// is empty (e.g. after Clear() or after initializing the object, but before - /// adding anything to the hash. - void SetSize(size_t sz); - - /// Returns current number of hash buckets. - inline size_t Size() { return hash_size_; } - - ~HashList(); - - private: - struct HashBucket { - size_t prev_bucket; // index to next bucket (-1 if list tail). Note: - // list of buckets goes in opposite direction to list of Elems. - Elem *last_elem; // pointer to last element in this bucket (NULL if empty) - inline HashBucket(size_t i, Elem *e) : prev_bucket(i), last_elem(e) {} - }; - - Elem *list_head_; // head of currently stored list. - size_t bucket_list_tail_; // tail of list of active hash buckets. - - size_t hash_size_; // number of hash buckets. - - std::vector buckets_; - - Elem *freed_head_; // head of list of currently freed elements. [ready for - // allocation] - - std::vector allocated_; // list of allocated blocks. - - static const size_t allocate_block_size_ = 1024; // Number of Elements to - // allocate in one block. Must be largish so storing allocated_ doesn't - // become a problem. -}; - -} // end namespace kaldi - -#include "util/hash-list-inl.h" - -#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io-inl.h deleted file mode 100644 index 8b0c92131c4af2113eb33da6f3cfa9dc4dee83e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io-inl.h +++ /dev/null @@ -1,40 +0,0 @@ -// util/kaldi-io-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_INL_H_ -#define KALDI_UTIL_KALDI_IO_INL_H_ - -#include - -namespace kaldi { - -bool Input::Open(const std::string &rxfilename, bool *binary) { - return OpenInternal(rxfilename, true, binary); -} - -bool Input::OpenTextMode(const std::string &rxfilename) { - return OpenInternal(rxfilename, false, NULL); -} - -bool Input::IsOpen() { return impl_ != NULL; } - -bool Output::IsOpen() { return impl_ != NULL; } - -} // end namespace kaldi. - -#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io.cc deleted file mode 100644 index 5f8ec4870138df32f6aca9c12383cf3885411741..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io.cc +++ /dev/null @@ -1,898 +0,0 @@ -// util/kaldi-io.cc - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/kaldi-io.h" - -#include -#include -#include - -#include - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" -#include "util/kaldi-pipebuf.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -#ifdef KALDI_CYGWIN_COMPAT -#include "util/kaldi-cygwin-io-inl.h" -#define MapOsPath(x) MapCygwinPath(x) -#else // KALDI_CYGWIN_COMPAT -#define MapOsPath(x) x -#endif // KALDI_CYGWIN_COMPAT - -#if defined(_MSC_VER) -static FILE *popen(const char *command, const char *mode) { -#ifdef KALDI_CYGWIN_COMPAT - return kaldi::CygwinCompatPopen(command, mode); -#else // KALDI_CYGWIN_COMPAT - return _popen(command, mode); -#endif // KALDI_CYGWIN_COMPAT -} -#endif // _MSC_VER - -namespace kaldi { - -#ifndef _MSC_VER // on VS, we don't need this type. -// could replace basic_pipebuf with stdio_filebuf on some platforms. -// Would mean we could use less of our own code. -typedef basic_pipebuf PipebufType; -#endif -} // namespace kaldi - -namespace kaldi { - -std::string PrintableRxfilename(const std::string &rxfilename) { - if (rxfilename == "" || rxfilename == "-") { - return "standard input"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return rxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(rxfilename); - } -} - -std::string PrintableWxfilename(const std::string &wxfilename) { - if (wxfilename == "" || wxfilename == "-") { - return "standard output"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return wxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(wxfilename); - } -} - -OutputType ClassifyWxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardOutput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardOutput; - } else if (first_char == '|') { - return kPipeOutput; // An output pipe like "|blah". - } else if (isspace(first_char) || isspace(last_char) || last_char == '|') { - return kNoOutput; // Leading or trailing space: can't interpret this. - // Final '|' would represent an input pipe, not an - // output pipe. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoOutput; - } else if (isdigit(last_char)) { - // This could be a file, but we have to see if it's an offset into a file - // (like foo.ark:4314328), which is not allowed for writing (but is - // allowed for reaching). This eliminates some things which would be - // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed - // such filenames for writing, we woudln't be able to correctly read them). - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') return kNoOutput; - // else it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but we - // check for internal '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" - " wrong place (pipe without | at the beginning?): " - << filename; - return kNoOutput; - } - return kFileOutput; // It matched no other pattern: assume it's a filename. -} - -InputType ClassifyRxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardInput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardInput; - } else if (first_char == '|') { - return kNoInput; // An output pipe like "|blah": not - // valid for input. - } else if (last_char == '|') { - return kPipeInput; - } else if (isspace(first_char) || isspace(last_char)) { - return kNoInput; // We don't allow leading or trailing space in a filename. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoInput; - } else if (isdigit(last_char)) { - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') - return kOffsetFileInput; // Filename is like - // some_file:12345 - // otherwise it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but - // we check for '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified in this case. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" - " wrong place (pipe without | at the end?): " - << filename; - return kNoInput; - } - return kFileInput; // It matched no other pattern: assume it's a filename. -} - -class OutputImplBase { - public: - // Open will open it as a file (no header), and return true - // on success. It cannot be called on an already open stream. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::ostream &Stream() = 0; - virtual bool Close() = 0; - virtual ~OutputImplBase() {} -}; - -class FileOutputImpl : public OutputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (os_.is_open()) - KALDI_ERR << "FileOutputImpl::Open(), " - << "open called on already open file."; - filename_ = filename; - os_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out); - return os_.is_open(); - } - - virtual std::ostream &Stream() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return os_; - } - - virtual bool Close() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - os_.close(); - return !(os_.fail()); - } - virtual ~FileOutputImpl() { - if (os_.is_open()) { - os_.close(); - if (os_.fail()) KALDI_ERR << "Error closing output file " << filename_; - } - } - - private: - std::string filename_; - std::ofstream os_; -}; - -class StandardOutputImpl : public OutputImplBase { - public: - StandardOutputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardOutputImpl::Open(), " - "open called on already open file."; -#ifdef _MSC_VER - _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); -#endif - is_open_ = std::cout.good(); - return is_open_; - } - - virtual std::ostream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cout; - } - - virtual bool Close() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; - is_open_ = false; - std::cout << std::flush; - return !(std::cout.fail()); - } - virtual ~StandardOutputImpl() { - if (is_open_) { - std::cout << std::flush; - if (std::cout.fail()) KALDI_ERR << "Error writing to standard output"; - } - } - - private: - bool is_open_; -}; - -class PipeOutputImpl : public OutputImplBase { - public: - PipeOutputImpl() : f_(NULL), os_(NULL) {} - - virtual bool Open(const std::string &wxfilename, bool binary) { - filename_ = wxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should - // start with '|' - std::string cmd_name(wxfilename, 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); -#else - f_ = popen(cmd_name.c_str(), "w"); -#endif - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for writing, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't make the - // destructor try to close the stream when - // we're done. - (binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - os_ = new std::ostream(fb_); -#else - os_ = new std::ofstream(f_); -#endif - return os_->good(); - } - } - - virtual std::ostream &Stream() { - if (os_ == NULL) - KALDI_ERR << "PipeOutputImpl::Stream()," - " object not initialized."; - // I believe this error can only arise from coding error. - return *os_; - } - - virtual bool Close() { - if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; - bool ok = true; - os_->flush(); - if (os_->fail()) ok = false; - delete os_; - os_ = NULL; - int status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return ok; - } - virtual ~PipeOutputImpl() { - if (os_) { - if (!Close()) - KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); - } - } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::ostream *os_; -}; - -class InputImplBase { - public: - // Open will open it as a file, and return true on success. - // May be called twice only for kOffsetFileInput (otherwise, - // if called twice, we just create a new Input object, to avoid - // having to deal with the extra hassle of reopening with the - // same object. - // Note that we will to call Open with true (binary) for - // for text-mode Kaldi files; the only actual text-mode input - // is for non-Kaldi files. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::istream &Stream() = 0; - virtual int32 Close() = 0; // We only need to check failure in the case of - // kPipeInput. - // on close for input streams. - virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may - // call Open twice - // (has efficiency benefits). - - virtual ~InputImplBase() {} -}; - -class FileInputImpl : public InputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (is_.is_open()) - KALDI_ERR << "FileInputImpl::Open(), " - << "open called on already open file."; - is_.open( - MapOsPath(filename).c_str(), - binary ? std::ios_base::in | std::ios_base::binary : std::ios_base::in); - return is_.is_open(); - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kFileInput; } - - virtual ~FileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::ifstream is_; -}; - -class StandardInputImpl : public InputImplBase { - public: - StandardInputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardInputImpl::Open(), " - "open called on already open file."; - is_open_ = true; -#ifdef _MSC_VER - _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); -#endif - return true; // Don't check good() because would be false if - // eof, which may be valid input. - } - - virtual std::istream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cin; - } - - virtual InputType MyType() { return kStandardInput; } - - virtual int32 Close() { - if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; - is_open_ = false; - return 0; - } - virtual ~StandardInputImpl() {} - - private: - bool is_open_; -}; - -class PipeInputImpl : public InputImplBase { - public: - PipeInputImpl() : f_(NULL), is_(NULL) {} - - virtual bool Open(const std::string &rxfilename, bool binary) { - filename_ = rxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(rxfilename.length() != 0 && - rxfilename[rxfilename.length() - 1] == - '|'); // should end with '|' - std::string cmd_name(rxfilename, 0, rxfilename.length() - 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); -#else - f_ = popen(cmd_name.c_str(), "r"); -#endif - - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for reading, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't lead the - // destructor to close the stream. - (binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - is_ = new std::istream(fb_); -#else - is_ = new std::ifstream(f_); -#endif - if (is_->fail() || is_->bad()) return false; - if (is_->eof()) { - KALDI_WARN << "Pipe opened with command " - << PrintableRxfilename(rxfilename) << " is empty."; - // don't return false: empty may be valid. - } - return true; - } - } - - virtual std::istream &Stream() { - if (is_ == NULL) - KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return *is_; - } - - virtual int32 Close() { - if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open."; - delete is_; - is_ = NULL; - int32 status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return status; - } - virtual ~PipeInputImpl() { - if (is_) Close(); - } - virtual InputType MyType() { return kPipeInput; } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::istream *is_; -}; - -/* -#else - -// Just have an empty implementation of the pipe input that crashes if -// called. -class PipeInputImpl: public InputImplBase { - public: - PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this - platform."); } - virtual bool Open(const std::string, bool) { return 0; } - virtual std::istream &Stream() const { return NULL; } - virtual void Close() {} - virtual InputType MyType() { return kPipeInput; } -}; - -#endif -*/ - -class OffsetFileInputImpl : public InputImplBase { - // This class is a bit more complicated than the - - public: - // splits a filename like /my/file:123 into /my/file and the - // number 123. Crashes if not this format. - static void SplitFilename(const std::string &rxfilename, - std::string *filename, size_t *offset) { - size_t pos = rxfilename.find_last_of(':'); - KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling - // code, as the filename is supposed to be of the correct form at this - // point. - *filename = std::string(rxfilename, 0, pos); - std::string number(rxfilename, pos + 1); - bool ans = ConvertStringToInteger(number, offset); - if (!ans) - KALDI_ERR << "Cannot get offset from filename " << rxfilename - << " (possibly you compiled in 32-bit and have a >32-bit" - << " byte offset into a file; you'll have to compile 64-bit."; - } - - bool Seek(size_t offset) { - size_t cur_pos = is_.tellg(); - if (cur_pos == offset) { - return true; - } else if (cur_pos < offset && cur_pos + 100 > offset) { - // We're close enough that it may be faster to just - // read that data, rather than seek. - for (size_t i = cur_pos; i < offset; i++) is_.get(); - return (is_.tellg() == std::streampos(offset)); - } - // Try to actually seek. - is_.seekg(offset, std::ios_base::beg); - if (is_.fail()) { // failbit or badbit is set [error happened] - is_.close(); - return false; // failure. - } else { - is_.clear(); // Clear any failure bits (e.g. eof). - return true; // success. - } - } - - // This Open routine is unusual in that it is designed to work even - // if it was already open. This for efficiency when seeking multiple - // times. - virtual bool Open(const std::string &rxfilename, bool binary) { - if (is_.is_open()) { - // We are opening when we have an already-open file. - // We may have to seek within this file, or else close it and - // open a different one. - std::string tmp_filename; - size_t offset; - SplitFilename(rxfilename, &tmp_filename, &offset); - if (tmp_filename == filename_ && binary == binary_) { // Just seek - is_.clear(); // clear fail bit, etc. - return Seek(offset); - } else { - is_.close(); // don't bother checking error status of is_. - filename_ = tmp_filename; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } else { - size_t offset; - SplitFilename(rxfilename, &filename_, &offset); - binary_ = binary; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kOffsetFileInput; } - - virtual ~OffsetFileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::string filename_; // the actual filename - bool binary_; // true if was opened in binary mode. - std::ifstream is_; -}; - -Output::Output(const std::string &wxfilename, bool binary, bool write_header) - : impl_(NULL) { - if (!Open(wxfilename, binary, write_header)) { - if (impl_) { - delete impl_; - impl_ = NULL; - } - KALDI_ERR << "Error opening output stream " - << PrintableWxfilename(wxfilename); - } -} - -bool Output::Close() { - if (!impl_) { - return false; // error to call Close if not open. - } else { - bool ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } -} - -Output::~Output() { - if (impl_) { - bool ok = impl_->Close(); - delete impl_; - impl_ = NULL; - if (!ok) - KALDI_ERR << "Error closing output file " - << PrintableWxfilename(filename_) - << (ClassifyWxfilename(filename_) == kFileOutput - ? " (disk full?)" - : ""); - } -} - -std::ostream &Output::Stream() { // will throw if not open; else returns - // stream. - if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; - return impl_->Stream(); -} - -bool Output::Open(const std::string &wxfn, bool binary, bool header) { - if (IsOpen()) { - if (!Close()) { // Throw here rather than return status, as it's an error - // about something else: if the user wanted to avoid the exception he/she - // could have called Close(). - KALDI_ERR << "Output::Open(), failed to close output stream: " - << PrintableWxfilename(filename_); - } - } - - filename_ = wxfn; - - OutputType type = ClassifyWxfilename(wxfn); - KALDI_ASSERT(impl_ == NULL); - - if (type == kFileOutput) { - impl_ = new FileOutputImpl(); - } else if (type == kStandardOutput) { - impl_ = new StandardOutputImpl(); - } else if (type == kPipeOutput) { - impl_ = new PipeOutputImpl(); - } else { // type == kNoOutput - KALDI_WARN << "Invalid output filename format " - << PrintableWxfilename(wxfn); - return false; - } - if (!impl_->Open(wxfn, binary)) { - delete impl_; - impl_ = NULL; - return false; // failed to open. - } else { // successfully opened it. - if (header) { - InitKaldiOutputStream(impl_->Stream(), binary); - bool ok = impl_->Stream().good(); // still OK? - if (!ok) { - delete impl_; - impl_ = NULL; - return false; - } - return true; - } else { - return true; - } - } -} - -Input::Input(const std::string &rxfilename, bool *binary) : impl_(NULL) { - if (!Open(rxfilename, binary)) { - KALDI_ERR << "Error opening input stream " - << PrintableRxfilename(rxfilename); - } -} - -int32 Input::Close() { - if (impl_) { - int32 ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } else { - return 0; - } -} - -bool Input::OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary) { - InputType type = ClassifyRxfilename(rxfilename); - if (IsOpen()) { - // May have to close the stream first. - if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { - // We want to use the same object to Open... this is in case - // the files are the same, so we can just seek. - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always open in binary. - delete impl_; - impl_ = NULL; - return false; - } - // read the binary header, if requested. - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; - } else { - Close(); - // and fall through to code below which actually opens the file. - } - } - if (type == kFileInput) { - impl_ = new FileInputImpl(); - } else if (type == kStandardInput) { - impl_ = new StandardInputImpl(); - } else if (type == kPipeInput) { - impl_ = new PipeInputImpl(); - } else if (type == kOffsetFileInput) { - impl_ = new OffsetFileInputImpl(); - } else { // type == kNoInput - KALDI_WARN << "Invalid input filename format " - << PrintableRxfilename(rxfilename); - return false; - } - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always read in binary. - delete impl_; - impl_ = NULL; - return false; - } - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; -} - -Input::~Input() { - if (impl_) Close(); -} - -std::istream &Input::Stream() { - if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; - return impl_->Stream(); -} - -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io.h deleted file mode 100644 index 2175ca8f89ed5f3e3bade26528e924208df692c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-io.h +++ /dev/null @@ -1,266 +0,0 @@ -// util/kaldi-io.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_H_ -#define KALDI_UTIL_KALDI_IO_H_ - -#ifdef _MSC_VER -#include -#include -#endif -#include // For isspace. -#include -#include -#include "base/kaldi-common.h" -// #include "matrix/kaldi-matrix.h" - -namespace kaldi { - -class OutputImplBase; // Forward decl; defined in a .cc file -class InputImplBase; // Forward decl; defined in a .cc file - -/// \addtogroup io_group -/// @{ - -// The Output and Input classes handle stream-opening for "extended" filenames -// that include actual files, standard-input/standard-output, pipes, and -// offsets into actual files. They also handle reading and writing the -// binary-mode headers for Kaldi files, where applicable. The classes have -// versions of the Open routines that throw and do not throw, depending whether -// the calling code wants to catch the errors or not; there are also versions -// that write (or do not write) the Kaldi binary-mode header that says if it's -// binary mode. Generally files that contain Kaldi objects will have the header -// on, so we know upon reading them whether they have the header. So you would -// use the OpenWithHeader routines for these (or the constructor); but other -// types of objects (e.g. FSTs) would have files without a header so you would -// use OpenNoHeader. - -// We now document the types of extended filenames that we use. -// -// A "wxfilename" is an extended filename for writing. It can take three forms: -// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My -// Documents\\boo" -// (whatever the actual file-system interprets) -// (2) Standard output: "" or "-" -// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" -// -// -// A "rxfilename" is an extended filename for reading. It can take four forms: -// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". -// (2) Standard input: "" or "-" -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" -// [these are created by the Table and TableWriter classes; I may also write -// a program that creates them for arbitrary files] -// - -// Typical usage: -// ... -// bool binary; -// MyObject.Write(Output(some_filename, binary).Stream(), binary); -// -// ... more extensive example: -// { -// Output ko(some_filename, binary); -// MyObject1.Write(ko.Stream(), binary); -// MyObject2.Write(ko.Stream(), binary); -// } - -enum OutputType { kNoOutput, kFileOutput, kStandardOutput, kPipeOutput }; - -/// ClassifyWxfilename interprets filenames as follows: -/// - kNoOutput: invalid filenames (leading or trailing space, things that look -/// like wspecifiers and rspecifiers or like pipes to read from with leading -/// |. -/// - kFileOutput: Normal filenames -/// - kStandardOutput: The empty string or "-", interpreted as standard output -/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" -OutputType ClassifyWxfilename(const std::string &wxfilename); - -enum InputType { - kNoInput, - kFileInput, - kStandardInput, - kOffsetFileInput, - kPipeInput -}; - -/// ClassifyRxfilenames interprets filenames for reading as follows: -/// - kNoInput: invalid filenames (leading or trailing space, things that -/// look like wspecifiers and rspecifiers or pipes to write to -/// with trailing |. -/// - kFileInput: normal filenames -/// - kStandardInput: the empty string or "-" -/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" -/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 -InputType ClassifyRxfilename(const std::string &rxfilename); - -class Output { - public: - // The normal constructor, provided for convenience. - // Equivalent to calling with default constructor then Open() - // with these arguments. - Output(const std::string &filename, bool binary, bool write_header = true); - - Output() : impl_(NULL) {} - - /// This opens the stream, with the given mode (binary or text). It returns - /// true on success and false on failure. However, it will throw if something - /// was already open and could not be closed (to avoid this, call Close() - /// first. if write_header == true and binary == true, it writes the Kaldi - /// binary-mode header ('\0' then 'B'). You may call Open even if it is - /// already open; it will close the existing stream and reopen (however if - /// closing the old stream failed it will throw). - bool Open(const std::string &wxfilename, bool binary, bool write_header); - - inline bool IsOpen(); // return true if we have an open stream. Does not - // imply stream is good for writing. - - std::ostream &Stream(); // will throw if not open; else returns stream. - - // Close closes the stream. Calling Close is never necessary unless you - // want to avoid exceptions being thrown. There are times when calling - // Close will hurt efficiency (basically, when using offsets into files, - // and using the same Input object), - // but most of the time the user won't be doing this directly, it will - // be done in kaldi-table.{h, cc}, so you don't have to worry about it. - bool Close(); - - // This will throw if stream could not be closed (to check error status, - // call Close()). - ~Output(); - - private: - OutputImplBase *impl_; // non-NULL if open. - std::string filename_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Output); -}; - -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject.Read(ki.Stream(), binary_in); -// -// ... more extensive example: -// -// { -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject1.Read(ki.Stream(), &binary_in); -// MyObject2.Write(ki.Stream(), &binary_in); -// } -// Note that to catch errors you need to use try.. catch. -// Input communicates errors by throwing exceptions. - -// Input interprets four kinds of filenames: -// (1) Normal filenames -// (2) The empty string or "-", interpreted as standard output -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) Offsets into [real] files, e.g. "/my/filename:12049" -// The last one has no correspondence in Output. - -class Input { - public: - /// The normal constructor. Opens the stream in binary mode. - /// Equivalent to calling the default constructor followed by Open(); then, if - /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it - /// throws on error. - explicit Input(const std::string &rxfilename, bool *contents_binary = NULL); - - Input() : impl_(NULL) {} - - // Open opens the stream for reading (the mode, where relevant, is binary; use - // OpenTextMode for text-mode, we made this a separate function rather than a - // boolean argument, to avoid confusion with Kaldi's text/binary distinction, - // since reading in the file system's text mode is unusual.) If - // contents_binary != NULL, it reads the binary-mode header and puts it in the - // "binary" variable. Returns true on success. If it returns false it will - // not be open. You may call Open even if it is already open; it will close - // the existing stream and reopen (however if closing the old stream failed it - // will throw). - inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); - - // As Open but (if the file system has text/binary modes) opens in text mode; - // you shouldn't ever have to use this as in Kaldi we read even text files in - // binary mode (and ignore the \r). - inline bool OpenTextMode(const std::string &rxfilename); - - // Return true if currently open for reading and Stream() will - // succeed. Does not guarantee that the stream is good. - inline bool IsOpen(); - - // It is never necessary or helpful to call Close, except if - // you are concerned about to many filehandles being open. - // Close does not throw. It returns the exit code as int32 - // in the case of a pipe [kPipeInput], and always zero otherwise. - int32 Close(); - - // Returns the underlying stream. Throws if !IsOpen() - std::istream &Stream(); - - // Destructor does not throw: input streams may legitimately fail so we - // don't worry about the status when we close them. - ~Input(); - - private: - bool OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary); - InputImplBase *impl_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Input); -}; - -template -void ReadKaldiObject(const std::string &filename, C *c) { - bool binary_in; - Input ki(filename, &binary_in); - c->Read(ki.Stream(), binary_in); -} - -// Specialize the template for reading matrices, because we want to be able to -// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); -// -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); - -template -inline void WriteKaldiObject(const C &c, const std::string &filename, - bool binary) { - Output ko(filename, binary); - c.Write(ko.Stream(), binary); -} - -/// PrintableRxfilename turns the rxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard input". -std::string PrintableRxfilename(const std::string &rxfilename); - -/// PrintableWxfilename turns the wxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard output". -std::string PrintableWxfilename(const std::string &wxfilename); - -/// @} - -} // end namespace kaldi. - -#include "util/kaldi-io-inl.h" - -#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-pipebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-pipebuf.h deleted file mode 100644 index bcee80ccb1a6fa8ce3195483ac144c5ff66d2f89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/kaldi-pipebuf.h +++ /dev/null @@ -1,86 +0,0 @@ -// util/kaldi-pipebuf.h - -// Copyright 2009-2011 Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/** @file kaldi-pipebuf.h - * This is an Kaldi C++ Library header. - */ - -#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ -#define KALDI_UTIL_KALDI_PIPEBUF_H_ - -#include -#if !defined(_LIBCPP_VERSION) // libc++ -#include -#else -#include "util/basic-filebuf.h" -#endif - -namespace kaldi { -// This class provides a way to initialize a filebuf with a FILE* pointer -// directly; it will not close the file pointer when it is deleted. -// The C++ standard does not allow implementations of C++ to provide -// this constructor within basic_filebuf, which makes it hard to deal -// with pipes using completely native C++. This is a workaround - -#ifdef _MSC_VER -#elif defined(_LIBCPP_VERSION) // libc++ -template > -class basic_pipebuf : public basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : basic_filebuf() { - this->open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - } -}; // class basic_pipebuf -#else -template > -class basic_pipebuf : public std::basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : std::basic_filebuf() { - this->_M_file.sys_open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - this->_M_mode = mode; - this->_M_buf_size = BUFSIZ; - this->_M_allocate_internal_buffer(); - this->_M_reading = false; - this->_M_writing = false; - this->_M_set_buffer(-1); - } -}; // class basic_pipebuf -#endif // _MSC_VER - -} // namespace kaldi - -#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/parse-options.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/parse-options.cc deleted file mode 100644 index 1f2ef844d28d67ed58d2e0c9d7c7b674e8209df8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/parse-options.cc +++ /dev/null @@ -1,636 +0,0 @@ -// util/parse-options.cc - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -// Frantisek Skala; Arnab Ghoshal -// Copyright 2013 Tanel Alumae -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -namespace kaldi { - -ParseOptions::ParseOptions(const std::string &prefix, OptionsItf *other) - : print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { - ParseOptions *po = dynamic_cast(other); - if (po != NULL && po->other_parser_ != NULL) { - // we get here if this constructor is used twice, recursively. - other_parser_ = po->other_parser_; - } else { - other_parser_ = other; - } - if (po != NULL && po->prefix_ != "") { - prefix_ = po->prefix_ + std::string(".") + prefix; - } else { - prefix_ = prefix; - } -} - -void ParseOptions::Register(const std::string &name, bool *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, int32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, uint32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, float *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, double *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, std::string *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -// old-style, used for registering application-specific parameters -template -void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, - const std::string &doc) { - if (other_parser_ == NULL) { - this->RegisterCommon(name, ptr, doc, false); - } else { - KALDI_ASSERT(prefix_ != "" && - "Cannot use empty prefix when registering with prefix."); - std::string new_name = prefix_ + '.' + name; // name becomes prefix.name - other_parser_->Register(new_name, ptr, doc); - } -} - -// does the common part of the job of registering a parameter -template -void ParseOptions::RegisterCommon(const std::string &name, T *ptr, - const std::string &doc, bool is_standard) { - KALDI_ASSERT(ptr != NULL); - std::string idx = name; - NormalizeArgName(&idx); - if (doc_map_.find(idx) != doc_map_.end()) - KALDI_WARN << "Registering option twice, ignoring second time: " << name; - this->RegisterSpecific(name, idx, ptr, doc, is_standard); -} - -// used to register standard parameters (those that are present in all of the -// applications) -template -void ParseOptions::RegisterStandard(const std::string &name, T *ptr, - const std::string &doc) { - this->RegisterCommon(name, ptr, doc, true); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, bool *b, - const std::string &doc, bool is_standard) { - bool_map_[idx] = b; - doc_map_[idx] = - DocInfo(name, doc + " (bool, default = " + ((*b) ? "true)" : "false)"), - is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, int32 *i, - const std::string &doc, bool is_standard) { - int_map_[idx] = i; - std::ostringstream ss; - ss << doc << " (int, default = " << *i << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, uint32 *u, - const std::string &doc, bool is_standard) { - uint_map_[idx] = u; - std::ostringstream ss; - ss << doc << " (uint, default = " << *u << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, float *f, - const std::string &doc, bool is_standard) { - float_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (float, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, double *f, - const std::string &doc, bool is_standard) { - double_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (double, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, std::string *s, - const std::string &doc, bool is_standard) { - string_map_[idx] = s; - doc_map_[idx] = - DocInfo(name, doc + " (string, default = \"" + *s + "\")", is_standard); -} -void ParseOptions::DisableOption(const std::string &name) { - if (argv_ != NULL) - KALDI_ERR << "DisableOption must not be called after calling Read()."; - if (doc_map_.erase(name) == 0) - KALDI_ERR << "Option " << name - << " was not registered so cannot be disabled: "; - bool_map_.erase(name); - int_map_.erase(name); - uint_map_.erase(name); - float_map_.erase(name); - double_map_.erase(name); - string_map_.erase(name); -} - -int ParseOptions::NumArgs() const { return positional_args_.size(); } - -std::string ParseOptions::GetArg(int i) const { - // use KALDI_ERR if code error - if (i < 1 || i > static_cast(positional_args_.size())) - KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; - return positional_args_[i - 1]; -} - -// We currently do not support any other options. -enum ShellType { kBash = 0 }; - -// This can be changed in the code if it ever does need to be changed (as it's -// unlikely that one compilation of this tool-set would use both shells). -static ShellType kShellType = kBash; - -// Returns true if we need to escape a string before putting it into -// a shell (mainly thinking of bash shell, but should work for others) -// This is for the convenience of the user so command-lines that are -// printed out by ParseOptions::Read (with --print-args=true) are -// paste-able into the shell and will run. If you use a different type of -// shell, it might be necessary to change this function. -// But it's mostly a cosmetic issue as it basically affects how -// the program echoes its command-line arguments to the screen. -static bool MustBeQuoted(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - const char *c = str.c_str(); - if (*c == '\0') { - return true; // Must quote empty string - } else { - const char *ok_chars[2]; - - // These seem not to be interpreted as long as there are no other "bad" - // characters involved (e.g. "," would be interpreted as part of something - // like a{b,c}, but not on its own. - ok_chars[kBash] = "[]~#^_-+=:.,/"; - - // Just want to make sure that a space character doesn't get automatically - // inserted here via an automated style-checking script, like it did before. - KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); - - for (; *c != '\0'; c++) { - // For non-alphanumeric characters we have a list of characters which - // are OK. All others are forbidden (this is easier since the shell - // interprets most non-alphanumeric characters). - if (!isalnum(*c)) { - const char *d; - for (d = ok_chars[st]; *d != '\0'; d++) - if (*c == *d) break; - // If not alphanumeric or one of the "ok_chars", it must be escaped. - if (*d == '\0') return true; - } - } - return false; // The string was OK. No quoting or escaping. - } -} - -// Returns a quoted and escaped version of "str" -// which has previously been determined to need escaping. -// Our aim is to print out the command line in such a way that if it's -// pasted into a shell of ShellType "st" (only bash for now), it -// will get passed to the program in the same way. -static std::string QuoteAndEscape(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - // For now we use the following rules: - // In the normal case, we quote with single-quote "'", and to escape - // a single-quote we use the string: '\'' (interpreted as closing the - // single-quote, putting an escaped single-quote from the shell, and - // then reopening the single quote). - char quote_char = '\''; - const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b - - // If the string contains single-quotes that would need escaping this - // way, and we determine that the string could be safely double-quoted - // without requiring any escaping, then we double-quote the string. - // This is the case if the characters "`$\ do not appear in the string. - // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html - const char *c_str = str.c_str(); - if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { - quote_char = '"'; - escape_str = "\\\""; // should never be accessed. - } - - char buf[2]; - buf[1] = '\0'; - - buf[0] = quote_char; - std::string ans = buf; - const char *c = str.c_str(); - for (; *c != '\0'; c++) { - if (*c == quote_char) { - ans += escape_str; - } else { - buf[0] = *c; - ans += buf; - } - } - buf[0] = quote_char; - ans += buf; - return ans; -} - -// static function -std::string ParseOptions::Escape(const std::string &str) { - return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; -} - -int ParseOptions::Read(int argc, const char *const argv[]) { - argc_ = argc; - argv_ = argv; - std::string key, value; - int i; - if (argc > 0) { - // set global "const char*" g_program_name (name of the program) - // so it can be printed out in error messages; - // it's useful because often the stderr of different programs will - // be mixed together in the same log file. -#ifdef _MSC_VER - const char *c = strrchr(argv[0], '\\'); -#else - const char *c = strrchr(argv[0], '/'); -#endif - SetProgramName(c == NULL ? argv[0] : c + 1); - } - // first pass: look for config parameter, look for priority - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // a lone "--" marks the end of named options - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (key.compare("config") == 0) { - ReadConfigFile(value); - } - if (key.compare("help") == 0) { - PrintUsage(); - exit(0); - } - } - } - bool double_dash_seen = false; - // second pass: add the command line options - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // A lone "--" marks the end of named options. - // Skip that option and break the processing of named options - i += 1; - double_dash_seen = true; - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << argv[i]; - } - } else { - break; - } - } - - // process remaining arguments as positional - for (; i < argc; i++) { - if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { - double_dash_seen = true; - } else { - positional_args_.push_back(std::string(argv[i])); - } - } - - // if the user did not suppress this with --print-args = false.... - if (print_args_) { - std::ostringstream strm; - for (int j = 0; j < argc; j++) strm << Escape(argv[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } - return i; -} - -void ParseOptions::PrintUsage(bool print_command_line) { - std::cerr << '\n' << usage_ << '\n'; - DocMapType::iterator it; - // first we print application-specific options - bool app_specific_header_printed = false; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option - if (app_specific_header_printed == false) { // header was not yet printed - std::cerr << "Options:" << '\n'; - app_specific_header_printed = true; - } - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - if (app_specific_header_printed == true) { - std::cerr << '\n'; - } - - // then the standard options - std::cerr << "Standard options:" << '\n'; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - std::cerr << '\n'; - if (print_command_line) { - std::ostringstream strm; - strm << "Command line was: "; - for (int j = 0; j < argc_; j++) strm << Escape(argv_[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } -} - -void ParseOptions::PrintConfig(std::ostream &os) { - os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; - std::string key; - DocMapType::iterator it; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; - if (bool_map_.end() != bool_map_.find(key)) { - os << (*bool_map_[key] ? "true" : "false"); - } else if (int_map_.end() != int_map_.find(key)) { - os << (*int_map_[key]); - } else if (uint_map_.end() != uint_map_.find(key)) { - os << (*uint_map_[key]); - } else if (float_map_.end() != float_map_.find(key)) { - os << (*float_map_[key]); - } else if (double_map_.end() != double_map_.find(key)) { - os << (*double_map_[key]); - } else if (string_map_.end() != string_map_.find(key)) { - os << "'" << *string_map_[key] << "'"; - } else { - KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; - } - os << '\n'; - } - os << '\n'; -} - -void ParseOptions::ReadConfigFile(const std::string &filename) { - std::ifstream is(filename.c_str(), std::ifstream::in); - if (!is.good()) { - KALDI_ERR << "Cannot open config file: " << filename; - } - - std::string line, key, value; - int32 line_number = 0; - while (std::getline(is, line)) { - line_number++; - // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { - line.erase(pos); - } - // skip empty lines - Trim(&line); - if (line.length() == 0) continue; - - if (line.substr(0, 2) != "--") { - KALDI_ERR << "Reading config file " << filename << ": line " - << line_number << " does not look like a line " - << "from a Kaldi command-line program's config file: should " - << "be of the form --x=y. Note: config files intended to " - << "be sourced by shell scripts lack the '--'."; - } - - // parse option - bool has_equal_sign; - SplitLongArg(line, &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << line << " in config file " << filename; - } - } -} - -void ParseOptions::SplitLongArg(const std::string &in, std::string *key, - std::string *value, bool *has_equal_sign) { - KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. - size_t pos = in.find_first_of('=', 0); - if (pos == std::string::npos) { // we allow --option for bools - // defaults to empty. We handle this differently in different cases. - *key = in.substr(2, in.size() - 2); // 2 because starts with --. - *value = ""; - *has_equal_sign = false; - } else if (pos == 2) { // we also don't allow empty keys: --=value - PrintUsage(true); - KALDI_ERR << "Invalid option (no key): " << in; - } else { // normal case: --option=value - *key = in.substr(2, pos - 2); // 2 because starts with --. - *value = in.substr(pos + 1); - *has_equal_sign = true; - } -} - -void ParseOptions::NormalizeArgName(std::string *str) { - std::string out; - std::string::iterator it; - - for (it = str->begin(); it != str->end(); ++it) { - if (*it == '_') - out += '-'; // convert _ to - - else - out += std::tolower(*it); - } - *str = out; - - KALDI_ASSERT(str->length() > 0); -} - -bool ParseOptions::SetOption(const std::string &key, const std::string &value, - bool has_equal_sign) { - if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") - KALDI_ERR << "Invalid option --" << key << "="; - *(bool_map_[key]) = ToBool(value); - } else if (int_map_.end() != int_map_.find(key)) { - *(int_map_[key]) = ToInt(value); - } else if (uint_map_.end() != uint_map_.find(key)) { - *(uint_map_[key]) = ToUint(value); - } else if (float_map_.end() != float_map_.find(key)) { - *(float_map_[key]) = ToFloat(value); - } else if (double_map_.end() != double_map_.find(key)) { - *(double_map_[key]) = ToDouble(value); - } else if (string_map_.end() != string_map_.find(key)) { - if (!has_equal_sign) - KALDI_ERR << "Invalid option --" << key << " (option format is --x=y)."; - *(string_map_[key]) = value; - } else { - return false; - } - return true; -} - -bool ParseOptions::ToBool(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); - - // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { - return true; - } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { - return false; - } - // if it is neither true nor false: - PrintUsage(true); - KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " - << str; - return false; // never reached -} - -int32 ParseOptions::ToInt(const std::string &str) { - int32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -uint32 ParseOptions::ToUint(const std::string &str) { - uint32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -float ParseOptions::ToFloat(const std::string &str) { - float ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -double ParseOptions::ToDouble(const std::string &str) { - double ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -// instantiate templates -template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterStandard(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - float *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - double *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterCommon(const std::string &name, bool *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, int32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, uint32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, float *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, double *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, - std::string *ptr, - const std::string &doc, - bool is_standard); - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/parse-options.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/parse-options.h deleted file mode 100644 index 93a060f4a411dfd63298a91bb313e0b66d337a75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/parse-options.h +++ /dev/null @@ -1,265 +0,0 @@ -// util/parse-options.h - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ -#define KALDI_UTIL_PARSE_OPTIONS_H_ - -#include -#include -#include - -#include "base/kaldi-common.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/// The class ParseOptions is for parsing command-line options; see -/// \ref parse_options for more documentation. -class ParseOptions : public OptionsItf { - public: - explicit ParseOptions(const char *usage) - : print_args_(true), - help_(false), - usage_(usage), - argc_(0), - argv_(NULL), - prefix_(""), - other_parser_(NULL) { -#if !defined(_MSC_VER) && \ - !defined(__CYGWIN__) // This is just a convenient place to set the stderr - // to line - setlinebuf(stderr); // buffering mode, since it's called at program start. -#endif // This helps ensure different programs' output is not mixed up. - RegisterStandard("config", &config_, - "Configuration file to read (this " - "option may be repeated)"); - RegisterStandard("print-args", &print_args_, - "Print the command line arguments (to stderr)"); - RegisterStandard("help", &help_, "Print out usage message"); - RegisterStandard("verbose", &g_kaldi_verbose_level, - "Verbose level (higher->more logging)"); - } - - /** - This is a constructor for the special case where some options are - registered with a prefix to avoid conflicts. The object thus created will - only be used temporarily to register an options class with the original - options parser (which is passed as the *other pointer) using the given - prefix. It should not be used for any other purpose, and the prefix must - not be the empty string. It seems to be the least bad way of implementing - options with prefixes at this point. - Example of usage is: - ParseOptions po; // original ParseOptions object - ParseOptions po_mfcc("mfcc", &po); // object with prefix. - MfccOptions mfcc_opts; - mfcc_opts.Register(&po_mfcc); - The options will now get registered as, e.g., --mfcc.frame-shift=10.0 - instead of just --frame-shift=10.0 - */ - ParseOptions(const std::string &prefix, OptionsItf *other); - - ~ParseOptions() {} - - // Methods from the interface - void Register(const std::string &name, bool *ptr, const std::string &doc); - void Register(const std::string &name, int32 *ptr, const std::string &doc); - void Register(const std::string &name, uint32 *ptr, const std::string &doc); - void Register(const std::string &name, float *ptr, const std::string &doc); - void Register(const std::string &name, double *ptr, const std::string &doc); - void Register(const std::string &name, std::string *ptr, - const std::string &doc); - - /// If called after registering an option and before calling - /// Read(), disables that option from being used. Will crash - /// at runtime if that option had not been registered. - void DisableOption(const std::string &name); - - /// This one is used for registering standard parameters of all the programs - template - void RegisterStandard(const std::string &name, T *ptr, - const std::string &doc); - - /** - Parses the command line options and fills the ParseOptions-registered - variables. This must be called after all the variables were registered!!! - - Initially the variables have implicit values, - then the config file values are set-up, - finally the command line values given. - Returns the first position in argv that was not used. - [typically not useful: use NumParams() and GetParam(). ] - */ - int Read(int argc, const char *const *argv); - - /// Prints the usage documentation [provided in the constructor]. - void PrintUsage(bool print_command_line = false); - /// Prints the actual configuration of all the registered variables - void PrintConfig(std::ostream &os); - - /// Reads the options values from a config file. Must be called after - /// registering all options. This is usually used internally after the - /// standard --config option is used, but it may also be called from a - /// program. - void ReadConfigFile(const std::string &filename); - - /// Number of positional parameters (c.f. argc-1). - int NumArgs() const; - - /// Returns one of the positional parameters; 1-based indexing for argc/argv - /// compatibility. Will crash if param is not >=1 and <=NumArgs(). - std::string GetArg(int param) const; - - std::string GetOptArg(int param) const { - return (param <= NumArgs() ? GetArg(param) : ""); - } - - /// The following function will return a possibly quoted and escaped - /// version of "str", according to the current shell. Currently - /// this is just hardwired to bash. It's useful for debug output. - static std::string Escape(const std::string &str); - - private: - /// Template to register various variable types, - /// used for program-specific parameters - template - void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); - - // Following functions do just the datatype-specific part of the job - /// Register boolean variable - void RegisterSpecific(const std::string &name, const std::string &idx, - bool *b, const std::string &doc, bool is_standard); - /// Register int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - int32 *i, const std::string &doc, bool is_standard); - /// Register unsinged int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - uint32 *u, const std::string &doc, bool is_standard); - /// Register float variable - void RegisterSpecific(const std::string &name, const std::string &idx, - float *f, const std::string &doc, bool is_standard); - /// Register double variable [useful as we change BaseFloat type]. - void RegisterSpecific(const std::string &name, const std::string &idx, - double *f, const std::string &doc, bool is_standard); - /// Register string variable - void RegisterSpecific(const std::string &name, const std::string &idx, - std::string *s, const std::string &doc, - bool is_standard); - - /// Does the actual job for both kinds of parameters - /// Does the common part of the job for all datatypes, - /// then calls RegisterSpecific - template - void RegisterCommon(const std::string &name, T *ptr, const std::string &doc, - bool is_standard); - - /// Set option with name "key" to "value"; will crash if can't do it. - /// "has_equal_sign" is used to allow --x for a boolean option x, - /// and --y=, for a string option y. - bool SetOption(const std::string &key, const std::string &value, - bool has_equal_sign); - - bool ToBool(std::string str); - int32 ToInt(const std::string &str); - uint32 ToUint(const std::string &str); - float ToFloat(const std::string &str); - double ToDouble(const std::string &str); - - // maps for option variables - std::map bool_map_; - std::map int_map_; - std::map uint_map_; - std::map float_map_; - std::map double_map_; - std::map string_map_; - - /** - Structure for options' documentation - */ - struct DocInfo { - DocInfo() {} - DocInfo(const std::string &name, const std::string &usemsg) - : name_(name), use_msg_(usemsg), is_standard_(false) {} - DocInfo(const std::string &name, const std::string &usemsg, - bool is_standard) - : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} - - std::string name_; - std::string use_msg_; - bool is_standard_; - }; - typedef std::map DocMapType; - DocMapType doc_map_; ///< map for the documentation - - bool print_args_; ///< variable for the implicit --print-args parameter - bool help_; ///< variable for the implicit --help parameter - std::string config_; ///< variable for the implicit --config parameter - std::vector positional_args_; - const char *usage_; - int argc_; - const char *const *argv_; - - /// These members are not normally used. They are only used when the object - /// is constructed with a prefix - std::string prefix_; - OptionsItf *other_parser_; - - protected: - /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, - /// and sets "has_equal_sign" to true if an equals-sign was parsed.. - /// this is needed in order to correctly allow --x for a boolean option - /// x, and --y= for a string option y, and to disallow --x= and --y. - void SplitLongArg(const std::string &in, std::string *key, std::string *value, - bool *has_equal_sign); - - void NormalizeArgName(std::string *str); -}; - -/// This template is provided for convenience in reading config classes from -/// files; this is not the standard way to read configuration options, but may -/// occasionally be needed. This function assumes the config has a function -/// "void Register(OptionsItf *opts)" which it can call to register the -/// ParseOptions object. -template -void ReadConfigFromFile(const std::string &config_filename, C *c) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << config_filename << "'"; - ParseOptions po(usage_str.str().c_str()); - c->Register(&po); - po.ReadConfigFile(config_filename); -} - -/// This variant of the template ReadConfigFromFile is for if you need to read -/// two config classes from the same file. -template -void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << conf << "'"; - ParseOptions po(usage_str.str().c_str()); - c1->Register(&po); - c2->Register(&po); - po.ReadConfigFile(conf); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/simple-io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/simple-io-funcs.cc deleted file mode 100644 index 5ace601b6a2bb186dec78b0b25cb5a3227c48bc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/simple-io-funcs.cc +++ /dev/null @@ -1,80 +0,0 @@ -// util/simple-io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/simple-io-funcs.h" -#include "util/text-utils.h" - -namespace kaldi { - -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; - return ko.Close(); -} - -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - int32 i; - list->clear(); - while (!(is >> i).fail()) list->push_back(i); - is >> std::ws; - return is.eof(); // should be eof, or junk at end of file. -} - -bool WriteIntegerVectorVectorSimple( - const std::string &wxfilename, - const std::vector > &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - std::ostream &os = ko.Stream(); - for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i].size(); j++) { - os << list[i][j]; - if (j + 1 < list[i].size()) os << ' '; - } - os << '\n'; - } - return ko.Close(); -} - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - list->clear(); - std::string line; - while (std::getline(is, line)) { - std::vector v; - if (!SplitStringToIntegers(line, " \t\r", true, &v)) { - list->clear(); - return false; - } - list->push_back(v); - } - return is.eof(); // if we're not at EOF, something weird happened. -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/simple-io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/simple-io-funcs.h deleted file mode 100644 index 1ead12790ba9bd6a44ccdff855918270191b8ebd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/simple-io-funcs.h +++ /dev/null @@ -1,61 +0,0 @@ -// util/simple-io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ -#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ - -#include -#include -#include "util/kaldi-io.h" - -// This header contains some utilities for reading some common, simple text -// formats:integers in files, one per line, and integers in files, possibly -// multiple per line. these are not really fully native Kaldi formats; they are -// mostly for small files that might be generated by scripts, and can be read -// all at one time. for longer files of this type, we would probably use the -// Table code. - -namespace kaldi { - -/// WriteToList attempts to write this list of integers, one per line, -/// to the given file, in text format. -/// returns true if succeeded. -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &v); - -/// ReadFromList attempts to read this list of integers, one per line, -/// from the given file, in text format. -/// returns true if succeeded. -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *v); - -// This is a file format like: -// 1 2 -// 3 -// -// 4 5 6 -// etc. -bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, - const std::vector > &v); - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *v); - -} // end namespace kaldi. - -#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/stl-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/stl-utils.h deleted file mode 100644 index 8a29cd582c77b3078277aa9713b8676032bbc5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/stl-utils.h +++ /dev/null @@ -1,310 +0,0 @@ -// util/stl-utils.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_STL_UTILS_H_ -#define KALDI_UTIL_STL_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -using std::unordered_map; -using std::unordered_set; - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Sorts and uniq's (removes duplicates) from a vector. -template -inline void SortAndUniq(std::vector *vec) { - std::sort(vec->begin(), vec->end()); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Returns true if the vector is sorted. -template -inline bool IsSorted(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter < *iter) return false; - iter = next_iter; - } -} - -/// Returns true if the vector is sorted and contains each element -/// only once. -template -inline bool IsSortedAndUniq(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter <= *iter) return false; - iter = next_iter; - } -} - -/// Removes duplicate elements from a sorted list. -template -inline void Uniq(std::vector *vec) { // must be already sorted. - KALDI_PARANOID_ASSERT(IsSorted(*vec)); - KALDI_ASSERT(vec); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Copies the elements of a set to a vector. -template -void CopySetToVector(const std::set &s, std::vector *v) { - // copies members of s into v, in sorted order from lowest to highest - // (because the set was in sorted order). - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename std::set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -template -void CopySetToVector(const unordered_set &s, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename unordered_set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -/// Copies the (key, value) pairs in a map to a vector of pairs. -template -void CopyMapToVector(const std::map &m, - std::vector > *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector >::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = std::make_pair(miter->first, miter->second); - // do it like this because of const casting. - } -} - -/// Copies the keys in a map to a vector. -template -void CopyMapKeysToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->first; - } -} - -/// Copies the values in a map to a vector. -template -void CopyMapValuesToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->second; - } -} - -/// Copies the keys in a map to a set. -template -void CopyMapKeysToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) { - s->insert(s->end(), miter->first); - } -} - -/// Copies the values in a map to a set. -template -void CopyMapValuesToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) s->insert(s->end(), miter->second); -} - -/// Copies the contents of a vector to a set. -template -void CopyVectorToSet(const std::vector &v, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) s->insert(s->end(), *iter); - // s->end() is a hint in case v was sorted. will work regardless. -} - -/// Deletes any non-NULL pointers in the vector v, and sets -/// the corresponding entries of v to NULL -template -void DeletePointers(std::vector *v) { - KALDI_ASSERT(v != NULL); - typename std::vector::iterator iter = v->begin(), end = v->end(); - for (; iter != end; ++iter) { - if (*iter != NULL) { - delete *iter; - *iter = NULL; // set to NULL for extra safety. - } - } -} - -/// Returns true if the vector of pointers contains NULL pointers. -template -bool ContainsNullPointers(const std::vector &v) { - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) - if (*iter == static_cast(NULL)) return true; - return false; -} - -/// Copies the contents a vector of one type to a vector -/// of another type. -template -void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { - KALDI_ASSERT(vec_out != NULL); - vec_out->resize(vec_in.size()); - for (size_t i = 0; i < vec_in.size(); i++) - (*vec_out)[i] = static_cast(vec_in[i]); -} - -/// A hashing function-object for vectors. -template -struct VectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const noexcept { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - VectorHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - } - - private: - static const int kPrime = 7853; -}; - -/// A hashing function-object for pairs of ints -template -struct PairHasher { // hashing function for pair - size_t operator()(const std::pair &x) const noexcept { - // 7853 was chosen at random from a list of primes. - return x.first + x.second * 7853; - } - PairHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int1); - KALDI_ASSERT_IS_INTEGER_TYPE(Int2); - } -}; - -/// A hashing function object for strings. -struct StringHasher { // hashing function for std::string - size_t operator()(const std::string &str) const noexcept { - size_t ans = 0, len = str.length(); - const char *c = str.c_str(), *end = c + len; - for (; c != end; c++) { - ans *= kPrime; - ans += *c; - } - return ans; - } - - private: - static const int kPrime = 7853; -}; - -/// Reverses the contents of a vector. -template -inline void ReverseVector(std::vector *vec) { - KALDI_ASSERT(vec != NULL); - size_t sz = vec->size(); - for (size_t i = 0; i < sz / 2; i++) std::swap((*vec)[i], (*vec)[sz - 1 - i]); -} - -/// Comparator object for pairs that compares only the first pair. -template -struct CompareFirstMemberOfPair { - inline bool operator()(const std::pair &p1, const std::pair &p2) { - return p1.first < p2.first; - } -}; - -/// For a vector of pair where I is an integer and F a floating-point or -/// integer type, this function sorts a vector of type vector > on -/// the I value and then merges elements with equal I values, summing these over -/// the F component and then removing any F component with zero value. This -/// is for where the vector of pairs represents a map from the integer to float -/// component, with an "adding" type of semantics for combining the elements. -template -inline void MergePairVectorSumming(std::vector > *vec) { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - CompareFirstMemberOfPair c; - std::sort(vec->begin(), vec->end(), c); // sort on 1st element. - typename std::vector >::iterator out = vec->begin(), - in = vec->begin(), - end = vec->end(); - // special case: while there is nothing to be changed, skip over - // initial input (avoids unnecessary copying). - while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { - in++; - out++; - } - while (in < end) { - // We reach this point only at the first element of - // each stretch of identical .first elements. - *out = *in; - ++in; - while (in < end && in->first == out->first) { - out->second += in->second; // this is the merge operation. - ++in; - } - if (out->second != static_cast(0)) // Don't keep zero elements. - out++; - } - vec->erase(out, end); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/text-utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/text-utils.cc deleted file mode 100644 index fd70889644f6b4e14793ddd4f5b0d71a66768699..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/text-utils.cc +++ /dev/null @@ -1,580 +0,0 @@ -// util/text-utils.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/text-utils.h" - -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out) { - KALDI_ASSERT(out != NULL); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - F f = 0; - if (!ConvertStringToReal(split[i], &f)) return false; - (*out)[i] = f; - } - return true; -} - -// Instantiate the template above for float and double. -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out) { - std::string tmp_str; - for (size_t i = 0; i < vec_in.size(); i++) { - if (!omit_empty_strings || !vec_in[i].empty()) { - tmp_str.append(vec_in[i]); - if (i < vec_in.size() - 1) - if (!omit_empty_strings || !vec_in[i + 1].empty()) - tmp_str.append(delim); - } - } - str_out->swap(tmp_str); -} - -void Trim(std::string *str) { - const char *white_chars = " \t\n\r\f\v"; - - std::string::size_type pos = str->find_last_not_of(white_chars); - if (pos != std::string::npos) { - str->erase(pos + 1); - pos = str->find_first_not_of(white_chars); - if (pos != std::string::npos) str->erase(0, pos); - } else { - str->erase(str->begin(), str->end()); - } -} - -bool IsToken(const std::string &token) { - size_t l = token.length(); - if (l == 0) return false; - for (size_t i = 0; i < l; i++) { - unsigned char c = token[i]; - if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) - return false; - // The "&& (isascii(c) || c == 255)" was added so that we won't reject - // non-ASCII characters such as French characters with accents [except for - // 255 which is "nbsp", a form of space]. - } - return true; -} - -void SplitStringOnFirstSpace(const std::string &str, std::string *first, - std::string *rest) { - const char *white_chars = " \t\n\r\f\v"; - typedef std::string::size_type I; - const I npos = std::string::npos; - I first_nonwhite = str.find_first_not_of(white_chars); - if (first_nonwhite == npos) { - first->clear(); - rest->clear(); - return; - } - // next_white is first whitespace after first nonwhitespace. - I next_white = str.find_first_of(white_chars, first_nonwhite); - - if (next_white == npos) { // no more whitespace... - *first = std::string(str, first_nonwhite); - rest->clear(); - return; - } - I next_nonwhite = str.find_first_not_of(white_chars, next_white); - if (next_nonwhite == npos) { - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - rest->clear(); - return; - } - - I last_nonwhite = str.find_last_not_of(white_chars); - KALDI_ASSERT(last_nonwhite != npos); // or coding error. - - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - *rest = std::string(str, next_nonwhite, last_nonwhite + 1 - next_nonwhite); -} - -bool IsLine(const std::string &line) { - if (line.find('\n') != std::string::npos) return false; - if (line.empty()) return true; - if (isspace(*(line.begin()))) return false; - if (isspace(*(line.rbegin()))) return false; - std::string::const_iterator iter = line.begin(), end = line.end(); - for (; iter != end; iter++) - if (!isprint(*iter)) return false; - return true; -} - -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - -template bool ConvertStringToReal(const std::string &str, float *out); -template bool ConvertStringToReal(const std::string &str, double *out); - -/* - This function is a helper function of StringsApproxEqual. It should be - thought of as a recursive function-- it was designed that way-- but rather - than actually recursing (which would cause problems with stack overflow), we - just set the args and return to the start. - - The 'decimal_places_tolerance' argument is just passed in from outside, - see the documentation for StringsApproxEqual in text-utils.h to see an - explanation. The argument 'places_into_number' provides some information - about the strings 'a' and 'b' that precedes the current pointers. - For purposes of this comment, let's define the 'decimal' of a number - as the part that comes after the decimal point, e.g. in '99.123', - '123' would be the decimal. If 'places_into_number' is -1, it means - we're not currently inside some place like that (i.e. it's not the - case that we're pointing to the '1' or the '2' or the '3'). - If it's 0, then we'd be pointing to the first place after the decimal, - '1' in this case. Note if one of the numbers is shorter than the - other, like '99.123' versus '99.1234' and 'a' points to the first '3' - while 'b' points to the second '4', 'places_into_number' referes to the - shorter of the two, i.e. it would be 2 in this example. - - - */ -bool StringsApproxEqualInternal(const char *a, const char *b, - int32 decimal_places_tolerance, - int32 places_into_number) { -start: - char ca = *a, cb = *b; - if (ca == cb) { - if (ca == '\0') { - return true; - } else { - if (places_into_number >= 0) { - if (isdigit(ca)) { - places_into_number++; - } else { - places_into_number = -1; - } - } else { - if (ca == '.') { - places_into_number = 0; - } - } - a++; - b++; - goto start; - } - } else { - if (places_into_number >= decimal_places_tolerance && - (isdigit(ca) || isdigit(cb))) { - // we're potentially willing to accept this difference between the - // strings. - if (isdigit(ca)) a++; - if (isdigit(cb)) b++; - // we'll have advanced at least one of the two strings. - goto start; - } else if (places_into_number >= 0 && - ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { - // this clause is designed to ensure that, for example, - // "0.1" would count the same as "0.100001". - if (ca == '0') - a++; - else - b++; - places_into_number++; - goto start; - } else { - return false; - } - } -} - -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_tolerance) { - return StringsApproxEqualInternal(a.c_str(), b.c_str(), - decimal_places_tolerance, -1); -} - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = - std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals - // sign, or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign + 1] == '\'' || - line[next_equals_sign + 1] == '"') { - char my_quote = line[next_equals_sign + 1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote - << " in config line '" << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) - // foo=bar": in general, config values with spaces in them, even without - // quoting. - - size_t next_next_equals_sign = - line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != - std::string::npos) { // found a later equals sign. - size_t preceding_space = - line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -// void ExpectOneOrTwoTokens(std::istream &is, bool binary, -// const std::string &token1, -// const std::string &token2) { -// KALDI_ASSERT(token1 != token2); -// std::string temp; -// ReadToken(is, binary, &temp); -// if (temp == token1) { -// ExpectToken(is, binary, token2); -// } else { -// if (temp != token2) { -// KALDI_ERR << "Expecting token " << token1 << " or " << token2 -// << " but got " << temp; -// } -// } -// } - -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines) { - config_lines->resize(lines.size()); - for (size_t i = 0; i < lines.size(); i++) { - bool ret = (*config_lines)[i].ParseLine(lines[i]); - if (!ret) { - KALDI_ERR << "Error parsing config line: " << lines[i]; - } - } -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/text-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/text-utils.h deleted file mode 100644 index bc7763c4aff38214d97cbeda3b29c8717dd65318..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/kaldi/util/text-utils.h +++ /dev/null @@ -1,264 +0,0 @@ -// util/text-utils.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_TEXT_UTILS_H_ -#define KALDI_UTIL_TEXT_UTILS_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Split a string using any of the single character delimiters. -/// If omit_empty_strings == true, the output will contain any -/// nonempty strings after splitting on any of the -/// characters in the delimiter. If omit_empty_strings == false, -/// the output will contain n+1 strings if there are n characters -/// in the set "delim" within the input string. In this case -/// the empty string is split to a single empty string. -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -/// Joins the elements of a vector of strings into a single string using -/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings -/// in the vector are skipped. A vector of empty strings results in an empty -/// string on the output. -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out); - -/** - \brief Split a string (e.g. 1:2:3) into a vector of integers. - - \param [in] delim String containing a list of characters, any of which - is allowed as a delimiter. - \param [in] omit_empty_strings If true, empty strings between delimiters are - allowed and will not produce an output integer; if false, - instances of characters in 'delim' that are consecutive or - at the start or end of the string would be an error. - You'll normally want this to be true if 'delim' consists - of spaces, and false otherwise. - \param [out] out The output list of integers. -*/ -template -bool SplitStringToIntegers(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false [but - // should probably be true - // if "delim" is spaces]. - std::vector *out) { - KALDI_ASSERT(out != NULL); - KALDI_ASSERT_IS_INTEGER_TYPE(I); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - const char *this_str = split[i].c_str(); - char *end = NULL; - int64 j = 0; - j = KALDI_STRTOLL(this_str, &end); - if (end == this_str || *end != '\0') { - out->clear(); - return false; - } else { - I jI = static_cast(j); - if (static_cast(jI) != j) { - // output type cannot fit this integer. - out->clear(); - return false; - } - (*out)[i] = jI; - } - } - return true; -} - -// This is defined for F = float and double. -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out); - -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - const char *this_str = str.c_str(); - char *end = NULL; - errno = 0; - int64 i = KALDI_STRTOLL(this_str, &end); - if (end != this_str) - while (isspace(*end)) end++; - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out); - -/// Removes the beginning and trailing whitespaces from a string -void Trim(std::string *str); - -/// Removes leading and trailing white space from the string, then splits on the -/// first section of whitespace found (if present), putting the part before the -/// whitespace in "first" and the rest in "rest". If there is no such space, -/// everything that remains after removing leading and trailing whitespace goes -/// in "first". -void SplitStringOnFirstSpace(const std::string &line, std::string *first, - std::string *rest); - -/// Returns true if "token" is nonempty, and all characters are -/// printable and whitespace-free. -bool IsToken(const std::string &token); - -/// Returns true if "line" is free of \n characters and unprintable -/// characters, and does not contain leading or trailing whitespace. -bool IsLine(const std::string &line); - -/** - This function returns true when two text strings are approximately equal, and - false when they are not. The definition of 'equal' is normal string - equality, except that two substrings like "0.31134" and "0.311341" would be - considered equal. 'decimal_places_tolerance' controls how many digits after - the '.' have to match up. - E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would - return false because there is a difference in the 2nd decimal, but with - an argument of 1 it would return true. - */ -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_check = 2); - -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' - baz="a b c d='a b' e" and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free - of the '=' character. If values are going to contain the '=' character, you - need to quote them with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; -}; - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, const std::string &token2); - -/** - This function reads in a config file and *appends* its contents to a vector - of lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, std::vector *lines); - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); - -/// Returns true if 'name' would be a valid name for a component or node in a -/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing -/// only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - -} // namespace kaldi - -#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/CPPLINT.cfg deleted file mode 100644 index 51ff339c18435a6c3a3be03131080d7b8ab8de86..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/CPPLINT.cfg +++ /dev/null @@ -1 +0,0 @@ -exclude_files=.* diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/CMakeLists.txt deleted file mode 100644 index 04051ef5ae46c04a40c1ffccc98c37fa594ad13e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - -#-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o - -include_directories(./include/) -install(DIRECTORY include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - -add_subdirectory(lib) - -if(HAVE_SCRIPT) - add_subdirectory(script) -endif(HAVE_SCRIPT) - -if(HAVE_BIN) - add_subdirectory(bin) -endif(HAVE_BIN) - -add_subdirectory(extensions) - -if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) -endif(BUILD_TESTING) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/extensions/special/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/extensions/special/CMakeLists.txt deleted file mode 100644 index 9c71b750a72ffe3c2dafde657273361c3dbae409..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/extensions/special/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) -message(STATUS "${HEADER_FILES}") - -if(HAVE_BIN) - add_executable(fstspecial-bin - ../../bin/fstconvert.cc - ../../bin/fstconvert-main.cc - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ) - - set_target_properties(fstspecial-bin PROPERTIES - FOLDER special/bin - OUTPUT_NAME fstspecial - ) - - target_link_libraries(fstspecial-bin - fstscript - fst - ${CMAKE_DL_LIBS} - ) -endif(HAVE_BIN) - - -add_library(fstspecial - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ${HEADER_FILES} -) - -set_target_properties(fstspecial PROPERTIES - SOVERSION "${SOVERSION}" - FOLDER special -) -target_link_libraries(fstspecial - fst -) - -set(FST_SPECIAL_INSTALL_TARGETS fstspecial) -if(HAVE_BIN) - list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) -endif() - -install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib -) - -function (add_module _name) - add_library(${ARGV}) - if (TARGET ${_name}) - target_link_libraries(${_name} fst) - set_target_properties(${_name} - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true - FOLDER special/modules - ) - endif() - - install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) -endfunction() - -add_module(phi-fst MODULE phi-fst.cc) -add_module(rho-fst MODULE rho-fst.cc) -add_module(sigma-fst MODULE sigma-fst.cc) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/include/fst/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/include/fst/flags.h deleted file mode 100644 index b5ec8ff7416774a0612ae0fe7e008a630b289dd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/include/fst/flags.h +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style flag handling declarations and inline definitions. - -#ifndef FST_LIB_FLAGS_H_ -#define FST_LIB_FLAGS_H_ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" - -using std::string; - -// FLAGS USAGE: -// -// Definition example: -// -// DEFINE_int32(length, 0, "length"); -// -// This defines variable FLAGS_length, initialized to 0. -// -// Declaration example: -// -// DECLARE_int32(length); -// -// SET_FLAGS() can be used to set flags from the command line -// using, for example, '--length=2'. -// -// ShowUsage() can be used to print out command and flag usage. - -// #define DECLARE_bool(name) extern bool FLAGS_ ## name -// #define DECLARE_string(name) extern string FLAGS_ ## name -// #define DECLARE_int32(name) extern int32 FLAGS_ ## name -// #define DECLARE_int64(name) extern int64 FLAGS_ ## name -// #define DECLARE_double(name) extern double FLAGS_ ## name - -template -struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, - const char *file, const T val) - : address(addr), - doc_string(doc), - type_name(type), - file_name(file), - default_value(val) {} - - T *address; - const char *doc_string; - const char *type_name; - const char *file_name; - const T default_value; -}; - -template -class FlagRegister { - public: - static FlagRegister *GetRegister() { - static auto reg = new FlagRegister; - return reg; - } - - const FlagDescription &GetFlagDescription(const string &name) const { - fst::MutexLock l(&flag_lock_); - auto it = flag_table_.find(name); - return it != flag_table_.end() ? it->second : 0; - } - - void SetDescription(const string &name, - const FlagDescription &desc) { - fst::MutexLock l(&flag_lock_); - flag_table_.insert(make_pair(name, desc)); - } - - bool SetFlag(const string &val, bool *address) const { - if (val == "true" || val == "1" || val.empty()) { - *address = true; - return true; - } else if (val == "false" || val == "0") { - *address = false; - return true; - } - else { - return false; - } - } - - bool SetFlag(const string &val, string *address) const { - *address = val; - return true; - } - - bool SetFlag(const string &val, int32 *address) const { - char *p = 0; - *address = strtol(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, int64 *address) const { - char *p = 0; - *address = strtoll(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, double *address) const { - char *p = 0; - *address = strtod(val.c_str(), &p); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, FlagDescription >::const_iterator it = - flag_table_.begin(); - it != flag_table_.end(); - ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - if (arg == name) - return SetFlag(val, desc.address); - } - return false; - } - - void GetUsage(std::set> *usage_set) const { - for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - string usage = " --" + name; - usage += ": type = "; - usage += desc.type_name; - usage += ", default = "; - usage += GetDefault(desc.default_value) + "\n "; - usage += desc.doc_string; - usage_set->insert(make_pair(desc.file_name, usage)); - } - } - - private: - string GetDefault(bool default_value) const { - return default_value ? "true" : "false"; - } - - string GetDefault(const string &default_value) const { - return "\"" + default_value + "\""; - } - - template - string GetDefault(const V &default_value) const { - std::ostringstream strm; - strm << default_value; - return strm.str(); - } - - mutable fst::Mutex flag_lock_; // Multithreading lock. - std::map> flag_table_; -}; - -template -class FlagRegisterer { - public: - FlagRegisterer(const string &name, const FlagDescription &desc) { - auto registr = FlagRegister::GetRegister(); - registr->SetDescription(name, desc); - } - - private: - FlagRegisterer(const FlagRegisterer &) = delete; - FlagRegisterer &operator=(const FlagRegisterer &) = delete; -}; - - -#define DEFINE_VAR(type, name, value, doc) \ - type FLAGS_ ## name = value; \ - static FlagRegisterer \ - name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ - doc, \ - #type, \ - __FILE__, \ - value)) - -// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) -// #define DEFINE_string(name, value, doc) \ -// DEFINE_VAR(string, name, value, doc) -// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc) -// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc) -// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc) - - -// Temporary directory. -DECLARE_string(tmpdir); - -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, - const char *src = ""); - -#define SET_FLAGS(usage, argc, argv, rmflags) \ -gflags::ParseCommandLineFlags(argc, argv, true) -// SetFlags(usage, argc, argv, rmflags, __FILE__) - -// Deprecated; for backward compatibility. -inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { - return SetFlags(usage, argc, argv, rmflags); -} - -void ShowUsage(bool long_usage = true); - -#endif // FST_LIB_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/include/fst/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/include/fst/log.h deleted file mode 100644 index bf041c58ebfab73d03bb14adf28c7c7916a2217d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/patch/openfst/src/include/fst/log.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style logging declarations and inline definitions. - -#ifndef FST_LIB_LOG_H_ -#define FST_LIB_LOG_H_ - -#include -#include -#include - -#include -#include - -using std::string; - -DECLARE_int32(v); - -class LogMessage { - public: - LogMessage(const string &type) : fatal_(type == "FATAL") { - std::cerr << type << ": "; - } - ~LogMessage() { - std::cerr << std::endl; - if(fatal_) - exit(1); - } - std::ostream &stream() { return std::cerr; } - - private: - bool fatal_; -}; - -// #define LOG(type) LogMessage(#type).stream() -// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO) - -// Checks -inline void FstCheck(bool x, const char* expr, - const char *file, int line) { - if (!x) { - LOG(FATAL) << "Check failed: \"" << expr - << "\" file: " << file - << " line: " << line; - } -} - -// #define CHECK(x) FstCheck(static_cast(x), #x, __FILE__, __LINE__) -// #define CHECK_EQ(x, y) CHECK((x) == (y)) -// #define CHECK_LT(x, y) CHECK((x) < (y)) -// #define CHECK_GT(x, y) CHECK((x) > (y)) -// #define CHECK_LE(x, y) CHECK((x) <= (y)) -// #define CHECK_GE(x, y) CHECK((x) >= (y)) -// #define CHECK_NE(x, y) CHECK((x) != (y)) - -// Debug checks -// #define DCHECK(x) assert(x) -// #define DCHECK_EQ(x, y) DCHECK((x) == (y)) -// #define DCHECK_LT(x, y) DCHECK((x) < (y)) -// #define DCHECK_GT(x, y) DCHECK((x) > (y)) -// #define DCHECK_LE(x, y) DCHECK((x) <= (y)) -// #define DCHECK_GE(x, y) DCHECK((x) >= (y)) -// #define DCHECK_NE(x, y) DCHECK((x) != (y)) - - -// Ports -#define ATTRIBUTE_DEPRECATED __attribute__((deprecated)) - -#endif // FST_LIB_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/CMakeLists.txt deleted file mode 100644 index 6113bbc26eb8fe35e4e17ffd1cab382f0fb0f1f8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(post_processor STATIC - post_processor.cc -) -target_link_libraries(post_processor PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/post_processor.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/post_processor.cc deleted file mode 100644 index 315f62d34cbc441ecbaf7c07667eb35ee61c2c8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/post_processor.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "utils/string.h" - -namespace wenet { - -std::string PostProcessor::ProcessSpace(const std::string& str) { - std::string result = str; - // 1. remove ' ' if needed - // only spaces between mandarin words need to be removed, please note that - // if str contains '_', we assume that the decoding type must be - // `CtcPrefixBeamSearch` and this branch will do nothing since str must be - // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) - if (opts_.language_type == kMandarinEnglish && !str.empty()) { - result.clear(); - // split str by ' ' - std::vector words; - std::stringstream ss(str); - std::string tmp; - while (ss >> tmp) { - words.push_back(tmp); - } - // check english word - bool is_englishword_prev = false; - bool is_englishword_now = false; - for (std::string& w : words) { - is_englishword_now = CheckEnglishWord(w); - if (is_englishword_prev && is_englishword_now) { - result += (' ' + w); - } else { - result += (w); - } - is_englishword_prev = is_englishword_now; - } - } - // 2. replace '_' with ' ' - // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) - result = ProcessBlank(result, opts_.lowercase); - return result; -} - -std::string PostProcessor::Process(const std::string& str, bool finish) { - std::string result; - result = ProcessSpace(str); - // TODO(xcsong): do itn/punctuation if finish == true - return result; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/post_processor.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/post_processor.h deleted file mode 100644 index 54597845ebc88ad22e1244d2e693e2088cff6d21..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/post_processor/post_processor.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#ifndef POST_PROCESSOR_POST_PROCESSOR_H_ -#define POST_PROCESSOR_POST_PROCESSOR_H_ - -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -enum LanguageType { - // spaces between **mandarin words** should be removed. - // cases of processing spaces with mandarin-only, english-only - // and mandarin-english code-switch can be found in post_processor_test.cc - kMandarinEnglish = 0x00, - // spaces should be kept for most of the - // Indo-European languages (i.e., deutsch or english-deutsch code-switch). - // cases of those languages can be found in post_processor_test.cc - kIndoEuropean = 0x01 -}; - -struct PostProcessOptions { - // space options - // The decoded result may contain spaces (' ' or '_'), - // we will process those spaces according to language_type. More details can - // be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - LanguageType language_type = kMandarinEnglish; - // whether lowercase letters are required - bool lowercase = true; -}; - -// TODO(xcsong): add itn/punctuation related resource -struct PostProcessResource {}; - -// Post Processor -class PostProcessor { - public: - explicit PostProcessor(PostProcessOptions&& opts) : opts_(std::move(opts)) {} - explicit PostProcessor(const PostProcessOptions& opts) : opts_(opts) {} - // call other functions to do post processing - std::string Process(const std::string& str, bool finish); - // process spaces according to configurations - std::string ProcessSpace(const std::string& str); - // TODO(xcsong): add itn/punctuation - // void InverseTN(const std::string& str); - // void Punctuate(const std::string& str); - - private: - const PostProcessOptions opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(PostProcessor); -}; - -} // namespace wenet - -#endif // POST_PROCESSOR_POST_PROCESSOR_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/CMakeLists.txt deleted file mode 100644 index 686362688c050d48224ca0a01e0d24b03d94758a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_library(utils STATIC - string.cc - utils.cc -) - -if(NOT ANDROID) - if(MSVC) - target_link_libraries(utils PUBLIC fst) - else() - target_link_libraries(utils PUBLIC fst dl) - endif() -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/blocking_queue.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/blocking_queue.h deleted file mode 100644 index 9bf0127d9298fbfae2eeebb9431c680fc5dd7647..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/blocking_queue.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_BLOCKING_QUEUE_H_ -#define UTILS_BLOCKING_QUEUE_H_ - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity = std::numeric_limits::max()) - : capacity_(capacity) {} - - void Push(const T& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(value); - } - not_empty_condition_.notify_one(); - } - - void Push(T&& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - void Push(const std::vector& values) { - { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(value); - } - } - not_empty_condition_.notify_one(); - } - - void Push(std::vector&& values) { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - T Pop() { - std::unique_lock lock(mutex_); - while (queue_.empty()) { - not_empty_condition_.wait(lock); - } - T t(std::move(queue_.front())); - queue_.pop(); - not_full_condition_.notify_one(); - return t; - } - - // num can be greater than capacity,but it needs to be used with care - std::vector Pop(size_t num) { - std::unique_lock lock(mutex_); - std::vector block_data; - while (block_data.size() < num) { - while (queue_.empty()) { - not_full_condition_.notify_one(); - not_empty_condition_.wait(lock); - } - block_data.push_back(std::move(queue_.front())); - queue_.pop(); - } - not_full_condition_.notify_one(); - return block_data; - } - - bool Empty() const { - std::lock_guard lock(mutex_); - return queue_.empty(); - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - void Clear() { - while (!Empty()) { - Pop(); - } - } - - private: - size_t capacity_; - mutable std::mutex mutex_; - std::condition_variable not_full_condition_; - std::condition_variable not_empty_condition_; - std::queue queue_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace wenet - -#endif // UTILS_BLOCKING_QUEUE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/file.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/file.h deleted file mode 100644 index 83ad9c8c52fecd334b3549285bf39cd4f59b9f2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/file.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FILE_H_ -#define UTILS_FILE_H_ - -#include -#include - -namespace wenet { - -inline bool FileExists(const std::string& path) { - std::ifstream f(path.c_str()); - return f.good(); -} - -} // namespace wenet - -#endif // UTILS_FILE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/flags.h deleted file mode 100644 index 3432aa78847322edec8d6d2aec59ed7ca5352fcd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/flags.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FLAGS_H_ -#define UTILS_FLAGS_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/flags.h" - -#endif // UTILS_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/json.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/json.h deleted file mode 100644 index bf8d94a3e42504139b10daa39b8f8e7a8b2d93cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/json.h +++ /dev/null @@ -1,754 +0,0 @@ -// Copyright (c) From https://github.com/nbsdx/SimpleJSON -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_JSON_H_ -#define UTILS_JSON_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace json { - -using std::deque; -using std::enable_if; -using std::initializer_list; -using std::is_convertible; -using std::is_floating_point; -using std::is_integral; -using std::is_same; -using std::map; -using std::string; - -namespace { // NOLINT -string json_escape(const string& str) { - string output; - for (unsigned i = 0; i < str.length(); ++i) switch (str[i]) { - case '\"': - output += "\\\""; - break; - case '\\': - output += "\\\\"; - break; - case '\b': - output += "\\b"; - break; - case '\f': - output += "\\f"; - break; - case '\n': - output += "\\n"; - break; - case '\r': - output += "\\r"; - break; - case '\t': - output += "\\t"; - break; - default: - output += str[i]; - break; - } - return std::move(output); -} -} // namespace - -class JSON { - union BackingData { - BackingData(double d) : Float(d) {} - BackingData(int l) : Int(l) {} - BackingData(bool b) : Bool(b) {} - BackingData(string s) : String(new string(s)) {} - BackingData() : Int(0) {} - - deque* List; - map* Map; - string* String; - double Float; - int Int; - bool Bool; - } Internal; - - public: - enum class Class { Null, Object, Array, String, Floating, Integral, Boolean }; - - template - class JSONWrapper { - Container* object; - - public: - explicit JSONWrapper(Container* val) : object(val) {} - explicit JSONWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::iterator begin() { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::iterator end() { - return object ? object->end() : typename Container::iterator(); - } - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::iterator(); - } - }; - - template - class JSONConstWrapper { - const Container* object; - - public: - explicit JSONConstWrapper(const Container* val) : object(val) {} - explicit JSONConstWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::const_iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::const_iterator(); - } - }; - - JSON() : Internal(), Type(Class::Null) {} - - explicit JSON(initializer_list list) : JSON() { - SetType(Class::Object); - for (auto i = list.begin(), e = list.end(); i != e; ++i, ++i) - operator[](i->ToString()) = *std::next(i); - } - - JSON(JSON&& other) : Internal(other.Internal), Type(other.Type) { - other.Type = Class::Null; - other.Internal.Map = nullptr; - } - - JSON& operator=(JSON&& other) { - ClearInternal(); - Internal = other.Internal; - Type = other.Type; - other.Internal.Map = nullptr; - other.Type = Class::Null; - return *this; - } - - JSON(const JSON& other) { - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - } - - JSON& operator=(const JSON& other) { - ClearInternal(); - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - return *this; - } - - ~JSON() { - switch (Type) { - case Class::Array: - delete Internal.List; - break; - case Class::Object: - delete Internal.Map; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - template - explicit JSON(T b, typename enable_if::value>::type* = 0) - : Internal(b), Type(Class::Boolean) {} - - template - explicit JSON(T i, typename enable_if::value && - !is_same::value>::type* = 0) - : Internal(static_cast(i)), Type(Class::Integral) {} - - template - explicit JSON(T f, typename enable_if::value>::type* = 0) - : Internal(static_cast(f)), Type(Class::Floating) {} - - template - explicit JSON(T s, - typename enable_if::value>::type* = 0) - : Internal(string(s)), Type(Class::String) {} - - explicit JSON(std::nullptr_t) : Internal(), Type(Class::Null) {} - - static JSON Make(Class type) { - JSON ret; - ret.SetType(type); - return ret; - } - - static JSON Load(const string&); - - template - void append(T arg) { - SetType(Class::Array); - Internal.List->emplace_back(arg); - } - - template - void append(T arg, U... args) { - append(arg); - append(args...); - } - - template - typename enable_if::value, JSON&>::type operator=(T b) { - SetType(Class::Boolean); - Internal.Bool = b; - return *this; - } - - template - typename enable_if::value && !is_same::value, - JSON&>::type - operator=(T i) { - SetType(Class::Integral); - Internal.Int = i; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=(T f) { - SetType(Class::Floating); - Internal.Float = f; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=( - T s) { - SetType(Class::String); - *Internal.String = string(s); - return *this; - } - - JSON& operator[](const string& key) { - SetType(Class::Object); - return Internal.Map->operator[](key); - } - - JSON& operator[](unsigned index) { - SetType(Class::Array); - if (index >= Internal.List->size()) Internal.List->resize(index + 1); - return Internal.List->operator[](index); - } - - JSON& at(const string& key) { return operator[](key); } - - const JSON& at(const string& key) const { return Internal.Map->at(key); } - - JSON& at(unsigned index) { return operator[](index); } - - const JSON& at(unsigned index) const { return Internal.List->at(index); } - - int length() const { - if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - bool hasKey(const string& key) const { - if (Type == Class::Object) - return Internal.Map->find(key) != Internal.Map->end(); - return false; - } - - int size() const { - if (Type == Class::Object) - return Internal.Map->size(); - else if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - Class JSONType() const { return Type; } - - /// Functions for getting primitives from the JSON object. - bool IsNull() const { return Type == Class::Null; } - - string ToString() const { - bool b; - return std::move(ToString(&b)); - } - string ToString(bool* ok) const { - *ok = (Type == Class::String); - return *ok ? std::move(json_escape(*Internal.String)) : string(""); - } - - double ToFloat() const { - bool b; - return ToFloat(&b); - } - double ToFloat(bool* ok) const { - *ok = (Type == Class::Floating); - return *ok ? Internal.Float : 0.0; - } - - int ToInt() const { - bool b; - return ToInt(&b); - } - int ToInt(bool* ok) const { - *ok = (Type == Class::Integral); - return *ok ? Internal.Int : 0; - } - - bool ToBool() const { - bool b; - return ToBool(&b); - } - bool ToBool(bool* ok) const { - *ok = (Type == Class::Boolean); - return *ok ? Internal.Bool : false; - } - - JSONWrapper> ObjectRange() { - if (Type == Class::Object) - return JSONWrapper>(Internal.Map); - return JSONWrapper>(nullptr); - } - - JSONWrapper> ArrayRange() { - if (Type == Class::Array) return JSONWrapper>(Internal.List); - return JSONWrapper>(nullptr); - } - - JSONConstWrapper> ObjectRange() const { - if (Type == Class::Object) - return JSONConstWrapper>(Internal.Map); - return JSONConstWrapper>(nullptr); - } - - JSONConstWrapper> ArrayRange() const { - if (Type == Class::Array) - return JSONConstWrapper>(Internal.List); - return JSONConstWrapper>(nullptr); - } - - string dump(int depth = 1, string tab = " ") const { - string pad = ""; - for (int i = 0; i < depth; ++i, pad += tab) { - } - - switch (Type) { - case Class::Null: - return "null"; - case Class::Object: { - string s = "{\n"; - bool skip = true; - for (auto& p : *Internal.Map) { - if (!skip) s += ",\n"; - s += (pad + "\"" + p.first + "\" : " + p.second.dump(depth + 1, tab)); - skip = false; - } - s += ("\n" + pad.erase(0, 2) + "}"); - return s; - } - case Class::Array: { - string s = "["; - bool skip = true; - for (auto& p : *Internal.List) { - if (!skip) s += ", "; - s += p.dump(depth + 1, tab); - skip = false; - } - s += "]"; - return s; - } - case Class::String: - return "\"" + json_escape(*Internal.String) + "\""; - case Class::Floating: - return std::to_string(Internal.Float); - case Class::Integral: - return std::to_string(Internal.Int); - case Class::Boolean: - return Internal.Bool ? "true" : "false"; - default: - return ""; - } - return ""; - } - - friend std::ostream& operator<<(std::ostream&, const JSON&); - - private: - void SetType(Class type) { - if (type == Type) return; - - ClearInternal(); - - switch (type) { - case Class::Null: - Internal.Map = nullptr; - break; - case Class::Object: - Internal.Map = new map(); - break; - case Class::Array: - Internal.List = new deque(); - break; - case Class::String: - Internal.String = new string(); - break; - case Class::Floating: - Internal.Float = 0.0; - break; - case Class::Integral: - Internal.Int = 0; - break; - case Class::Boolean: - Internal.Bool = false; - break; - } - - Type = type; - } - - private: - /* beware: only call if YOU know that Internal is allocated. No checks - performed here. This function should be called in a constructed JSON just - before you are going to overwrite Internal... -*/ - void ClearInternal() { - switch (Type) { - case Class::Object: - delete Internal.Map; - break; - case Class::Array: - delete Internal.List; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - private: - Class Type = Class::Null; -}; - -JSON Array() { return std::move(JSON::Make(JSON::Class::Array)); } - -template -JSON Array(T... args) { - JSON arr = JSON::Make(JSON::Class::Array); - arr.append(args...); - return std::move(arr); -} - -JSON Object() { return std::move(JSON::Make(JSON::Class::Object)); } - -std::ostream& operator<<(std::ostream& os, const JSON& json) { - os << json.dump(); - return os; -} - -namespace { // NOLINT -JSON parse_next(const string&, size_t&); - -void consume_ws(const string& str, size_t& offset) { // NOLINT - while (isspace(str[offset])) ++offset; -} - -JSON parse_object(const string& str, size_t& offset) { // NOLINT - JSON Object = JSON::Make(JSON::Class::Object); - - ++offset; - consume_ws(str, offset); - if (str[offset] == '}') { - ++offset; - return std::move(Object); - } - - while (true) { - JSON Key = parse_next(str, offset); - consume_ws(str, offset); - if (str[offset] != ':') { - std::cerr << "Error: Object: Expected colon, found '" << str[offset] - << "'\n"; - break; - } - consume_ws(str, ++offset); - JSON Value = parse_next(str, offset); - Object[Key.ToString()] = Value; - - consume_ws(str, offset); - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == '}') { - ++offset; - break; - } else { - std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] - << "'\n"; - break; - } - } - - return std::move(Object); -} - -JSON parse_array(const string& str, size_t& offset) { // NOLINT - JSON Array = JSON::Make(JSON::Class::Array); - unsigned index = 0; - - ++offset; - consume_ws(str, offset); - if (str[offset] == ']') { - ++offset; - return std::move(Array); - } - - while (true) { - Array[index++] = parse_next(str, offset); - consume_ws(str, offset); - - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == ']') { - ++offset; - break; - } else { - std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] - << "'\n"; - return std::move(JSON::Make(JSON::Class::Array)); - } - } - - return std::move(Array); -} - -JSON parse_string(const string& str, size_t& offset) { // NOLINT - JSON String; - string val; - for (char c = str[++offset]; c != '\"'; c = str[++offset]) { - if (c == '\\') { - switch (str[++offset]) { - case '\"': - val += '\"'; - break; - case '\\': - val += '\\'; - break; - case '/': - val += '/'; - break; - case 'b': - val += '\b'; - break; - case 'f': - val += '\f'; - break; - case 'n': - val += '\n'; - break; - case 'r': - val += '\r'; - break; - case 't': - val += '\t'; - break; - case 'u': { - val += "\\u"; - for (unsigned i = 1; i <= 4; ++i) { - c = str[offset + i]; - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { - val += c; - } else { - std::cerr << "ERROR: String: Expected hex character in unicode " - "escape, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::String)); - } - } - offset += 4; - } break; - default: - val += '\\'; - break; - } - } else { - val += c; - } - } - ++offset; - String = val; - return std::move(String); -} - -JSON parse_number(const string& str, size_t& offset) { // NOLINT - JSON Number; - string val, exp_str; - char c; - bool isDouble = false; - int exp = 0; - while (true) { - c = str[offset++]; - if ((c == '-') || (c >= '0' && c <= '9')) { - val += c; - } else if (c == '.') { - val += c; - isDouble = true; - } else { - break; - } - } - if (c == 'E' || c == 'e') { - c = str[offset++]; - if (c == '-') { - ++offset; - exp_str += '-'; - } - while (true) { - c = str[offset++]; - if (c >= '0' && c <= '9') { - exp_str += c; - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: Expected a number for exponent, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } else { - break; - } - } - exp = std::stol(exp_str); - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - --offset; - - if (isDouble) { - Number = std::stod(val) * std::pow(10, exp); - } else { - if (!exp_str.empty()) - Number = std::stol(val) * std::pow(10, exp); - else - Number = std::stol(val); - } - return std::move(Number); -} - -JSON parse_bool(const string& str, size_t& offset) { // NOLINT - JSON Bool; - if (str.substr(offset, 4) == "true") { - Bool = true; - } else if (str.substr(offset, 5) == "false") { - Bool = false; - } else { - std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" - << str.substr(offset, 5) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += (Bool.ToBool() ? 4 : 5); - return std::move(Bool); -} - -JSON parse_null(const string& str, size_t& offset) { // NOLINT - JSON Null; - if (str.substr(offset, 4) != "null") { - std::cerr << "ERROR: Null: Expected 'null', found '" - << str.substr(offset, 4) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += 4; - return std::move(Null); -} - -JSON parse_next(const string& str, size_t& offset) { // NOLINT - char value; - consume_ws(str, offset); - value = str[offset]; - switch (value) { - case '[': - return std::move(parse_array(str, offset)); - case '{': - return std::move(parse_object(str, offset)); - case '\"': - return std::move(parse_string(str, offset)); - case 't': - case 'f': - return std::move(parse_bool(str, offset)); - case 'n': - return std::move(parse_null(str, offset)); - default: - if ((value <= '9' && value >= '0') || value == '-') - return std::move(parse_number(str, offset)); - } - std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; - return JSON(); -} -} // namespace - -JSON JSON::Load(const string& str) { - size_t offset = 0; - return std::move(parse_next(str, offset)); -} - -} // namespace json - -#endif // UTILS_JSON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/log.h deleted file mode 100644 index c2bf03f261a8711f74da819d80d68e8eb9fb124a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/log.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_LOG_H_ -#define UTILS_LOG_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/log.h" - -#endif // UTILS_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/string.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/string.cc deleted file mode 100644 index 1ab93adf3cac1bc5a42c0b8c6cadbde399678fef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/string.cc +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/string.h" - -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -void SplitString(const std::string& str, std::vector* strs) { - SplitStringToVector(Trim(str), " \t", true, strs); -} - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars) { - chars->clear(); - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - assert((str[i] & 0xF8) <= 0xF0); - if ((str[i] & 0x80) == 0x00) { - // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - // The next 1,920 characters need two bytes to encode, - // which covers the remainder of almost all Latin-script alphabets. - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - // Three bytes are needed for characters in the rest of - // the Basic Multilingual Plane, which contains virtually all characters - // in common use, including most Chinese, Japanese and Korean characters. - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - // Four bytes are needed for characters in the other planes of Unicode, - // which include less common CJK characters, various historic scripts, - // mathematical symbols, and emoji (pictographic symbols). - bytes = 4; - } - chars->push_back(str.substr(i, bytes)); - } -} - -int UTF8StringLength(const std::string& str) { - int len = 0; - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - if ((str[i] & 0x80) == 0x00) { - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - bytes = 4; - } - ++len; - } - return len; -} - -bool CheckEnglishChar(const std::string& ch) { - // all english characters should be encoded in one byte - if (ch.size() != 1) return false; - // english words may contain apostrophe, i.e., "He's" - return isalpha(ch[0]) || ch[0] == '\''; -} - -bool CheckEnglishWord(const std::string& word) { - std::vector chars; - SplitUTF8StringToChars(word, &chars); - for (size_t k = 0; k < chars.size(); k++) { - if (!CheckEnglishChar(chars[k])) { - return false; - } - } - return true; -} - -std::string JoinString(const std::string& c, - const std::vector& strs) { - std::string result; - if (strs.size() > 0) { - for (int i = 0; i < strs.size() - 1; i++) { - result += (strs[i] + c); - } - result += strs.back(); - } - return result; -} - -bool IsAlpha(const std::string& str) { - for (size_t i = 0; i < str.size(); i++) { - if (!isalpha(str[i])) { - return false; - } - } - return true; -} - -std::string ProcessBlank(const std::string& str, bool lowercase) { - std::string result; - if (!str.empty()) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - for (std::string& ch : chars) { - if (ch != kSpaceSymbol) { - result.append(ch); - } else { - // Ignore consecutive space or located in head - if (!result.empty() && result.back() != ' ') { - result.push_back(' '); - } - } - } - // Ignore tailing space - if (!result.empty() && result.back() == ' ') { - result.pop_back(); - } - // NOTE: convert string to wstring - // see issue 745: https://github.com/wenet-e2e/wenet/issues/745 - std::locale loc(""); - std::wstring_convert, wchar_t> converter; - std::wstring wsresult = converter.from_bytes(result); - for (auto& c : wsresult) { - c = lowercase ? tolower(c, loc) : toupper(c, loc); - } - result = converter.to_bytes(wsresult); - } - return result; -} - -std::string Ltrim(const std::string& str) { - size_t start = str.find_first_not_of(WHITESPACE); - return (start == std::string::npos) ? "" : str.substr(start); -} - -std::string Rtrim(const std::string& str) { - size_t end = str.find_last_not_of(WHITESPACE); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); -} - -std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } - -std::string JoinPath(const std::string& left, const std::string& right) { - std::string path(left); - if (path.size() && path.back() != '/') { - path.push_back('/'); - } - path.append(right); - return path; -} - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str) { - unsigned len = str.size() * 2; - setlocale(LC_CTYPE, ""); - wchar_t* p = new wchar_t[len]; - mbstowcs(p, str.c_str(), len); - std::wstring wstr(p); - delete[] p; - return wstr; -} -#endif - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/string.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/string.h deleted file mode 100644 index bf7a52ae09bce45ab7e34a5277652d7ae91bae1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/string.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_STRING_H_ -#define UTILS_STRING_H_ - -#include -#include -#include -#include -#include - -#include "fst/symbol-table.h" - -namespace wenet { - -const char WHITESPACE[] = " \n\r\t\f\v"; - -// Split the string with space or tab. -void SplitString(const std::string& str, std::vector* strs); - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out); - -// NOTE(Xingchen Song): we add this function to make it possible to -// support multilingual recipe in the future, in which characters of -// different languages are all encoded in UTF-8 format. -// UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding -// Split the UTF-8 string into chars. -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars); - -int UTF8StringLength(const std::string& str); - -// Check whether the UTF-8 char is alphabet or '. -bool CheckEnglishChar(const std::string& ch); - -// Check whether the UTF-8 word is only contains alphabet or '. -bool CheckEnglishWord(const std::string& word); - -std::string JoinString(const std::string& c, - const std::vector& strs); - -bool IsAlpha(const std::string& str); - -// Split the UTF-8 string into words by symbol table. -// Return whether not contains oov. -bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - -// Replace ▁ with space, then remove head, tail and consecutive space. -std::string ProcessBlank(const std::string& str, bool lowercase); - -std::string Ltrim(const std::string& str); - -std::string Rtrim(const std::string& str); - -std::string Trim(const std::string& str); - -std::string JoinPath(const std::string& left, const std::string& right); - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str); -#endif - -} // namespace wenet - -#endif // UTILS_STRING_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/thread_pool.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/thread_pool.h deleted file mode 100644 index a78162995d90bf079ad091cf14cb9f2cd4476d05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/thread_pool.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2012 Jakob Progsch, Václav Zeman - -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. - -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: - -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. - -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original software. - -// 3. This notice may not be removed or altered from any source -// distribution. - -#ifndef UTILS_THREAD_POOL_H_ -#define UTILS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - public: - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared >( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) { - worker.join(); - } -} - -#endif // UTILS_THREAD_POOL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/timer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/timer.h deleted file mode 100644 index 068519f98d140ba0eef68babe2ad2fdcb798c074..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/timer.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_TIMER_H_ -#define UTILS_TIMER_H_ - -#include - -namespace wenet { - -class Timer { - public: - Timer() : time_start_(std::chrono::steady_clock::now()) {} - void Reset() { time_start_ = std::chrono::steady_clock::now(); } - // return int in milliseconds - int Elapsed() const { - auto time_now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(time_now - - time_start_) - .count(); - } - - private: - std::chrono::time_point time_start_; -}; -} // namespace wenet - -#endif // UTILS_TIMER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/utils.cc deleted file mode 100644 index c37e36c6e9f629e0a4b11cf21a791aefd58b659f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/utils.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/utils.h" - -#include -#include -#include -#include -#include -#include - -#include "utils/log.h" - -namespace wenet { - -float LogAdd(float x, float y) { - static float num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - float xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -template -struct ValueComp { - bool operator()(const std::pair& lhs, - const std::pair& rhs) const { - return lhs.first > rhs.first || - (lhs.first == rhs.first && lhs.second < rhs.second); - } -}; - -// We refer the pytorch topk implementation -// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices) { - std::vector> heap_data; - int n = data.size(); - for (int32_t i = 0; i < k && i < n; ++i) { - heap_data.emplace_back(data[i], i); - } - std::priority_queue, std::vector>, - ValueComp> - pq(ValueComp(), std::move(heap_data)); - for (int32_t i = k; i < n; ++i) { - if (pq.top().first < data[i]) { - pq.pop(); - pq.emplace(data[i], i); - } - } - - values->resize(std::min(k, n)); - indices->resize(std::min(k, n)); - int32_t cur = values->size() - 1; - while (!pq.empty()) { - const auto& item = pq.top(); - (*values)[cur] = item.first; - (*indices)[cur] = item.second; - pq.pop(); - cur -= 1; - } -} - -template void TopK(const std::vector& data, int32_t k, - std::vector* values, - std::vector* indices); - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/utils.h deleted file mode 100644 index f9957c0b6e8ae27d9260e75cf55e786055827801..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/utils/utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_UTILS_H_ -#define UTILS_UTILS_H_ - -#include -#include -#include - -namespace wenet { - -#define WENET_DISALLOW_COPY_AND_ASSIGN(Type) \ - Type(const Type&) = delete; \ - Type& operator=(const Type&) = delete; - -const float kFloatMax = std::numeric_limits::max(); -// kSpaceSymbol in UTF-8 is: ▁ -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Return the sum of two probabilities in log scale -float LogAdd(float x, float y); - -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices); - -} // namespace wenet - -#endif // UTILS_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/wenet.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/wenet.cc deleted file mode 100644 index 7c8e92a37336cd0bd647b213784a86f8366f9b60..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/cpp/wenet.cc +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (authors: Xiaoyu Chen) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include - -#include "torch/script.h" -#include "torch/torch.h" - -#include "decoder/asr_decoder.h" -#include "decoder/torch_asr_model.h" -#include "frontend/feature_pipeline.h" -#include "frontend/wav.h" -#include "post_processor/post_processor.h" -#include "utils/log.h" -#include "utils/string.h" - -namespace wenet { - -std::shared_ptr decode_config; -std::shared_ptr feature_config; -std::shared_ptr feature_pipeline; -std::shared_ptr decoder; -std::shared_ptr resource; -DecodeState state = kEndBatch; -std::string total_result; // NOLINT - -void init(JNIEnv* env, jobject, jstring jModelDir) { - const char* pModelDir = env->GetStringUTFChars(jModelDir, nullptr); - - std::string modelPath = std::string(pModelDir) + "/final.zip"; - std::string dictPath = std::string(pModelDir) + "/units.txt"; - auto model = std::make_shared(); - model->Read(modelPath); - LOG(INFO) << "model path: " << modelPath; - - resource = std::make_shared(); - resource->model = model; - resource->symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(dictPath)); - LOG(INFO) << "dict path: " << dictPath; - - PostProcessOptions post_process_opts; - resource->post_processor = - std::make_shared(post_process_opts); - - feature_config = std::make_shared(80, 16000); - feature_pipeline = std::make_shared(*feature_config); - - decode_config = std::make_shared(); - decode_config->chunk_size = 16; - decoder = std::make_shared(feature_pipeline, resource, - *decode_config); -} - -void reset(JNIEnv *env, jobject) { - LOG(INFO) << "wenet reset"; - decoder->Reset(); - state = kEndBatch; - total_result = ""; -} - -void accept_waveform(JNIEnv *env, jobject, jshortArray jWaveform) { - jsize size = env->GetArrayLength(jWaveform); - int16_t* waveform = env->GetShortArrayElements(jWaveform, 0); - feature_pipeline->AcceptWaveform(waveform, size); - LOG(INFO) << "wenet accept waveform in ms: " << int(size / 16); -} - -void set_input_finished() { - LOG(INFO) << "wenet input finished"; - feature_pipeline->set_input_finished(); -} - -void decode_thread_func() { - while (true) { - state = decoder->Decode(); - if (state == kEndFeats || state == kEndpoint) { - decoder->Rescoring(); - } - - std::string result; - if (decoder->DecodedSomething()) { - result = decoder->result()[0].sentence; - } - - if (state == kEndFeats) { - LOG(INFO) << "wenet endfeats final result: " << result; - total_result += result; - break; - } else if (state == kEndpoint) { - LOG(INFO) << "wenet endpoint final result: " << result; - total_result += result + ","; - decoder->ResetContinuousDecoding(); - } else { - if (decoder->DecodedSomething()) { - LOG(INFO) << "wenet partial result: " << result; - } - } - } -} - -void start_decode() { - std::thread decode_thread(decode_thread_func); - decode_thread.detach(); -} - -jboolean get_finished(JNIEnv *env, jobject) { - if (state == kEndFeats) { - LOG(INFO) << "wenet recognize finished"; - return JNI_TRUE; - } - return JNI_FALSE; -} - -jstring get_result(JNIEnv *env, jobject) { - std::string result; - if (decoder->DecodedSomething()) { - result = decoder->result()[0].sentence; - } - LOG(INFO) << "wenet ui result: " << total_result + result; - return env->NewStringUTF((total_result + result).c_str()); -} -} // namespace wenet - -JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) { - JNIEnv *env; - if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { - return JNI_ERR; - } - - jclass c = env->FindClass("com/mobvoi/wenet/Recognize"); - if (c == nullptr) { - return JNI_ERR; - } - - static const JNINativeMethod methods[] = { - {"init", "(Ljava/lang/String;)V", reinterpret_cast(wenet::init)}, - {"reset", "()V", reinterpret_cast(wenet::reset)}, - {"acceptWaveform", "([S)V", - reinterpret_cast(wenet::accept_waveform)}, - {"setInputFinished", "()V", - reinterpret_cast(wenet::set_input_finished)}, - {"getFinished", "()Z", reinterpret_cast(wenet::get_finished)}, - {"startDecode", "()V", reinterpret_cast(wenet::start_decode)}, - {"getResult", "()Ljava/lang/String;", - reinterpret_cast(wenet::get_result)}, - }; - int rc = env->RegisterNatives(c, methods, - sizeof(methods) / sizeof(JNINativeMethod)); - - if (rc != JNI_OK) { - return rc; - } - - return JNI_VERSION_1_6; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/MainActivity.java b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/MainActivity.java deleted file mode 100644 index 715170326149c614ab518343535adc1c180b96d5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/MainActivity.java +++ /dev/null @@ -1,220 +0,0 @@ -package com.mobvoi.wenet; - -import android.Manifest; -import android.content.Context; -import android.content.pm.PackageManager; -import android.content.res.AssetManager; -import android.media.AudioFormat; -import android.media.AudioRecord; -import android.media.MediaRecorder; -import android.os.Bundle; -import android.os.Process; -import android.util.Log; -import android.widget.Button; -import android.widget.TextView; -import android.widget.Toast; -import androidx.appcompat.app.AppCompatActivity; -import androidx.core.app.ActivityCompat; -import androidx.core.content.ContextCompat; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.BlockingQueue; - -public class MainActivity extends AppCompatActivity { - - private final int MY_PERMISSIONS_RECORD_AUDIO = 1; - private static final String LOG_TAG = "WENET"; - private static final int SAMPLE_RATE = 16000; // The sampling rate - private static final int MAX_QUEUE_SIZE = 2500; // 100 seconds audio, 1 / 0.04 * 100 - private static final List resource = Arrays.asList( - "final.zip", "units.txt", "ctc.ort", "decoder.ort", "encoder.ort" - ); - - private boolean startRecord = false; - private AudioRecord record = null; - private int miniBufferSize = 0; // 1280 bytes 648 byte 40ms, 0.04s - private final BlockingQueue bufferQueue = new ArrayBlockingQueue<>(MAX_QUEUE_SIZE); - - public static void assetsInit(Context context) throws IOException { - AssetManager assetMgr = context.getAssets(); - // Unzip all files in resource from assets to context. - // Note: Uninstall the APP will remove the resource files in the context. - for (String file : assetMgr.list("")) { - if (resource.contains(file)) { - File dst = new File(context.getFilesDir(), file); - if (!dst.exists() || dst.length() == 0) { - Log.i(LOG_TAG, "Unzipping " + file + " to " + dst.getAbsolutePath()); - InputStream is = assetMgr.open(file); - OutputStream os = new FileOutputStream(dst); - byte[] buffer = new byte[4 * 1024]; - int read; - while ((read = is.read(buffer)) != -1) { - os.write(buffer, 0, read); - } - os.flush(); - } - } - } - } - - @Override - public void onRequestPermissionsResult(int requestCode, - String[] permissions, int[] grantResults) { - if (requestCode == MY_PERMISSIONS_RECORD_AUDIO) { - if (grantResults.length > 0 && grantResults[0] == PackageManager.PERMISSION_GRANTED) { - Log.i(LOG_TAG, "record permission is granted"); - initRecorder(); - } else { - Toast.makeText(this, "Permissions denied to record audio", Toast.LENGTH_LONG).show(); - Button button = findViewById(R.id.button); - button.setEnabled(false); - } - } - } - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_main); - requestAudioPermissions(); - try { - assetsInit(this); - } catch (IOException e) { - Log.e(LOG_TAG, "Error process asset files to file path"); - } - - TextView textView = findViewById(R.id.textView); - textView.setText(""); - Recognize.init(getFilesDir().getPath()); - - Button button = findViewById(R.id.button); - button.setText("Start Record"); - button.setOnClickListener(view -> { - if (!startRecord) { - startRecord = true; - Recognize.reset(); - startRecordThread(); - startAsrThread(); - Recognize.startDecode(); - button.setText("Stop Record"); - } else { - startRecord = false; - Recognize.setInputFinished(); - button.setText("Start Record"); - } - button.setEnabled(false); - }); - } - - private void requestAudioPermissions() { - if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) - != PackageManager.PERMISSION_GRANTED) { - ActivityCompat.requestPermissions(this, - new String[]{Manifest.permission.RECORD_AUDIO}, - MY_PERMISSIONS_RECORD_AUDIO); - } else { - initRecorder(); - } - } - - private void initRecorder() { - // buffer size in bytes 1280 - miniBufferSize = AudioRecord.getMinBufferSize(SAMPLE_RATE, - AudioFormat.CHANNEL_IN_MONO, - AudioFormat.ENCODING_PCM_16BIT); - if (miniBufferSize == AudioRecord.ERROR || miniBufferSize == AudioRecord.ERROR_BAD_VALUE) { - Log.e(LOG_TAG, "Audio buffer can't initialize!"); - return; - } - record = new AudioRecord(MediaRecorder.AudioSource.DEFAULT, - SAMPLE_RATE, - AudioFormat.CHANNEL_IN_MONO, - AudioFormat.ENCODING_PCM_16BIT, - miniBufferSize); - if (record.getState() != AudioRecord.STATE_INITIALIZED) { - Log.e(LOG_TAG, "Audio Record can't initialize!"); - return; - } - Log.i(LOG_TAG, "Record init okay"); - } - - private void startRecordThread() { - new Thread(() -> { - VoiceRectView voiceView = findViewById(R.id.voiceRectView); - record.startRecording(); - Process.setThreadPriority(Process.THREAD_PRIORITY_AUDIO); - while (startRecord) { - short[] buffer = new short[miniBufferSize / 2]; - int read = record.read(buffer, 0, buffer.length); - voiceView.add(calculateDb(buffer)); - try { - if (AudioRecord.ERROR_INVALID_OPERATION != read) { - bufferQueue.put(buffer); - } - } catch (InterruptedException e) { - Log.e(LOG_TAG, e.getMessage()); - } - Button button = findViewById(R.id.button); - if (!button.isEnabled() && startRecord) { - runOnUiThread(() -> button.setEnabled(true)); - } - } - record.stop(); - voiceView.zero(); - }).start(); - } - - private double calculateDb(short[] buffer) { - double energy = 0.0; - for (short value : buffer) { - energy += value * value; - } - energy /= buffer.length; - energy = (10 * Math.log10(1 + energy)) / 100; - energy = Math.min(energy, 1.0); - return energy; - } - - private void startAsrThread() { - new Thread(() -> { - // Send all data - while (startRecord || bufferQueue.size() > 0) { - try { - short[] data = bufferQueue.take(); - // 1. add data to C++ interface - Recognize.acceptWaveform(data); - // 2. get partial result - runOnUiThread(() -> { - TextView textView = findViewById(R.id.textView); - textView.setText(Recognize.getResult()); - }); - } catch (InterruptedException e) { - Log.e(LOG_TAG, e.getMessage()); - } - } - - // Wait for final result - while (true) { - // get result - if (!Recognize.getFinished()) { - runOnUiThread(() -> { - TextView textView = findViewById(R.id.textView); - textView.setText(Recognize.getResult()); - }); - } else { - runOnUiThread(() -> { - Button button = findViewById(R.id.button); - button.setEnabled(true); - }); - break; - } - } - }).start(); - } -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/Recognize.java b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/Recognize.java deleted file mode 100644 index 31cafcf8a31685216e1510b9b7b43812624b5ea8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/Recognize.java +++ /dev/null @@ -1,16 +0,0 @@ -package com.mobvoi.wenet; - -public class Recognize { - - static { - System.loadLibrary("wenet"); - } - - public static native void init(String modelDir); - public static native void reset(); - public static native void acceptWaveform(short[] waveform); - public static native void setInputFinished(); - public static native boolean getFinished(); - public static native void startDecode(); - public static native String getResult(); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/VoiceRectView.java b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/VoiceRectView.java deleted file mode 100644 index fd1c832b28536b918f26969ce987898870ad584f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/java/com/mobvoi/wenet/VoiceRectView.java +++ /dev/null @@ -1,134 +0,0 @@ -package com.mobvoi.wenet; - -import android.content.Context; -import android.content.res.TypedArray; -import android.graphics.Canvas; -import android.graphics.LinearGradient; -import android.graphics.Paint; -import android.graphics.Shader; -import android.util.AttributeSet; -import android.view.View; -import androidx.core.content.ContextCompat; -import java.util.Arrays; - -/** - * 自定义的音频模拟条形图 Created by shize on 2016/9/5. - */ -public class VoiceRectView extends View { - - // 音频矩形的数量 - private int mRectCount; - // 音频矩形的画笔 - private Paint mRectPaint; - // 渐变颜色的两种 - private int topColor, downColor; - // 音频矩形的宽和高 - private int mRectWidth, mRectHeight; - // 偏移量 - private int offset; - // 频率速度 - private int mSpeed; - - private double[] mEnergyBuffer = null; - - public VoiceRectView(Context context) { - this(context, null); - } - - public VoiceRectView(Context context, AttributeSet attrs) { - this(context, attrs, 0); - } - - public VoiceRectView(Context context, AttributeSet attrs, int defStyleAttr) { - super(context, attrs, defStyleAttr); - setPaint(context, attrs); - } - - public void setPaint(Context context, AttributeSet attrs) { - // 将属性存储到TypedArray中 - TypedArray ta = context.obtainStyledAttributes(attrs, R.styleable.VoiceRect); - mRectPaint = new Paint(); - // 添加矩形画笔的基础颜色 - mRectPaint.setColor(ta.getColor(R.styleable.VoiceRect_RectTopColor, - ContextCompat.getColor(context, R.color.top_color))); - // 添加矩形渐变色的上面部分 - topColor = ta.getColor(R.styleable.VoiceRect_RectTopColor, - ContextCompat.getColor(context, R.color.top_color)); - // 添加矩形渐变色的下面部分 - downColor = ta.getColor(R.styleable.VoiceRect_RectDownColor, - ContextCompat.getColor(context, R.color.down_color)); - // 设置矩形的数量 - mRectCount = ta.getInt(R.styleable.VoiceRect_RectCount, 10); - mEnergyBuffer = new double[mRectCount]; - - // 设置重绘的时间间隔,也就是变化速度 - mSpeed = ta.getInt(R.styleable.VoiceRect_RectSpeed, 300); - // 每个矩形的间隔 - offset = ta.getInt(R.styleable.VoiceRect_RectOffset, 0); - // 回收TypeArray - ta.recycle(); - } - - @Override - protected void onSizeChanged(int w, int h, int oldW, int oldH) { - super.onSizeChanged(w, h, oldW, oldH); - // 渐变效果 - LinearGradient mLinearGradient; - // 画布的宽 - int mWidth; - // 获取画布的宽 - mWidth = getWidth(); - // 获取矩形的最大高度 - mRectHeight = getHeight(); - // 获取单个矩形的宽度(减去的部分为到右边界的间距) - mRectWidth = (mWidth - offset) / mRectCount; - // 实例化一个线性渐变 - mLinearGradient = new LinearGradient( - 0, - 0, - mRectWidth, - mRectHeight, - topColor, - downColor, - Shader.TileMode.CLAMP - ); - // 添加进画笔的着色器 - mRectPaint.setShader(mLinearGradient); - } - - public void add(double energy) { - if (mEnergyBuffer.length - 1 >= 0) { - System.arraycopy(mEnergyBuffer, 1, mEnergyBuffer, 0, mEnergyBuffer.length - 1); - } - mEnergyBuffer[mEnergyBuffer.length - 1] = energy; - } - - public void zero() { - Arrays.fill(mEnergyBuffer, 0); - } - - @Override - protected void onDraw(Canvas canvas) { - super.onDraw(canvas); - double mRandom; - float currentHeight; - for (int i = 0; i < mRectCount; i++) { - // 由于只是简单的案例就不监听音频输入,随机模拟一些数字即可 - mRandom = Math.random(); - - //if (i < 1 || i > mRectCount - 2) mRandom = 0; - currentHeight = (float) (mRectHeight * mEnergyBuffer[i]); - - // 矩形的绘制是从左边开始到上、右、下边(左右边距离左边画布边界的距离,上下边距离上边画布边界的距离) - canvas.drawRect( - (float) (mRectWidth * i + offset), - (mRectHeight - currentHeight) / 2, - (float) (mRectWidth * (i + 1)), - mRectHeight / 2 + currentHeight / 2, - mRectPaint - ); - } - // 使得view延迟重绘 - postInvalidateDelayed(mSpeed); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml deleted file mode 100644 index 2b068d11462a4b96669193de13a711a3a36220a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/drawable/ic_launcher_background.xml b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/drawable/ic_launcher_background.xml deleted file mode 100644 index 07d5da9cbf141911847041df5d7b87f0dd5ef9d4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/drawable/ic_launcher_background.xml +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/layout/activity_main.xml b/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/layout/activity_main.xml deleted file mode 100644 index 3ef1e0d3b27b4c19aa30a4c79ad2bf557d1ecb65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/android/app/src/main/res/layout/activity_main.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/Info.plist b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/Info.plist deleted file mode 100644 index 16ad1537f36c818128d96ec61f63df0c2f45498c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/Info.plist +++ /dev/null @@ -1,27 +0,0 @@ - - - - - UIApplicationSceneManifest - - UIApplicationSupportsMultipleScenes - - UISceneConfigurations - - UIWindowSceneSessionRoleApplication - - - UISceneConfigurationName - Default Configuration - UISceneDelegateClassName - $(PRODUCT_MODULE_NAME).SceneDelegate - UISceneStoryboardFile - Main - - - - - NSMicrophoneUsageDescription - Need microphone access for recording speech - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/SceneDelegate.swift b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/SceneDelegate.swift deleted file mode 100644 index 1c61b7853fce34cb4bd1cea152fb3013ef47015c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/SceneDelegate.swift +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2022 Dan Ma (1067837450@qq.com) -// -// SceneDelegate.swift -// WenetDemo -// - -import UIKit - -class SceneDelegate: UIResponder, UIWindowSceneDelegate { - - var window: UIWindow? - - - func scene(_ scene: UIScene, willConnectTo session: UISceneSession, - options connectionOptions: UIScene.ConnectionOptions) { - // Use this method to optionally configure and attach the UIWindow - // `window` to the provided UIWindowScene `scene`. - // If using a storyboard, the `window` property will - // automatically be initialized and attached to the scene. - // This delegate does not imply the connecting scene or session - // are new (see - // `application:configurationForConnectingSceneSession` instead). - guard let _ = (scene as? UIWindowScene) else { return } - } - - func sceneDidDisconnect(_ scene: UIScene) { - // Called as the scene is being released by the system. - // This occurs shortly after the scene enters the background, or when - // its session is discarded. - // Release any resources associated with this scene that can be - // re-created the next time the scene connects. - // The scene may re-connect later, as its session was not necessarily - // discarded (see `application:didDiscardSceneSessions` instead). - } - - func sceneDidBecomeActive(_ scene: UIScene) { - // Called when the scene has moved from an inactive state - // to an active state. - // Use this method to restart any tasks that were - // paused (or not yet started) when the scene was inactive. - } - - func sceneWillResignActive(_ scene: UIScene) { - // Called when the scene will move from an active state to - // an inactive state. - // This may occur due to temporary interruptions - // (ex. an incoming phone call). - } - - func sceneWillEnterForeground(_ scene: UIScene) { - // Called as the scene transitions from the background - // to the foreground. - // Use this method to undo the changes made on - // entering the background. - } - - func sceneDidEnterBackground(_ scene: UIScene) { - // Called as the scene transitions from the foreground to - // the background. - // Use this method to save data, release shared resources, - // and store enough scene-specific state information - // to restore the scene back to its current state. - } - - -} - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/ViewController.swift b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/ViewController.swift deleted file mode 100644 index 707073a2d271c22faa6e4aa604148079e9b458e9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/ViewController.swift +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2022 Dan Ma (1067837450@qq.com) -// -// ViewController.swift -// WenetDemo -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -import UIKit -import AVFoundation - -class ViewController: UIViewController { - - @IBOutlet weak var label: UILabel! - @IBOutlet weak var button: UIButton! - - var wenetModel: Wenet? - var audioEngine: AVAudioEngine? - var startRecord: Bool? - private var workItem: DispatchWorkItem? - - override func viewDidLoad() { - super.viewDidLoad() - // Do any additional setup after loading the view. - - initModel() - - initRecorder() - } - - func initModel() { - let modelPath = Bundle.main.path(forResource: "final", ofType: "zip") - let dictPath = Bundle.main.path(forResource: "units", ofType: "txt") - wenetModel = Wenet(modelPath:modelPath, dictPath:dictPath)! - - wenetModel?.reset() - } - - func initRecorder() { - startRecord = false - - audioEngine = AVAudioEngine() - let inputNode = self.audioEngine?.inputNode - let bus = 0 - let inputFormat = inputNode?.outputFormat(forBus: bus) - let outputFormat = AVAudioFormat(commonFormat: .pcmFormatFloat32, - sampleRate: 16000, channels: 1, - interleaved: false)! - let converter = AVAudioConverter(from: inputFormat!, to: outputFormat)! - inputNode!.installTap(onBus: bus, - bufferSize: 1024, - format: inputFormat) { - (buffer: AVAudioPCMBuffer, when: AVAudioTime) in - var newBufferAvailable = true - - let inputCallback: AVAudioConverterInputBlock = { - inNumPackets, outStatus in - if newBufferAvailable { - outStatus.pointee = .haveData - newBufferAvailable = false - - return buffer - } else { - outStatus.pointee = .noDataNow - return nil - } - } - - let convertedBuffer = AVAudioPCMBuffer( - pcmFormat: outputFormat, - frameCapacity: - AVAudioFrameCount(outputFormat.sampleRate) - * buffer.frameLength - / AVAudioFrameCount(buffer.format.sampleRate))! - - var error: NSError? - let status = converter.convert( - to: convertedBuffer, - error: &error, withInputFrom: inputCallback) - - // 16000 Hz buffer - let actualSampleCount = Int(convertedBuffer.frameLength) - guard let floatChannelData = convertedBuffer.floatChannelData - else { return } - - self.wenetModel?.acceptWaveForm(floatChannelData[0], - Int32(actualSampleCount)) - } - } - - @IBAction func btnClicked(_ sender: Any) { - if(!startRecord!) { - //Clear result - self.setResult(text: "") - - //Reset model - self.wenetModel?.reset() - - //Start record - do { - try self.audioEngine?.start() - } catch let error as NSError { - print("Got an error starting audioEngine: \(error.domain), \(error)") - } - - //Start decode thread - workItem = DispatchWorkItem { - while(!self.workItem!.isCancelled) { - self.wenetModel?.decode() - DispatchQueue.main.sync { - self.setResult(text: (self.wenetModel?.get_result())!) - } - } - } - DispatchQueue.global().async(execute: workItem!) - - startRecord = true - button.setTitle("Stop Record", for: UIControl.State.normal) - } else { - //Stop record - self.audioEngine?.stop() - - //Stop decode thread - workItem!.cancel() - - startRecord = false - button.setTitle("Start Record", for: UIControl.State.normal) - } - } - - @objc func setResult(text: String) { - label.text = text - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/model/.gitkeep b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/model/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/WenetDemo-Bridging-Header.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/WenetDemo-Bridging-Header.h deleted file mode 100644 index 5cec9898b6b7826f988892d28b22daf6080e96f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/WenetDemo-Bridging-Header.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2022 Dan Ma (1067837450@qq.com) -// -// Use this file to import your target's public headers -// that you would like to expose to Swift. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUNTIME_IOS_WENETDEMO_WENETDEMO_WENET_WENETDEMO_BRIDGING_HEADER_H_ -#define RUNTIME_IOS_WENETDEMO_WENETDEMO_WENET_WENETDEMO_BRIDGING_HEADER_H_ - -#import "wenet.h" - -#endif // RUNTIME_IOS_WENETDEMO_WENETDEMO_WENET_WENETDEMO_BRIDGING_HEADER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/wenet.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/wenet.h deleted file mode 100644 index 0d430e3577e2d5fa56dfab0925c9131b9197aefa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/wenet.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2022 Dan Ma (1067837450@qq.com) -// -// wenet.h -// WenetDemo -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUNTIME_IOS_WENETDEMO_WENETDEMO_WENET_WENET_H_ -#define RUNTIME_IOS_WENETDEMO_WENETDEMO_WENET_WENET_H_ - -#include - -#import - -@interface Wenet : NSObject - -- (nullable instancetype)initWithModelPath: -(NSString*)modelPath DictPath:(NSString*)dictPath; // NOLINT - -- (void)reset; - -- (void)acceptWaveForm: (float*)pcm: (int)size; // NOLINT - -- (void)decode; - -- (NSString*)get_result; // NOLINT - -@end - -#endif // RUNTIME_IOS_WENETDEMO_WENETDEMO_WENET_WENET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/wenet.mm b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/wenet.mm deleted file mode 100644 index bab9a085ca47fa8004ec8995b144fd224d5c926d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/WenetDemo/WenetDemo/wenet/wenet.mm +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2022 Dan Ma (1067837450@qq.com) -// -// wenet.mm -// WenetDemo -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "wenet.h" - -#define IOS - -#include "decoder/asr_decoder.h" -#include "decoder/torch_asr_model.h" -#include "frontend/feature_pipeline.h" -#include "frontend/wav.h" -#include "post_processor/post_processor.h" -#include "utils/log.h" -#include "utils/string.h" - -using namespace wenet; - -@implementation Wenet { -@protected - std::shared_ptr decode_config; - std::shared_ptr feature_config; - std::shared_ptr feature_pipeline; - std::shared_ptr decoder; - std::shared_ptr resource; - DecodeState state; - std::string total_result; -} - -- (nullable instancetype)initWithModelPath: -(NSString*)modelPath DictPath:(NSString*)dictPath { - self = [super init]; - if (self) { - try { - auto qengines = at::globalContext().supportedQEngines(); - if (std::find(qengines.begin(), qengines.end(), at::QEngine::QNNPACK) - != qengines.end()) { - at::globalContext().setQEngine(at::QEngine::QNNPACK); - } - auto model = std::make_shared(); - model->Read(modelPath.UTF8String); - resource = std::make_shared(); - resource->model = model; - resource->symbol_table = std::shared_ptr - (fst::SymbolTable::ReadText(dictPath.UTF8String)); - - PostProcessOptions post_process_opts; - resource->post_processor = - std::make_shared(post_process_opts); - - feature_config = std::make_shared(80, 16000); - feature_pipeline = std::make_shared(*feature_config); - - decode_config = std::make_shared(); - decode_config->chunk_size = 16; - decoder = std::make_shared(feature_pipeline, - resource, - *decode_config); - - state = kEndBatch; - } catch (const std::exception& exception) { - NSLog(@"%s", exception.what()); - return nil; - } - } - - return self; -} - -- (void)reset { - decoder->Reset(); - state = kEndBatch; - total_result = ""; -} - -- (void)acceptWaveForm: (float*)pcm: (int)size { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = pcm[i] * 65535; - } - feature_pipeline->AcceptWaveform(float_pcm, size); -} - -- (void)decode { - state = decoder->Decode(); - if (state == kEndFeats || state == kEndpoint) { - decoder->Rescoring(); - } - - std::string result; - if (decoder->DecodedSomething()) { - result = decoder->result()[0].sentence; - } - - if (state == kEndFeats) { - LOG(INFO) << "wenet endfeats final result: " << result; - NSLog(@"wenet endfeats final result: %s", result.c_str()); - total_result += result; - } else if (state == kEndpoint) { - LOG(INFO) << "wenet endpoint final result: " << result; - NSLog(@"wenet endpoint final result: %s", result.c_str()); - total_result += result + ","; - decoder->ResetContinuousDecoding(); - } else { - if (decoder->DecodedSomething()) { - LOG(INFO) << "wenet partial result: " << result; - NSLog(@"wenet partial result: %s", result.c_str()); - } - } -} - -- (NSString*)get_result { - std::string result; - if (decoder->DecodedSomething()) { - result = decoder->result()[0].sentence; - } - LOG(INFO) << "wenet ui result: " << total_result + result; - NSLog(@"wenet ui result: %s", (total_result + result).c_str()); - return [NSString stringWithUTF8String:(total_result + result).c_str()]; -} - -@end diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/boost.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/boost.cmake deleted file mode 100644 index 8684c0ec43960da213da923dc57416f04301ea2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/boost.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FetchContent_Declare(boost - URL https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz - URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a -) -FetchContent_MakeAvailable(boost) -include_directories(${boost_SOURCE_DIR}) - -if(MSVC) - add_definitions(-DBOOST_ALL_DYN_LINK -DBOOST_ALL_NO_LIB) -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/bpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/bpu.cmake deleted file mode 100644 index 350d76c19d6f656fb130de09877d649cf49972a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/bpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -if(BPU) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(EASY_DNN_URL "https://github.com/xingchensong/toolchain_pkg/releases/download/easy_dnn/easy_dnn.0.4.11.tar.gz") - set(URL_HASH "SHA256=a1a6f77d1baae7181d75ec5d37a2ee529ac4e1c4400babd6ceb1c007392a4904") - else() - message(FATAL_ERROR "Unsupported CMake System Processor '${CMAKE_SYSTEM_PROCESSOR}' (expected 'aarch64')") - endif() - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Linux')") - endif() - - FetchContent_Declare(easy_dnn - URL ${EASY_DNN_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(easy_dnn) - include_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/include) - link_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/lib) - - add_definitions(-DUSE_BPU) - # NOTE(xcsong): Reasons for adding flag `-fuse-ld=gold`: - # https://stackoverflow.com/questions/59915966/unknown-gcc-linker-error-but-builds-sucessfully/59916438#59916438 - # https://github.com/tensorflow/tensorflow/issues/47849 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/gflags.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/gflags.cmake deleted file mode 100644 index 53ae5763b5a8c860b7e64d35b380eee5429f539d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/gflags.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(gflags - URL https://github.com/gflags/gflags/archive/v2.2.2.zip - URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 -) -FetchContent_MakeAvailable(gflags) -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/glog.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/glog.cmake deleted file mode 100644 index 447ab4132f669ee2c3a52c37959dd684a39ff21b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/glog.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(glog - URL https://github.com/google/glog/archive/v0.4.0.zip - URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc -) -FetchContent_MakeAvailable(glog) -include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/grpc.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/grpc.cmake deleted file mode 100644 index 644093a4bf8191f3a45b0df0a72c000981c48f58..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/grpc.cmake +++ /dev/null @@ -1,9 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) -# third_party: grpc -# On how to build grpc, you may refer to https://github.com/grpc/grpc -# We recommend manually recursive clone the repo to avoid internet connection problem -FetchContent_Declare(gRPC - GIT_REPOSITORY https://github.com/grpc/grpc - GIT_TAG v1.37.1 -) -FetchContent_MakeAvailable(gRPC) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/gtest.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/gtest.cmake deleted file mode 100644 index 30dc7c1a31d8b83991841a4dc33f61ed078b532a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/gtest.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FetchContent_Declare(googletest - URL https://github.com/google/googletest/archive/release-1.11.0.zip - URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a -) -if(MSVC) - set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) -endif() -FetchContent_MakeAvailable(googletest) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/libtorch.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/libtorch.cmake deleted file mode 100644 index 3cd9245b2da52f8be206d27164de5f411bff171b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/libtorch.cmake +++ /dev/null @@ -1,79 +0,0 @@ -if(TORCH) - add_definitions(-DUSE_TORCH) - if(NOT ANDROID) - if(GPU) - if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(FATAL_ERROR "GPU is supported only Linux, you can use CPU version") - else() - add_definitions(-DUSE_GPU) - endif() - endif() - - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - if(${CMAKE_BUILD_TYPE} MATCHES "Release") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bece54d36377990257e9d028c687c5b6759c5cfec0a0153da83cf6f0f71f648f") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=3cc7ba3c3865d86f03d78c2f0878fdbed8b764359476397a5c95cf3bba0d665a") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CXX11_ABI) - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=d52f63577a07adb0bfd6d77c90f7da21896e94f71eb7dcd55ed7835ccb3b2b59") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip") - set(URL_HASH "SHA256=80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865") - endif() - else() - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bee1b7be308792aa60fc95a4f5274d9658cb7248002d0e333d49eb81ec88430c") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip") - set(URL_HASH "SHA256=90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad") - endif() - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.13.0.zip") - set(URL_HASH "SHA256=a8f80050b95489b4e002547910410c2c230e9f590ffab2482e19e809afe4f7aa") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_definitions(-DIOS) - else() - message(FATAL_ERROR "Unsupported System '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux', 'Darwin' or 'iOS')") - endif() - - # iOS use LibTorch from pod install - if(NOT IOS) - FetchContent_Declare(libtorch - URL ${LIBTORCH_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(libtorch) - find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG") - endif() - - if(MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - file(COPY ${TORCH_DLLS} DESTINATION ${CMAKE_BINARY_DIR}) - endif() - else() - # Change version in runtime/android/app/build.gradle. - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - include_directories( - ${PYTORCH_INCLUDE_DIRS} - ${PYTORCH_INCLUDE_DIRS}/torch/csrc/api/include - ) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/onnx.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/onnx.cmake deleted file mode 100644 index bd55402cb2a6024620fa6ff8b5c413207041adfa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/onnx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -if(ONNX) - set(ONNX_VERSION "1.12.0") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-win-x64-${ONNX_VERSION}.zip") - set(URL_HASH "SHA256=8b5d61204989350b7904ac277f5fbccd3e6736ddbb6ec001e412723d71c9c176") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-aarch64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5820d9f343df73c63b6b2b174a1ff62575032e171c9564bcf92060f46827d0ac") - else() - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5d503ce8540358b59be26c675e42081be14a3e833a5301926f555451046929c5") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-osx-x86_64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=09b17f712f8c6f19bb63da35d508815b443cbb473e16c6192abfaa297c02f600") - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux' or 'Darwin')") - endif() - - FetchContent_Declare(onnxruntime - URL ${ONNX_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(onnxruntime) - include_directories(${onnxruntime_SOURCE_DIR}/include) - link_directories(${onnxruntime_SOURCE_DIR}/lib) - - if(MSVC) - file(GLOB ONNX_DLLS "${onnxruntime_SOURCE_DIR}/lib/*.dll") - file(COPY ${ONNX_DLLS} DESTINATION ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}) - endif() - - add_definitions(-DUSE_ONNX) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/openfst.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/openfst.cmake deleted file mode 100644 index 490a3da6b571ec228114167fb9c0d9e9b4043bd2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/openfst.cmake +++ /dev/null @@ -1,45 +0,0 @@ -if(NOT ANDROID) - include(gflags) - # We can't build glog with gflags, unless gflags is pre-installed. - # If build glog with pre-installed gflags, there will be conflict. - set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) - include(glog) - - if(NOT GRAPH_TOOLS) - set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) - set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) - endif() - set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) - set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) - set(HAVE_GRM OFF CACHE BOOL "Build grm" FORCE) - set(HAVE_FAR OFF CACHE BOOL "Build far" FORCE) - set(HAVE_PDT OFF CACHE BOOL "Build pdt" FORCE) - set(HAVE_MPDT OFF CACHE BOOL "Build mpdt" FORCE) - set(HAVE_LINEAR OFF CACHE BOOL "Build linear" FORCE) - set(HAVE_LOOKAHEAD OFF CACHE BOOL "Build lookahead" FORCE) - set(HAVE_NGRAM OFF CACHE BOOL "Build ngram" FORCE) - set(HAVE_SPECIAL OFF CACHE BOOL "Build special" FORCE) - - if(MSVC) - add_compile_options(/W0 /wd4244 /wd4267) - endif() - - # "OpenFST port for Windows" builds openfst with cmake for multiple platforms. - # Openfst is compiled with glog/gflags to avoid log and flag conflicts with log and flags in wenet/libtorch. - # To build openfst with gflags and glog, we comment out some vars of {flags, log}.h and flags.cc. - set(openfst_SOURCE_DIR ${fc_base}/openfst-src CACHE PATH "OpenFST source directory") - FetchContent_Declare(openfst - URL https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz - URL_HASH SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e - PATCH_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} - ) - FetchContent_MakeAvailable(openfst) - add_dependencies(fst gflags glog) - target_link_libraries(fst PUBLIC gflags_nothreads_static glog) - include_directories(${openfst_SOURCE_DIR}/src/include) -else() - set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) - include_directories(${openfst_BINARY_DIR}/include) - link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) - link_libraries(log gflags_nothreads glog fst) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/pybind11.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/pybind11.cmake deleted file mode 100644 index 6bdae202c1c4d94228e5f92dab051c118dba7d3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/pybind11.cmake +++ /dev/null @@ -1,7 +0,0 @@ -FetchContent_Declare(pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.zip - URL_HASH SHA256=d1646e6f70d8a3acb2ddd85ce1ed543b5dd579c68b8fb8e9638282af20edead8 -) -FetchContent_MakeAvailable(pybind11) - -add_subdirectory(${pybind11_SOURCE_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/xpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/xpu.cmake deleted file mode 100644 index 38418671b0237550cd01d4d95e8743067e113e56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/cmake/xpu.cmake +++ /dev/null @@ -1,37 +0,0 @@ -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -if(XPU) - set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") - set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) - if(NOT DEFINED ENV{XPU_API_PATH}) - message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") - else() - set(XPU_API_PATH $ENV{XPU_API_PATH}) - message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") - endif() - - include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ - ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) - link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) - - add_definitions(-DUSE_XPU) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/CMakeLists.txt deleted file mode 100644 index fe03efb288eb1c7ae3d05e896e95855e5865472f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -set(decoder_srcs - asr_decoder.cc - asr_model.cc - context_graph.cc - ctc_prefix_beam_search.cc - ctc_wfst_beam_search.cc - ctc_endpoint.cc -) - -if(NOT TORCH AND NOT ONNX AND NOT XPU AND NOT IOS AND NOT BPU) - message(FATAL_ERROR "Please build with TORCH or ONNX or XPU or IOS or BPU!!!") -endif() -if(TORCH OR IOS) - list(APPEND decoder_srcs torch_asr_model.cc) -endif() -if(ONNX) - list(APPEND decoder_srcs onnx_asr_model.cc) -endif() - -add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend - post_processor utils) - -if(ANDROID) - target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) -else() - if(TORCH) - target_link_libraries(decoder PUBLIC ${TORCH_LIBRARIES}) - endif() - if(ONNX) - target_link_libraries(decoder PUBLIC onnxruntime) - endif() - if(BPU) - target_link_libraries(decoder PUBLIC bpu_asr_model) - endif() - if(XPU) - target_link_libraries(decoder PUBLIC xpu_conformer) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_decoder.cc deleted file mode 100644 index 34de7550ea287b37d2cb707e148f5d6853b3d804..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_decoder.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/asr_decoder.h" - -#include - -#include -#include -#include - -#include "utils/timer.h" - -namespace wenet { - -AsrDecoder::AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts) - : feature_pipeline_(std::move(feature_pipeline)), - // Make a copy of the model ASR model since we will change the inner - // status of the model - model_(resource->model->Copy()), - post_processor_(resource->post_processor), - symbol_table_(resource->symbol_table), - fst_(resource->fst), - unit_table_(resource->unit_table), - opts_(opts), - ctc_endpointer_(new CtcEndpoint(opts.ctc_endpoint_config)) { - if (opts_.reverse_weight > 0) { - // Check if model has a right to left decoder - CHECK(model_->is_bidirectional_decoder()); - } - if (nullptr == fst_) { - searcher_.reset(new CtcPrefixBeamSearch(opts.ctc_prefix_search_opts, - resource->context_graph)); - } else { - searcher_.reset(new CtcWfstBeamSearch(*fst_, opts.ctc_wfst_search_opts, - resource->context_graph)); - } - ctc_endpointer_->frame_shift_in_ms(frame_shift_in_ms()); -} - -void AsrDecoder::Reset() { - start_ = false; - result_.clear(); - num_frames_ = 0; - global_frame_offset_ = 0; - model_->Reset(); - searcher_->Reset(); - feature_pipeline_->Reset(); - ctc_endpointer_->Reset(); -} - -void AsrDecoder::ResetContinuousDecoding() { - global_frame_offset_ = num_frames_; - start_ = false; - result_.clear(); - model_->Reset(); - searcher_->Reset(); - ctc_endpointer_->Reset(); -} - -DecodeState AsrDecoder::Decode(bool block) { - return this->AdvanceDecoding(block); -} - -void AsrDecoder::Rescoring() { - // Do attention rescoring - Timer timer; - AttentionRescoring(); - VLOG(2) << "Rescoring cost latency: " << timer.Elapsed() << "ms."; -} - -DecodeState AsrDecoder::AdvanceDecoding(bool block) { - DecodeState state = DecodeState::kEndBatch; - model_->set_chunk_size(opts_.chunk_size); - model_->set_num_left_chunks(opts_.num_left_chunks); - int num_required_frames = model_->num_frames_for_chunk(start_); - std::vector> chunk_feats; - // Return immediately if we do not want to block - if (!block && !feature_pipeline_->input_finished() && - feature_pipeline_->NumQueuedFrames() < num_required_frames) { - return DecodeState::kWaitFeats; - } - // If not okay, that means we reach the end of the input - if (!feature_pipeline_->Read(num_required_frames, &chunk_feats)) { - state = DecodeState::kEndFeats; - } - - num_frames_ += chunk_feats.size(); - VLOG(2) << "Required " << num_required_frames << " get " - << chunk_feats.size(); - Timer timer; - std::vector> ctc_log_probs; - model_->ForwardEncoder(chunk_feats, &ctc_log_probs); - int forward_time = timer.Elapsed(); - if (opts_.ctc_wfst_search_opts.blank_scale != 1.0) { - for (int i = 0; i < ctc_log_probs.size(); i++) { - ctc_log_probs[i][0] = ctc_log_probs[i][0] - + std::log(opts_.ctc_wfst_search_opts.blank_scale); - } - } - timer.Reset(); - searcher_->Search(ctc_log_probs); - int search_time = timer.Elapsed(); - VLOG(3) << "forward takes " << forward_time << " ms, search takes " - << search_time << " ms"; - UpdateResult(); - - if (state != DecodeState::kEndFeats) { - if (ctc_endpointer_->IsEndpoint(ctc_log_probs, DecodedSomething())) { - VLOG(1) << "Endpoint is detected at " << num_frames_; - state = DecodeState::kEndpoint; - } - } - - start_ = true; - return state; -} - -void AsrDecoder::UpdateResult(bool finish) { - const auto& hypotheses = searcher_->Outputs(); - const auto& inputs = searcher_->Inputs(); - const auto& likelihood = searcher_->Likelihood(); - const auto& times = searcher_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - int offset = global_frame_offset_ * feature_frame_shift_in_ms(); - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (searcher_->Type() == kWfstBeamSearch) { - path.sentence += (' ' + word); - } else { - path.sentence += (word); - } - } - - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - const std::vector& input = inputs[i]; - const std::vector& time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ > 0 - ? time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ - : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : start; - } - int end = time_stamp[j] * frame_shift_in_ms(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : end; - } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } - } - - if (post_processor_ != nullptr) { - path.sentence = post_processor_->Process(path.sentence, finish); - } - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } -} - -void AsrDecoder::AttentionRescoring() { - searcher_->FinalizeSearch(); - UpdateResult(true); - // No need to do rescoring - if (0.0 == opts_.rescoring_weight) { - return; - } - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = searcher_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - std::vector rescoring_score; - model_->AttentionRescoring(hypotheses, opts_.reverse_weight, - &rescoring_score); - - // Combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - result_[i].score = opts_.rescoring_weight * rescoring_score[i] + - opts_.ctc_weight * result_[i].score; - } - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_decoder.h deleted file mode 100644 index df71f5b7bad7b2ffdc69bbd7ab11f576bed464d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_decoder.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_ASR_DECODER_H_ -#define DECODER_ASR_DECODER_H_ - -#include -#include -#include -#include - -#include "fst/fstlib.h" -#include "fst/symbol-table.h" - -#include "decoder/asr_model.h" -#include "decoder/context_graph.h" -#include "decoder/ctc_endpoint.h" -#include "decoder/ctc_prefix_beam_search.h" -#include "decoder/ctc_wfst_beam_search.h" -#include "decoder/search_interface.h" -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/utils.h" - -namespace wenet { - -struct DecodeOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 64 = 16*4 - int chunk_size = 16; - int num_left_chunks = -1; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores in the following two search - // methods are different. - // For CtcPrefixBeamSearch, it's a sum(prefix) score + context score - // For CtcWfstBeamSearch, it's a max(viterbi) path score + context score - // So we should carefully set ctc_weight according to the search methods. - float ctc_weight = 0.5; - float rescoring_weight = 1.0; - float reverse_weight = 0.0; - CtcEndpointConfig ctc_endpoint_config; - CtcPrefixBeamSearchOptions ctc_prefix_search_opts; - CtcWfstBeamSearchOptions ctc_wfst_search_opts; -}; - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -enum DecodeState { - kEndBatch = 0x00, // End of current decoding batch, normal case - kEndpoint = 0x01, // Endpoint is detected - kEndFeats = 0x02, // All feature is decoded - kWaitFeats = 0x03 // Feat is not enough for one chunk inference, wait -}; - -// DecodeResource is thread safe, which can be shared for multiple -// decoding threads -struct DecodeResource { - std::shared_ptr model = nullptr; - std::shared_ptr symbol_table = nullptr; - std::shared_ptr> fst = nullptr; - std::shared_ptr unit_table = nullptr; - std::shared_ptr context_graph = nullptr; - std::shared_ptr post_processor = nullptr; -}; - -// Torch ASR decoder -class AsrDecoder { - public: - AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts); - // @param block: if true, block when feature is not enough for one chunk - // inference. Otherwise, return kWaitFeats. - DecodeState Decode(bool block = true); - void Rescoring(); - void Reset(); - void ResetContinuousDecoding(); - bool DecodedSomething() const { - return !result_.empty() && !result_[0].sentence.empty(); - } - - // This method is used for time benchmark - int num_frames_in_current_chunk() const { - return num_frames_in_current_chunk_; - } - int frame_shift_in_ms() const { - return model_->subsampling_rate() * - feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - int feature_frame_shift_in_ms() const { - return feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - const std::vector& result() const { return result_; } - - private: - DecodeState AdvanceDecoding(bool block = true); - void AttentionRescoring(); - - void UpdateResult(bool finish = false); - - std::shared_ptr feature_pipeline_; - std::shared_ptr model_; - std::shared_ptr post_processor_; - - std::shared_ptr> fst_ = nullptr; - // output symbol table - std::shared_ptr symbol_table_; - // e2e unit symbol table - std::shared_ptr unit_table_ = nullptr; - const DecodeOptions& opts_; - // cache feature - bool start_ = false; - // For continuous decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = 100; // timestamp gap between words in a sentence - - std::unique_ptr searcher_; - std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(AsrDecoder); -}; - -} // namespace wenet - -#endif // DECODER_ASR_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_model.cc deleted file mode 100644 index 8c7b0fb1195cf07bac6c3ff1bb8cb0e187e977da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_model.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#include "decoder/asr_model.h" - -#include -#include - -namespace wenet { - -int AsrModel::num_frames_for_chunk(bool start) const { - int num_required_frames = 0; - if (chunk_size_ > 0) { - if (!start) { // First batch - int context = right_context_ + 1; // Add current frame - num_required_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - num_required_frames = chunk_size_ * subsampling_rate_; - } - } else { - num_required_frames = std::numeric_limits::max(); - } - return num_required_frames; -} - -void AsrModel::CacheFeature( - const std::vector>& chunk_feats) { - // Cache feature for next chunk - const int cached_feature_size = 1 + right_context_ - subsampling_rate_; - if (chunk_feats.size() >= cached_feature_size) { - // TODO(Binbin Zhang): Only deal the case when - // chunk_feats.size() > cached_feature_size here, and it's consistent - // with our current model, refine it later if we have new model or - // new requirements - cached_feature_.resize(cached_feature_size); - for (int i = 0; i < cached_feature_size; ++i) { - cached_feature_[i] = - chunk_feats[chunk_feats.size() - cached_feature_size + i]; - } - } -} - -void AsrModel::ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) { - ctc_prob->clear(); - int num_frames = cached_feature_.size() + chunk_feats.size(); - if (num_frames >= right_context_ + 1) { - this->ForwardEncoderFunc(chunk_feats, ctc_prob); - this->CacheFeature(chunk_feats); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_model.h deleted file mode 100644 index d100dd818551014fa4769c1766bc3b1b626e8453..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/asr_model.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#ifndef DECODER_ASR_MODEL_H_ -#define DECODER_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "utils/timer.h" -#include "utils/utils.h" - -namespace wenet { - -class AsrModel { - public: - virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } - virtual int sos() const { return sos_; } - virtual int eos() const { return eos_; } - virtual bool is_bidirectional_decoder() const { - return is_bidirectional_decoder_; - } - virtual int offset() const { return offset_; } - - // If chunk_size > 0, streaming case. Otherwise, none streaming case - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { - num_left_chunks_ = num_left_chunks; - } - // start: if it is the start chunk of one sentence - virtual int num_frames_for_chunk(bool start) const; - - virtual void Reset() = 0; - - virtual void ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob); - - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - - virtual std::shared_ptr Copy() const = 0; - - protected: - virtual void ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) = 0; - virtual void CacheFeature(const std::vector>& chunk_feats); - - int right_context_ = 1; - int subsampling_rate_ = 1; - int sos_ = 0; - int eos_ = 0; - bool is_bidirectional_decoder_ = false; - int chunk_size_ = 16; - int num_left_chunks_ = -1; // -1 means all left chunks - int offset_ = 0; - - std::vector> cached_feature_; -}; - -} // namespace wenet - -#endif // DECODER_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/context_graph.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/context_graph.cc deleted file mode 100644 index adc59c506de2afa7087815887295e4d8735d2a35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/context_graph.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/context_graph.h" - -#include - -#include "fst/determinize.h" - -#include "utils/string.h" -#include "utils/utils.h" - -namespace wenet { - -ContextGraph::ContextGraph(ContextConfig config) : config_(config) {} - -void ContextGraph::BuildContextGraph( - const std::vector& query_contexts, - const std::shared_ptr& symbol_table) { - CHECK(symbol_table != nullptr) << "Symbols table should not be nullptr!"; - start_tag_id_ = symbol_table->AddSymbol(""); - end_tag_id_ = symbol_table->AddSymbol(""); - symbol_table_ = symbol_table; - if (query_contexts.empty()) { - if (graph_ != nullptr) graph_.reset(); - return; - } - - std::unique_ptr ofst(new fst::StdVectorFst()); - // State 0 is the start state and the final state. - int start_state = ofst->AddState(); - ofst->SetStart(start_state); - ofst->SetFinal(start_state, fst::StdArc::Weight::One()); - - LOG(INFO) << "Contexts count size: " << query_contexts.size(); - int count = 0; - for (const auto& context : query_contexts) { - if (context.size() > config_.max_context_length) { - LOG(INFO) << "Skip long context: " << context; - continue; - } - if (++count > config_.max_contexts) break; - - std::vector words; - // Split context to words by symbol table, and build the context graph. - bool no_oov = SplitUTF8StringToWords(Trim(context), symbol_table, &words); - if (!no_oov) { - LOG(WARNING) << "Ignore unknown word found during compilation."; - continue; - } - - int prev_state = start_state; - int next_state = start_state; - float escape_score = 0; - for (size_t i = 0; i < words.size(); ++i) { - int word_id = symbol_table_->Find(words[i]); - float score = (i * config_.incremental_context_score - + config_.context_score) * UTF8StringLength(words[i]); - next_state = (i < words.size() - 1) ? ofst->AddState() : start_state; - ofst->AddArc(prev_state, - fst::StdArc(word_id, word_id, score, next_state)); - // Add escape arc to clean the previous context score. - if (i > 0) { - // ilabel and olabel of the escape arc is 0 (). - ofst->AddArc(prev_state, fst::StdArc(0, 0, -escape_score, start_state)); - } - prev_state = next_state; - escape_score += score; - } - } - std::unique_ptr det_fst(new fst::StdVectorFst()); - fst::Determinize(*ofst, det_fst.get()); - graph_ = std::move(det_fst); -} - -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // escape score, will be overwritten when ilabel equals to word id. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - next_state = arc.nextstate; - *score = arc.weight.Value(); - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} - -bool ContextGraph::SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - bool no_oov = true; - for (size_t start = 0; start < chars.size();) { - for (size_t end = chars.size(); end > start; --end) { - std::string word; - for (size_t i = start; i < end; i++) { - word += chars[i]; - } - // Skip space. - if (word == " ") { - start = end; - continue; - } - // Add '▁' at the beginning of English word. - if (IsAlpha(word)) { - word = kSpaceSymbol + word; - } - - if (symbol_table->Find(word) != -1) { - words->emplace_back(word); - start = end; - continue; - } - if (end == start + 1) { - ++start; - no_oov = false; - LOG(WARNING) << word << " is oov."; - } - } - } - return no_oov; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/context_graph.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/context_graph.h deleted file mode 100644 index 41b59206987cfe22d421f40506057830b6311f8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/context_graph.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CONTEXT_GRAPH_H_ -#define DECODER_CONTEXT_GRAPH_H_ - -#include -#include -#include - -#include "fst/compose.h" -#include "fst/fst.h" -#include "fst/vector-fst.h" - -namespace wenet { - -using StateId = fst::StdArc::StateId; - -struct ContextConfig { - int max_contexts = 5000; - int max_context_length = 100; - float context_score = 3.0; - float incremental_context_score = 0.0; -}; - -class ContextGraph { - public: - explicit ContextGraph(ContextConfig config); - void BuildContextGraph(const std::vector& query_context, - const std::shared_ptr& symbol_table); - int GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary); - - int start_tag_id() { return start_tag_id_; } - int end_tag_id() { return end_tag_id_; } - - private: - bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - - int start_tag_id_ = -1; - int end_tag_id_ = -1; - ContextConfig config_; - std::shared_ptr symbol_table_ = nullptr; - std::unique_ptr graph_ = nullptr; - DISALLOW_COPY_AND_ASSIGN(ContextGraph); -}; - -} // namespace wenet - -#endif // DECODER_CONTEXT_GRAPH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_endpoint.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_endpoint.cc deleted file mode 100644 index 4a64dd048f32401ab0dca468836cfac8be943d26..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_endpoint.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_endpoint.h" - -#include - -#include -#include - -#include "utils/log.h" - -namespace wenet { - -CtcEndpoint::CtcEndpoint(const CtcEndpointConfig& config) : config_(config) { - Reset(); -} - -void CtcEndpoint::Reset() { - num_frames_decoded_ = 0; - num_frames_trailing_blank_ = 0; -} - -static bool RuleActivated(const CtcEndpointRule& rule, - const std::string& rule_name, bool decoded_sth, - int trailing_silence, int utterance_length) { - bool ans = (decoded_sth || !rule.must_decoded_sth) && - trailing_silence >= rule.min_trailing_silence && - utterance_length >= rule.min_utterance_length; - if (ans) { - VLOG(2) << "Endpointing rule " << rule_name - << " activated: " << (decoded_sth ? "true" : "false") << ',' - << trailing_silence << ',' << utterance_length; - } - return ans; -} - -bool CtcEndpoint::IsEndpoint( - const std::vector>& ctc_log_probs, - bool decoded_something) { - for (int t = 0; t < ctc_log_probs.size(); ++t) { - const auto& logp_t = ctc_log_probs[t]; - float blank_prob = expf(logp_t[config_.blank]); - - num_frames_decoded_++; - if (blank_prob > config_.blank_threshold) { - num_frames_trailing_blank_++; - } else { - num_frames_trailing_blank_ = 0; - } - } - CHECK_GE(num_frames_decoded_, num_frames_trailing_blank_); - CHECK_GT(frame_shift_in_ms_, 0); - int utterance_length = num_frames_decoded_ * frame_shift_in_ms_; - int trailing_silence = num_frames_trailing_blank_ * frame_shift_in_ms_; - if (RuleActivated(config_.rule1, "rule1", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule2, "rule2", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule3, "rule3", decoded_something, trailing_silence, - utterance_length)) - return true; - return false; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_endpoint.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_endpoint.h deleted file mode 100644 index 56d9e08e7d3fab5562028e956f7b1d6ebac7b9e4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_endpoint.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_ENDPOINT_H_ -#define DECODER_CTC_ENDPOINT_H_ - -#include - -namespace wenet { - -struct CtcEndpointRule { - bool must_decoded_sth; - int min_trailing_silence; - int min_utterance_length; - - CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000, - int min_utterance_length = 0) - : must_decoded_sth(must_decoded_sth), - min_trailing_silence(min_trailing_silence), - min_utterance_length(min_utterance_length) {} -}; - -struct CtcEndpointConfig { - /// We consider blank as silence for purposes of endpointing. - int blank = 0; // blank id - float blank_threshold = 0.8; // blank threshold to be silence - /// We support three rules. We terminate decoding if ANY of these rules - /// evaluates to "true". If you want to add more rules, do it by changing this - /// code. If you want to disable a rule, you can set the silence-timeout for - /// that rule to a very large number. - - /// rule1 times out after 5000 ms of silence, even if we decoded nothing. - CtcEndpointRule rule1; - /// rule2 times out after 1000 ms of silence after decoding something. - CtcEndpointRule rule2; - /// rule3 times out after the utterance is 20000 ms long, regardless of - /// anything else. - CtcEndpointRule rule3; - - CtcEndpointConfig() - : rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {} -}; - -class CtcEndpoint { - public: - explicit CtcEndpoint(const CtcEndpointConfig& config); - - void Reset(); - /// This function returns true if this set of endpointing rules thinks we - /// should terminate decoding. - bool IsEndpoint(const std::vector>& ctc_log_probs, - bool decoded_something); - - void frame_shift_in_ms(int frame_shift_in_ms) { - frame_shift_in_ms_ = frame_shift_in_ms; - } - - private: - CtcEndpointConfig config_; - int frame_shift_in_ms_ = -1; - int num_frames_decoded_ = 0; - int num_frames_trailing_blank_ = 0; -}; - -} // namespace wenet - -#endif // DECODER_CTC_ENDPOINT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_prefix_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index 154c8864ba98255528a33a80a35b18eee8fa5dc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_prefix_beam_search.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -CtcPrefixBeamSearch::CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : opts_(opts), context_graph_(context_graph) { - Reset(); -} - -void CtcPrefixBeamSearch::Reset() { - hypotheses_.clear(); - likelihood_.clear(); - cur_hyps_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); - abs_time_step_ = 0; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - prefix_score.v_s = 0.0; - prefix_score.v_ns = 0.0; - std::vector empty; - cur_hyps_[empty] = prefix_score; - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.total_score()); - times_.emplace_back(empty); -} - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.total_score() > b.second.total_score(); -} - -void CtcPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - const std::vector& start_boundaries = prefix.second.start_boundaries; - const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - if (s < start_boundaries.size() && i == start_boundaries[s]) { - output.emplace_back(context_graph_->start_tag_id()); - ++s; - } - output.emplace_back(input[i]); - if (e < end_boundaries.size() && i == end_boundaries[e]) { - output.emplace_back(context_graph_->end_tag_id()); - ++e; - } - } - outputs_.emplace_back(output); -} - -void CtcPrefixBeamSearch::UpdateHypotheses( - const std::vector, PrefixScore>>& hpys) { - cur_hyps_.clear(); - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - for (auto& item : hpys) { - cur_hyps_[item.first] = item.second; - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.total_score()); - viterbi_likelihood_.emplace_back(item.second.viterbi_score()); - times_.emplace_back(item.second.times()); - } -} - -// Please refer https://robin1001.github.io/2020/12/11/ctc-search -// for how CTC prefix beam search works, and there is a simple graph demo in -// it. -void CtcPrefixBeamSearch::Search(const std::vector>& logp) { - if (logp.size() == 0) return; - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. First beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. Token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields s(blank ending score) and - // ns(none blank ending score) to -inf, respectively. - if (id == opts_.blank) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = LogAdd(next_score.s, prefix_score.score() + prob); - next_score.v_s = prefix_score.viterbi_score() + prob; - next_score.times_s = prefix_score.times(); - // Prefix not changed, copy the context from prefix. - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.ns = LogAdd(next_score1.ns, prefix_score.ns + prob); - if (next_score1.v_ns < prefix_score.v_ns + prob) { - next_score1.v_ns = prefix_score.v_ns + prob; - if (next_score1.cur_token_prob < prob) { - next_score1.cur_token_prob = prob; - next_score1.times_ns = prefix_score.times_ns; - CHECK_GT(next_score1.times_ns.size(), 0); - next_score1.times_ns.back() = abs_time_step_; - } - } - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.ns = LogAdd(next_score2.ns, prefix_score.s + prob); - if (next_score2.v_ns < prefix_score.v_s + prob) { - next_score2.v_ns = prefix_score.v_s + prob; - next_score2.cur_token_prob = prob; - next_score2.times_ns = prefix_score.times_s; - next_score2.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score2.has_context) { - // Prefix changed, calculate the context score. - next_score2.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score2.has_context = true; - } - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = LogAdd(next_score.ns, prefix_score.score() + prob); - if (next_score.v_ns < prefix_score.viterbi_score() + prob) { - next_score.v_ns = prefix_score.viterbi_score() + prob; - next_score.cur_token_prob = prob; - next_score.times_ns = prefix_score.times(); - next_score.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score.has_context) { - // Calculate the context score. - next_score.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score.has_context = true; - } - } - } - } - - // 3. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), arr.begin() + second_beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. Update cur_hyps_ and get new result - UpdateHypotheses(arr); - } -} - -void CtcPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } - -void CtcPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - CHECK_EQ(hypotheses_.size(), cur_hyps_.size()); - CHECK_EQ(hypotheses_.size(), likelihood_.size()); - // We should backoff the context score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_state != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); - } - } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_prefix_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_prefix_beam_search.h deleted file mode 100644 index f44ec23c37af517c9e45140f89ef7346768f5d35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_prefix_beam_search.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_PREFIX_BEAM_SEARCH_H_ -#define DECODER_CTC_PREFIX_BEAM_SEARCH_H_ - -#include -#include -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "utils/utils.h" - -namespace wenet { - -struct CtcPrefixBeamSearchOptions { - int blank = 0; // blank id - int first_beam_size = 10; - int second_beam_size = 10; -}; - -struct PrefixScore { - float s = -kFloatMax; // blank ending score - float ns = -kFloatMax; // none blank ending score - float v_s = -kFloatMax; // viterbi blank ending score - float v_ns = -kFloatMax; // viterbi none blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_s; // times of viterbi blank path - std::vector times_ns; // times of viterbi none blank path - - float score() const { return LogAdd(s, ns); } - float viterbi_score() const { return v_s > v_ns ? v_s : v_ns; } - const std::vector& times() const { - return v_s > v_ns ? times_s : times_ns; - } - - bool has_context = false; - int context_state = 0; - float context_score = 0; - std::vector start_boundaries; - std::vector end_boundaries; - - void CopyContext(const PrefixScore& prefix_score) { - context_state = prefix_score.context_state; - context_score = prefix_score.context_score; - start_boundaries = prefix_score.start_boundaries; - end_boundaries = prefix_score.end_boundaries; - } - - void UpdateContext(const std::shared_ptr& context_graph, - const PrefixScore& prefix_score, int word_id, - int prefix_len) { - this->CopyContext(prefix_score); - - float score = 0; - bool is_start_boundary = false; - bool is_end_boundary = false; - - context_state = - context_graph->GetNextState(prefix_score.context_state, word_id, &score, - &is_start_boundary, &is_end_boundary); - context_score += score; - if (is_start_boundary) start_boundaries.emplace_back(prefix_len); - if (is_end_boundary) end_boundaries.emplace_back(prefix_len); - } - - float total_score() const { return score() + context_score; } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -class CtcPrefixBeamSearch : public SearchInterface { - public: - explicit CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph = nullptr); - - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kPrefixBeamSearch; } - void UpdateOutputs(const std::pair, PrefixScore>& prefix); - void UpdateHypotheses( - const std::vector, PrefixScore>>& hpys); - void UpdateFinalContext(); - - const std::vector& viterbi_likelihood() const { - return viterbi_likelihood_; - } - const std::vector>& Inputs() const override { - return hypotheses_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - int abs_time_step_ = 0; - - // N-best list and corresponding likelihood_, in sorted order - std::vector> hypotheses_; - std::vector likelihood_; - std::vector viterbi_likelihood_; - std::vector> times_; - - std::unordered_map, PrefixScore, PrefixHash> cur_hyps_; - std::shared_ptr context_graph_ = nullptr; - // Outputs contain the hypotheses_ and tags like: and - std::vector> outputs_; - const CtcPrefixBeamSearchOptions& opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(CtcPrefixBeamSearch); -}; - -} // namespace wenet - -#endif // DECODER_CTC_PREFIX_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_wfst_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_wfst_beam_search.cc deleted file mode 100644 index 10e93f387e87b5f16fb7784d7060c50f227bf58e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_wfst_beam_search.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_wfst_beam_search.h" - -#include - -namespace wenet { - -void DecodableTensorScaled::Reset() { - num_frames_ready_ = 0; - done_ = false; - // Give an empty initialization, will throw error when - // AcceptLoglikes is not called - logp_.clear(); -} - -void DecodableTensorScaled::AcceptLoglikes(const std::vector& logp) { - ++num_frames_ready_; - // TODO(Binbin Zhang): Avoid copy here - logp_ = logp; -} - -float DecodableTensorScaled::LogLikelihood(int32 frame, int32 index) { - CHECK_GT(index, 0); - CHECK_LT(frame, num_frames_ready_); - return scale_ * logp_[index - 1]; -} - -bool DecodableTensorScaled::IsLastFrame(int32 frame) const { - CHECK_LT(frame, num_frames_ready_); - return done_ && (frame == num_frames_ready_ - 1); -} - -int32 DecodableTensorScaled::NumIndices() const { - LOG(FATAL) << "Not implement"; - return 0; -} - -CtcWfstBeamSearch::CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : decodable_(opts.acoustic_scale), - decoder_(fst, opts, context_graph), - context_graph_(context_graph), - opts_(opts) { - Reset(); -} - -void CtcWfstBeamSearch::Reset() { - num_frames_ = 0; - decoded_frames_mapping_.clear(); - is_last_frame_blank_ = false; - last_best_ = 0; - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - decodable_.Reset(); - decoder_.InitDecoding(); -} - -void CtcWfstBeamSearch::Search(const std::vector>& logp) { - if (0 == logp.size()) { - return; - } - // Every time we get the log posterior, we decode it all before return - for (int i = 0; i < logp.size(); i++) { - float blank_score = std::exp(logp[i][0]); - if (blank_score > opts_.blank_skip_thresh * opts_.blank_scale) { - VLOG(3) << "skipping frame " << num_frames_ << " score " << blank_score; - is_last_frame_blank_ = true; - last_frame_prob_ = logp[i]; - } else { - // Get the best symbol - int cur_best = - std::max_element(logp[i].begin(), logp[i].end()) - logp[i].begin(); - // Optional, adding one blank frame if we has skipped it in two same - // symbols - if (cur_best != 0 && is_last_frame_blank_ && cur_best == last_best_) { - decodable_.AcceptLoglikes(last_frame_prob_); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_ - 1); - VLOG(2) << "Adding blank frame at symbol " << cur_best; - } - last_best_ = cur_best; - - decodable_.AcceptLoglikes(logp[i]); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_); - is_last_frame_blank_ = false; - } - num_frames_++; - } - // Get the best path - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - if (decoded_frames_mapping_.size() > 0) { - inputs_.resize(1); - outputs_.resize(1); - likelihood_.resize(1); - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, false); - std::vector alignment; - kaldi::LatticeWeight weight; - fst::GetLinearSymbolSequence(lat, &alignment, &outputs_[0], &weight); - ConvertToInputs(alignment, &inputs_[0]); - RemoveContinuousTags(&outputs_[0]); - VLOG(3) << weight.Value1() << " " << weight.Value2(); - likelihood_[0] = -(weight.Value1() + weight.Value2()); - } -} - -void CtcWfstBeamSearch::FinalizeSearch() { - decodable_.SetFinish(); - decoder_.FinalizeDecoding(); - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - if (decoded_frames_mapping_.size() > 0) { - std::vector nbest_lats; - if (opts_.nbest == 1) { - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, true); - nbest_lats.push_back(std::move(lat)); - } else { - // Get N-best path by lattice(CompactLattice) - kaldi::CompactLattice clat; - decoder_.GetLattice(&clat, true); - kaldi::Lattice lat, nbest_lat; - fst::ConvertLattice(clat, &lat); - // TODO(Binbin Zhang): it's n-best word lists here, not character n-best - fst::ShortestPath(lat, &nbest_lat, opts_.nbest); - fst::ConvertNbestToVector(nbest_lat, &nbest_lats); - } - int nbest = nbest_lats.size(); - inputs_.resize(nbest); - outputs_.resize(nbest); - likelihood_.resize(nbest); - times_.resize(nbest); - for (int i = 0; i < nbest; i++) { - kaldi::LatticeWeight weight; - std::vector alignment; - fst::GetLinearSymbolSequence(nbest_lats[i], &alignment, &outputs_[i], - &weight); - ConvertToInputs(alignment, &inputs_[i], ×_[i]); - RemoveContinuousTags(&outputs_[i]); - likelihood_[i] = -(weight.Value1() + weight.Value2()); - } - } -} - -void CtcWfstBeamSearch::ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time) { - input->clear(); - if (time != nullptr) time->clear(); - for (int cur = 0; cur < alignment.size(); ++cur) { - // ignore blank - if (alignment[cur] - 1 == 0) continue; - // merge continuous same label - if (cur > 0 && alignment[cur] == alignment[cur - 1]) continue; - - input->push_back(alignment[cur] - 1); - if (time != nullptr) { - time->push_back(decoded_frames_mapping_[cur]); - } - } -} - -void CtcWfstBeamSearch::RemoveContinuousTags(std::vector* output) { - if (context_graph_) { - for (auto it = output->begin(); it != output->end();) { - if (*it == context_graph_->start_tag_id() || - *it == context_graph_->end_tag_id()) { - if (it + 1 != output->end() && *it == *(it + 1)) { - it = output->erase(it); - continue; - } - } - ++it; - } - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_wfst_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_wfst_beam_search.h deleted file mode 100644 index 204a0c8db1254035b7e3bd4a6e02b65d66b756f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/ctc_wfst_beam_search.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_WFST_BEAM_SEARCH_H_ -#define DECODER_CTC_WFST_BEAM_SEARCH_H_ - -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "kaldi/decoder/lattice-faster-online-decoder.h" -#include "utils/utils.h" - -namespace wenet { - -class DecodableTensorScaled : public kaldi::DecodableInterface { - public: - explicit DecodableTensorScaled(float scale = 1.0) : scale_(scale) { Reset(); } - - void Reset(); - int32 NumFramesReady() const override { return num_frames_ready_; } - bool IsLastFrame(int32 frame) const override; - float LogLikelihood(int32 frame, int32 index) override; - int32 NumIndices() const override; - void AcceptLoglikes(const std::vector& logp); - void SetFinish() { done_ = true; } - - private: - int num_frames_ready_ = 0; - float scale_ = 1.0; - bool done_ = false; - std::vector logp_; -}; - -// LatticeFasterDecoderConfig has the following key members -// beam: decoding beam -// max_active: Decoder max active states -// lattice_beam: Lattice generation beam -struct CtcWfstBeamSearchOptions : public kaldi::LatticeFasterDecoderConfig { - float acoustic_scale = 1.0; - float nbest = 10; - // When blank score is greater than this thresh, skip the frame in viterbi - // search - float blank_skip_thresh = 0.98; - float blank_scale = 1.0; -}; - -class CtcWfstBeamSearch : public SearchInterface { - public: - explicit CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph); - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kWfstBeamSearch; } - // For CTC prefix beam search, both inputs and outputs are hypotheses_ - const std::vector>& Inputs() const override { - return inputs_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - // Sub one and remove - void ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time = nullptr); - void RemoveContinuousTags(std::vector* output); - - int num_frames_ = 0; - std::vector decoded_frames_mapping_; - - int last_best_ = 0; // last none blank best id - std::vector last_frame_prob_; - bool is_last_frame_blank_ = false; - std::vector> inputs_, outputs_; - std::vector likelihood_; - std::vector> times_; - DecodableTensorScaled decodable_; - kaldi::LatticeFasterOnlineDecoder decoder_; - std::shared_ptr context_graph_; - const CtcWfstBeamSearchOptions& opts_; -}; - -} // namespace wenet - -#endif // DECODER_CTC_WFST_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/onnx_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/onnx_asr_model.cc deleted file mode 100644 index fc7afc704febbde3b7e350e392dc46763c453e74..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/onnx_asr_model.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/onnx_asr_model.h" - -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -Ort::Env OnnxAsrModel::env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, ""); -Ort::SessionOptions OnnxAsrModel::session_options_ = Ort::SessionOptions(); - -void OnnxAsrModel::InitEngineThreads(int num_threads) { - session_options_.SetIntraOpNumThreads(num_threads); -} - -void OnnxAsrModel::GetInputOutputInfo( - const std::shared_ptr& session, - std::vector* in_names, std::vector* out_names) { - Ort::AllocatorWithDefaultOptions allocator; - // Input info - int num_nodes = session->GetInputCount(); - in_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetInputName(i, allocator); - Ort::TypeInfo type_info = session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tInput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*in_names)[i] = name; - } - // Output info - num_nodes = session->GetOutputCount(); - out_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetOutputName(i, allocator); - Ort::TypeInfo type_info = session->GetOutputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tOutput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*out_names)[i] = name; - } -} - -void OnnxAsrModel::Read(const std::string& model_dir) { - std::string encoder_onnx_path = model_dir + "/encoder.onnx"; - std::string rescore_onnx_path = model_dir + "/decoder.onnx"; - std::string ctc_onnx_path = model_dir + "/ctc.onnx"; - - // 1. Load sessions - try { -#ifdef _MSC_VER - encoder_session_ = std::make_shared( - env_, ToWString(encoder_onnx_path).c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, ToWString(rescore_onnx_path).c_str(), session_options_); - ctc_session_ = std::make_shared( - env_, ToWString(ctc_onnx_path).c_str(), session_options_); -#else - encoder_session_ = std::make_shared( - env_, encoder_onnx_path.c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, rescore_onnx_path.c_str(), session_options_); - ctc_session_ = std::make_shared(env_, ctc_onnx_path.c_str(), - session_options_); -#endif - } catch (std::exception const& e) { - LOG(ERROR) << "error when load onnx model: " << e.what(); - exit(0); - } - - // 2. Read metadata - auto model_metadata = encoder_session_->GetModelMetadata(); - - Ort::AllocatorWithDefaultOptions allocator; - encoder_output_size_ = - atoi(model_metadata.LookupCustomMetadataMap("output_size", allocator)); - num_blocks_ = - atoi(model_metadata.LookupCustomMetadataMap("num_blocks", allocator)); - head_ = atoi(model_metadata.LookupCustomMetadataMap("head", allocator)); - cnn_module_kernel_ = atoi( - model_metadata.LookupCustomMetadataMap("cnn_module_kernel", allocator)); - subsampling_rate_ = atoi( - model_metadata.LookupCustomMetadataMap("subsampling_rate", allocator)); - right_context_ = - atoi(model_metadata.LookupCustomMetadataMap("right_context", allocator)); - sos_ = atoi(model_metadata.LookupCustomMetadataMap("sos_symbol", allocator)); - eos_ = atoi(model_metadata.LookupCustomMetadataMap("eos_symbol", allocator)); - is_bidirectional_decoder_ = atoi(model_metadata.LookupCustomMetadataMap( - "is_bidirectional_decoder", allocator)); - chunk_size_ = - atoi(model_metadata.LookupCustomMetadataMap("chunk_size", allocator)); - num_left_chunks_ = - atoi(model_metadata.LookupCustomMetadataMap("left_chunks", allocator)); - - LOG(INFO) << "Onnx Model Info:"; - LOG(INFO) << "\tencoder_output_size " << encoder_output_size_; - LOG(INFO) << "\tnum_blocks " << num_blocks_; - LOG(INFO) << "\thead " << head_; - LOG(INFO) << "\tcnn_module_kernel " << cnn_module_kernel_; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; - LOG(INFO) << "\tchunk_size " << chunk_size_; - LOG(INFO) << "\tnum_left_chunks " << num_left_chunks_; - - // 3. Read model nodes - LOG(INFO) << "Onnx Encoder:"; - GetInputOutputInfo(encoder_session_, &encoder_in_names_, &encoder_out_names_); - LOG(INFO) << "Onnx CTC:"; - GetInputOutputInfo(ctc_session_, &ctc_in_names_, &ctc_out_names_); - LOG(INFO) << "Onnx Rescore:"; - GetInputOutputInfo(rescore_session_, &rescore_in_names_, &rescore_out_names_); -} - -OnnxAsrModel::OnnxAsrModel(const OnnxAsrModel& other) { - // metadatas - encoder_output_size_ = other.encoder_output_size_; - num_blocks_ = other.num_blocks_; - head_ = other.head_; - cnn_module_kernel_ = other.cnn_module_kernel_; - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - // sessions - encoder_session_ = other.encoder_session_; - ctc_session_ = other.ctc_session_; - rescore_session_ = other.rescore_session_; - - // node names - encoder_in_names_ = other.encoder_in_names_; - encoder_out_names_ = other.encoder_out_names_; - ctc_in_names_ = other.ctc_in_names_; - ctc_out_names_ = other.ctc_out_names_; - rescore_in_names_ = other.rescore_in_names_; - rescore_out_names_ = other.rescore_out_names_; -} - -std::shared_ptr OnnxAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void OnnxAsrModel::Reset() { - offset_ = 0; - encoder_outs_.clear(); - cached_feature_.clear(); - // Reset att_cache - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - if (num_left_chunks_ > 0) { - int required_cache_size = chunk_size_ * num_left_chunks_; - offset_ = required_cache_size; - att_cache_.resize(num_blocks_ * head_ * required_cache_size * - encoder_output_size_ / head_ * 2, - 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, required_cache_size, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } else { - att_cache_.resize(0, 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, 0, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } - - // Reset cnn_cache - cnn_cache_.resize( - num_blocks_ * encoder_output_size_ * (cnn_module_kernel_ - 1), 0.0); - const int64_t cnn_cache_shape[] = {num_blocks_, 1, encoder_output_size_, - cnn_module_kernel_ - 1}; - cnn_cache_ort_ = Ort::Value::CreateTensor( - memory_info, cnn_cache_.data(), cnn_cache_.size(), cnn_cache_shape, 4); -} - -void OnnxAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - // 1. Prepare onnx required data, splice cached_feature_ and chunk_feats - // chunk - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - std::vector feats; - for (size_t i = 0; i < cached_feature_.size(); ++i) { - feats.insert(feats.end(), cached_feature_[i].begin(), - cached_feature_[i].end()); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - feats.insert(feats.end(), chunk_feats[i].begin(), chunk_feats[i].end()); - } - const int64_t feats_shape[3] = {1, num_frames, feature_dim}; - Ort::Value feats_ort = Ort::Value::CreateTensor( - memory_info, feats.data(), feats.size(), feats_shape, 3); - // offset - int64_t offset_int64 = static_cast(offset_); - Ort::Value offset_ort = Ort::Value::CreateTensor( - memory_info, &offset_int64, 1, std::vector{}.data(), 0); - // required_cache_size - int64_t required_cache_size = chunk_size_ * num_left_chunks_; - Ort::Value required_cache_size_ort = Ort::Value::CreateTensor( - memory_info, &required_cache_size, 1, std::vector{}.data(), 0); - // att_mask - Ort::Value att_mask_ort{nullptr}; - std::vector att_mask(required_cache_size + chunk_size_, 1); - if (num_left_chunks_ > 0) { - int chunk_idx = offset_ / chunk_size_ - num_left_chunks_; - if (chunk_idx < num_left_chunks_) { - for (int i = 0; i < (num_left_chunks_ - chunk_idx) * chunk_size_; ++i) { - att_mask[i] = 0; - } - } - const int64_t att_mask_shape[] = {1, 1, required_cache_size + chunk_size_}; - att_mask_ort = Ort::Value::CreateTensor( - memory_info, reinterpret_cast(att_mask.data()), att_mask.size(), - att_mask_shape, 3); - } - - // 2. Encoder chunk forward - std::vector inputs; - for (auto name : encoder_in_names_) { - if (!strcmp(name, "chunk")) { - inputs.emplace_back(std::move(feats_ort)); - } else if (!strcmp(name, "offset")) { - inputs.emplace_back(std::move(offset_ort)); - } else if (!strcmp(name, "required_cache_size")) { - inputs.emplace_back(std::move(required_cache_size_ort)); - } else if (!strcmp(name, "att_cache")) { - inputs.emplace_back(std::move(att_cache_ort_)); - } else if (!strcmp(name, "cnn_cache")) { - inputs.emplace_back(std::move(cnn_cache_ort_)); - } else if (!strcmp(name, "att_mask")) { - inputs.emplace_back(std::move(att_mask_ort)); - } - } - - std::vector ort_outputs = encoder_session_->Run( - Ort::RunOptions{nullptr}, encoder_in_names_.data(), inputs.data(), - inputs.size(), encoder_out_names_.data(), encoder_out_names_.size()); - - offset_ += static_cast( - ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape()[1]); - att_cache_ort_ = std::move(ort_outputs[1]); - cnn_cache_ort_ = std::move(ort_outputs[2]); - - std::vector ctc_inputs; - ctc_inputs.emplace_back(std::move(ort_outputs[0])); - - std::vector ctc_ort_outputs = ctc_session_->Run( - Ort::RunOptions{nullptr}, ctc_in_names_.data(), ctc_inputs.data(), - ctc_inputs.size(), ctc_out_names_.data(), ctc_out_names_.size()); - encoder_outs_.push_back(std::move(ctc_inputs[0])); - - float* logp_data = ctc_ort_outputs[0].GetTensorMutableData(); - auto type_info = ctc_ort_outputs[0].GetTensorTypeAndShapeInfo(); - - int num_outputs = type_info.GetShape()[1]; - int output_dim = type_info.GetShape()[2]; - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), logp_data + i * output_dim, - sizeof(float) * output_dim); - } -} - -float OnnxAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void OnnxAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - - std::vector rescore_input; - int encoder_len = 0; - for (int i = 0; i < encoder_outs_.size(); i++) { - float* encoder_outs_data = encoder_outs_[i].GetTensorMutableData(); - auto type_info = encoder_outs_[i].GetTensorTypeAndShapeInfo(); - for (int j = 0; j < type_info.GetElementCount(); j++) { - rescore_input.emplace_back(encoder_outs_data[j]); - } - encoder_len += type_info.GetShape()[1]; - } - - const int64_t decode_input_shape[] = {1, encoder_len, encoder_output_size_}; - - std::vector hyps_pad; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad.emplace_back(0); - } - } - - const int64_t hyps_pad_shape[] = {num_hyps, max_hyps_len}; - - const int64_t hyps_lens_shape[] = {num_hyps}; - - Ort::Value decode_input_tensor_ = Ort::Value::CreateTensor( - memory_info, rescore_input.data(), rescore_input.size(), - decode_input_shape, 3); - Ort::Value hyps_pad_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_pad.data(), hyps_pad.size(), hyps_pad_shape, 2); - Ort::Value hyps_lens_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_lens.data(), hyps_lens.size(), hyps_lens_shape, 1); - - std::vector rescore_inputs; - - rescore_inputs.emplace_back(std::move(hyps_pad_tensor_)); - rescore_inputs.emplace_back(std::move(hyps_lens_tensor_)); - rescore_inputs.emplace_back(std::move(decode_input_tensor_)); - - std::vector rescore_outputs = rescore_session_->Run( - Ort::RunOptions{nullptr}, rescore_in_names_.data(), rescore_inputs.data(), - rescore_inputs.size(), rescore_out_names_.data(), - rescore_out_names_.size()); - - float* decoder_outs_data = rescore_outputs[0].GetTensorMutableData(); - float* r_decoder_outs_data = rescore_outputs[1].GetTensorMutableData(); - - auto type_info = rescore_outputs[0].GetTensorTypeAndShapeInfo(); - int decode_out_len = type_info.GetShape()[2]; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - score = ComputeAttentionScore( - decoder_outs_data + max_hyps_len * decode_out_len * i, hyp, eos_, - decode_out_len); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore( - r_decoder_outs_data + max_hyps_len * decode_out_len * i, r_hyp, eos_, - decode_out_len); - } - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/onnx_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/onnx_asr_model.h deleted file mode 100644 index f5d9e9a0c61d728f2fb6d45d1428234abae98c90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/onnx_asr_model.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_ONNX_ASR_MODEL_H_ -#define DECODER_ONNX_ASR_MODEL_H_ - -#include -#include -#include - -#include "onnxruntime_cxx_api.h" // NOLINT - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -class OnnxAsrModel : public AsrModel { - public: - static void InitEngineThreads(int num_threads = 1); - - public: - OnnxAsrModel() = default; - OnnxAsrModel(const OnnxAsrModel& other); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - void GetInputOutputInfo(const std::shared_ptr& session, - std::vector* in_names, - std::vector* out_names); - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // sessions - // NOTE(Mddct): The Env holds the logging state used by all other objects. - // One Env must be created before using any other Onnxruntime functionality. - static Ort::Env env_; // shared environment across threads. - static Ort::SessionOptions session_options_; - std::shared_ptr encoder_session_ = nullptr; - std::shared_ptr rescore_session_ = nullptr; - std::shared_ptr ctc_session_ = nullptr; - - // node names - std::vector encoder_in_names_, encoder_out_names_; - std::vector ctc_in_names_, ctc_out_names_; - std::vector rescore_in_names_, rescore_out_names_; - - // caches - Ort::Value att_cache_ort_{nullptr}; - Ort::Value cnn_cache_ort_{nullptr}; - std::vector encoder_outs_; - // NOTE: Instead of making a copy of the xx_cache, ONNX only maintains - // its data pointer when initializing xx_cache_ort (see https://github.com/ - // microsoft/onnxruntime/blob/master/onnxruntime/core/framework - // /tensor.cc#L102-L129), so we need the following variables to keep - // our data "alive" during the lifetime of decoder. - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // DECODER_ONNX_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/params.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/params.h deleted file mode 100644 index 3edc877f1bb6d876ca087cab8e4ed00d42e97e63..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/params.h +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_PARAMS_H_ -#define DECODER_PARAMS_H_ - -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#ifdef USE_ONNX -#include "decoder/onnx_asr_model.h" -#endif -#ifdef USE_TORCH -#include "decoder/torch_asr_model.h" -#endif -#ifdef USE_XPU -#include "xpu/xpu_asr_model.h" -#endif -#ifdef USE_BPU -#include "bpu/bpu_asr_model.h" -#endif -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); - -// TorchAsrModel flags -DEFINE_string(model_path, "", "pytorch exported model path"); -// OnnxAsrModel flags -DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); -// XPUAsrModel flags -DEFINE_string(xpu_model_dir, "", - "directory where the XPU model and weights is saved"); -// BPUAsrModel flags -DEFINE_string(bpu_model_dir, "", - "directory where the HORIZON BPU model is saved"); - -// FeaturePipelineConfig flags -DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); -DEFINE_int32(sample_rate, 16000, "sample rate for audio"); - -// TLG fst -DEFINE_string(fst_path, "", "TLG fst path"); - -// DecodeOptions flags -DEFINE_int32(chunk_size, 16, "decoding chunk size"); -DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); -DEFINE_double(ctc_weight, 0.5, - "ctc weight when combining ctc score and rescoring score"); -DEFINE_double(rescoring_weight, 1.0, - "rescoring weight when combining ctc score and rescoring score"); -DEFINE_double(reverse_weight, 0.0, - "used for bitransformer rescoring. it must be 0.0 if decoder is" - "conventional transformer decoder, and only reverse_weight > 0.0" - "dose the right to left decoder will be calculated and used"); -DEFINE_int32(max_active, 7000, "max active states in ctc wfst search"); -DEFINE_int32(min_active, 200, "min active states in ctc wfst search"); -DEFINE_double(beam, 16.0, "beam in ctc wfst search"); -DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); -DEFINE_double(blank_skip_thresh, 1.0, - "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(blank_scale, 1.0, "blank scale for ctc wfst search"); -DEFINE_double(length_penalty, 0.0, - "length penalty ctc wfst search, will not" - "apply on self-loop arc, for balancing the del/ins ratio, " - "suggest set to -3.0"); -DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); - -// SymbolTable flags -DEFINE_string(dict_path, "", - "dict symbol table path, required when LM is enabled"); -DEFINE_string(unit_path, "", - "e2e model unit symbol table, it is used in both " - "with/without LM scenarios for context/timestamp"); - -// Context flags -DEFINE_string(context_path, "", "context path, is used to build context graph"); -DEFINE_double(context_score, 3.0, "is used to rescore the decoded result"); - -// PostProcessOptions flags -DEFINE_int32(language_type, 0, - "remove spaces according to language type" - "0x00 = kMandarinEnglish, " - "0x01 = kIndoEuropean"); -DEFINE_bool(lowercase, true, "lowercase final result if needed"); - -namespace wenet { -std::shared_ptr InitFeaturePipelineConfigFromFlags() { - auto feature_config = std::make_shared( - FLAGS_num_bins, FLAGS_sample_rate); - return feature_config; -} - -std::shared_ptr InitDecodeOptionsFromFlags() { - auto decode_config = std::make_shared(); - decode_config->chunk_size = FLAGS_chunk_size; - decode_config->num_left_chunks = FLAGS_num_left_chunks; - decode_config->ctc_weight = FLAGS_ctc_weight; - decode_config->reverse_weight = FLAGS_reverse_weight; - decode_config->rescoring_weight = FLAGS_rescoring_weight; - decode_config->ctc_wfst_search_opts.max_active = FLAGS_max_active; - decode_config->ctc_wfst_search_opts.min_active = FLAGS_min_active; - decode_config->ctc_wfst_search_opts.beam = FLAGS_beam; - decode_config->ctc_wfst_search_opts.lattice_beam = FLAGS_lattice_beam; - decode_config->ctc_wfst_search_opts.acoustic_scale = FLAGS_acoustic_scale; - decode_config->ctc_wfst_search_opts.blank_skip_thresh = - FLAGS_blank_skip_thresh; - decode_config->ctc_wfst_search_opts.blank_scale = FLAGS_blank_scale; - decode_config->ctc_wfst_search_opts.length_penalty = FLAGS_length_penalty; - decode_config->ctc_wfst_search_opts.nbest = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; - return decode_config; -} - -std::shared_ptr InitDecodeResourceFromFlags() { - auto resource = std::make_shared(); - const int kNumGemmThreads = 1; - if (!FLAGS_onnx_dir.empty()) { -#ifdef USE_ONNX - LOG(INFO) << "Reading onnx model "; - OnnxAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_onnx_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; -#endif - } else if (!FLAGS_model_path.empty()) { -#ifdef USE_TORCH - LOG(INFO) << "Reading torch model " << FLAGS_model_path; - TorchAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_model_path); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; -#endif - } else if (!FLAGS_xpu_model_dir.empty()) { -#ifdef USE_XPU - LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; - auto model = std::make_shared(); - model->SetEngineThreads(kNumGemmThreads); - model->SetDeviceId(FLAGS_device_id); - model->Read(FLAGS_xpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; -#endif - } else if (!FLAGS_bpu_model_dir.empty()) { -#ifdef USE_BPU - LOG(INFO) << "Reading Horizon BPU model from " << FLAGS_bpu_model_dir; - auto model = std::make_shared(); - model->Read(FLAGS_bpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DBPU=ON'."; -#endif - } else { - LOG(FATAL) << "Please set ONNX, TORCH, XPU or BPU model path!!!"; - } - - LOG(INFO) << "Reading unit table " << FLAGS_unit_path; - auto unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_unit_path)); - CHECK(unit_table != nullptr); - resource->unit_table = unit_table; - - if (!FLAGS_fst_path.empty()) { // With LM - CHECK(!FLAGS_dict_path.empty()); - LOG(INFO) << "Reading fst " << FLAGS_fst_path; - auto fst = std::shared_ptr>( - fst::Fst::Read(FLAGS_fst_path)); - CHECK(fst != nullptr); - resource->fst = fst; - - LOG(INFO) << "Reading symbol table " << FLAGS_dict_path; - auto symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_dict_path)); - CHECK(symbol_table != nullptr); - resource->symbol_table = symbol_table; - } else { // Without LM, symbol_table is the same as unit_table - resource->symbol_table = unit_table; - } - - if (!FLAGS_context_path.empty()) { - LOG(INFO) << "Reading context " << FLAGS_context_path; - std::vector contexts; - std::ifstream infile(FLAGS_context_path); - std::string context; - while (getline(infile, context)) { - contexts.emplace_back(Trim(context)); - } - ContextConfig config; - config.context_score = FLAGS_context_score; - resource->context_graph = std::make_shared(config); - resource->context_graph->BuildContextGraph(contexts, - resource->symbol_table); - } - - PostProcessOptions post_process_opts; - post_process_opts.language_type = - FLAGS_language_type == 0 ? kMandarinEnglish : kIndoEuropean; - post_process_opts.lowercase = FLAGS_lowercase; - resource->post_processor = - std::make_shared(std::move(post_process_opts)); - return resource; -} - -} // namespace wenet - -#endif // DECODER_PARAMS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/search_interface.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/search_interface.h deleted file mode 100644 index 25bad26705f8be44561d2c686f50a63035b14bbf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/search_interface.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_SEARCH_INTERFACE_H_ -#define DECODER_SEARCH_INTERFACE_H_ - -namespace wenet { - -#include - -enum SearchType { - kPrefixBeamSearch = 0x00, - kWfstBeamSearch = 0x01, -}; - -class SearchInterface { - public: - virtual ~SearchInterface() {} - virtual void Search(const std::vector>& logp) = 0; - virtual void Reset() = 0; - virtual void FinalizeSearch() = 0; - - virtual SearchType Type() const = 0; - // N-best inputs id - virtual const std::vector>& Inputs() const = 0; - // N-best outputs id - virtual const std::vector>& Outputs() const = 0; - // N-best likelihood - virtual const std::vector& Likelihood() const = 0; - // N-best timestamp - virtual const std::vector>& Times() const = 0; -}; - -} // namespace wenet - -#endif // DECODER_SEARCH_INTERFACE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/torch_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/torch_asr_model.cc deleted file mode 100644 index 3abca283e12f5c173c9511707229ea82b31f26d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/torch_asr_model.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/torch_asr_model.h" - -#include -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -namespace wenet { - -#ifndef IOS -void TorchAsrModel::InitEngineThreads(int num_threads) { - // For multi-thread performance - at::set_num_threads(num_threads); - VLOG(1) << "Num intra-op threads: " << at::get_num_threads(); -} -#endif - -void TorchAsrModel::Read(const std::string& model_path) { - torch::DeviceType device = at::kCPU; -#ifdef USE_GPU - if (!torch::cuda::is_available()) { - VLOG(1) << "CUDA is not available! Please check your GPU settings"; - throw std::runtime_error("CUDA is not available!"); - } else { - VLOG(1) << "CUDA available! Running on GPU"; - device = at::kCUDA; - } -#endif - torch::jit::script::Module model = torch::jit::load(model_path, device); - model_ = std::make_shared(std::move(model)); - torch::NoGradGuard no_grad; - model_->eval(); - torch::jit::IValue o1 = model_->run_method("subsampling_rate"); - CHECK_EQ(o1.isInt(), true); - subsampling_rate_ = o1.toInt(); - torch::jit::IValue o2 = model_->run_method("right_context"); - CHECK_EQ(o2.isInt(), true); - right_context_ = o2.toInt(); - torch::jit::IValue o3 = model_->run_method("sos_symbol"); - CHECK_EQ(o3.isInt(), true); - sos_ = o3.toInt(); - torch::jit::IValue o4 = model_->run_method("eos_symbol"); - CHECK_EQ(o4.isInt(), true); - eos_ = o4.toInt(); - torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder"); - CHECK_EQ(o5.isBool(), true); - is_bidirectional_decoder_ = o5.toBool(); - - VLOG(1) << "Torch Model Info:"; - VLOG(1) << "\tsubsampling_rate " << subsampling_rate_; - VLOG(1) << "\tright context " << right_context_; - VLOG(1) << "\tsos " << sos_; - VLOG(1) << "\teos " << eos_; - VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - // 2. Model copy, just copy the model ptr since: - // PyTorch allows using multiple CPU threads during TorchScript model - // inference, please see https://pytorch.org/docs/stable/notes/cpu_ - // threading_torchscript_inference.html - model_ = other.model_; - - // NOTE(Binbin Zhang): - // inner states for forward are not copied here. -} - -std::shared_ptr TorchAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void TorchAsrModel::Reset() { - offset_ = 0; - att_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - encoder_outs_.clear(); - cached_feature_.clear(); -} - -void TorchAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - torch::Tensor feats = - torch::zeros({1, num_frames, feature_dim}, torch::kFloat); - for (size_t i = 0; i < cached_feature_.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(cached_feature_[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][i] = std::move(row); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(chunk_feats[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][cached_feature_.size() + i] = std::move(row); - } - - // 2. Encoder chunk forward -#ifdef USE_GPU - feats = feats.to(at::kCUDA); - att_cache_ = att_cache_.to(at::kCUDA); - cnn_cache_ = cnn_cache_.to(at::kCUDA); -#endif - int required_cache_size = chunk_size_ * num_left_chunks_; - torch::NoGradGuard no_grad; - std::vector inputs = {feats, offset_, required_cache_size, - att_cache_, cnn_cache_}; - - // Refer interfaces in wenet/transformer/asr_model.py - auto outputs = - model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements(); - CHECK_EQ(outputs.size(), 3); -#ifdef USE_GPU - torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU); - att_cache_ = outputs[1].toTensor().to(at::kCPU); - cnn_cache_ = outputs[2].toTensor().to(at::kCPU); -#else - torch::Tensor chunk_out = outputs[0].toTensor(); - att_cache_ = outputs[1].toTensor(); - cnn_cache_ = outputs[2].toTensor(); -#endif - offset_ += chunk_out.size(1); - - // The first dimension of returned value is for batchsize, which is 1 -#ifdef USE_GPU - chunk_out = chunk_out.to(at::kCUDA); - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor(); - ctc_log_probs = ctc_log_probs.to(at::kCPU)[0]; - encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU))); -#else - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor()[0]; - encoder_outs_.push_back(std::move(chunk_out)); -#endif - - // Copy to output - int num_outputs = ctc_log_probs.size(0); - int output_dim = ctc_log_probs.size(1); - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(), - sizeof(float) * output_dim); - } -} - -float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, - int eos) { - float score = 0.0f; - auto accessor = prob.accessor(); - for (size_t j = 0; j < hyp.size(); ++j) { - score += accessor[j][hyp[j]]; - } - score += accessor[hyp.size()][eos]; - return score; -} - -void TorchAsrModel::AttentionRescoring( - const std::vector>& hyps, float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - torch::NoGradGuard no_grad; - // Step 1: Prepare input for libtorch - torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong); - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_length[i] = static_cast(length); - } - torch::Tensor hyps_tensor = - torch::zeros({num_hyps, max_hyps_len}, torch::kLong); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_tensor[i][0] = sos_; - for (size_t j = 0; j < hyp.size(); ++j) { - hyps_tensor[i][j + 1] = hyp[j]; - } - } - - // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_ - torch::Tensor encoder_out = torch::cat(encoder_outs_, 1); -#ifdef USE_GPU - hyps_tensor = hyps_tensor.to(at::kCUDA); - hyps_length = hyps_length.to(at::kCUDA); - encoder_out = encoder_out.to(at::kCUDA); -#endif - auto outputs = model_ - ->run_method("forward_attention_decoder", hyps_tensor, - hyps_length, encoder_out, reverse_weight) - .toTuple() - ->elements(); -#ifdef USE_GPU - auto probs = outputs[0].toTensor().to(at::kCPU); - auto r_probs = outputs[1].toTensor().to(at::kCPU); -#else - auto probs = outputs[0].toTensor(); - auto r_probs = outputs[1].toTensor(); -#endif - CHECK_EQ(probs.size(0), num_hyps); - CHECK_EQ(probs.size(1), max_hyps_len); - - // Step 3: Compute rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left-to-right decoder score - score = ComputeAttentionScore(probs[i], hyp, eos_); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - // right-to-left score - CHECK_EQ(r_probs.size(0), num_hyps); - CHECK_EQ(r_probs.size(1), max_hyps_len); - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_); - } - - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/torch_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/torch_asr_model.h deleted file mode 100644 index a3cebe08798f1cad60ca4cd73c7b2488173b6114..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/decoder/torch_asr_model.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_TORCH_ASR_MODEL_H_ -#define DECODER_TORCH_ASR_MODEL_H_ - -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -#include "decoder/asr_model.h" -#include "utils/utils.h" - -namespace wenet { - -class TorchAsrModel : public AsrModel { - public: -#ifndef IOS - static void InitEngineThreads(int num_threads = 1); -#endif - - public: - using TorchModule = torch::jit::script::Module; - TorchAsrModel() = default; - TorchAsrModel(const TorchAsrModel& other); - void Read(const std::string& model_path); - std::shared_ptr torch_model() const { return model_; } - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, int eos); - - private: - std::shared_ptr model_ = nullptr; - std::vector encoder_outs_; - // transformer/conformer attention cache - torch::Tensor att_cache_ = torch::zeros({0, 0, 0, 0}); - // conformer-only conv_module cache - torch::Tensor cnn_cache_ = torch::zeros({0, 0, 0, 0}); -}; - -} // namespace wenet - -#endif // DECODER_TORCH_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/CMakeLists.txt deleted file mode 100644 index 78872257e43bb9a6ffcedaae977bf0173817ae50..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(frontend STATIC - feature_pipeline.cc - fft.cc -) -target_link_libraries(frontend PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fbank.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fbank.h deleted file mode 100644 index 5a650dc035b8e244388cc1f2e0b9512654de7fda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fbank.h +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FBANK_H_ -#define FRONTEND_FBANK_H_ - -#include -#include -#include -#include -#include - -#include "frontend/fft.h" -#include "utils/log.h" - -namespace wenet { - -// This code is based on kaldi Fbank implementation, please see -// https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.cc -class Fbank { - public: - Fbank(int num_bins, int sample_rate, int frame_length, int frame_shift) - : num_bins_(num_bins), - sample_rate_(sample_rate), - frame_length_(frame_length), - frame_shift_(frame_shift), - use_log_(true), - remove_dc_offset_(true), - generator_(0), - distribution_(0, 1.0), - dither_(0.0) { - fft_points_ = UpperPowerOfTwo(frame_length_); - // generate bit reversal table and trigonometric function table - const int fft_points_4 = fft_points_ / 4; - bitrev_.resize(fft_points_); - sintbl_.resize(fft_points_ + fft_points_4); - make_sintbl(fft_points_, sintbl_.data()); - make_bitrev(fft_points_, bitrev_.data()); - - int num_fft_bins = fft_points_ / 2; - float fft_bin_width = static_cast(sample_rate_) / fft_points_; - int low_freq = 20, high_freq = sample_rate_ / 2; - float mel_low_freq = MelScale(low_freq); - float mel_high_freq = MelScale(high_freq); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); - bins_.resize(num_bins_); - center_freqs_.resize(num_bins_); - for (int bin = 0; bin < num_bins; ++bin) { - float left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - center_freqs_[bin] = InverseMelScale(center_mel); - std::vector this_bin(num_fft_bins); - int first_index = -1, last_index = -1; - for (int i = 0; i < num_fft_bins; ++i) { - float freq = (fft_bin_width * i); // Center frequency of this fft - // bin. - float mel = MelScale(freq); - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) - weight = (mel - left_mel) / (center_mel - left_mel); - else - weight = (right_mel - mel) / (right_mel - center_mel); - this_bin[i] = weight; - if (first_index == -1) first_index = i; - last_index = i; - } - } - CHECK(first_index != -1 && last_index >= first_index); - bins_[bin].first = first_index; - int size = last_index + 1 - first_index; - bins_[bin].second.resize(size); - for (int i = 0; i < size; ++i) { - bins_[bin].second[i] = this_bin[first_index + i]; - } - } - - // povey window - povey_window_.resize(frame_length_); - double a = M_2PI / (frame_length - 1); - for (int i = 0; i < frame_length; ++i) { - povey_window_[i] = pow(0.5 - 0.5 * cos(a * i), 0.85); - } - } - - void set_use_log(bool use_log) { use_log_ = use_log; } - - void set_remove_dc_offset(bool remove_dc_offset) { - remove_dc_offset_ = remove_dc_offset; - } - - void set_dither(float dither) { dither_ = dither; } - - int num_bins() const { return num_bins_; } - - static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); - } - - static inline float MelScale(float freq) { - return 1127.0f * logf(1.0f + freq / 700.0f); - } - - static int UpperPowerOfTwo(int n) { - return static_cast(pow(2, ceil(log(n) / log(2)))); - } - - // pre emphasis - void PreEmphasis(float coeff, std::vector* data) const { - if (coeff == 0.0) return; - for (int i = data->size() - 1; i > 0; i--) - (*data)[i] -= coeff * (*data)[i - 1]; - (*data)[0] -= coeff * (*data)[0]; - } - - // Apply povey window on data in place - void Povey(std::vector* data) const { - CHECK_GE(data->size(), povey_window_.size()); - for (size_t i = 0; i < povey_window_.size(); ++i) { - (*data)[i] *= povey_window_[i]; - } - } - - // Compute fbank feat, return num frames - int Compute(const std::vector& wave, - std::vector>* feat) { - int num_samples = wave.size(); - if (num_samples < frame_length_) return 0; - int num_frames = 1 + ((num_samples - frame_length_) / frame_shift_); - feat->resize(num_frames); - std::vector fft_real(fft_points_, 0), fft_img(fft_points_, 0); - std::vector power(fft_points_ / 2); - for (int i = 0; i < num_frames; ++i) { - std::vector data(wave.data() + i * frame_shift_, - wave.data() + i * frame_shift_ + frame_length_); - // optional add noise - if (dither_ != 0.0) { - for (size_t j = 0; j < data.size(); ++j) - data[j] += dither_ * distribution_(generator_); - } - // optinal remove dc offset - if (remove_dc_offset_) { - float mean = 0.0; - for (size_t j = 0; j < data.size(); ++j) mean += data[j]; - mean /= data.size(); - for (size_t j = 0; j < data.size(); ++j) data[j] -= mean; - } - - PreEmphasis(0.97, &data); - Povey(&data); - // copy data to fft_real - memset(fft_img.data(), 0, sizeof(float) * fft_points_); - memset(fft_real.data() + frame_length_, 0, - sizeof(float) * (fft_points_ - frame_length_)); - memcpy(fft_real.data(), data.data(), sizeof(float) * frame_length_); - fft(bitrev_.data(), sintbl_.data(), fft_real.data(), fft_img.data(), - fft_points_); - // power - for (int j = 0; j < fft_points_ / 2; ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - } - - (*feat)[i].resize(num_bins_); - // cepstral coefficients, triangle filter array - for (int j = 0; j < num_bins_; ++j) { - float mel_energy = 0.0; - int s = bins_[j].first; - for (size_t k = 0; k < bins_[j].second.size(); ++k) { - mel_energy += bins_[j].second[k] * power[s + k]; - } - // optional use log - if (use_log_) { - if (mel_energy < std::numeric_limits::epsilon()) - mel_energy = std::numeric_limits::epsilon(); - mel_energy = logf(mel_energy); - } - - (*feat)[i][j] = mel_energy; - } - } - return num_frames; - } - - private: - int num_bins_; - int sample_rate_; - int frame_length_, frame_shift_; - int fft_points_; - bool use_log_; - bool remove_dc_offset_; - std::vector center_freqs_; - std::vector>> bins_; - std::vector povey_window_; - std::default_random_engine generator_; - std::normal_distribution distribution_; - float dither_; - - // bit reversal table - std::vector bitrev_; - // trigonometric function table - std::vector sintbl_; -}; - -} // namespace wenet - -#endif // FRONTEND_FBANK_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/feature_pipeline.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/feature_pipeline.cc deleted file mode 100644 index ab450b15cd35ebd8101a3bcdec4f963a73bed10c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/feature_pipeline.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/feature_pipeline.h" - -#include -#include - -namespace wenet { - -FeaturePipeline::FeaturePipeline(const FeaturePipelineConfig& config) - : config_(config), - feature_dim_(config.num_bins), - fbank_(config.num_bins, config.sample_rate, config.frame_length, - config.frame_shift), - num_frames_(0), - input_finished_(false) {} - -void FeaturePipeline::AcceptWaveform(const float* pcm, const int size) { - std::vector> feats; - std::vector waves; - waves.insert(waves.end(), remained_wav_.begin(), remained_wav_.end()); - waves.insert(waves.end(), pcm, pcm + size); - int num_frames = fbank_.Compute(waves, &feats); - feature_queue_.Push(std::move(feats)); - num_frames_ += num_frames; - - int left_samples = waves.size() - config_.frame_shift * num_frames; - remained_wav_.resize(left_samples); - std::copy(waves.begin() + config_.frame_shift * num_frames, waves.end(), - remained_wav_.begin()); - // We are still adding wave, notify input is not finished - finish_condition_.notify_one(); -} - -void FeaturePipeline::AcceptWaveform(const int16_t* pcm, const int size) { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = static_cast(pcm[i]); - } - this->AcceptWaveform(float_pcm, size); - delete[] float_pcm; -} - -void FeaturePipeline::set_input_finished() { - CHECK(!input_finished_); - { - std::lock_guard lock(mutex_); - input_finished_ = true; - } - finish_condition_.notify_one(); -} - -bool FeaturePipeline::ReadOne(std::vector* feat) { - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - return false; - } - } -} - -bool FeaturePipeline::Read(int num_frames, - std::vector>* feats) { - feats->clear(); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - *feats = std::move(feature_queue_.Pop(feature_queue_.Size())); - return false; - } - } -} - -void FeaturePipeline::Reset() { - input_finished_ = false; - num_frames_ = 0; - remained_wav_.clear(); - feature_queue_.Clear(); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/feature_pipeline.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/feature_pipeline.h deleted file mode 100644 index 9918d6b573255795e0e665f0a9598c44be625c19..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/feature_pipeline.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FEATURE_PIPELINE_H_ -#define FRONTEND_FEATURE_PIPELINE_H_ - -#include -#include -#include -#include - -#include "frontend/fbank.h" -#include "utils/blocking_queue.h" -#include "utils/log.h" - -namespace wenet { - -struct FeaturePipelineConfig { - int num_bins; - int sample_rate; - int frame_length; - int frame_shift; - FeaturePipelineConfig(int num_bins, int sample_rate) - : num_bins(num_bins), // 80 dim fbank - sample_rate(sample_rate) { // 16k sample rate - frame_length = sample_rate / 1000 * 25; // frame length 25ms - frame_shift = sample_rate / 1000 * 10; // frame shift 10ms - } - - void Info() const { - LOG(INFO) << "feature pipeline config" - << " num_bins " << num_bins << " frame_length " << frame_length - << " frame_shift " << frame_shift; - } -}; - -// Typically, FeaturePipeline is used in two threads: one thread A calls -// AcceptWaveform() to add raw wav data and set_input_finished() to notice -// the end of input wav, another thread B (decoder thread) calls Read() to -// consume features.So a BlockingQueue is used to make this class thread safe. - -// The Read() is designed as a blocking method when there is no feature -// in feature_queue_ and the input is not finished. - -// See bin/decoder_main.cc, websocket/websocket_server.cc and -// decoder/torch_asr_decoder.cc for usage - -class FeaturePipeline { - public: - explicit FeaturePipeline(const FeaturePipelineConfig& config); - - // The feature extraction is done in AcceptWaveform(). - void AcceptWaveform(const float* pcm, const int size); - void AcceptWaveform(const int16_t* pcm, const int size); - - // Current extracted frames number. - int num_frames() const { return num_frames_; } - int feature_dim() const { return feature_dim_; } - const FeaturePipelineConfig& config() const { return config_; } - - // The caller should call this method when speech input is end. - // Never call AcceptWaveform() after calling set_input_finished() ! - void set_input_finished(); - bool input_finished() const { return input_finished_; } - - // Return False if input is finished and no feature could be read. - // Return True if a feature is read. - // This function is a blocking method. It will block the thread when - // there is no feature in feature_queue_ and the input is not finished. - bool ReadOne(std::vector* feat); - - // Read #num_frames frame features. - // Return False if less than #num_frames features are read and the - // input is finished. - // Return True if #num_frames features are read. - // This function is a blocking method when there is no feature - // in feature_queue_ and the input is not finished. - bool Read(int num_frames, std::vector>* feats); - - void Reset(); - bool IsLastFrame(int frame) const { - return input_finished_ && (frame == num_frames_ - 1); - } - - int NumQueuedFrames() const { return feature_queue_.Size(); } - - private: - const FeaturePipelineConfig& config_; - int feature_dim_; - Fbank fbank_; - - BlockingQueue> feature_queue_; - int num_frames_; - bool input_finished_; - - // The feature extraction is done in AcceptWaveform(). - // This waveform sample points are consumed by frame size. - // The residual waveform sample points after framing are - // kept to be used in next AcceptWaveform() calling. - std::vector remained_wav_; - - // Used to block the Read when there is no feature in feature_queue_ - // and the input is not finished. - mutable std::mutex mutex_; - std::condition_variable finish_condition_; -}; - -} // namespace wenet - -#endif // FRONTEND_FEATURE_PIPELINE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fft.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fft.cc deleted file mode 100644 index 9e05f854e79ea733d0411045385e924c2670b7f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fft.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include - -#include "frontend/fft.h" - -namespace wenet { - -void make_sintbl(int n, float* sintbl) { - int i, n2, n4, n8; - float c, s, dc, ds, t; - - n2 = n / 2; - n4 = n / 4; - n8 = n / 8; - t = sin(M_PI / n); - dc = 2 * t * t; - ds = sqrt(dc * (2 - dc)); - t = 2 * dc; - c = sintbl[n4] = 1; - s = sintbl[0] = 0; - for (i = 1; i < n8; ++i) { - c -= dc; - dc += t * c; - s += ds; - ds -= t * s; - sintbl[i] = s; - sintbl[n4 - i] = c; - } - if (n8 != 0) sintbl[n8] = sqrt(0.5); - for (i = 0; i < n4; ++i) sintbl[n2 - i] = sintbl[i]; - for (i = 0; i < n2 + n4; ++i) sintbl[i + n2] = -sintbl[i]; -} - -void make_bitrev(int n, int* bitrev) { - int i, j, k, n2; - - n2 = n / 2; - i = j = 0; - for (;;) { - bitrev[i] = j; - if (++i >= n) break; - k = n2; - while (k <= j) { - j -= k; - k /= 2; - } - j += k; - } -} - -// bitrev: bit reversal table -// sintbl: trigonometric function table -// x:real part -// y:image part -// n: fft length -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n) { - int i, j, k, ik, h, d, k2, n4, inverse; - float t, s, c, dx, dy; - - /* preparation */ - if (n < 0) { - n = -n; - inverse = 1; /* inverse transform */ - } else { - inverse = 0; - } - n4 = n / 4; - if (n == 0) { - return 0; - } - - /* bit reversal */ - for (i = 0; i < n; ++i) { - j = bitrev[i]; - if (i < j) { - t = x[i]; - x[i] = x[j]; - x[j] = t; - t = y[i]; - y[i] = y[j]; - y[j] = t; - } - } - - /* transformation */ - for (k = 1; k < n; k = k2) { - h = 0; - k2 = k + k; - d = n / k2; - for (j = 0; j < k; ++j) { - c = sintbl[h + n4]; - if (inverse) - s = -sintbl[h]; - else - s = sintbl[h]; - for (i = j; i < n; i += k2) { - ik = i + k; - dx = s * y[ik] + c * x[ik]; - dy = c * y[ik] - s * x[ik]; - x[ik] = x[i] - dx; - x[i] += dx; - y[ik] = y[i] - dy; - y[i] += dy; - } - h += d; - } - } - if (inverse) { - /* divide by n in case of the inverse transformation */ - for (i = 0; i < n; ++i) { - x[i] /= n; - y[i] /= n; - } - } - return 0; /* finished successfully */ -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fft.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fft.h deleted file mode 100644 index 6b92e406c44b4768eaee6e734f55bb39cd9af28b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/fft.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_FFT_H_ -#define FRONTEND_FFT_H_ - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -namespace wenet { - -// Fast Fourier Transform - -void make_sintbl(int n, float* sintbl); - -void make_bitrev(int n, int* bitrev); - -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n); - -} // namespace wenet - -#endif // FRONTEND_FFT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/wav.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/wav.h deleted file mode 100644 index 688a049a940ebbdc83f24e59134fff22b7b09bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/frontend/wav.h +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2016 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_WAV_H_ -#define FRONTEND_WAV_H_ - -#include -#include -#include -#include -#include - -#include - -#include "utils/log.h" - -namespace wenet { - -struct WavHeader { - char riff[4] = {'R', 'I', 'F', 'F'}; - unsigned int size = 0; - char wav[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - unsigned int fmt_size = 16; - uint16_t format = 1; - uint16_t channels = 0; - unsigned int sample_rate = 0; - unsigned int bytes_per_second = 0; - uint16_t block_size = 0; - uint16_t bit = 0; - char data[4] = {'d', 'a', 't', 'a'}; - unsigned int data_size = 0; - - WavHeader() {} - - WavHeader(int num_samples, int num_channel, int sample_rate, - int bits_per_sample) { - data_size = num_samples * num_channel * (bits_per_sample / 8); - size = sizeof(WavHeader) - 8 + data_size; - channels = num_channel; - this->sample_rate = sample_rate; - bytes_per_second = sample_rate * num_channel * (bits_per_sample / 8); - block_size = num_channel * (bits_per_sample / 8); - bit = bits_per_sample; - } -}; - -class WavReader { - public: - WavReader() : data_(nullptr) {} - explicit WavReader(const std::string& filename) { Open(filename); } - - bool Open(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "rb"); - if (NULL == fp) { - LOG(WARNING) << "Error in read " << filename; - return false; - } - - WavHeader header; - fread(&header, 1, sizeof(header), fp); - if (header.fmt_size < 16) { - fprintf(stderr, - "WaveData: expect PCM format data " - "to have fmt chunk of at least size 16.\n"); - return false; - } else if (header.fmt_size > 16) { - int offset = 44 - 8 + header.fmt_size - 16; - fseek(fp, offset, SEEK_SET); - fread(header.data, 8, sizeof(char), fp); - } - // check "RIFF" "WAVE" "fmt " "data" - - // Skip any sub-chunks between "fmt" and "data". Usually there will - // be a single "fact" sub chunk, but on Windows there can also be a - // "list" sub chunk. - while (0 != strncmp(header.data, "data", 4)) { - // We will just ignore the data in these chunks. - fseek(fp, header.data_size, SEEK_CUR); - // read next sub chunk - fread(header.data, 8, sizeof(char), fp); - } - - num_channel_ = header.channels; - sample_rate_ = header.sample_rate; - bits_per_sample_ = header.bit; - int num_data = header.data_size / (bits_per_sample_ / 8); - data_ = new float[num_data]; - num_samples_ = num_data / num_channel_; - - for (int i = 0; i < num_data; ++i) { - switch (bits_per_sample_) { - case 8: { - char sample; - fread(&sample, 1, sizeof(char), fp); - data_[i] = static_cast(sample); - break; - } - case 16: { - int16_t sample; - fread(&sample, 1, sizeof(int16_t), fp); - data_[i] = static_cast(sample); - break; - } - case 32: { - int sample; - fread(&sample, 1, sizeof(int), fp); - data_[i] = static_cast(sample); - break; - } - default: - fprintf(stderr, "unsupported quantization bits"); - exit(1); - } - } - fclose(fp); - return true; - } - - int num_channel() const { return num_channel_; } - int sample_rate() const { return sample_rate_; } - int bits_per_sample() const { return bits_per_sample_; } - int num_samples() const { return num_samples_; } - - ~WavReader() { - delete[] data_; - } - - const float* data() const { return data_; } - - private: - int num_channel_; - int sample_rate_; - int bits_per_sample_; - int num_samples_; // sample points per channel - float* data_; -}; - -class WavWriter { - public: - WavWriter(const float* data, int num_samples, int num_channel, - int sample_rate, int bits_per_sample) - : data_(data), - num_samples_(num_samples), - num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample) {} - - void Write(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "wb"); - WavHeader header(num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fwrite(&header, 1, sizeof(header), fp); - - for (int i = 0; i < num_samples_; ++i) { - for (int j = 0; j < num_channel_; ++j) { - switch (bits_per_sample_) { - case 8: { - char sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 16: { - int16_t sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 32: { - int sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - } - } - } - fclose(fp); - } - - private: - const float* data_; - int num_samples_; // total float points in data_ - int num_channel_; - int sample_rate_; - int bits_per_sample_; -}; - -class StreamWavWriter { - public: - StreamWavWriter(int num_channel, int sample_rate, int bits_per_sample) - : num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample), - total_num_samples_(0) {} - - StreamWavWriter(const std::string& filename, int num_channel, - int sample_rate, int bits_per_sample) - : StreamWavWriter(num_channel, sample_rate, bits_per_sample) { - Open(filename); - } - - void Open(const std::string& filename) { - fp_ = fopen(filename.c_str(), "wb"); - fseek(fp_, sizeof(WavHeader), SEEK_SET); - } - - void Write(const int16_t* sample_data, size_t num_samples) { - fwrite(sample_data, sizeof(int16_t), num_samples, fp_); - total_num_samples_ += num_samples; - } - - void Close() { - WavHeader header(total_num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fseek(fp_, 0L, SEEK_SET); - fwrite(&header, 1, sizeof(header), fp_); - fclose(fp_); - } - - private: - FILE* fp_; - int num_channel_; - int sample_rate_; - int bits_per_sample_; - size_t total_num_samples_; -}; - -} // namespace wenet - -#endif // FRONTEND_WAV_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/CMakeLists.txt deleted file mode 100644 index b072309e44b90dcee44ea31e9bcbc1741e73f151..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) - -project(kaldi) - -# include_directories() is called in the root CMakeLists.txt - -add_library(kaldi-util - base/kaldi-error.cc - base/kaldi-math.cc - util/kaldi-io.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/text-utils.cc -) -target_link_libraries(kaldi-util PUBLIC utils) - -add_library(kaldi-decoder - lat/determinize-lattice-pruned.cc - lat/lattice-functions.cc - decoder/lattice-faster-decoder.cc - decoder/lattice-faster-online-decoder.cc -) -target_link_libraries(kaldi-decoder PUBLIC kaldi-util) - -if(GRAPH_TOOLS) - # Arpa binary - add_executable(arpa2fst - lm/arpa-file-parser.cc - lm/arpa-lm-compiler.cc - lmbin/arpa2fst.cc - ) - target_link_libraries(arpa2fst PUBLIC kaldi-util) - - # FST tools binary - set(FST_BINS - fstaddselfloops - fstdeterminizestar - fstisstochastic - fstminimizeencoded - fsttablecompose - ) - - if(NOT MSVC) - # dl is for dynamic linking, otherwise there is a linking error on linux - link_libraries(dl) - endif() - foreach(name IN LISTS FST_BINS) - add_executable(${name} - fstbin/${name}.cc - fstext/kaldi-fst-io.cc - ) - target_link_libraries(${name} PUBLIC kaldi-util) - endforeach() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/README.md deleted file mode 100644 index 4eb9c9173b747686f00b658afc5e1e0dfdc17e68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/README.md +++ /dev/null @@ -1,21 +0,0 @@ -We use Kaldi decoder to implement TLG based language model integration, -so we copied related files to this directory. -The main changes are: - -1. To minimize the change, we use the same directories tree as Kaldi. - -2. We replace Kaldi log system with glog in the following way. - -``` c++ -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_INFO \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) -``` - -3. We lint all the files to satisfy the lint in WeNet. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs-inl.h deleted file mode 100644 index 9397400833676b323492321183c989cec2f41c3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs-inl.h +++ /dev/null @@ -1,329 +0,0 @@ -// base/io-funcs-inl.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; -// Johns Hopkins University (Author: Daniel Povey) -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_INL_H_ -#define KALDI_BASE_IO_FUNCS_INL_H_ 1 - -// Do not include this file directly. It is included by base/io-funcs.h - -#include -#include -#include - -namespace kaldi { - -// Template that covers integers. -template -void WriteBasicType(std::ostream &os, bool binary, T t) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char len_c = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(t)); - os.put(len_c); - os.write(reinterpret_cast(&t), sizeof(t)); - } else { - if (sizeof(t) == 1) - os << static_cast(t) << " "; - else - os << t << " "; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteBasicType."; - } -} - -// Template that covers integers. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t) { - KALDI_PARANOID_ASSERT(t != NULL); - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - int len_c_in = is.get(); - if (len_c_in == -1) - KALDI_ERR << "ReadBasicType: encountered end of stream."; - char len_c = static_cast(len_c_in), - len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(*t)); - if (len_c != len_c_expected) { - KALDI_ERR << "ReadBasicType: did not get expected integer type, " - << static_cast(len_c) << " vs. " - << static_cast(len_c_expected) - << ". You can change this code to successfully" - << " read it later, if needed."; - // insert code here to read "wrong" type. Might have a switch statement. - } - is.read(reinterpret_cast(t), sizeof(*t)); - } else { - if (sizeof(*t) == 1) { - int16 i; - is >> i; - *t = i; - } else { - is >> *t; - } - } - if (is.fail()) { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << is.peek(); - } -} - -// Template that covers integers. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector >::const_iterator iter = v.begin(), - end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(iter->first) << ',' - << static_cast(iter->second) << ' '; - else - os << iter->first << ',' << iter->second << ' '; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerPairVector."; - } -} - -// Template that covers integers. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz * 2); - } - } else { - std::vector > tmp_v; // use temporary so v doesn't use - // extra memory due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); - } else { - T next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::pair(next_t1, next_t2)); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerPairVector: read failure at file position " - << is.tellg(); -} - -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(*iter) << " "; - else - os << *iter << " "; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerVector."; - } -} - -template -inline void ReadIntegerVector(std::istream &is, bool binary, - std::vector *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz); - } - } else { - std::vector tmp_v; // use temporary so v doesn't use extra memory - // due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back((T)next_t); - } else { - T next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(next_t); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerVector: read failure at file position " - << is.tellg(); -} - -// Initialize an opened stream for writing by writing an optional binary -// header and modifying the floating-point precision. -inline void InitKaldiOutputStream(std::ostream &os, bool binary) { - // This does not throw exceptions (does not check for errors). - if (binary) { - os.put('\0'); - os.put('B'); - } - // Note, in non-binary mode we may at some point want to mess with - // the precision a bit. - // 7 is a bit more than the precision of float.. - if (os.precision() < 7) os.precision(7); -} - -/// Initialize an opened stream for reading by detecting the binary header and -// setting the "binary" value appropriately. -inline bool InitKaldiInputStream(std::istream &is, bool *binary) { - // Sets the 'binary' variable. - // Throws exception in the very unusual situation that stream - // starts with '\0' but not then 'B'. - - if (is.peek() == '\0') { // seems to be binary - is.get(); - if (is.peek() != 'B') { - return false; - } - is.get(); - *binary = true; - return true; - } else { - *binary = false; - return true; - } -} - -} // end namespace kaldi. - -#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs.cc deleted file mode 100644 index bd6c350780d1096ff8c452fd00864aa07a30ac65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs.cc +++ /dev/null @@ -1,215 +0,0 @@ -// base/io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" - -namespace kaldi { - -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b) { - os << (b ? "T" : "F"); - if (!binary) os << " "; - if (os.fail()) KALDI_ERR << "Write failure in WriteBasicType"; -} - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b) { - KALDI_PARANOID_ASSERT(b != NULL); - if (!binary) is >> std::ws; // eat up whitespace. - char c = is.peek(); - if (c == 'T') { - *b = true; - is.get(); - } else if (c == 'F') { - *b = false; - is.get(); - } else { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << CharToString(c); - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, float f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f) { - KALDI_PARANOID_ASSERT(f != NULL); - if (binary) { - double d; - int c = is.peek(); - if (c == sizeof(*f)) { - is.get(); - is.read(reinterpret_cast(f), sizeof(*f)); - } else if (c == sizeof(d)) { - ReadBasicType(is, binary, &d); - *f = d; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *f; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, double *d) { - KALDI_PARANOID_ASSERT(d != NULL); - if (binary) { - float f; - int c = is.peek(); - if (c == sizeof(*d)) { - is.get(); - is.read(reinterpret_cast(d), sizeof(*d)); - } else if (c == sizeof(f)) { - ReadBasicType(is, binary, &f); - *d = f; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *d; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -void CheckToken(const char *token) { - if (*token == '\0') KALDI_ERR << "Token is empty (not a valid token)"; - const char *orig_token = token; - while (*token != '\0') { - if (::isspace(*token)) - KALDI_ERR << "Token is not a valid token (contains space): '" - << orig_token << "'"; - token++; - } -} - -void WriteToken(std::ostream &os, bool binary, const char *token) { - // binary mode is ignored; - // we use space as termination character in either case. - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - os << token << " "; - if (os.fail()) { - KALDI_ERR << "Write failure in WriteToken."; - } -} - -int Peek(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // eat up whitespace. - return is.peek(); -} - -void WriteToken(std::ostream &os, bool binary, const std::string &token) { - WriteToken(os, binary, token.c_str()); -} - -void ReadToken(std::istream &is, bool binary, std::string *str) { - KALDI_ASSERT(str != NULL); - if (!binary) is >> std::ws; // consume whitespace. - is >> *str; - if (is.fail()) { - KALDI_ERR << "ReadToken, failed to read token at file position " - << is.tellg(); - } - if (!isspace(is.peek())) { - KALDI_ERR << "ReadToken, expected space after token, saw instead " - << CharToString(static_cast(is.peek())) - << ", at file position " << is.tellg(); - } - is.get(); // consume the space. -} - -int PeekToken(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // consume whitespace. - bool read_bracket; - if (static_cast(is.peek()) == '<') { - read_bracket = true; - is.get(); - } else { - read_bracket = false; - } - int ans = is.peek(); - if (read_bracket) { - if (!is.unget()) { - // Clear the bad bit. This code can be (and is in fact) reached, since the - // C++ standard does not guarantee that a call to unget() must succeed. - is.clear(); - } - } - return ans; -} - -void ExpectToken(std::istream &is, bool binary, const char *token) { - int pos_at_start = is.tellg(); - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - if (!binary) is >> std::ws; // consume whitespace. - std::string str; - is >> str; - is.get(); // consume the space. - if (is.fail()) { - KALDI_ERR << "Failed to read token [started at file position " - << pos_at_start << "], expected " << token; - } - // The second half of the '&&' expression below is so that if we're expecting - // "", we will accept "Foo>" instead. This is so that the model-reading - // code will tolerate errors in PeekToken where is.unget() failed; search for - // is.clear() in PeekToken() for an explanation. - if (strcmp(str.c_str(), token) != 0 && - !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { - KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str - << "\"."; - } -} - -void ExpectToken(std::istream &is, bool binary, const std::string &token) { - ExpectToken(is, binary, token.c_str()); -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs.h deleted file mode 100644 index 06ad1e3d2d8dc8385886a7c6653f620642c7c05a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/io-funcs.h +++ /dev/null @@ -1,246 +0,0 @@ -// base/io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_H_ -#define KALDI_BASE_IO_FUNCS_H_ - -// This header only contains some relatively low-level I/O functions. -// The full Kaldi I/O declarations are in ../util/kaldi-io.h -// and ../util/kaldi-table.h -// They were put in util/ in order to avoid making the Matrix library -// dependent on them. - -#include -#include -#include -#include - -#include "base/io-funcs-inl.h" -#include "base/kaldi-common.h" - -namespace kaldi { - -/* - This comment describes the Kaldi approach to I/O. All objects can be written - and read in two modes: binary and text. In addition we want to make the I/O - work if we redefine the typedef "BaseFloat" between floats and doubles. - We also want to have control over whitespace in text mode without affecting - the meaning of the file, for pretty-printing purposes. - - Errors are handled by throwing a KaldiFatalError exception. - - For integer and floating-point types (and boolean values): - - WriteBasicType(std::ostream &, bool binary, const T&); - ReadBasicType(std::istream &, bool binary, T*); - - and we expect these functions to be defined in such a way that they work when - the type T changes between float and double, so you can read float into double - and vice versa]. Note that for efficiency and space-saving reasons, the - Vector and Matrix classes do not use these functions [but they preserve the - type interchangeability in their own way] - - For a class (or struct) C: - class C { - .. - Write(std::ostream &, bool binary, [possibly extra optional args for - specific classes]) const; Read(std::istream &, bool binary, [possibly extra - optional args for specific classes]); - .. - } - NOTE: The only actual optional args we used are the "add" arguments in - Vector/Matrix classes, which specify whether we should sum the data already - in the class with the data being read. - - For types which are typedef's involving stl classes, I/O is as follows: - typedef std::vector > MyTypedefName; - - The user should define something like: - - WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); - ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); - - The user would have to write these functions. - - For a type std::vector: - - void WriteIntegerVector(std::ostream &os, bool binary, const std::vector - &v); void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - - For other types, e.g. vectors of pairs, the user should create a routine of - the type WriteMyTypedefName. This is to avoid introducing confusing templated - functions; we could easily create templated functions to handle most of these - cases but they would have to share the same name. - - It also often happens that the user needs to write/read special tokens as part - of a file. These might be class headers, or separators/identifiers in the - class. We provide special functions for manipulating these. These special - tokens must be nonempty and must not contain any whitespace. - - void WriteToken(std::ostream &os, bool binary, const char*); - void WriteToken(std::ostream &os, bool binary, const std::string & token); - int Peek(std::istream &is, bool binary); - void ReadToken(std::istream &is, bool binary, std::string *str); - void PeekToken(std::istream &is, bool binary, std::string *str); - - WriteToken writes the token and one space (whether in binary or text mode). - - Peek returns the first character of the next token, by consuming whitespace - (in text mode) and then returning the peek() character. It returns -1 at EOF; - it doesn't throw. It's useful if a class can have various forms based on - typedefs and virtual classes, and wants to know which version to read. - - ReadToken allows the caller to obtain the next token. PeekToken works just - like ReadToken, but seeks back to the beginning of the token. A subsequent - call to ReadToken will read the same token again. This is useful when - different object types are written to the same file; using PeekToken one can - decide which of the objects to read. - - There is currently no special functionality for writing/reading strings (where - the strings contain data rather than "special tokens" that are whitespace-free - and nonempty). This is because Kaldi is structured in such a way that strings - don't appear, except as OpenFst symbol table entries (and these have their own - format). - - - NOTE: you should not call ReadIntegerType and WriteIntegerType with types, - such as int and size_t, that are machine-independent -- at least not - if you want your file formats to port between machines. Use int32 and - int64 where necessary. There is no way to detect this using compile-time - assertions because C++ only keeps track of the internal representation of - the type. -*/ - -/// \addtogroup io_funcs_basic -/// @{ - -/// WriteBasicType is the name of the write function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void WriteBasicType(std::ostream &os, bool binary, T t); - -/// ReadBasicType is the name of the read function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void ReadBasicType(std::istream &is, bool binary, T *t); - -// Declare specialization for bool. -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b); - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b); - -// Declare specializations for float and double. -template <> -void WriteBasicType(std::ostream &os, bool binary, float f); - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f); - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f); - -template <> -void ReadBasicType(std::istream &is, bool binary, double *f); - -// Define ReadBasicType that accepts an "add" parameter to add to -// the destination. Caution: if used in Read functions, be careful -// to initialize the parameters concerned to zero in the default -// constructor. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { - if (!add) { - ReadBasicType(is, binary, t); - } else { - T tmp = T(0); - ReadBasicType(is, binary, &tmp); - *t += tmp; - } -} - -/// Function for writing STL vectors of integer types. -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v); - -/// Function for reading STL vector of integer types. -template -inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - -/// Function for writing STL vectors of pairs of integer types. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v); - -/// Function for reading STL vector of pairs of integer types. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v); - -/// The WriteToken functions are for writing nonempty sequences of non-space -/// characters. They are not for general strings. -void WriteToken(std::ostream &os, bool binary, const char *token); -void WriteToken(std::ostream &os, bool binary, const std::string &token); - -/// Peek consumes whitespace (if binary == false) and then returns the peek() -/// value of the stream. -int Peek(std::istream &is, bool binary); - -/// ReadToken gets the next token and puts it in str (exception on failure). If -/// PeekToken() had been previously called, it is possible that the stream had -/// failed to unget the starting '<' character. In this case ReadToken() returns -/// the token string without the leading '<'. You must be prepared to handle -/// this case. ExpectToken() handles this internally, and is not affected. -void ReadToken(std::istream &is, bool binary, std::string *token); - -/// PeekToken will return the first character of the next token, or -1 if end of -/// file. It's the same as Peek(), except if the first character is '<' it will -/// skip over it and will return the next character. It will attempt to unget -/// the '<' so the stream is where it was before you did PeekToken(), however, -/// this is not guaranteed (see ReadToken()). -int PeekToken(std::istream &is, bool binary); - -/// ExpectToken tries to read in the given token, and throws an exception -/// on failure. -void ExpectToken(std::istream &is, bool binary, const char *token); -void ExpectToken(std::istream &is, bool binary, const std::string &token); - -/// ExpectPretty attempts to read the text in "token", but only in non-binary -/// mode. Throws exception on failure. It expects an exact match except that -/// arbitrary whitespace matches arbitrary whitespace. -void ExpectPretty(std::istream &is, bool binary, const char *token); -void ExpectPretty(std::istream &is, bool binary, const std::string &token); - -/// @} end "addtogroup io_funcs_basic" - -/// InitKaldiOutputStream initializes an opened stream for writing by writing an -/// optional binary header and modifying the floating-point precision; it will -/// typically not be called by users directly. -inline void InitKaldiOutputStream(std::ostream &os, bool binary); - -/// InitKaldiInputStream initializes an opened stream for reading by detecting -/// the binary header and setting the "binary" value appropriately; -/// It will typically not be called by users directly. -inline bool InitKaldiInputStream(std::istream &is, bool *binary); - -} // end namespace kaldi. -#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-common.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-common.h deleted file mode 100644 index eee5f34d7234e7c029e6bb59584d3ee65ff5a875..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-common.h +++ /dev/null @@ -1,41 +0,0 @@ -// base/kaldi-common.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_COMMON_H_ -#define KALDI_BASE_KALDI_COMMON_H_ 1 - -#include -#include -#include // C string stuff like strcpy -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-utils.h" -#include "base/kaldi-error.h" -#include "base/kaldi-types.h" -// #include "base/io-funcs.h" -#include "base/kaldi-math.h" -// #include "base/timer.h" - -#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-error.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-error.cc deleted file mode 100644 index 77edc6af6e56bb8fa3431d519e58fda9ee0bac6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-error.cc +++ /dev/null @@ -1,42 +0,0 @@ -// base/kaldi-error.cc - -// Copyright 2019 LAIX (Yi Sun) -// Copyright 2019 SmartAction LLC (kkm) -// Copyright 2016 Brno University of Technology (author: Karel Vesely) -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-error.h" - -#include - -namespace kaldi { - -/***** GLOBAL VARIABLES FOR LOGGING *****/ - -int32 g_kaldi_verbose_level = 0; -static std::string program_name; // NOLINT - -void SetProgramName(const char *basename) { - // Using the 'static std::string' for the program name is mostly harmless, - // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ - // string implementation has been found in the wild that would not be just - // an empty string when zero-initialized but not yet constructed. - program_name = basename; -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-error.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-error.h deleted file mode 100644 index 0f65db372b5f05a8017433eed7c95badc819a0a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// base/kaldi-error.h - -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_ERROR_H_ -#define KALDI_BASE_KALDI_ERROR_H_ 1 - -#include "utils/log.h" - -namespace kaldi { - -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_LOG \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) - - -/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ - -/// Called by ParseOptions to set base name (no directory) of the executing -/// program. The name is printed in logging code along with every message, -/// because in our scripts, we often mix together the stderr of many programs. -/// This function is very thread-unsafe. -void SetProgramName(const char *basename); - -/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. -/// Do not use directly, prefer {Get,Set}VerboseLevel(). -extern int32 g_kaldi_verbose_level; - -/// Get verbosity level, usually set via command line '--verbose=' switch. -inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } - -/// This should be rarely used, except by programs using Kaldi as library; -/// command-line programs set the verbose level automatically from ParseOptions. -inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } - -} // namespace kaldi - -#endif // KALDI_BASE_KALDI_ERROR_H_ - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-math.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-math.cc deleted file mode 100644 index 175d9f49b6c5216645e90e146f4e2eab5572c342..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-math.cc +++ /dev/null @@ -1,164 +0,0 @@ -// base/kaldi-math.cc - -// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; -// Saarland University; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-math.h" -#ifndef _MSC_VER -#include -#include -#endif -#include -#include - -namespace kaldi { -// These routines are tested in matrix/matrix-test.cc - -int32 RoundUpToNearestPowerOfTwo(int32 n) { - KALDI_ASSERT(n > 0); - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n+1; -} - -static std::mutex _RandMutex; - -int Rand(struct RandomState* state) { -#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) - // On Windows and Cygwin, just call Rand() - return rand(); -#else - if (state) { - return rand_r(&(state->seed)); - } else { - std::lock_guard lock(_RandMutex); - return rand(); - } -#endif -} - -RandomState::RandomState() { - // we initialize it as Rand() + 27437 instead of just Rand(), because on some - // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be - // the case that rand_r when initialized with rand() will give you the exact - // same sequence of numbers that rand() will give if you keep calling rand() - // after that initial call. This can cause problems with repeated sequences. - // For example if you initialize two RandomState structs one after the other - // without calling rand() in between, they would give you the same sequence - // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just - // a randomly chosen prime number. - seed = unsigned(Rand()) + 27437; -} - -bool WithProb(BaseFloat prob, struct RandomState* state) { - KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, - // but we allow slightly larger values that could arise from roundoff in - // previous calculations. - KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); - if (prob == 0) { - return false; - } else if (prob == 1.0) { - return true; - } else if (prob * RAND_MAX < 128.0) { - // prob is very small but nonzero, and the "main algorithm" - // wouldn't work that well. So: with probability 1/128, we - // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... - // Note: we know that prob * 128.0 < 1.0, because - // we asserted RAND_MAX > 128 * 128. - return WithProb(prob * 128.0); - } else { - return false; - } - } else { - return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); - } -} - -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { - // This is not exact. - KALDI_ASSERT(max_val >= min_val); - if (max_val == min_val) return min_val; - -#ifdef _MSC_VER - // RAND_MAX is quite small on Windows -> may need to handle larger numbers. - if (RAND_MAX > (max_val-min_val)*8) { - // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + - ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); - } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > - (unsigned int)((max_val+1-min_val)*8)) { - // *8 to avoid inaccuracies in probability, from the modulus... - return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) - % (unsigned int)(max_val+1-min_val)); - } else { - KALDI_ERR << "rand_int failed because we do not support such large " - "random numbers. (Extend this function)."; - } - } -#else - return min_val + - (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); -#endif -} - -// Returns poisson-distributed random number. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state) { - // Knuth's algorithm. - KALDI_ASSERT(lambda >= 0); - float L = expf(-lambda), p = 1.0; - int32 k = 0; - do { - k++; - float u = RandUniform(state); - p *= u; - } while (p > L); - return k-1; -} - -void RandGauss2(float *a, float *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float u1 = RandUniform(state); - float u2 = RandUniform(state); - u1 = sqrtf(-2.0f * logf(u1)); - u2 = 2.0f * M_PI * u2; - *a = u1 * cosf(u2); - *b = u1 * sinf(u2); -} - -void RandGauss2(double *a, double *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float a_float, b_float; - // Just because we're using doubles doesn't mean we need super-high-quality - // random numbers, so we just use the floating-point version internally. - RandGauss2(&a_float, &b_float, state); - *a = a_float; - *b = b_float; -} - - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-math.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-math.h deleted file mode 100644 index 93c265ee96e704893da26b9083a44a9e60c6c192..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-math.h +++ /dev/null @@ -1,363 +0,0 @@ -// base/kaldi-math.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; -// Jan Silovsky; Saarland University -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_MATH_H_ -#define KALDI_BASE_KALDI_MATH_H_ 1 - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include - -#include "base/kaldi-types.h" -#include "base/kaldi-common.h" - - -#ifndef DBL_EPSILON -#define DBL_EPSILON 2.2204460492503131e-16 -#endif -#ifndef FLT_EPSILON -#define FLT_EPSILON 1.19209290e-7f -#endif - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.4142135623730950488016887 -#endif - -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244008443621048490 -#endif - -#ifndef M_LOG_2PI -#define M_LOG_2PI 1.8378770664093454835606594728112 -#endif - -#ifndef M_LN2 -#define M_LN2 0.693147180559945309417232121458 -#endif - -#ifndef M_LN10 -#define M_LN10 2.302585092994045684017991454684 -#endif - - -#define KALDI_ISNAN std::isnan -#define KALDI_ISINF std::isinf -#define KALDI_ISFINITE(x) std::isfinite(x) - -#if !defined(KALDI_SQR) -# define KALDI_SQR(x) ((x) * (x)) -#endif - -namespace kaldi { - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline double Exp(double x) { return exp(x); } -#ifndef KALDI_NO_EXPF -inline float Exp(float x) { return expf(x); } -#else -inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF -#else -inline double Exp(double x) { return exp(x); } -#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -// Microsoft CL v18.0 buggy 64-bit implementation of -// expf() incorrectly returns -inf for exp(-inf). -inline float Exp(float x) { return exp(static_cast(x)); } -#else -inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -inline double Log(double x) { return log(x); } -inline float Log(float x) { return logf(x); } - -#if !defined(_MSC_VER) || (_MSC_VER >= 1700) -inline double Log1p(double x) { return log1p(x); } -inline float Log1p(float x) { return log1pf(x); } -#else -inline double Log1p(double x) { - const double cutoff = 1.0e-08; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} - -inline float Log1p(float x) { - const float cutoff = 1.0e-07; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} -#endif - -static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! -static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! - -// -infinity -const float kLogZeroFloat = -std::numeric_limits::infinity(); -const double kLogZeroDouble = -std::numeric_limits::infinity(); -const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); - -// Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state = NULL); - -// State for thread-safe random number generator -struct RandomState { - RandomState(); - unsigned seed; -}; - -// Returns a random integer between first and last inclusive. -int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); - -// Returns true with probability "prob", -bool WithProb(BaseFloat prob, struct RandomState* state = NULL); -// with 0 <= prob <= 1 [we check this]. -// Internally calls Rand(). This function is carefully implemented so -// that it should work even if prob is very small. - -/// Returns a random number strictly between 0 and 1. -inline float RandUniform(struct RandomState* state = NULL) { - return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); -} - -inline float RandGauss(struct RandomState* state = NULL) { - return static_cast(sqrtf (-2 * Log(RandUniform(state))) - * cosf(2*M_PI*RandUniform(state))); -} - -// Returns poisson-distributed random number. Uses Knuth's algorithm. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state = NULL); - -// Returns a pair of gaussian random numbers. Uses Box-Muller transform -void RandGauss2(float *a, float *b, RandomState *state = NULL); -void RandGauss2(double *a, double *b, RandomState *state = NULL); - -// Also see Vector::RandCategorical(). - -// This is a randomized pruning mechanism that preserves expectations, -// that we typically use to prune posteriors. -template -inline Float RandPrune(Float post, BaseFloat prune_thresh, - struct RandomState* state = NULL) { - KALDI_ASSERT(prune_thresh >= 0.0); - if (post == 0.0 || std::abs(post) >= prune_thresh) - return post; - return (post >= 0 ? 1.0 : -1.0) * - (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); -} - -// returns log(exp(x) + exp(y)). -inline double LogAdd(double x, double y) { - double diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffDouble) { - double res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) + exp(y)). -inline float LogAdd(float x, float y) { - float diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffFloat) { - float res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) - exp(y)). -inline double LogSub(double x, double y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - double diff = y - x; // Will be negative. - double res = x + Log(1.0 - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroDouble; - return res; -} - - -// returns log(exp(x) - exp(y)). -inline float LogSub(float x, float y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - float diff = y - x; // Will be negative. - float res = x + Log(1.0f - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroFloat; - return res; -} - -/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). -static inline bool ApproxEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - if (a == b) return true; - float diff = std::abs(a-b); - if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); -} - -/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) -static inline void AssertEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); -} - - -// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. -int32 RoundUpToNearestPowerOfTwo(int32 n); - -/// Returns a / b, rounding towards negative infinity in all cases. -static inline int32 DivideRoundingDown(int32 a, int32 b) { - KALDI_ASSERT(b != 0); - if (a * b >= 0) - return a / b; - else if (a < 0) - return (a - b + 1) / b; - else - return (a - b - 1) / b; -} - -template I Gcd(I m, I n) { - if (m == 0 || n == 0) { - if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. - KALDI_ERR << "Undefined GCD since m = 0, n = 0."; - } - return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); - // return absolute value of whichever is nonzero - } - // could use compile-time assertion - // but involves messing with complex template stuff. - KALDI_ASSERT(std::numeric_limits::is_integer); - while (1) { - m %= n; - if (m == 0) return (n > 0 ? n : -n); - n %= m; - if (n == 0) return (m > 0 ? m : -m); - } -} - -/// Returns the least common multiple of two integers. Will -/// crash unless the inputs are positive. -template I Lcm(I m, I n) { - KALDI_ASSERT(m > 0 && n > 0); - I gcd = Gcd(m, n); - return gcd * (m/gcd) * (n/gcd); -} - - -template void Factorize(I m, std::vector *factors) { - // Splits a number into its prime factors, in sorted order from - // least to greatest, with duplication. A very inefficient - // algorithm, which is mainly intended for use in the - // mixed-radix FFT computation (where we assume most factors - // are small). - KALDI_ASSERT(factors != NULL); - KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. - factors->clear(); - I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; - - // First try small factors. - for (I i = 0; i < 10; i++) { - if (m == 1) return; // We're done. - while (m % small_factors[i] == 0) { - m /= small_factors[i]; - factors->push_back(small_factors[i]); - } - } - // Next try all odd numbers starting from 31. - for (I j = 31;; j += 2) { - if (m == 1) return; - while (m % j == 0) { - m /= j; - factors->push_back(j); - } - } -} - -inline double Hypot(double x, double y) { return hypot(x, y); } -inline float Hypot(float x, float y) { return hypotf(x, y); } - - - - -} // namespace kaldi - - -#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-types.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-types.h deleted file mode 100644 index 7ebf4f85386192a65e176d8f0ecde9bb348af4a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-types.h +++ /dev/null @@ -1,75 +0,0 @@ -// base/kaldi-types.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_TYPES_H_ -#define KALDI_BASE_KALDI_TYPES_H_ 1 - -namespace kaldi { -// TYPEDEFS .................................................................. -#if (KALDI_DOUBLEPRECISION != 0) -typedef double BaseFloat; -#else -typedef float BaseFloat; -#endif -} - -#ifdef _MSC_VER -#include -#define ssize_t SSIZE_T -#endif - -// we can do this a different way if some platform -// we find in the future lacks stdint.h -#include - -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - -#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-utils.h deleted file mode 100644 index bd434d09ed92ec94bc4208f53a4416f941edfdb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/base/kaldi-utils.h +++ /dev/null @@ -1,155 +0,0 @@ -// base/kaldi-utils.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; -// Saarland University; Karel Vesely; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_UTILS_H_ -#define KALDI_BASE_KALDI_UTILS_H_ 1 - -#if defined(_MSC_VER) -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX -# include -#endif - -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) -#if _MSC_VER < 1400 -#define __restrict__ -#else -#define __restrict__ __restrict -#endif -#endif - -#if defined(_MSC_VER) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = _aligned_malloc(size, align)) -# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) -#elif defined(__CYGWIN__) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = aligned_alloc(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#else -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#endif - -#ifdef __ICC -#pragma warning(disable: 383) // ICPC remark we don't want. -#pragma warning(disable: 810) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#pragma warning(disable: 1418) // ICPC remark we don't want. -#pragma warning(disable: 444) // ICPC remark we don't want. -#pragma warning(disable: 869) // ICPC remark we don't want. -#pragma warning(disable: 1287) // ICPC remark we don't want. -#pragma warning(disable: 279) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#endif - - -namespace kaldi { - - -// CharToString prints the character in a human-readable form, for debugging. -std::string CharToString(const char &c); - - -inline int MachineIsLittleEndian() { - int check = 1; - return (*reinterpret_cast(&check) != 0); -} - -// This function kaldi::Sleep() provides a portable way -// to sleep for a possibly fractional -// number of seconds. On Windows it's only accurate to microseconds. -void Sleep(float seconds); -} // namespace kaldi - -#define KALDI_SWAP8(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ - (reinterpret_cast(&a))[7] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ - (reinterpret_cast(&a))[6] = t;\ - t = (reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ - (reinterpret_cast(&a))[5] = t;\ - t = (reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ - (reinterpret_cast(&a))[4] = t;} while (0) -#define KALDI_SWAP4(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=t;} while (0) -#define KALDI_SWAP2(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1] = t;} while (0) - - -// Makes copy constructor and operator= private. -#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ - type(const type&); \ - void operator = (const type&) - -template class KaldiCompileTimeAssert { }; -template<> class KaldiCompileTimeAssert { - public: - static inline void Check() { } -}; - -#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() - -#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ - KaldiCompileTimeAssert::is_specialized \ - && std::numeric_limits::is_integer>::Check() - -#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ - KaldiCompileTimeAssert::is_specialized \ - && !std::numeric_limits::is_integer>::Check() - -#if defined(_MSC_VER) -#define KALDI_STRCASECMP _stricmp -#elif defined(__CYGWIN__) -#include -#define KALDI_STRCASECMP strcasecmp -#else -#define KALDI_STRCASECMP strcasecmp -#endif -#ifdef _MSC_VER -# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif - -#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-decoder.cc deleted file mode 100644 index 06f77557fa49a23f6a44d07c327a1b3b081c6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-decoder.cc +++ /dev/null @@ -1,1101 +0,0 @@ -// decoder/lattice-faster-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2018 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "decoder/lattice-faster-decoder.h" -// #include "lat/lattice-functions.h" - -namespace kaldi { - -// instantiate this class once for each thing you have to decode. -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : fst_(&fst), - delete_fst_(false), - config_(config), - num_toks_(0), - context_graph_(context_graph) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const LatticeFasterDecoderConfig &config, FST *fst) - : fst_(fst), delete_fst_(true), config_(config), num_toks_(0) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::~LatticeFasterDecoderTpl() { - DeleteElems(toks_.Clear()); - ClearActiveTokens(); - if (delete_fst_) delete fst_; -} - -template -void LatticeFasterDecoderTpl::InitDecoding() { - // clean up from last time: - DeleteElems(toks_.Clear()); - cost_offsets_.clear(); - ClearActiveTokens(); - warned_ = false; - num_toks_ = 0; - decoding_finalized_ = false; - final_costs_.clear(); - StateId start_state = fst_->Start(); - KALDI_ASSERT(start_state != fst::kNoStateId); - active_toks_.resize(1); - Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL); - active_toks_[0].toks = start_tok; - toks_.Insert(start_state, start_tok); - num_toks_++; - ProcessNonemitting(config_.beam); -} - -// Returns true if any kind of traceback is available (not necessarily from -// a final state). It should only very rarely return false; this indicates -// an unusual search error. -template -bool LatticeFasterDecoderTpl::Decode( - DecodableInterface *decodable) { - InitDecoding(); - // We use 1-based indexing for frames in this decoder (if you view it in - // terms of features), but note that the decodable object uses zero-based - // numbering, which we have to correct for when we call it. - AdvanceDecoding(decodable); - FinalizeDecoding(); - - // Returns true if we have any kind of traceback available (not necessarily - // to the end state; query ReachedFinal() for that). - return !active_toks_.empty() && active_toks_.back().toks != NULL; -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - Lattice raw_lat; - GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, olat); - return (olat->NumStates() != 0); -} - -// Outputs an FST corresponding to the raw, state-level lattice -template -bool LatticeFasterDecoderTpl::GetRawLattice( - Lattice *ofst, bool use_final_probs) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (decoding_finalized_ ? final_costs_ : final_costs_local); - if (!decoding_finalized_ && use_final_probs) - ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_ / 2 + 3; - unordered_map tok_map(bucket_count); - // First create all states. - std::vector token_list; - for (int32 f = 0; f <= num_frames; f++) { - if (active_toks_[f].toks == NULL) { - KALDI_WARN << "GetRawLattice: no tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - TopSortTokens(active_toks_[f].toks, &token_list); - for (size_t i = 0; i < token_list.size(); i++) - if (token_list[i] != NULL) tok_map[token_list[i]] = ofst->AddState(); - } - // The next statement sets the start state of the output FST. Because we - // topologically sorted the tokens, state zero must be the start-state. - ofst->SetStart(0); - - KALDI_VLOG(4) << "init:" << num_toks_ / 2 + 3 - << " buckets:" << tok_map.bucket_count() - << " load:" << tok_map.load_factor() - << " max:" << tok_map.max_load_factor(); - // Now create all arcs. - for (int32 f = 0; f <= num_frames; f++) { - for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) { - StateId cur_state = tok_map[tok]; - for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) { - typename unordered_map::const_iterator iter = - tok_map.find(l->next_tok); - StateId nextstate = iter->second; - KALDI_ASSERT(iter != tok_map.end()); - BaseFloat cost_offset = 0.0; - if (l->ilabel != 0) { // emitting.. - KALDI_ASSERT(f >= 0 && f < cost_offsets_.size()); - cost_offset = cost_offsets_[f]; - } - - StateId state = cur_state; - if (l->is_start_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->start_tag_id(), Weight(0, 0), tmp); - ofst->AddArc(state, arc); - state = tmp; - } - if (l->is_end_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->end_tag_id(), Weight(0, 0), nextstate); - ofst->AddArc(tmp, arc); - nextstate = tmp; - } - - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(state, arc); - } - if (f == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - } - - fst::TopSort(ofst); - return (ofst->NumStates() > 0); -} - -// This function is now deprecated, since now we do determinization from outside -// the LatticeFasterDecoder class. Outputs an FST corresponding to the -// lattice-determinized lattice (one path per word sequence). -template -bool LatticeFasterDecoderTpl::GetLattice( - CompactLattice *ofst, bool use_final_probs) const { - Lattice raw_fst; - GetRawLattice(&raw_fst, use_final_probs); - Invert(&raw_fst); // make it so word labels are on the input. - // (in phase where we get backward-costs). - fst::ILabelCompare ilabel_comp; - ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes - // lattice-determinization more efficient. - - fst::DeterminizeLatticePrunedOptions lat_opts; - lat_opts.max_mem = config_.det_opts.max_mem; - - DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); - raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. - Connect(ofst); // Remove unreachable states... there might be - // a small number of these, in some cases. - // Note: if something went wrong and the raw lattice was empty, - // we should still get to this point in the code without warnings or failures. - return (ofst->NumStates() != 0); -} - -template -void LatticeFasterDecoderTpl::PossiblyResizeHash(size_t num_toks) { - size_t new_sz = static_cast(static_cast(num_toks) * - config_.hash_ratio); - if (new_sz > toks_.Size()) { - toks_.SetSize(new_sz); - } -} - -/* - A note on the definition of extra_cost. - - extra_cost is used in pruning tokens, to save memory. - - extra_cost can be thought of as a beta (backward) cost assuming - we had set the betas on currently-active tokens to all be the negative - of the alphas for those tokens. (So all currently active tokens would - be on (tied) best paths). - - We can use the extra_cost to accurately prune away tokens that we know will - never appear in the lattice. If the extra_cost is greater than the desired - lattice beam, the token would provably never appear in the lattice, so we can - prune away the token. - - (Note: we don't update all the extra_costs every time we update a frame; we - only do it every 'config_.prune_interval' frames). - */ - -// FindOrAddToken either locates a token in hash of toks_, -// or if necessary inserts a new, empty token (i.e. with no forward links) -// for the current frame. [note: it's inserted if necessary into hash toks_ -// and also into the singly linked list of tokens active on this frame -// (whose head is at active_toks_[frame]). -template -inline typename LatticeFasterDecoderTpl::Elem * -LatticeFasterDecoderTpl::FindOrAddToken(StateId state, - int32 frame_plus_one, - BaseFloat tot_cost, - Token *backpointer, - bool *changed) { - // Returns the Token pointer. Sets "changed" (if non-NULL) to true - // if the token was newly created or the cost changed. - KALDI_ASSERT(frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - Elem *e_found = toks_.Insert(state, NULL); - if (e_found->val == NULL) { // no such token presently. - const BaseFloat extra_cost = 0.0; - // tokens on the currently final frame have zero extra_cost - // as any of them could end up - // on the winning path. - Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer); - // NULL: no forward links yet - toks = new_tok; - num_toks_++; - e_found->val = new_tok; - if (changed) *changed = true; - return e_found; - } else { - Token *tok = e_found->val; // There is an existing Token for this state. - if (tok->tot_cost > tot_cost) { // replace old token - tok->tot_cost = tot_cost; - // SetBackpointer() just does tok->backpointer = backpointer in - // the case where Token == BackpointerToken, else nothing. - tok->SetBackpointer(backpointer); - // we don't allocate a new token, the old stays linked in active_toks_ - // we only replace the tot_cost - // in the current frame, there are no forward links (and no extra_cost) - // only in ProcessNonemitting we have to delete forward links - // in case we visit a state for the second time - // those forward links, that lead to this replaced token before: - // they remain and will hopefully be pruned later (PruneForwardLinks...) - if (changed) *changed = true; - } else { - if (changed) *changed = false; - } - return e_found; - } -} - -// prunes outgoing links for all tokens in active_toks_[frame] -// it's called by PruneActiveTokens -// all links, that have link_extra_cost > lattice_beam are pruned -template -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - // delta is the amount by which the extra_costs must change - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - - *extra_costs_changed = false; - *links_pruned = false; - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - if (active_toks_[frame_plus_one].toks == - NULL) { // empty list; should not happen. - if (!warned_) { - KALDI_WARN << "No tokens alive [doing pruning].. warning first " - "time only for each utterance\n"; - warned_ = true; - } - } - - // We have to iterate until there is no more change, because the links - // are not guaranteed to be in topological order. - bool changed = true; // difference new minus old extra cost >= delta ? - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost for tok. - BaseFloat tok_extra_cost = std::numeric_limits::infinity(); - // tok_extra_cost is the best (min) of link_extra_cost of outgoing links - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state - KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN - // the graph_cost contatins the context score - // if it's the score of the backoff arc, it should be removed. - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - *links_pruned = true; - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; // move to next link - link = link->next; - } - } // for all outgoing links - if (fabs(tok_extra_cost - tok->extra_cost) > delta) - changed = true; // difference new minus old is bigger than delta - tok->extra_cost = tok_extra_cost; - // will be +infinity or <= lattice_beam_. - // infinity indicates, that no forward link survived pruning - } // for all Token on active_toks_[frame] - if (changed) *extra_costs_changed = true; - - // Note: it's theoretically possible that aggressive compiler - // optimizations could cause an infinite loop here for small delta and - // high-dynamic-range scores. - } // while changed -} - -// PruneForwardLinksFinal is a version of PruneForwardLinks that we call -// on the final frame. If there are final tokens active, it uses -// the final-probs for pruning, otherwise it treats all tokens as final. -template -void LatticeFasterDecoderTpl::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame_plus_one = active_toks_.size() - 1; - - if (active_toks_[frame_plus_one].toks == - NULL) // empty list; should not happen. - KALDI_WARN << "No tokens alive at end of file"; - - typedef typename unordered_map::const_iterator IterType; - ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); - decoding_finalized_ = true; - // We call DeleteElems() as a nicety, not because it's really necessary; - // otherwise there would be a time, after calling PruneTokensForFrame() on the - // final frame, when toks_.GetList() or toks_.Clear() would contain pointers - // to nonexistent tokens. - DeleteElems(toks_.Clear()); - - // Now go through tokens on this frame, pruning forward links... may have to - // iterate a few times until there is no more change, because the list is not - // in topological order. This is a modified version of the code in - // PruneForwardLinks, but here we also take account of the final-probs. - bool changed = true; - BaseFloat delta = 1.0e-05; - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost. It has a term in it that corresponds - // to the "final-prob", so instead of initializing tok_extra_cost to - // infinity below we set it to the difference between the - // (score+final_prob) of this token, and the best such (score+final_prob). - BaseFloat final_cost; - if (final_costs_.empty()) { - final_cost = 0.0; - } else { - IterType iter = final_costs_.find(tok); - if (iter != final_costs_.end()) - final_cost = iter->second; - else - final_cost = std::numeric_limits::infinity(); - } - BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_; - // tok_extra_cost will be a "min" over either directly being final, or - // being indirectly final through other links, and the loop below may - // decrease its value: - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; - link = link->next; - } - } - // prune away tokens worse than lattice_beam above best path. This step - // was not necessary in the non-final case because then, this case - // showed up as having no forward links. Here, the tok_extra_cost has - // an extra component relating to the final-prob. - if (tok_extra_cost > config_.lattice_beam) - tok_extra_cost = std::numeric_limits::infinity(); - // to be pruned in PruneTokensForFrame - - if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true; - tok->extra_cost = - tok_extra_cost; // will be +infinity or <= lattice_beam_. - } - } // while changed -} - -template -BaseFloat LatticeFasterDecoderTpl::FinalRelativeCost() const { - if (!decoding_finalized_) { - BaseFloat relative_cost; - ComputeFinalCosts(NULL, &relative_cost, NULL); - return relative_cost; - } else { - // we're not allowed to call that function if FinalizeDecoding() has - // been called; return a cached value. - return final_relative_cost_; - } -} - -// Prune away any tokens on this frame that have no forward links. -// [we don't do this in PruneForwardLinks because it would give us -// a problem with dangling pointers]. -// It's called by PruneActiveTokens if any forward links have been pruned -template -void LatticeFasterDecoderTpl::PruneTokensForFrame( - int32 frame_plus_one) { - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]"; - Token *tok, *next_tok, *prev_tok = NULL; - for (tok = toks; tok != NULL; tok = next_tok) { - next_tok = tok->next; - if (tok->extra_cost == std::numeric_limits::infinity()) { - // token is unreachable from end of graph; (no forward links survived) - // excise tok from list and delete tok. - if (prev_tok != NULL) - prev_tok->next = tok->next; - else - toks = tok->next; - delete tok; - num_toks_--; - } else { // fetch next Token - prev_tok = tok; - } - } -} - -// Go backwards through still-alive tokens, pruning them, starting not from -// the current frame (where we want to keep all tokens) but from the frame -// before that. We go backwards through the frames and stop when we reach a -// point where the delta-costs are not changing (and the delta controls when we -// consider a cost to have "not changed"). -template -void LatticeFasterDecoderTpl::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // The index "f" below represents a "frame plus one", i.e. you'd have to - // subtract one to get the corresponding index for the decodable object. - for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) { - // Reason why we need to prune forward links in this situation: - // (1) we have never pruned them (new TokenList) - // (2) we have not yet pruned the forward links to the next f, - // after any of those tokens have changed their extra_cost. - if (active_toks_[f].must_prune_forward_links) { - bool extra_costs_changed = false, links_pruned = false; - PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); - if (extra_costs_changed && f > 0) // any token has changed extra_cost - active_toks_[f - 1].must_prune_forward_links = true; - if (links_pruned) // any link was pruned - active_toks_[f].must_prune_tokens = true; - active_toks_[f].must_prune_forward_links = false; // job done - } - if (f + 1 < cur_frame_plus_one && // except for last f (no forward links) - active_toks_[f + 1].must_prune_tokens) { - PruneTokensForFrame(f + 1); - active_toks_[f + 1].must_prune_tokens = false; - } - } - KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin - << " to " << num_toks_; -} - -template -void LatticeFasterDecoderTpl::ComputeFinalCosts( - unordered_map *final_costs, - BaseFloat *final_relative_cost, BaseFloat *final_best_cost) const { - KALDI_ASSERT(!decoding_finalized_); - if (final_costs != NULL) final_costs->clear(); - const Elem *final_toks = toks_.GetList(); - BaseFloat infinity = std::numeric_limits::infinity(); - BaseFloat best_cost = infinity, best_cost_with_final = infinity; - - while (final_toks != NULL) { - StateId state = final_toks->key; - Token *tok = final_toks->val; - const Elem *next = final_toks->tail; - BaseFloat final_cost = fst_->Final(state).Value(); - BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost; - best_cost = std::min(cost, best_cost); - best_cost_with_final = std::min(cost_with_final, best_cost_with_final); - if (final_costs != NULL && final_cost != infinity) - (*final_costs)[tok] = final_cost; - final_toks = next; - } - if (final_relative_cost != NULL) { - if (best_cost == infinity && best_cost_with_final == infinity) { - // Likely this will only happen if there are no tokens surviving. - // This seems the least bad way to handle it. - *final_relative_cost = infinity; - } else { - *final_relative_cost = best_cost_with_final - best_cost; - } - } - if (final_best_cost != NULL) { - if (best_cost_with_final != infinity) { // final-state exists. - *final_best_cost = best_cost_with_final; - } else { // no final-state exists. - *final_best_cost = best_cost; - } - } -} - -template -void LatticeFasterDecoderTpl::AdvanceDecoding( - DecodableInterface *decodable, int32 max_num_frames) { - if (std::is_same >::value) { - // if the type 'FST' is the FST base-class, then see if the FST type of fst_ - // is actually VectorFst or ConstFst. If so, call the AdvanceDecoding() - // function after casting *this to the more specific type. - if (fst_->Type() == "const") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } else if (fst_->Type() == "vector") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } - } - - KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ && - "You must call InitDecoding() before AdvanceDecoding"); - int32 num_frames_ready = decodable->NumFramesReady(); - // num_frames_ready must be >= num_frames_decoded, or else - // the number of frames ready must have decreased (which doesn't - // make sense) or the decodable object changed between calls - // (which isn't allowed). - KALDI_ASSERT(num_frames_ready >= NumFramesDecoded()); - int32 target_frames_decoded = num_frames_ready; - if (max_num_frames >= 0) - target_frames_decoded = - std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames); - while (NumFramesDecoded() < target_frames_decoded) { - if (NumFramesDecoded() % config_.prune_interval == 0) { - PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); - } -} - -// FinalizeDecoding() is a version of PruneActiveTokens that we call -// (optionally) on the final frame. Takes into account the final-prob of -// tokens. This function used to be called PruneActiveTokensFinal(). -template -void LatticeFasterDecoderTpl::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // PruneForwardLinksFinal() prunes final frame (with final-probs), and - // sets decoding_finalized_. - PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { - bool b1, b2; // values not used. - BaseFloat dontcare = 0.0; // delta of zero means we must always update - PruneForwardLinks(f, &b1, &b2, dontcare); - PruneTokensForFrame(f + 1); - } - PruneTokensForFrame(0); - KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " - << num_toks_; -} - -/// Gets the weight cutoff. Also counts the active tokens. -template -BaseFloat LatticeFasterDecoderTpl::GetCutoff( - Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, - Elem **best_elem) { - BaseFloat best_weight = std::numeric_limits::infinity(); - // positive == high cost == bad. - size_t count = 0; - if (config_.max_active == std::numeric_limits::max() && - config_.min_active == 0) { - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = static_cast(e->val->tot_cost); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - if (adaptive_beam != NULL) *adaptive_beam = config_.beam; - return best_weight + config_.beam; - } else { - tmp_array_.clear(); - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = e->val->tot_cost; - tmp_array_.push_back(w); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - - BaseFloat beam_cutoff = best_weight + config_.beam, - min_active_cutoff = std::numeric_limits::infinity(), - max_active_cutoff = std::numeric_limits::infinity(); - - KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() - << " is " << tmp_array_.size(); - - if (tmp_array_.size() > static_cast(config_.max_active)) { - std::nth_element(tmp_array_.begin(), - tmp_array_.begin() + config_.max_active, - tmp_array_.end()); - max_active_cutoff = tmp_array_[config_.max_active]; - } - if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam. - if (adaptive_beam) - *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; - return max_active_cutoff; - } - if (tmp_array_.size() > static_cast(config_.min_active)) { - if (config_.min_active == 0) { - min_active_cutoff = best_weight; - } else { - std::nth_element( - tmp_array_.begin(), tmp_array_.begin() + config_.min_active, - tmp_array_.size() > static_cast(config_.max_active) - ? tmp_array_.begin() + config_.max_active - : tmp_array_.end()); - min_active_cutoff = tmp_array_[config_.min_active]; - } - } - if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. - if (adaptive_beam) - *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; - return min_active_cutoff; - } else { - *adaptive_beam = config_.beam; - return beam_cutoff; - } - } -} - -template -BaseFloat LatticeFasterDecoderTpl::ProcessEmitting( - DecodableInterface *decodable) { - KALDI_ASSERT(active_toks_.size() > 0); - int32 frame = - active_toks_.size() - 1; // frame is the frame-index - // (zero-based) used to get likelihoods - // from the decodable object. - active_toks_.resize(active_toks_.size() + 1); - - Elem *final_toks = - toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_ - // in simple-decoder.h. Removes the Elems from - // being indexed in the hash in toks_. - Elem *best_elem = NULL; - BaseFloat adaptive_beam; - size_t tok_cnt; - BaseFloat cur_cutoff = - GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); - KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " - << adaptive_beam; - - PossiblyResizeHash( - tok_cnt); // This makes sure the hash is always big enough. - - BaseFloat next_cutoff = std::numeric_limits::infinity(); - // pruning "online" before having seen all tokens - - BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good - // dynamic range. - - // First process the best token to get a hopefully - // reasonably tight bound on the next cutoff. The only - // products of the next block are "next_cutoff" and "cost_offset". - if (best_elem) { - StateId state = best_elem->key; - Token *tok = best_elem->val; - cost_offset = -tok->tot_cost; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat new_weight = arc.weight.Value() + cost_offset - - decodable->LogLikelihood(frame, arc.ilabel) + - tok->tot_cost; - if (state != arc.nextstate) { - new_weight += config_.length_penalty; - } - if (new_weight + adaptive_beam < next_cutoff) - next_cutoff = new_weight + adaptive_beam; - } - } - } - - // Store the offset on the acoustic likelihoods that we're applying. - // Could just do cost_offsets_.push_back(cost_offset), but we - // do it this way as it's more robust to future code changes. - cost_offsets_.resize(frame + 1, 0.0); - cost_offsets_[frame] = cost_offset; - - // the tokens are now owned here, in final_toks, and the hash is empty. - // 'owned' is a complex thing here; the point is we need to call DeleteElem - // on each elem 'e' to let toks_ know we're done with them. - for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) { - // loop this way because we delete "e" as we go. - StateId state = e->key; - Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat ac_cost = cost_offset - - decodable->LogLikelihood(frame, arc.ilabel), - graph_cost = arc.weight.Value(); - if (state != arc.nextstate) { - graph_cost += config_.length_penalty; - } - BaseFloat cur_cost = tok->tot_cost, - tot_cost = cur_cost + ac_cost + graph_cost; - if (tot_cost >= next_cutoff) - continue; - else if (tot_cost + adaptive_beam < next_cutoff) - next_cutoff = - tot_cost + adaptive_beam; // prune by best current token - // Note: the frame indexes into active_toks_ are one-based, - // hence the + 1. - Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); - // NULL: no change indicator needed - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - // Add ForwardLink from tok to next_tok (put on head of list - // tok->links) - tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); - tok->links->context_score = context_score; - } - } // for all arcs - } - e_tail = e->tail; - toks_.Delete(e); // delete Elem - } - return next_cutoff; -} - -// static inline -template -void LatticeFasterDecoderTpl::DeleteForwardLinks(Token *tok) { - ForwardLinkT *l = tok->links, *m; - while (l != NULL) { - m = l->next; - delete l; - l = m; - } - tok->links = NULL; -} - -template -void LatticeFasterDecoderTpl::ProcessNonemitting(BaseFloat cutoff) { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame = static_cast(active_toks_.size()) - 2; - // Note: "frame" is the time-index we just processed, or -1 if - // we are processing the nonemitting transitions before the - // first frame (called from InitDecoding()). - - // Processes nonemitting arcs for one frame. Propagates within toks_. - // Note-- this queue structure is not very optimal as - // it may cause us to process states unnecessarily (e.g. more than once), - // but in the baseline code, turning this vector into a set to fix this - // problem did not improve overall speed. - - KALDI_ASSERT(queue_.empty()); - - if (toks_.GetList() == NULL) { - if (!warned_) { - KALDI_WARN << "Error, no surviving tokens: frame is " << frame; - warned_ = true; - } - } - - int before = 0, after = 0; - for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) { - StateId state = e->key; - if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(e); - ++before; - } - - while (!queue_.empty()) { - ++after; - const Elem *e = queue_.back(); - queue_.pop_back(); - - StateId state = e->key; - Token *tok = - e->val; // would segfault if e is a NULL pointer but this can't happen. - BaseFloat cur_cost = tok->tot_cost; - if (cur_cost >= cutoff) // Don't bother processing successors. - continue; - // If "tok" has any existing forward links, delete them, - // because we're about to regenerate them. This is a kind - // of non-optimality (remember, this is the simple decoder), - // but since most states are emitting it's not a huge issue. - DeleteForwardLinks(tok); // necessary when re-visiting - tok->links = NULL; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel == 0) { // propagate nonemitting only... - BaseFloat graph_cost = arc.weight.Value(), - tot_cost = cur_cost + graph_cost; - if (tot_cost < cutoff) { - bool changed; - - Elem *e_new = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed); - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_new->val->context_state = tok->context_state; - } else { - e_new->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - - tok->links = - new ForwardLinkT(e_new->val, 0, arc.olabel, graph_cost, 0, - is_start_boundary, is_end_boundary, tok->links); - tok->links->context_score = context_score; - - // "changed" tells us whether the new token has a different - // cost from before, or is new [if so, add into queue]. - if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0) - queue_.push_back(e_new); - } - } - } // for all arcs - } // while queue not empty - KALDI_VLOG(3) << "ProcessNonemitting " << before << " " << after; -} - -template -void LatticeFasterDecoderTpl::DeleteElems(Elem *list) { - for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { - e_tail = e->tail; - toks_.Delete(e); - } -} - -template -void LatticeFasterDecoderTpl< - FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin - for (size_t i = 0; i < active_toks_.size(); i++) { - // Delete all tokens alive on this frame, and any forward - // links they may have. - for (Token *tok = active_toks_[i].toks; tok != NULL;) { - DeleteForwardLinks(tok); - Token *next_tok = tok->next; - delete tok; - num_toks_--; - tok = next_tok; - } - } - active_toks_.clear(); - KALDI_ASSERT(num_toks_ == 0); -} - -// static -template -void LatticeFasterDecoderTpl::TopSortTokens( - Token *tok_list, std::vector *topsorted_list) { - unordered_map token2pos; - using std::unordered_set; - typedef typename unordered_map::iterator IterType; - int32 num_toks = 0; - for (Token *tok = tok_list; tok != NULL; tok = tok->next) num_toks++; - int32 cur_pos = 0; - // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0. - // This is likely to be in closer to topological order than - // if we had given them ascending order, because of the way - // new tokens are put at the front of the list. - for (Token *tok = tok_list; tok != NULL; tok = tok->next) - token2pos[tok] = num_toks - ++cur_pos; - - unordered_set reprocess; - - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) { - Token *tok = iter->first; - int32 pos = iter->second; - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - // We only need to consider epsilon links, since non-epsilon links - // transition between frames and this function only needs to sort a list - // of tokens from a single frame. - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { // another token on this - // frame, so must consider it. - int32 next_pos = following_iter->second; - if (next_pos < pos) { // reassign the position of the next Token. - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - // In case we had previously assigned this token to be reprocessed, we can - // erase it from that set because it's "happy now" (we just processed it). - reprocess.erase(tok); - } - - size_t max_loop = 1000000, - loop_count; // max_loop is to detect epsilon cycles. - for (loop_count = 0; !reprocess.empty() && loop_count < max_loop; - ++loop_count) { - std::vector reprocess_vec; - for (typename unordered_set::iterator iter = reprocess.begin(); - iter != reprocess.end(); ++iter) - reprocess_vec.push_back(*iter); - reprocess.clear(); - for (typename std::vector::iterator iter = reprocess_vec.begin(); - iter != reprocess_vec.end(); ++iter) { - Token *tok = *iter; - int32 pos = token2pos[tok]; - // Repeat the processing we did above (for comments, see above). - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { - int32 next_pos = following_iter->second; - if (next_pos < pos) { - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - } - } - KALDI_ASSERT(loop_count < max_loop && - "Epsilon loops exist in your decoding " - "graph (this is not allowed!)"); - - topsorted_list->clear(); - topsorted_list->resize(cur_pos, - NULL); // create a list with NULLs in between. - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) - (*topsorted_list)[iter->second] = iter->first; -} - -// Instantiate the template for the combination of token types and FST types -// that we'll need. -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; - -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-decoder.h deleted file mode 100644 index 0152b85447e354b770745b748d266b1ca2d57024..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-decoder.h +++ /dev/null @@ -1,558 +0,0 @@ -// decoder/lattice-faster-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_ - -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "decoder/context_graph.h" -#include "fst/fstlib.h" -#include "fstext/fstext-lib.h" -#include "itf/decodable-itf.h" -#include "lat/determinize-lattice-pruned.h" -#include "lat/kaldi-lattice.h" -#include "util/hash-list.h" - -namespace kaldi { - -struct LatticeFasterDecoderConfig { - BaseFloat beam; - int32 max_active; - int32 min_active; - BaseFloat lattice_beam; - int32 prune_interval; - bool determinize_lattice; // not inspected by this class... used in - // command-line program. - BaseFloat beam_delta; - BaseFloat hash_ratio; - // Note: we don't make prune_scale configurable on the command line, it's not - // a very important parameter. It affects the algorithm that prunes the - // tokens as we go. - BaseFloat prune_scale; - BaseFloat length_penalty; // for balancing the del/ins ratio, suggested -3.0 - - // Most of the options inside det_opts are not actually queried by the - // LatticeFasterDecoder class itself, but by the code that calls it, for - // example in the function DecodeUtteranceLatticeFaster. - fst::DeterminizeLatticePhonePrunedOptions det_opts; - - LatticeFasterDecoderConfig() - : beam(16.0), - max_active(std::numeric_limits::max()), - min_active(200), - lattice_beam(10.0), - prune_interval(25), - determinize_lattice(true), - beam_delta(0.5), - hash_ratio(2.0), - prune_scale(0.1), - length_penalty(0.0) {} - void Register(OptionsItf *opts) { - det_opts.Register(opts); - opts->Register("beam", &beam, - "Decoding beam. Larger->slower, more accurate."); - opts->Register("max-active", &max_active, - "Decoder max active states. Larger->slower; " - "more accurate"); - opts->Register("min-active", &min_active, - "Decoder minimum #active states."); - opts->Register("lattice-beam", &lattice_beam, - "Lattice generation beam. Larger->slower, " - "and deeper lattices"); - opts->Register("prune-interval", &prune_interval, - "Interval (in frames) at " - "which to prune tokens"); - opts->Register( - "determinize-lattice", &determinize_lattice, - "If true, " - "determinize the lattice (lattice-determinization, keeping only " - "best pdf-sequence for each word-sequence)."); - opts->Register( - "beam-delta", &beam_delta, - "Increment used in decoding-- this " - "parameter is obscure and relates to a speedup in the way the " - "max-active constraint is applied. Larger is more accurate."); - opts->Register("hash-ratio", &hash_ratio, - "Setting used in decoder to " - "control hash behavior"); - } - void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && - min_active <= max_active && prune_interval > 0 && - beam_delta > 0.0 && hash_ratio >= 1.0 && prune_scale > 0.0 && - prune_scale < 1.0); - } -}; - -namespace decoder { -// We will template the decoder on the token type as well as the FST type; this -// is a mechanism so that we can use the same underlying decoder code for -// versions of the decoder that support quickly getting the best path -// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also -// those that do not (LatticeFasterDecoder). - -// ForwardLinks are the links from a token to a token on the next frame. -// or sometimes on the current frame (for input-epsilon links). -template -struct ForwardLink { - using Label = fst::StdArc::Label; - - Token *next_tok; // the next token [or NULL if represents final-state] - Label ilabel; // ilabel on arc - Label olabel; // olabel on arc - BaseFloat graph_cost; // graph cost of traversing arc (contains LM, etc.) - BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing arc - bool is_start_boundary; - bool is_end_boundary; - float context_score; - ForwardLink *next; // next in singly-linked list of forward arcs (arcs - // in the state-level lattice) from a token. - inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, - bool is_start_boundary, bool is_end_boundary, - ForwardLink *next) - : next_tok(next_tok), - ilabel(ilabel), - olabel(olabel), - graph_cost(graph_cost), - acoustic_cost(acoustic_cost), - is_start_boundary(is_start_boundary), - is_end_boundary(is_end_boundary), - context_score(0), - next(next) {} -}; - -struct StdToken { - using ForwardLinkT = ForwardLink; - using Token = StdToken; - - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals the - // minimum difference between the cost of the best path that this link is a - // part of, and the cost of the absolute best path, under the assumption that - // any of the currently active states at the decoding front may eventually - // succeed (e.g. if you were to take the currently active states one by one - // and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - Token *next; - - // This function does nothing and should be optimized out; it's needed - // so we can share the regular LatticeFasterDecoderTpl code and the code - // for LatticeFasterOnlineDecoder that supports fast traceback. - inline void SetBackpointer(Token *backpointer) {} - - // This constructor just ignores the 'backpointer' argument. That argument is - // needed so that we can use the same decoder code for LatticeFasterDecoderTpl - // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a - // fast way to obtain the best path). - inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links, - Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - context_state(0), - next(next) {} -}; - -struct BackpointerToken { - using ForwardLinkT = ForwardLink; - using Token = BackpointerToken; - - // BackpointerToken is like Token but also - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - BackpointerToken *next; - - // Best preceding BackpointerToken (could be a on this frame, connected to - // this via an epsilon transition, or on a previous frame). This is only - // required for an efficient GetBestPath function in - // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation - // (the "links" list is what stores the forward links, for that). - Token *backpointer; - - inline void SetBackpointer(Token *backpointer) { - this->backpointer = backpointer; - } - - inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, - ForwardLinkT *links, Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - next(next), - backpointer(backpointer), - context_state(0) {} -}; - -} // namespace decoder - -/** This is the "normal" lattice-generating decoder. - See \ref lattices_generation \ref decoders_faster and \ref decoders_simple - for more information. - - The decoder is templated on the FST type and the token type. The token type - will normally be StdToken, but also may be BackpointerToken which is to - support quick lookup of the current best path (see - lattice-faster-online-decoder.h) - - The FST you invoke this decoder which is expected to equal - Fst::Fst, a.k.a. StdFst, or GrammarFst. If you invoke it with - FST == StdFst and it notices that the actual FST type is - fst::VectorFst or fst::ConstFst, the decoder object - will internally cast itself to one that is templated on those more specific - types; this is an optimization for speed. - */ -template -class LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph); - - // This version of the constructor takes ownership of the fst, and will delete - // it when this object is destroyed. - LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config, FST *fst); - - void SetOptions(const LatticeFasterDecoderConfig &config) { - config_ = config; - } - - const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - - ~LatticeFasterDecoderTpl(); - - /// Decodes until there are no more frames left in the "decodable" object.. - /// note, this may block waiting for input if the "decodable" object blocks. - /// Returns true if any kind of traceback is available (not necessarily from a - /// final state). - bool Decode(DecodableInterface *decodable); - - /// says whether a final-state was active on the last frame. If it was not, - /// the lattice (or traceback) will end with states that are not final-states. - bool ReachedFinal() const { - return FinalRelativeCost() != std::numeric_limits::infinity(); - } - - /// Outputs an FST corresponding to the single best path through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. Note: this just calls - /// GetRawLattice() and figures out the shortest path. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// Outputs an FST corresponding to the raw, state-level - /// tracebacks. Returns true if result is nonempty. - /// If "use_final_probs" is true AND we reached the final-state - /// of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - /// The raw lattice will be topologically sorted. - /// - /// See also GetRawLatticePruned in lattice-faster-online-decoder.h, - /// which also supports a pruning beam, in case for some reason - /// you want it pruned tighter than the regular lattice beam. - /// We could put that here in future needed. - bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const; - - /// [Deprecated, users should now use GetRawLattice and determinize it - /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper]. - /// Outputs an FST corresponding to the lattice-determinized - /// lattice (one path per word sequence). Returns true if result is - /// nonempty. If "use_final_probs" is true AND we reached the final-state of - /// the graph then it will include those as final-probs, else it will treat - /// all final-probs as one. - bool GetLattice(CompactLattice *ofst, bool use_final_probs = true) const; - - /// InitDecoding initializes the decoding, and should only be used if you - /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to - /// call this. You can also call InitDecoding if you have already decoded an - /// utterance and want to start with a new utterance. - void InitDecoding(); - - /// This will decode until there are no more frames ready in the decodable - /// object. You can keep calling it each time more frames become available. - /// If max_num_frames is specified, it specifies the maximum number of frames - /// the function will decode before returning. - void AdvanceDecoding(DecodableInterface *decodable, - int32 max_num_frames = -1); - - /// This function may be optionally called after AdvanceDecoding(), when you - /// do not plan to decode any further. It does an extra pruning step that - /// will help to prune the lattices output by GetLattice and (particularly) - /// GetRawLattice more completely, particularly toward the end of the - /// utterance. If you call this, you cannot call AdvanceDecoding again (it - /// will fail), and you cannot call GetLattice() and related functions with - /// use_final_probs = false. Used to be called PruneActiveTokensFinal(). - void FinalizeDecoding(); - - /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives - /// more information. It returns the difference between the best (final-cost - /// plus cost) of any token on the final frame, and the best cost of any token - /// on the final frame. If it is infinity it means no final-states were - /// present on the final frame. It will usually be nonnegative. If it not - /// too positive (e.g. < 5 is my first guess, but this is not tested) you can - /// take it as a good indication that we reached the final-state with - /// reasonable likelihood. - BaseFloat FinalRelativeCost() const; - - // Returns the number of frames decoded so far. The value returned changes - // whenever we call ProcessEmitting(). - inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; } - - protected: - // we make things protected instead of private, as code in - // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the - // internals. - - // Deletes the elements of the singly linked list tok->links. - inline static void DeleteForwardLinks(Token *tok); - - // head of per-frame list of Tokens (list is in topological order), - // and something saying whether we ever pruned it using PruneForwardLinks. - struct TokenList { - Token *toks; - bool must_prune_forward_links; - bool must_prune_tokens; - TokenList() - : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) {} - }; - - using Elem = typename HashList::Elem; - // Equivalent to: - // struct Elem { - // StateId key; - // Token *val; - // Elem *tail; - // }; - - void PossiblyResizeHash(size_t num_toks); - - // FindOrAddToken either locates a token in hash of toks_, or if necessary - // inserts a new, empty token (i.e. with no forward links) for the current - // frame. [note: it's inserted if necessary into hash toks_ and also into the - // singly linked list of tokens active on this frame (whose head is at - // active_toks_[frame]). The frame_plus_one argument is the acoustic frame - // index plus one, which is used to index into the active_toks_ array. - // Returns the Token pointer. Sets "changed" (if non-NULL) to true if the - // token was newly created or the cost changed. - // If Token == StdToken, the 'backpointer' argument has no purpose (and will - // hopefully be optimized out). - inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one, - BaseFloat tot_cost, Token *backpointer, - bool *changed); - - // prunes outgoing links for all tokens in active_toks_[frame] - // it's called by PruneActiveTokens - // all links, that have link_extra_cost > lattice_beam are pruned - // delta is the amount by which the extra_costs must change - // before we set *extra_costs_changed = true. - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed, - bool *links_pruned, BaseFloat delta); - - // This function computes the final-costs for tokens active on the final - // frame. It outputs to final-costs, if non-NULL, a map from the Token* - // pointer to the final-prob of the corresponding state, for all Tokens - // that correspond to states that have final-probs. This map will be - // empty if there were no final-probs. It outputs to - // final_relative_cost, if non-NULL, the difference between the best - // forward-cost including the final-prob cost, and the best forward-cost - // without including the final-prob cost (this will usually be positive), or - // infinity if there were no final-probs. [c.f. FinalRelativeCost(), which - // outputs this quanitity]. It outputs to final_best_cost, if - // non-NULL, the lowest for any token t active on the final frame, of - // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in - // the graph of the state corresponding to token t, or the best of - // forward-cost[t] if there were no final-probs active on the final frame. - // You cannot call this after FinalizeDecoding() has been called; in that - // case you should get the answer from class-member variables. - void ComputeFinalCosts(unordered_map *final_costs, - BaseFloat *final_relative_cost, - BaseFloat *final_best_cost) const; - - // PruneForwardLinksFinal is a version of PruneForwardLinks that we call - // on the final frame. If there are final tokens active, it uses - // the final-probs for pruning, otherwise it treats all tokens as final. - void PruneForwardLinksFinal(); - - // Prune away any tokens on this frame that have no forward links. - // [we don't do this in PruneForwardLinks because it would give us - // a problem with dangling pointers]. - // It's called by PruneActiveTokens if any forward links have been pruned - void PruneTokensForFrame(int32 frame_plus_one); - - // Go backwards through still-alive tokens, pruning them if the - // forward+backward cost is more than lat_beam away from the best path. It's - // possible to prove that this is "correct" in the sense that we won't lose - // anything outside of lat_beam, regardless of what happens in the future. - // delta controls when it considers a cost to have changed enough to continue - // going backward and propagating the change. larger delta -> will recurse - // less far. - void PruneActiveTokens(BaseFloat delta); - - /// Gets the weight cutoff. Also counts the active tokens. - BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, - BaseFloat *adaptive_beam, Elem **best_elem); - - /// Processes emitting arcs for one frame. Propagates from prev_toks_ to - /// cur_toks_. Returns the cost cutoff for subsequent ProcessNonemitting() to - /// use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); - - /// Processes nonemitting (epsilon) arcs for one frame. Called after - /// ProcessEmitting() on each frame. The cost cutoff is computed by the - /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); - - // HashList defined in ../util/hash-list.h. It actually allows us to maintain - // more than one list (e.g. for current and previous frames), but only one of - // them at a time can be indexed by StateId. It is indexed by frame-index - // plus one, where the frame-index is zero-based, as used in decodable object. - // That is, the emitting probs of frame t are accounted for in tokens at - // toks_[t+1]. The zeroth frame is for nonemitting transition at the start of - // the graph. - HashList toks_; - - std::vector active_toks_; // Lists of tokens, indexed by - // frame (members of TokenList are toks, must_prune_forward_links, - // must_prune_tokens). - std::vector - queue_; // temp variable used in ProcessNonemitting, - std::vector tmp_array_; // used in GetCutoff. - - // fst_ is a pointer to the FST we are decoding from. - const FST *fst_; - // delete_fst_ is true if the pointer fst_ needs to be deleted when this - // object is destroyed. - bool delete_fst_; - - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic log-likelihoods on that - // frame in order to keep everything in a nice dynamic range i.e. close to - // zero, to reduce roundoff errors. - LatticeFasterDecoderConfig config_; - int32 num_toks_; // current total #toks allocated... - bool warned_; - - /// decoding_finalized_ is true if someone called FinalizeDecoding(). [note, - /// calling this is optional]. If true, it's forbidden to decode more. Also, - /// if this is set, then the output of ComputeFinalCosts() is in the next - /// three variables. The reason we need to do this is that after - /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some - /// of the tokens on the last frame are freed, so we free the list from toks_ - /// to avoid having dangling pointers hanging around. - bool decoding_finalized_; - /// For the meaning of the next 3 variables, see the comment for - /// decoding_finalized_ above., and ComputeFinalCosts(). - unordered_map final_costs_; - BaseFloat final_relative_cost_; - BaseFloat final_best_cost_; - - std::shared_ptr context_graph_ = nullptr; - - // There are various cleanup tasks... the toks_ structure contains - // singly linked lists of Token pointers, where Elem is the list type. - // It also indexes them in a hash, indexed by state (this hash is only - // maintained for the most recent frame). toks_.Clear() - // deletes them from the hash and returns the list of Elems. The - // function DeleteElems calls toks_.Delete(elem) for each elem in - // the list, which returns ownership of the Elem to the toks_ structure - // for reuse, but does not delete the Token pointer. The Token pointers - // are reference-counted and are ultimately deleted in PruneTokensForFrame, - // but are also linked together on each frame by their own linked-list, - // using the "next" pointer. We delete them manually. - void DeleteElems(Elem *list); - - // This function takes a singly linked list of tokens for a single frame, and - // outputs a list of them in topological order (it will crash if no such order - // can be found, which will typically be due to decoding graphs with epsilon - // cycles, which are not allowed). Note: the output list may contain NULLs, - // which the caller should pass over; it just happens to be more efficient for - // the algorithm to output a list that contains NULLs. - static void TopSortTokens(Token *tok_list, - std::vector *topsorted_list); - - void ClearActiveTokens(); - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl); -}; - -typedef LatticeFasterDecoderTpl - LatticeFasterDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-online-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-online-decoder.cc deleted file mode 100644 index 2345b4d129ff905784762e973bad279f2fb55d31..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-online-decoder.cc +++ /dev/null @@ -1,278 +0,0 @@ -// decoder/lattice-faster-online-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2014 IMSL, PKU-HKUST (author: Wei Shi) -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.cc, about how to maintain this -// file in sync with lattice-faster-decoder.cc - -#include -#include -#include -#include - -#include "decoder/lattice-faster-online-decoder.h" - -namespace kaldi { - -template -bool LatticeFasterOnlineDecoderTpl::TestGetBestPath( - bool use_final_probs) const { - Lattice lat1; - { - Lattice raw_lat; - this->GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, &lat1); - } - Lattice lat2; - GetBestPath(&lat2, use_final_probs); - BaseFloat delta = 0.1; - int32 num_paths = 1; - if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) { - KALDI_WARN << "Best-path test failed"; - return false; - } else { - return true; - } -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterOnlineDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - olat->DeleteStates(); - BaseFloat final_graph_cost; - BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost); - if (iter.Done()) return false; // would have printed warning. - StateId state = olat->AddState(); - olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0)); - while (!iter.Done()) { - LatticeArc arc; - iter = TraceBackBestPath(iter, &arc); - arc.nextstate = state; - StateId new_state = olat->AddState(); - olat->AddArc(new_state, arc); - state = new_state; - } - olat->SetStart(state); - return true; -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::BestPathEnd( - bool use_final_probs, BaseFloat *final_cost_out) const { - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "BestPathEnd() with use_final_probs == false"; - KALDI_ASSERT(this->NumFramesDecoded() > 0 && - "You cannot call BestPathEnd if no frames were decoded."); - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - // Singly linked list of tokens on last frame (access list through "next" - // pointer). - BaseFloat best_cost = std::numeric_limits::infinity(); - BaseFloat best_final_cost = 0; - Token *best_tok = NULL; - for (Token *tok = this->active_toks_.back().toks; tok != NULL; - tok = tok->next) { - BaseFloat cost = tok->tot_cost, final_cost = 0.0; - if (use_final_probs && !final_costs.empty()) { - // if we are instructed to use final-probs, and any final tokens were - // active on final frame, include the final-prob in the cost of the token. - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) { - final_cost = iter->second; - cost += final_cost; - } else { - cost = std::numeric_limits::infinity(); - } - } - if (cost < best_cost) { - best_cost = cost; - best_tok = tok; - best_final_cost = final_cost; - } - } - if (best_tok == - NULL) { // this should not happen, and is likely a code error or - // caused by infinities in likelihoods, but I'm not making - // it a fatal error for now. - KALDI_WARN << "No final token found."; - } - if (final_cost_out) *final_cost_out = best_final_cost; - return BestPathIterator(best_tok, this->NumFramesDecoded() - 1); -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::TraceBackBestPath(BestPathIterator iter, - LatticeArc *oarc) const { - KALDI_ASSERT(!iter.Done() && oarc != NULL); - Token *tok = static_cast(iter.tok); - int32 cur_t = iter.frame, step_t = 0; - if (tok->backpointer != NULL) { - // retrieve the correct forward link(with the best link cost) - BaseFloat best_cost = std::numeric_limits::infinity(); - ForwardLinkT *link; - for (link = tok->backpointer->links; link != NULL; link = link->next) { - if (link->next_tok == tok) { // this is a link to "tok" - BaseFloat graph_cost = link->graph_cost, - acoustic_cost = link->acoustic_cost; - BaseFloat cost = graph_cost + acoustic_cost; - if (cost < best_cost) { - oarc->ilabel = link->ilabel; - oarc->olabel = link->olabel; - if (link->ilabel != 0) { - KALDI_ASSERT(static_cast(cur_t) < - this->cost_offsets_.size()); - acoustic_cost -= this->cost_offsets_[cur_t]; - step_t = -1; - } else { - step_t = 0; - } - oarc->weight = LatticeWeight(graph_cost, acoustic_cost); - best_cost = cost; - } - } - } - if (link == NULL && - best_cost == - std::numeric_limits::infinity()) { // Did not find - // correct link. - KALDI_ERR << "Error tracing best-path back (likely " - << "bug in token-pruning algorithm)"; - } - } else { - oarc->ilabel = 0; - oarc->olabel = 0; - oarc->weight = LatticeWeight::One(); // zero costs. - } - return BestPathIterator(tok->backpointer, cur_t + step_t); -} - -template -bool LatticeFasterOnlineDecoderTpl::GetRawLatticePruned( - Lattice *ofst, bool use_final_probs, BaseFloat beam) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = this->active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - for (int32 f = 0; f <= num_frames; f++) { - if (this->active_toks_[f].toks == NULL) { - KALDI_WARN << "No tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - } - unordered_map tok_map; - std::queue > tok_queue; - // First initialize the queue and states. Put the initial state on the queue; - // this is the last token in the list active_toks_[0].toks. - for (Token *tok = this->active_toks_[0].toks; tok != NULL; tok = tok->next) { - if (tok->next == NULL) { - tok_map[tok] = ofst->AddState(); - ofst->SetStart(tok_map[tok]); - std::pair tok_pair(tok, 0); // #frame = 0 - tok_queue.push(tok_pair); - } - } - - // Next create states for "good" tokens - while (!tok_queue.empty()) { - std::pair cur_tok_pair = tok_queue.front(); - tok_queue.pop(); - Token *cur_tok = cur_tok_pair.first; - int32 cur_frame = cur_tok_pair.second; - KALDI_ASSERT(cur_frame >= 0 && cur_frame <= this->cost_offsets_.size()); - - typename unordered_map::const_iterator iter = - tok_map.find(cur_tok); - KALDI_ASSERT(iter != tok_map.end()); - StateId cur_state = iter->second; - - for (ForwardLinkT *l = cur_tok->links; l != NULL; l = l->next) { - Token *next_tok = l->next_tok; - if (next_tok->extra_cost < beam) { - // so both the current and the next token are good; create the arc - int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1; - StateId nextstate; - if (tok_map.find(next_tok) == tok_map.end()) { - nextstate = tok_map[next_tok] = ofst->AddState(); - tok_queue.push(std::pair(next_tok, next_frame)); - } else { - nextstate = tok_map[next_tok]; - } - BaseFloat cost_offset = - (l->ilabel != 0 ? this->cost_offsets_[cur_frame] : 0); - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(cur_state, arc); - } - } - if (cur_frame == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(cur_tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - return (ofst->NumStates() != 0); -} - -// Instantiate the template for the FST types that we'll need. -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-online-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-online-decoder.h deleted file mode 100644 index dc50cfa73e6574e9625eda9045c47f674fcbc1e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/decoder/lattice-faster-online-decoder.h +++ /dev/null @@ -1,131 +0,0 @@ -// decoder/lattice-faster-online-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.h, about how to maintain this -// file in sync with lattice-faster-decoder.h - -#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ - -#include "decoder/lattice-faster-decoder.h" - -#include - -namespace kaldi { - -/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also - supports an efficient way to get the best path (see the function - BestPathEnd()), which is useful in endpointing and in situations where you - might want to frequently access the best path. - - This is only templated on the FST type, since the Token type is required to - be BackpointerToken. Actually it only makes sense to instantiate - LatticeFasterDecoderTpl with Token == BackpointerToken if you do so - indirectly via this child class. - */ -template -class LatticeFasterOnlineDecoderTpl - : public LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using Token = decoder::BackpointerToken; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterOnlineDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : LatticeFasterDecoderTpl(fst, config, context_graph) {} - - // This version of the initializer takes ownership of 'fst', and will delete - // it when this object is destroyed. - LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config, - FST *fst) - : LatticeFasterDecoderTpl(config, fst) {} - - struct BestPathIterator { - void *tok; - int32 frame; - // note, "frame" is the frame-index of the frame you'll get the - // transition-id for next time, if you call TraceBackBestPath on this - // iterator (assuming it's not an epsilon transition). Note that this - // is one less than you might reasonably expect, e.g. it's -1 for - // the nonemitting transitions before the first frame. - BestPathIterator(void *t, int32 f) : tok(t), frame(f) {} - bool Done() const { return tok == NULL; } - }; - - /// Outputs an FST corresponding to the single best path through the lattice. - /// This is quite efficient because it doesn't get the entire raw lattice and - /// find the best path through it; instead, it uses the BestPathEnd and - /// BestPathIterator so it basically traces it back through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// This function does a self-test of GetBestPath(). Returns true on - /// success; returns false and prints a warning on failure. - bool TestGetBestPath(bool use_final_probs = true) const; - - /// This function returns an iterator that can be used to trace back - /// the best path. If use_final_probs == true and at least one final state - /// survived till the end, it will use the final-probs in working out the best - /// final Token, and will output the final cost to *final_cost (if non-NULL), - /// else it will use only the forward likelihood, and will put zero in - /// *final_cost (if non-NULL). - /// Requires that NumFramesDecoded() > 0. - BestPathIterator BestPathEnd(bool use_final_probs, - BaseFloat *final_cost = NULL) const; - - /// This function can be used in conjunction with BestPathEnd() to trace back - /// the best path one link at a time (e.g. this can be useful in endpoint - /// detection). By "link" we mean a link in the graph; not all links cross - /// frame boundaries, but each time you see a nonzero ilabel you can interpret - /// that as a frame. The return value is the updated iterator. It outputs - /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" - /// pointer, while leaving its "nextstate" variable unchanged. - BestPathIterator TraceBackBestPath(BestPathIterator iter, - LatticeArc *arc) const; - - /// Behaves the same as GetRawLattice but only processes tokens whose - /// extra_cost is smaller than the best-cost plus the specified beam. - /// It is only worthwhile to call this function if beam is less than - /// the lattice_beam specified in the config; otherwise, it would - /// return essentially the same thing as GetRawLattice, but more slowly. - bool GetRawLatticePruned(Lattice *ofst, bool use_final_probs, - BaseFloat beam) const; - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl); -}; - -typedef LatticeFasterOnlineDecoderTpl LatticeFasterOnlineDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstaddselfloops.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstaddselfloops.cc deleted file mode 100644 index 145bf006f2324136c5fea4a8d0012a7a4126c646..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstaddselfloops.cc +++ /dev/null @@ -1,100 +0,0 @@ -// fstbin/fstaddselfloops.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#include "util/simple-io-funcs.h" - -/* some test examples: - pushd ~/tmpdir - ( echo 3; echo 4) > in.list - ( echo 5; echo 6) > out.list - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list - | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | - fstcompile | fstaddselfloops in.list out.list | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Adds self-loops to states of an FST to propagate disambiguation " - "symbols through it\n" - "They are added on each final state and each state with non-epsilon " - "output symbols\n" - "on at least one arc out of the state. Useful in conjunction with " - "predeterminize\n" - "\n" - "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " - "[out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" - "in.list and out.list are lists of integers, one per line, of the\n" - "same length.\n"; - - ParseOptions po(usage); - po.Read(argc, argv); - - if (po.NumArgs() < 2 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string disambig_in_rxfilename = po.GetArg(1), - disambig_out_rxfilename = po.GetArg(2), - fst_in_filename = po.GetOptArg(3), - fst_out_filename = po.GetOptArg(4); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - std::vector disambig_in; - if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_in_rxfilename); - - std::vector disambig_out; - if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_out_rxfilename); - - if (disambig_in.size() != disambig_out.size()) - KALDI_ERR - << "fstaddselfloops: mismatch in size of disambiguation symbols"; - - AddSelfLoops(fst, disambig_in, disambig_out); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstdeterminizestar.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstdeterminizestar.cc deleted file mode 100644 index e818143025c0fd5d389c28c77715d65711fe63f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstdeterminizestar.cc +++ /dev/null @@ -1,114 +0,0 @@ -// fstbin/fstdeterminizestar.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#if !defined(_MSC_VER) && !defined(__APPLE__) -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. We are disabling this code if -// compiling on Windows because signal.h is not available there, and on -// MacOS due to a problem with in the initial release of Sierra. -#endif - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | - fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 - 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst - > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n - "." done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo - "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id - of fstdeterminizestar] # prints out a bunch of debugging output showing the - mess it got itself into. -*/ - -bool debug_location = false; -void signal_handler(int) { debug_location = true; } - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Removes epsilons and determinizes in one step\n" - "\n" - "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" - "\n" - "See also: fstdeterminizelog, lattice-determinize\n"; - - float delta = kDelta; - int max_states = -1; - bool use_log = false; - ParseOptions po(usage); - po.Register("use-log", &use_log, "Determinize in log semiring."); - po.Register("delta", &delta, - "Delta value used to determine equivalence of weights."); - po.Register( - "max-states", &max_states, - "Maximum number of states in determinized FST before it will abort."); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); - - // This enables us to get traceback info from determinization that is - // not seeming to terminate. -#if !defined(_MSC_VER) && !defined(__APPLE__) - signal(SIGUSR1, signal_handler); -#endif - // Normal case: just files. - VectorFst *fst = ReadFstKaldi(fst_in_str); - - ArcSort(fst, ILabelCompare()); // improves speed. - if (use_log) { - DeterminizeStarInLog(fst, delta, &debug_location, max_states); - } else { - VectorFst det_fst; - DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); - *fst = det_fst; // will do shallow copy and then det_fst goes - // out of scope anyway. - } - WriteFstKaldi(*fst, fst_out_str); - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstisstochastic.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstisstochastic.cc deleted file mode 100644 index 468ed0daa7d37cb9a25cf25264f86e48e137b975..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstisstochastic.cc +++ /dev/null @@ -1,91 +0,0 @@ -// fstbin/fstisstochastic.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -// e.g. of test: -// echo " 0 0" | fstcompile | fstisstochastic -// should return 0 and print "0 0" [meaning, min and -// max weight are one = exp(0)] -// echo " 0 1" | fstcompile | fstisstochastic -// should return 1, not stochastic, and print 1 1 -// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 -// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo -// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, -// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; -// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic -// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" -// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even -// though not stochastic because we gave it an absurdly large delta. - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Checks whether an FST is stochastic and exits with success if so.\n" - "Prints out maximum error (in log units).\n" - "\n" - "Usage: fstisstochastic [ in.fst ]\n"; - - float delta = 0.01; - bool test_in_log = true; - - ParseOptions po(usage); - po.Register("delta", &delta, "Maximum error to accept."); - po.Register("test-in-log", &test_in_log, - "Test stochasticity in log semiring."); - po.Read(argc, argv); - - if (po.NumArgs() > 1) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1); - - Fst *fst = ReadFstKaldiGeneric(fst_in_filename); - - bool ans; - StdArc::Weight min, max; - if (test_in_log) - ans = IsStochasticFstInLog(*fst, delta, &min, &max); - else - ans = IsStochasticFst(*fst, delta, &min, &max); - - std::cout << min.Value() << " " << max.Value() << '\n'; - delete fst; - if (ans) - return 0; // success; - else - return 1; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstminimizeencoded.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstminimizeencoded.cc deleted file mode 100644 index ae9ca6d75abe67d9a195572dd6d91ec3c7b44851..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fstminimizeencoded.cc +++ /dev/null @@ -1,74 +0,0 @@ -// fstbin/fstminimizeencoded.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint - ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | - fstminimizeencoded | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Minimizes FST after encoding [similar to fstminimize, but no " - "weight-pushing]\n" - "\n" - "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; - - float delta = kDelta; - ParseOptions po(usage); - po.Register("delta", &delta, - "Delta likelihood used for quantization of weights"); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1), - fst_out_filename = po.GetOptArg(2); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - MinimizeEncoded(fst, delta); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fsttablecompose.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fsttablecompose.cc deleted file mode 100644 index bdd476da78b8cb8823c60abf33b5278e05bfd92c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstbin/fsttablecompose.cc +++ /dev/null @@ -1,133 +0,0 @@ -// fstbin/fsttablecompose.cc - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "fstext/table-matcher.h" -#include "util/parse-options.h" - -/* - cd ~/tmpdir - while true; do - fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort - > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst - fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" - echo -n "." - done - -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - /* - fsttablecompose should always give equivalent results to compose, - but it is more efficient for certain kinds of inputs. - In particular, it is useful when, say, the left FST has states - that typically either have epsilon olabels, or - one transition out for each of the possible symbols (as the - olabel). The same with the input symbols of the right-hand FST - is possible. - */ - - const char *usage = - "Composition algorithm [between two FSTs of standard type, in " - "tropical\n" - "semiring] that is more efficient for certain cases-- in particular,\n" - "where one of the FSTs (the left one, if --match-side=left) has large\n" - "out-degree\n" - "\n" - "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " - "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; - - ParseOptions po(usage); - - TableComposeOptions opts; - std::string match_side = "left"; - std::string compose_filter = "sequence"; - - po.Register("connect", &opts.connect, "If true, trim FST before output."); - po.Register("match-side", &match_side, - "Side of composition to do table " - "match, one of: \"left\" or \"right\"."); - po.Register("compose-filter", &compose_filter, - "Composition filter to use, " - "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); - - po.Read(argc, argv); - - if (match_side == "left") { - opts.table_match_type = MATCH_OUTPUT; - } else if (match_side == "right") { - opts.table_match_type = MATCH_INPUT; - } else { - KALDI_ERR << "Invalid match-side option: " << match_side; - } - - if (compose_filter == "alt_sequence") { - opts.filter_type = ALT_SEQUENCE_FILTER; - } else if (compose_filter == "auto") { - opts.filter_type = AUTO_FILTER; - } else if (compose_filter == "match") { - opts.filter_type = MATCH_FILTER; - } else if (compose_filter == "sequence") { - opts.filter_type = SEQUENCE_FILTER; - } else { - KALDI_ERR << "Invalid compose-filter option: " << compose_filter; - } - - if (po.NumArgs() < 2 || po.NumArgs() > 3) { - po.PrintUsage(); - exit(1); - } - - std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), - fst_out_str = po.GetOptArg(3); - - VectorFst *fst1 = ReadFstKaldi(fst1_in_str); - - VectorFst *fst2 = ReadFstKaldi(fst2_in_str); - - // Checks if is olabel sorted and is ilabel sorted. - if (fst1->Properties(fst::kOLabelSorted, true) == 0) { - KALDI_WARN << "The first FST is not olabel sorted."; - } - if (fst2->Properties(fst::kILabelSorted, true) == 0) { - KALDI_WARN << "The second FST is not ilabel sorted."; - } - - VectorFst composed_fst; - - TableCompose(*fst1, *fst2, &composed_fst, opts); - - delete fst1; - delete fst2; - - WriteFstKaldi(composed_fst, fst_out_str); - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstext/determinize-lattice-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstext/determinize-lattice-inl.h deleted file mode 100644 index 0bfbc8f41c7e439b1fac037f60490e04fdcbdd8b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/fstext/determinize-lattice-inl.h +++ /dev/null @@ -1,1357 +0,0 @@ -// fstext/determinize-lattice-inl.h - -// Copyright 2009-2012 Microsoft Corporation -// 2012-2013 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -#define KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -// Do not include this file directly. It is included by determinize-lattice.h - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fst { - -// This class maps back and forth from/to integer id's to sequences of strings. -// used in determinization algorithm. It is constructed in such a way that -// finding the string-id of the successor of (string, next-label) has constant -// time. - -// Note: class IntType, typically int32, is the type of the element in the -// string (typically a template argument of the CompactLatticeWeightTpl). - -template -class LatticeStringRepository { - public: - struct Entry { - const Entry *parent; // NULL for empty string. - IntType i; - inline bool operator==(const Entry &other) const { - return (parent == other.parent && i == other.i); - } - Entry() {} - Entry(const Entry &e) : parent(e.parent), i(e.i) {} - }; - // Note: all Entry* pointers returned in function calls are - // owned by the repository itself, not by the caller! - - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } - - // Returns string of "parent" with i appended. Pointer - // owned by repository - const Entry *Successor(const Entry *parent, IntType i) { - new_entry_->parent = parent; - new_entry_->i = i; - - std::pair pr = set_.insert(new_entry_); - if (pr.second) { // Was successfully inserted (was not there). We need to - // replace the element we inserted, which resides on the - // stack, with one from the heap. - const Entry *ans = new_entry_; - new_entry_ = new Entry(); - return ans; - } else { // Was not inserted because an equivalent Entry already - // existed. - return *pr.first; - } - } - - const Entry *Concatenate(const Entry *a, const Entry *b) { - if (a == NULL) - return b; - else if (b == NULL) - return a; - std::vector v; - ConvertToVector(b, &v); - const Entry *ans = a; - for (size_t i = 0; i < v.size(); i++) ans = Successor(ans, v[i]); - return ans; - } - const Entry *CommonPrefix(const Entry *a, const Entry *b) { - std::vector a_vec, b_vec; - ConvertToVector(a, &a_vec); - ConvertToVector(b, &b_vec); - const Entry *ans = NULL; - for (size_t i = 0; - i < a_vec.size() && i < b_vec.size() && a_vec[i] == b_vec[i]; i++) - ans = Successor(ans, a_vec[i]); - return ans; - } - - // removes any elements from b that are not part of - // a common prefix with a. - void ReduceToCommonPrefix(const Entry *a, std::vector *b) { - size_t a_size = Size(a), b_size = b->size(); - while (a_size > b_size) { - a = a->parent; - a_size--; - } - if (b_size > a_size) b_size = a_size; - typename std::vector::iterator b_begin = b->begin(); - while (a_size != 0) { - if (a->i != *(b_begin + a_size - 1)) b_size = a_size - 1; - a = a->parent; - a_size--; - } - if (b_size != b->size()) b->resize(b_size); - } - - // removes the first n elements of a. - const Entry *RemovePrefix(const Entry *a, size_t n) { - if (n == 0) return a; - std::vector a_vec; - ConvertToVector(a, &a_vec); - assert(a_vec.size() >= n); - const Entry *ans = NULL; - for (size_t i = n; i < a_vec.size(); i++) ans = Successor(ans, a_vec[i]); - return ans; - } - - // Returns true if a is a prefix of b. If a is prefix of b, - // time taken is |b| - |a|. Else, time taken is |b|. - bool IsPrefixOf(const Entry *a, const Entry *b) const { - if (a == NULL) return true; // empty string prefix of all. - if (a == b) return true; - if (b == NULL) return false; - return IsPrefixOf(a, b->parent); - } - - inline size_t Size(const Entry *entry) const { - size_t ans = 0; - while (entry != NULL) { - ans++; - entry = entry->parent; - } - return ans; - } - - void ConvertToVector(const Entry *entry, std::vector *out) const { - size_t length = Size(entry); - out->resize(length); - if (entry != NULL) { - typename std::vector::reverse_iterator iter = out->rbegin(); - while (entry != NULL) { - *iter = entry->i; - entry = entry->parent; - ++iter; - } - } - } - - const Entry *ConvertFromVector(const std::vector &vec) { - const Entry *e = NULL; - for (size_t i = 0; i < vec.size(); i++) e = Successor(e, vec[i]); - return e; - } - - LatticeStringRepository() { new_entry_ = new Entry; } - - void Destroy() { - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) - delete *iter; - SetType tmp; - tmp.swap(set_); - if (new_entry_) { - delete new_entry_; - new_entry_ = NULL; - } - } - - // Rebuild will rebuild this object, guaranteeing only - // to preserve the Entry values that are in the vector pointed - // to (this list does not have to be unique). The point of - // this is to save memory. - void Rebuild(const std::vector &to_keep) { - SetType tmp_set; - for (typename std::vector::const_iterator iter = - to_keep.begin(); - iter != to_keep.end(); ++iter) - RebuildHelper(*iter, &tmp_set); - // Now delete all elems not in tmp_set. - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) { - if (tmp_set.count(*iter) == 0) - delete (*iter); // delete the Entry; not needed. - } - set_.swap(tmp_set); - } - - ~LatticeStringRepository() { Destroy(); } - int32 MemSize() const { - return set_.size() * sizeof(Entry) * 2; // this is a lower bound - // on the size this structure might take. - } - - private: - class EntryKey { // Hash function object. - public: - inline size_t operator()(const Entry *entry) const { - size_t prime = 49109; - return static_cast(entry->i) + - prime * reinterpret_cast(entry->parent); - } - }; - class EntryEqual { - public: - inline bool operator()(const Entry *e1, const Entry *e2) const { - return (*e1 == *e2); - } - }; - typedef std::unordered_set SetType; - - void RebuildHelper(const Entry *to_add, SetType *tmp_set) { - while (true) { - if (to_add == NULL) return; - typename SetType::iterator iter = tmp_set->find(to_add); - if (iter == tmp_set->end()) { // not in tmp_set. - tmp_set->insert(to_add); - to_add = to_add->parent; // and loop. - } else { - return; - } - } - } - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); - Entry *new_entry_; // We always have a pre-allocated Entry ready to use, - // to avoid unnecessary news and deletes. - SetType set_; -}; - -// class LatticeDeterminizer is templated on the same types that -// CompactLatticeWeight is templated on: the base weight (Weight), typically -// LatticeWeightTpl etc. but could also be e.g. TropicalWeight, and the -// IntType, typically int32, used for the output symbols in the compact -// representation of strings [note: the output symbols would usually be -// p.d.f. id's in the anticipated use of this code] It has a special requirement -// on the Weight type: that there should be a Compare function on the weights -// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 -// > w2. This requires that there be a total order on the weights. - -template -class LatticeDeterminizer { - public: - // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 - // correspondence between our states and the states in ofst. If destroy == - // true, release memory as we go (but we cannot output again). - - typedef CompactLatticeWeightTpl CompactWeight; - typedef ArcTpl - CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - - // Output to standard FST with CompactWeightTpl as its weight type - // (the weight stores the original output-symbol strings). If destroy == - // true, release memory as we go (but we cannot output again). - void Output(MutableFst *ofst, bool destroy = true) { - assert(determinized_); - typedef typename Arc::StateId StateId; - StateId nStates = static_cast(output_arcs_.size()); - if (destroy) FreeMostMemory(); - ofst->DeleteStates(); - ofst->SetStart(kNoStateId); - if (nStates == 0) { - return; - } - for (StateId s = 0; s < nStates; s++) { - OutputStateId news = ofst->AddState(); - assert(news == s); - } - ofst->SetStart(0); - // now process transitions. - for (StateId this_state = 0; this_state < nStates; this_state++) { - std::vector &this_vec(output_arcs_[this_state]); - typename std::vector::const_iterator iter = this_vec.begin(), - end = this_vec.end(); - - for (; iter != end; ++iter) { - const TempArc &temp_arc(*iter); - CompactArc new_arc; - std::vector is not treated as epsilon, create a common end state for - // all transitions accepting the , since they do not back off. This small - // optimization saves about 2% states in an average grammar. - if (sub_eps_ == 0) { - eos_state_ = fst_->AddState(); - fst_->SetFinal(eos_state_, 0); - } -} - -template -void ArpaLmCompilerImpl::ConsumeNGram(const NGram& ngram, - bool is_highest) { - // Generally, we do the following. Suppose we are adding an n-gram "A B - // C". Then find the node for "A B", add a new node for "A B C", and connect - // them with the arc accepting "C" with the specified weight. Also, add a - // backoff arc from the new "A B C" node to its backoff state "B C". - // - // Two notable exceptions are the highest order n-grams, and final n-grams. - // - // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM), - // the following optimization is performed. There is no point adding a node - // for "A B C" with a "C" arc from "A B", since there will be no other - // arcs ingoing to this node, and an epsilon backoff arc into the backoff - // model "B C", with the weight of \bar{1}. To save a node, create an arc - // accepting "C" directly from "A B" to "B C". This saves as many nodes - // as there are the highest order n-grams, which is typically about half - // the size of a large 3-gram model. - // - // Indeed, this does not apply to n-grams ending in EOS, since they do not - // back off. These are special, as they do not have a back-off state, and - // the node for "(..anything..) " is always final. These are handled - // in one of the two possible ways, If symbols and are being - // replaced by epsilons, neither node nor arc is created, and the logprob - // of the n-gram is applied to its source node as final weight. If and - // are preserved, then a special final node for is allocated and - // used as the destination of the "" acceptor arc. - HistKey heads(ngram.words.begin(), ngram.words.end() - 1); - typename HistoryMap::iterator source_it = history_.find(heads); - if (source_it == history_.end()) { - // There was no "A B", therefore the probability of "A B C" is zero. - // Print a warning and discard current n-gram. - if (parent_->ShouldWarn()) - KALDI_WARN << parent_->LineReference() - << " skipped: no parent (n-1)-gram exists"; - return; - } - - StateId source = source_it->second; - StateId dest; - Symbol sym = ngram.words.back(); - float weight = -ngram.logprob; - if (sym == sub_eps_ || sym == 0) { - KALDI_ERR << " or disambiguation symbol " << sym - << "found in the ARPA file. "; - } - if (sym == eos_symbol_) { - if (sub_eps_ == 0) { - // Keep as a real symbol when not substituting. - dest = eos_state_; - } else { - // Treat as if it was epsilon: mark source final, with the weight - // of the n-gram. - fst_->SetFinal(source, weight); - return; - } - } else { - // For the highest order n-gram, this may find an existing state, for - // non-highest, will create one (unless there are duplicate n-grams - // in the grammar, which cannot be reliably detected if highest order, - // so we better do not do that at all). - dest = AddStateWithBackoff( - HistKey(ngram.words.begin() + (is_highest ? 1 : 0), ngram.words.end()), - -ngram.backoff); - } - - if (sym == bos_symbol_) { - weight = 0; // Accepting is always free. - if (sub_eps_ == 0) { - // is as a real symbol, only accepted in the start state. - source = fst_->AddState(); - fst_->SetStart(source); - } else { - // The new state for unigram history *is* the start state. - fst_->SetStart(dest); - return; - } - } - - // Add arc from source to dest, whichever way it was found. - fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest)); - return; -} - -// Find or create a new state for n-gram defined by key, and ensure it has a -// backoff transition. The key is either the current n-gram for all but -// highest orders, or the tails of the n-gram for the highest order. The -// latter arises from the chain-collapsing optimization described above. -template -StateId ArpaLmCompilerImpl::AddStateWithBackoff(HistKey key, - float backoff) { - typename HistoryMap::iterator dest_it = history_.find(key); - if (dest_it != history_.end()) { - // Found an existing state in the history map. Invariant: if the state in - // the map, then its backoff arc is in the FST. We are done. - return dest_it->second; - } - // Otherwise create a new state and its backoff arc, and register in the map. - StateId dest = fst_->AddState(); - history_[key] = dest; - CreateBackoff(key.Tails(), dest, backoff); - return dest; -} - -// Create a backoff arc for a state. Key is a backoff destination that may or -// may not exist. When the destination is not found, naturally fall back to -// the lower order model, and all the way down until one is found (since the -// 0-gram model is always present, the search is guaranteed to terminate). -template -inline void ArpaLmCompilerImpl::CreateBackoff(HistKey key, - StateId state, - float weight) { - typename HistoryMap::iterator dest_it = history_.find(key); - while (dest_it == history_.end()) { - key = key.Tails(); - dest_it = history_.find(key); - } - - // The arc should transduce either or #0 to , depending on the - // epsilon substitution mode. This is the only case when input and output - // label may differ. - fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second)); -} - -ArpaLmCompiler::~ArpaLmCompiler() { - if (impl_ != NULL) delete impl_; -} - -void ArpaLmCompiler::HeaderAvailable() { - KALDI_ASSERT(impl_ == NULL); - // Use optimized implementation if the grammar is 4-gram or less, and the - // maximum attained symbol id will fit into the optimized range. - int64 max_symbol = 0; - if (Symbols() != NULL) max_symbol = Symbols()->AvailableKey() - 1; - // If augmenting the symbol table, assume the worst case when all words in - // the model being read are novel. - if (Options().oov_handling == ArpaParseOptions::kAddToSymbols) - max_symbol += NgramCounts()[0]; - - if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - } else { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - KALDI_LOG << "Reverting to slower state tracking because model is large: " - << NgramCounts().size() << "-gram with symbols up to " - << max_symbol; - } -} - -void ArpaLmCompiler::ConsumeNGram(const NGram& ngram) { - // is invalid in tails, in heads of an n-gram. - for (int i = 0; i < ngram.words.size(); ++i) { - if ((i > 0 && ngram.words[i] == Options().bos_symbol) || - (i + 1 < ngram.words.size() && - ngram.words[i] == Options().eos_symbol)) { - if (ShouldWarn()) - KALDI_WARN << LineReference() - << " skipped: n-gram has invalid BOS/EOS placement"; - return; - } - } - - bool is_highest = ngram.words.size() == NgramCounts().size(); - impl_->ConsumeNGram(ngram, is_highest); -} - -void ArpaLmCompiler::RemoveRedundantStates() { - fst::StdArc::Label backoff_symbol = sub_eps_; - if (backoff_symbol == 0) { - // The method of removing redundant states implemented in this function - // leads to slow determinization of L o G when people use the older style of - // usage of arpa2fst where the --disambig-symbol option was not specified. - // The issue seems to be that it creates a non-deterministic FST, while G is - // supposed to be deterministic. By 'return'ing below, we just disable this - // method if people were using an older script. This method isn't really - // that consequential anyway, and people will move to the newer-style - // scripts (see current utils/format_lm.sh), so this isn't much of a - // problem. - return; - } - - fst::StdArc::StateId num_states = fst_.NumStates(); - - // replace the #0 symbols on the input of arcs out of redundant states (states - // that are not final and have only a backoff arc leaving them), with . - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && - fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } - } - } - - // we could call fst::RemoveEps, and it would have the same effect in normal - // cases, where backoff_symbol != 0 and there are no epsilons in unexpected - // places, but RemoveEpsLocal is a bit safer in case something weird is going - // on; it guarantees not to blow up the FST. - fst::RemoveEpsLocal(&fst_); - KALDI_LOG << "Reduced num-states from " << num_states << " to " - << fst_.NumStates(); -} - -void ArpaLmCompiler::Check() const { - if (fst_.Start() == fst::kNoStateId) { - KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " - << Symbols()->Find(Options().bos_symbol) << "."; - } -} - -void ArpaLmCompiler::ReadComplete() { - fst_.SetInputSymbols(Symbols()); - fst_.SetOutputSymbols(Symbols()); - RemoveRedundantStates(); - Check(); -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/lm/arpa-lm-compiler.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/lm/arpa-lm-compiler.h deleted file mode 100644 index 069c71bd0e6f5acf0b9521ec1ef46796eb31fe4d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/lm/arpa-lm-compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// lm/arpa-lm-compiler.h - -// Copyright 2009-2011 Gilles Boulianne -// Copyright 2016 Smart Action LLC (kkm) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_LM_ARPA_LM_COMPILER_H_ -#define KALDI_LM_ARPA_LM_COMPILER_H_ - -#include - -#include "lm/arpa-file-parser.h" - -namespace kaldi { - -class ArpaLmCompilerImplInterface; - -class ArpaLmCompiler : public ArpaFileParser { - public: - ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps, - fst::SymbolTable* symbols) - : ArpaFileParser(options, symbols), sub_eps_(sub_eps), impl_(NULL) {} - ~ArpaLmCompiler(); - - const fst::StdVectorFst& Fst() const { return fst_; } - fst::StdVectorFst* MutableFst() { return &fst_; } - - protected: - // ArpaFileParser overrides. - virtual void HeaderAvailable(); - virtual void ConsumeNGram(const NGram& ngram); - virtual void ReadComplete(); - - private: - // this function removes states that only have a backoff arc coming - // out of them. - void RemoveRedundantStates(); - void Check() const; - - int sub_eps_; - ArpaLmCompilerImplInterface* impl_; // Owned. - fst::StdVectorFst fst_; - template - friend class ArpaLmCompilerImpl; -}; - -} // namespace kaldi - -#endif // KALDI_LM_ARPA_LM_COMPILER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/lmbin/arpa2fst.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/lmbin/arpa2fst.cc deleted file mode 100644 index 881a45c5b37810247ea38dae56237f59b5554a9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/lmbin/arpa2fst.cc +++ /dev/null @@ -1,145 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "lm/arpa-lm-compiler.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; // NOLINT - try { - const char *usage = - "Convert an ARPA format language model into an FST\n" - "Usage: arpa2fst [opts] \n" - " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" - "data/lang/words.txt lm/input.arpa G.fst\n\n" - "Note: When called without switches, the output G.fst will contain\n" - "an embedded symbol table. This is compatible with the way a previous\n" - "version of arpa2fst worked.\n"; - - ParseOptions po(usage); - - ArpaParseOptions options; - options.Register(&po); - - // Option flags. - std::string bos_symbol = ""; - std::string eos_symbol = ""; - std::string disambig_symbol; - std::string read_syms_filename; - std::string write_syms_filename; - bool keep_symbols = false; - bool ilabel_sort = true; - - po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); - po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); - po.Register("disambig-symbol", &disambig_symbol, - "Disambiguator. If provided (e. g. #0), used on input side of " - "backoff links, and and are replaced with epsilons"); - po.Register("read-symbol-table", &read_syms_filename, - "Use existing symbol table"); - po.Register("write-symbol-table", &write_syms_filename, - "Write generated symbol table to a file"); - po.Register("keep-symbols", &keep_symbols, - "Store symbol table with FST. Symbols always saved to FST if " - "symbol tables are neither read or written (otherwise symbols " - "would be lost entirely)"); - po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); - - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_rxfilename = po.GetArg(1), - fst_wxfilename = po.GetOptArg(2); - - int64 disambig_symbol_id = 0; - - fst::SymbolTable *symbols; - if (!read_syms_filename.empty()) { - // Use existing symbols. Required symbols must be in the table. - kaldi::Input kisym(read_syms_filename); - symbols = fst::SymbolTable::ReadText( - kisym.Stream(), PrintableWxfilename(read_syms_filename)); - if (symbols == NULL) - KALDI_ERR << "Could not read symbol table from file " - << read_syms_filename; - - options.oov_handling = ArpaParseOptions::kSkipNGram; - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == -1) // fst::kNoSymbol - KALDI_ERR << "Symbol table " << read_syms_filename - << " has no symbol for " << disambig_symbol; - } - } else { - // Create a new symbol table and populate it from ARPA file. - symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); - options.oov_handling = ArpaParseOptions::kAddToSymbols; - symbols->AddSymbol("", 0); - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->AddSymbol(disambig_symbol); - } - } - - // Add or use existing BOS and EOS. - options.bos_symbol = symbols->AddSymbol(bos_symbol); - options.eos_symbol = symbols->AddSymbol(eos_symbol); - - // If producing new (not reading existing) symbols and not saving them, - // need to keep symbols with FST, otherwise they would be lost. - if (read_syms_filename.empty() && write_syms_filename.empty()) - keep_symbols = true; - - // Actually compile LM. - KALDI_ASSERT(symbols != NULL); - ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); - { - Input ki(arpa_rxfilename); - lm_compiler.Read(ki.Stream()); - } - - // Sort the FST in-place if requested by options. - if (ilabel_sort) { - fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); - } - - // Write symbols if requested. - if (!write_syms_filename.empty()) { - kaldi::Output kosym(write_syms_filename, false); - symbols->WriteText(kosym.Stream()); - } - - // Write LM FST. - bool write_binary = true, write_header = false; - kaldi::Output kofst(fst_wxfilename, write_binary, write_header); - fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); - wopts.write_isymbols = wopts.write_osymbols = keep_symbols; - lm_compiler.Fst().Write(kofst.Stream(), wopts); - - delete symbols; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/basic-filebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/basic-filebuf.h deleted file mode 100644 index 22ec891064d5955c8b1d255e0d34781a9f505a38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/basic-filebuf.h +++ /dev/null @@ -1,952 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// This is a modified version of the std::basic_filebuf from libc++ -// Copyright 20XX LLVM -// (http://libcxx.llvm.org/). -// It allows one to create basic_filebuf from an existing FILE* handle or file -// descriptor. -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source License licenses. See LICENSE.TXT for details (included at the -// bottom). -/////////////////////////////////////////////////////////////////////////////// -#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ -#define KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////// -namespace kaldi { -/////////////////////////////////////////////////////////////////////////////// -template > -class basic_filebuf : public std::basic_streambuf { - public: - typedef CharT char_type; - typedef Traits traits_type; - typedef typename traits_type::int_type int_type; - typedef typename traits_type::pos_type pos_type; - typedef typename traits_type::off_type off_type; - typedef typename traits_type::state_type state_type; - - basic_filebuf(); - basic_filebuf(basic_filebuf&& rhs); - virtual ~basic_filebuf(); - - basic_filebuf& operator=(basic_filebuf&& rhs); - void swap(basic_filebuf& rhs); - - bool is_open() const; - basic_filebuf* open(const char* s, std::ios_base::openmode mode); - basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); - basic_filebuf* open(int fd, std::ios_base::openmode mode); - basic_filebuf* open(FILE* f, std::ios_base::openmode mode); - basic_filebuf* close(); - - FILE* file() { return this->_M_file; } - int fd() { return fileno(this->_M_file); } - - protected: - int_type underflow() override; - int_type pbackfail(int_type c = traits_type::eof()) override; - int_type overflow(int_type c = traits_type::eof()) override; - std::basic_streambuf* setbuf( - char_type* s, std::streamsize n) override; - pos_type seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - pos_type seekpos(pos_type sp, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - int sync() override; - void imbue(const std::locale& loc) override; - - protected: - char* _M_extbuf; - const char* _M_extbufnext; - const char* _M_extbufend; - char _M_extbuf_min[8]; - size_t _M_ebs; - char_type* _M_intbuf; - size_t _M_ibs; - FILE* _M_file; - const std::codecvt* _M_cv; - state_type _M_st; - state_type _M_st_last; - std::ios_base::openmode _M_om; - std::ios_base::openmode _M_cm; - bool _M_owns_eb; - bool _M_owns_ib; - bool _M_always_noconv; - - const char* _M_get_mode(std::ios_base::openmode mode); - bool _M_read_mode(); - void _M_write_mode(); -}; - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf() - : _M_extbuf(nullptr), - _M_extbufnext(nullptr), - _M_extbufend(nullptr), - _M_ebs(0), - _M_intbuf(nullptr), - _M_ibs(0), - _M_file(nullptr), - _M_cv(nullptr), - _M_st(), - _M_st_last(), - _M_om(std::ios_base::openmode(0)), - _M_cm(std::ios_base::openmode(0)), - _M_owns_eb(false), - _M_owns_ib(false), - _M_always_noconv(false) { - if (std::has_facet >( - this->getloc())) { - _M_cv = &std::use_facet >( - this->getloc()); - _M_always_noconv = _M_cv->always_noconv(); - } - setbuf(0, 4096); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf(basic_filebuf&& rhs) - : std::basic_streambuf(rhs) { - if (rhs._M_extbuf == rhs._M_extbuf_min) { - _M_extbuf = _M_extbuf_min; - _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); - _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); - } else { - _M_extbuf = rhs._M_extbuf; - _M_extbufnext = rhs._M_extbufnext; - _M_extbufend = rhs._M_extbufend; - } - _M_ebs = rhs._M_ebs; - _M_intbuf = rhs._M_intbuf; - _M_ibs = rhs._M_ibs; - _M_file = rhs._M_file; - _M_cv = rhs._M_cv; - _M_st = rhs._M_st; - _M_st_last = rhs._M_st_last; - _M_om = rhs._M_om; - _M_cm = rhs._M_cm; - _M_owns_eb = rhs._M_owns_eb; - _M_owns_ib = rhs._M_owns_ib; - _M_always_noconv = rhs._M_always_noconv; - if (rhs.pbase()) { - if (rhs.pbase() == rhs._M_intbuf) - this->setp(_M_intbuf, _M_intbuf + (rhs.epptr() - rhs.pbase())); - else - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + - (rhs.epptr() - rhs.pbase())); - this->pbump(rhs.pptr() - rhs.pbase()); - } else if (rhs.eback()) { - if (rhs.eback() == rhs._M_intbuf) - this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), - _M_intbuf + (rhs.egptr() - rhs.eback())); - else - this->setg( - reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (rhs.gptr() - rhs.eback()), - reinterpret_cast(_M_extbuf) + - (rhs.egptr() - rhs.eback())); - } - rhs._M_extbuf = nullptr; - rhs._M_extbufnext = nullptr; - rhs._M_extbufend = nullptr; - rhs._M_ebs = 0; - rhs._M_intbuf = nullptr; - rhs._M_ibs = 0; - rhs._M_file = nullptr; - rhs._M_st = state_type(); - rhs._M_st_last = state_type(); - rhs._M_om = std::ios_base::openmode(0); - rhs._M_cm = std::ios_base::openmode(0); - rhs._M_owns_eb = false; - rhs._M_owns_ib = false; - rhs.setg(0, 0, 0); - rhs.setp(0, 0); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf& basic_filebuf::operator=( - basic_filebuf&& rhs) { - close(); - swap(rhs); - return *this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::~basic_filebuf() { - // try - // { - // close(); - // } - // catch (...) - // { - // } - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::swap(basic_filebuf& rhs) { - std::basic_streambuf::swap(rhs); - if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - std::swap(_M_extbuf, rhs._M_extbuf); - std::swap(_M_extbufnext, rhs._M_extbufnext); - std::swap(_M_extbufend, rhs._M_extbufend); - } else { - ptrdiff_t ln = _M_extbufnext - _M_extbuf; - ptrdiff_t le = _M_extbufend - _M_extbuf; - ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; - ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; - if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - _M_extbuf = rhs._M_extbuf; - rhs._M_extbuf = rhs._M_extbuf_min; - } else if (_M_extbuf != _M_extbuf_min && - rhs._M_extbuf == rhs._M_extbuf_min) { - rhs._M_extbuf = _M_extbuf; - _M_extbuf = _M_extbuf_min; - } - _M_extbufnext = _M_extbuf + rn; - _M_extbufend = _M_extbuf + re; - rhs._M_extbufnext = rhs._M_extbuf + ln; - rhs._M_extbufend = rhs._M_extbuf + le; - } - std::swap(_M_ebs, rhs._M_ebs); - std::swap(_M_intbuf, rhs._M_intbuf); - std::swap(_M_ibs, rhs._M_ibs); - std::swap(_M_file, rhs._M_file); - std::swap(_M_cv, rhs._M_cv); - std::swap(_M_st, rhs._M_st); - std::swap(_M_st_last, rhs._M_st_last); - std::swap(_M_om, rhs._M_om); - std::swap(_M_cm, rhs._M_cm); - std::swap(_M_owns_eb, rhs._M_owns_eb); - std::swap(_M_owns_ib, rhs._M_owns_ib); - std::swap(_M_always_noconv, rhs._M_always_noconv); - if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->gptr() - this->eback(); - ptrdiff_t e = this->egptr() - this->eback(); - this->setg(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + n, - reinterpret_cast(_M_extbuf_min) + e); - } else if (this->pbase() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->pptr() - this->pbase(); - ptrdiff_t e = this->epptr() - this->pbase(); - this->setp(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + e); - this->pbump(n); - } - if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.gptr() - rhs.eback(); - ptrdiff_t e = rhs.egptr() - rhs.eback(); - rhs.setg(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + n, - reinterpret_cast(rhs._M_extbuf_min) + e); - } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.pptr() - rhs.pbase(); - ptrdiff_t e = rhs.epptr() - rhs.pbase(); - rhs.setp(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + e); - rhs.pbump(n); - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline void swap(basic_filebuf& x, - basic_filebuf& y) { - x.swap(y); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline bool basic_filebuf::is_open() const { - return _M_file != nullptr; -} - -/////////////////////////////////////////////////////////////////////////////// -template -const char* basic_filebuf::_M_get_mode( - std::ios_base::openmode mode) { - switch ((mode & ~std::ios_base::ate) | 0) { - case std::ios_base::out: - case std::ios_base::out | std::ios_base::trunc: - return "w"; - case std::ios_base::out | std::ios_base::app: - case std::ios_base::app: - return "a"; - break; - case std::ios_base::in: - return "r"; - case std::ios_base::in | std::ios_base::out: - return "r+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: - return "w+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app: - case std::ios_base::in | std::ios_base::app: - return "a+"; - case std::ios_base::out | std::ios_base::binary: - case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: - return "wb"; - case std::ios_base::out | std::ios_base::app | std::ios_base::binary: - case std::ios_base::app | std::ios_base::binary: - return "ab"; - case std::ios_base::in | std::ios_base::binary: - return "rb"; - case std::ios_base::in | std::ios_base::out | std::ios_base::binary: - return "r+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | - std::ios_base::binary: - return "w+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app | - std::ios_base::binary: - case std::ios_base::in | std::ios_base::app | std::ios_base::binary: - return "a+b"; - default: - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - const char* s, std::ios_base::openmode mode) { - basic_filebuf* rt = nullptr; - if (_M_file == nullptr) { - const char* md = _M_get_mode(mode); - if (md) { - _M_file = fopen(s, md); - if (_M_file) { - rt = this; - _M_om = mode; - if (mode & std::ios_base::ate) { - if (fseek(_M_file, 0, SEEK_END)) { - fclose(_M_file); - _M_file = nullptr; - rt = nullptr; - } - } - } - } - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf* basic_filebuf::open( - const std::string& s, std::ios_base::openmode mode) { - return open(s.c_str(), mode); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - int fd, std::ios_base::openmode mode) { - const char* md = this->_M_get_mode(mode); - if (md) { - this->_M_file = fdopen(fd, md); - this->_M_om = mode; - return this; - } else { - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - FILE* f, std::ios_base::openmode mode) { - this->_M_file = f; - this->_M_om = mode; - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::close() { - basic_filebuf* rt = nullptr; - if (_M_file) { - rt = this; - std::unique_ptr h(_M_file, fclose); - if (sync()) rt = nullptr; - if (fclose(h.release()) == 0) - _M_file = nullptr; - else - rt = nullptr; - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::underflow() { - if (_M_file == nullptr) return traits_type::eof(); - bool initial = _M_read_mode(); - char_type buf; - if (this->gptr() == nullptr) this->setg(&buf, &buf + 1, &buf + 1); - const size_t unget_sz = - initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); - int_type c = traits_type::eof(); - if (this->gptr() == this->egptr()) { - memmove(this->eback(), this->egptr() - unget_sz, - unget_sz * sizeof(char_type)); - if (_M_always_noconv) { - size_t nmemb = - static_cast(this->egptr() - this->eback() - unget_sz); - nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); - if (nmemb != 0) { - this->setg(this->eback(), this->eback() + unget_sz, - this->eback() + unget_sz + nmemb); - c = traits_type::to_int_type(*this->gptr()); - } - } else { - memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); - _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); - _M_extbufend = - _M_extbuf + - (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); - size_t nmemb = - std::min(static_cast(_M_ibs - unget_sz), - static_cast(_M_extbufend - _M_extbufnext)); - std::codecvt_base::result r; - _M_st_last = _M_st; - size_t nr = - fread(reinterpret_cast(const_cast(_M_extbufnext)), - 1, nmemb, _M_file); - if (nr != 0) { - if (!_M_cv) throw std::bad_cast(); - _M_extbufend = _M_extbufnext + nr; - char_type* inext; - r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, - this->eback() + unget_sz, this->eback() + _M_ibs, inext); - if (r == std::codecvt_base::noconv) { - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf), - const_cast(_M_extbufend)); - c = traits_type::to_int_type(*this->gptr()); - } else if (inext != this->eback() + unget_sz) { - this->setg(this->eback(), this->eback() + unget_sz, inext); - c = traits_type::to_int_type(*this->gptr()); - } - } - } - } else { - c = traits_type::to_int_type(*this->gptr()); - } - if (this->eback() == &buf) this->setg(0, 0, 0); - return c; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::pbackfail(int_type c) { - if (_M_file && this->eback() < this->gptr()) { - if (traits_type::eq_int_type(c, traits_type::eof())) { - this->gbump(-1); - return traits_type::not_eof(c); - } - if ((_M_om & std::ios_base::out) || - traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { - this->gbump(-1); - *this->gptr() = traits_type::to_char_type(c); - return c; - } - } - return traits_type::eof(); -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::overflow(int_type c) { - if (_M_file == nullptr) return traits_type::eof(); - _M_write_mode(); - char_type buf; - char_type* pb_save = this->pbase(); - char_type* epb_save = this->epptr(); - if (!traits_type::eq_int_type(c, traits_type::eof())) { - if (this->pptr() == nullptr) this->setp(&buf, &buf + 1); - *this->pptr() = traits_type::to_char_type(c); - this->pbump(1); - } - if (this->pptr() != this->pbase()) { - if (_M_always_noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else { - char* extbe = _M_extbuf; - std::codecvt_base::result r; - do { - if (!_M_cv) throw std::bad_cast(); - const char_type* e; - r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, _M_extbuf, - _M_extbuf + _M_ebs, extbe); - if (e == this->pbase()) return traits_type::eof(); - if (r == std::codecvt_base::noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else if (r == std::codecvt_base::ok || - r == std::codecvt_base::partial) { - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - if (r == std::codecvt_base::partial) { - this->setp(const_cast(e), this->pptr()); - this->pbump(this->epptr() - this->pbase()); - } - } else { - return traits_type::eof(); - } - } while (r == std::codecvt_base::partial); - } - this->setp(pb_save, epb_save); - } - return traits_type::not_eof(c); -} - -/////////////////////////////////////////////////////////////////////////////// -template -std::basic_streambuf* basic_filebuf::setbuf( - char_type* s, std::streamsize n) { - this->setg(0, 0, 0); - this->setp(0, 0); - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; - _M_ebs = n; - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv && s) { - _M_extbuf = reinterpret_cast(s); - _M_owns_eb = false; - } else { - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } - } else { - _M_extbuf = _M_extbuf_min; - _M_ebs = sizeof(_M_extbuf_min); - _M_owns_eb = false; - } - if (!_M_always_noconv) { - _M_ibs = std::max(n, sizeof(_M_extbuf_min)); - if (s && _M_ibs >= sizeof(_M_extbuf_min)) { - _M_intbuf = s; - _M_owns_ib = false; - } else { - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } else { - _M_ibs = 0; - _M_intbuf = 0; - _M_owns_ib = false; - } - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode) { - if (!_M_cv) throw std::bad_cast(); - int width = _M_cv->encoding(); - if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) - return pos_type(off_type(-1)); - // width > 0 || off == 0 - int whence; - switch (way) { - case std::ios_base::beg: - whence = SEEK_SET; - break; - case std::ios_base::cur: - whence = SEEK_CUR; - break; - case std::ios_base::end: - whence = SEEK_END; - break; - default: - return pos_type(off_type(-1)); - } -#if _WIN32 - if (fseek(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftell(_M_file); -#else - if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftello(_M_file); -#endif - r.state(_M_st); - return r; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { - if (_M_file == nullptr || sync()) return pos_type(off_type(-1)); -#if _WIN32 - if (fseek(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#else - if (fseeko(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#endif - _M_st = sp.state(); - return sp; -} - -/////////////////////////////////////////////////////////////////////////////// -template -int basic_filebuf::sync() { - if (_M_file == nullptr) return 0; - if (!_M_cv) throw std::bad_cast(); - if (_M_cm & std::ios_base::out) { - if (this->pptr() != this->pbase()) - if (overflow() == traits_type::eof()) return -1; - std::codecvt_base::result r; - do { - char* extbe; - r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) return -1; - } while (r == std::codecvt_base::partial); - if (r == std::codecvt_base::error) return -1; - if (fflush(_M_file)) return -1; - } else if (_M_cm & std::ios_base::in) { - off_type c; - state_type state = _M_st_last; - bool update_st = false; - if (_M_always_noconv) { - c = this->egptr() - this->gptr(); - } else { - int width = _M_cv->encoding(); - c = _M_extbufend - _M_extbufnext; - if (width > 0) { - c += width * (this->egptr() - this->gptr()); - } else { - if (this->gptr() != this->egptr()) { - const int off = _M_cv->length(state, _M_extbuf, _M_extbufnext, - this->gptr() - this->eback()); - c += _M_extbufnext - _M_extbuf - off; - update_st = true; - } - } - } -#if _WIN32 - if (fseek(_M_file_, -c, SEEK_CUR)) return -1; -#else - if (fseeko(_M_file, -c, SEEK_CUR)) return -1; -#endif - if (update_st) _M_st = state; - _M_extbufnext = _M_extbufend = _M_extbuf; - this->setg(0, 0, 0); - _M_cm = std::ios_base::openmode(0); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::imbue(const std::locale& loc) { - sync(); - _M_cv = &std::use_facet >(loc); - bool old_anc = _M_always_noconv; - _M_always_noconv = _M_cv->always_noconv(); - if (old_anc != _M_always_noconv) { - this->setg(0, 0, 0); - this->setp(0, 0); - // invariant, char_type is char, else we couldn't get here - // need to dump _M_intbuf - if (_M_always_noconv) { - if (_M_owns_eb) delete[] _M_extbuf; - _M_owns_eb = _M_owns_ib; - _M_ebs = _M_ibs; - _M_extbuf = reinterpret_cast(_M_intbuf); - _M_ibs = 0; - _M_intbuf = nullptr; - _M_owns_ib = false; - } else { // need to obtain an _M_intbuf. - // If _M_extbuf is user-supplied, use it, else new _M_intbuf - if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { - _M_ibs = _M_ebs; - _M_intbuf = reinterpret_cast(_M_extbuf); - _M_owns_ib = false; - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } else { - _M_ibs = _M_ebs; - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -bool basic_filebuf::_M_read_mode() { - if (!(_M_cm & std::ios_base::in)) { - this->setp(0, 0); - if (_M_always_noconv) - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + _M_ebs, - reinterpret_cast(_M_extbuf) + _M_ebs); - else - this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); - _M_cm = std::ios_base::in; - return true; - } - return false; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::_M_write_mode() { - if (!(_M_cm & std::ios_base::out)) { - this->setg(0, 0, 0); - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv) - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (_M_ebs - 1)); - else - this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); - } else { - this->setp(0, 0); - } - _M_cm = std::ios_base::out; - } -} - -/////////////////////////////////////////////////////////////////////////////// -} // namespace kaldi - -/////////////////////////////////////////////////////////////////////////////// -#endif // KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// - -/* - * ============================================================================ - * libc++ License - * ============================================================================ - * - * The libc++ library is dual licensed under both the University of Illinois - * "BSD-Like" license and the MIT license. As a user of this code you may - * choose to use it under either license. As a contributor, you agree to allow - * your code to be used under both. - * - * Full text of the relevant licenses is included below. - * - * ============================================================================ - * - * University of Illinois/NCSA - * Open Source License - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * All rights reserved. - * - * Developed by: - * - * LLVM Team - * - * University of Illinois at Urbana-Champaign - * - * http://llvm.org - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the names of the LLVM Team, University of Illinois at - * Urbana-Champaign, nor the names of its contributors may be used to - * endorse or promote products derived from this Software without specific - * prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - * ============================================================================== - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * ============================================================================== - * - * This file is a partial list of people who have contributed to the LLVM/libc++ - * project. If you have contributed a patch or made some other contribution to - * LLVM/libc++, please submit a patch to this file to add yourself, and it will - * be done! - * - * The list is sorted by surname and formatted to allow easy grepping and - * beautification by scripts. The fields are: name (N), email (E), web-address - * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address - * (S). - * - * N: Saleem Abdulrasool - * E: compnerd@compnerd.org - * D: Minor patches and Linux fixes. - * - * N: Dimitry Andric - * E: dimitry@andric.com - * D: Visibility fixes, minor FreeBSD portability patches. - * - * N: Holger Arnold - * E: holgerar@gmail.com - * D: Minor fix. - * - * N: Ruben Van Boxem - * E: vanboxem dot ruben at gmail dot com - * D: Initial Windows patches. - * - * N: David Chisnall - * E: theraven at theravensnest dot org - * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. - * - * N: Marshall Clow - * E: mclow.lists@gmail.com - * E: marshall@idio.com - * D: C++14 support, patches and bug fixes. - * - * N: Bill Fisher - * E: william.w.fisher@gmail.com - * D: Regex bug fixes. - * - * N: Matthew Dempsky - * E: matthew@dempsky.org - * D: Minor patches and bug fixes. - * - * N: Google Inc. - * D: Copyright owner and contributor of the CityHash algorithm - * - * N: Howard Hinnant - * E: hhinnant@apple.com - * D: Architect and primary author of libc++ - * - * N: Hyeon-bin Jeong - * E: tuhertz@gmail.com - * D: Minor patches and bug fixes. - * - * N: Argyrios Kyrtzidis - * E: kyrtzidis@apple.com - * D: Bug fixes. - * - * N: Bruce Mitchener, Jr. - * E: bruce.mitchener@gmail.com - * D: Emscripten-related changes. - * - * N: Michel Morin - * E: mimomorin@gmail.com - * D: Minor patches to is_convertible. - * - * N: Andrew Morrow - * E: andrew.c.morrow@gmail.com - * D: Minor patches and Linux fixes. - * - * N: Arvid Picciani - * E: aep at exys dot org - * D: Minor patches and musl port. - * - * N: Bjorn Reese - * E: breese@users.sourceforge.net - * D: Initial regex prototype - * - * N: Nico Rieck - * E: nico.rieck@gmail.com - * D: Windows fixes - * - * N: Jonathan Sauer - * D: Minor patches, mostly related to constexpr - * - * N: Craig Silverstein - * E: csilvers@google.com - * D: Implemented Cityhash as the string hash function on 64-bit machines - * - * N: Richard Smith - * D: Minor patches. - * - * N: Joerg Sonnenberger - * E: joerg@NetBSD.org - * D: NetBSD port. - * - * N: Stephan Tolksdorf - * E: st@quanttec.com - * D: Minor fix - * - * N: Michael van der Westhuizen - * E: r1mikey at gmail dot com - * - * N: Klaas de Vries - * E: klaas at klaasgaaf dot nl - * D: Minor bug fix. - * - * N: Zhang Xiongpang - * E: zhangxiongpang@gmail.com - * D: Minor patches and bug fixes. - * - * N: Xing Xue - * E: xingxue@ca.ibm.com - * D: AIX port - * - * N: Zhihao Yuan - * E: lichray@gmail.com - * D: Standard compatibility fixes. - * - * N: Jeffrey Yasskin - * E: jyasskin@gmail.com - * E: jyasskin@google.com - * D: Linux fixes. - */ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/const-integer-set-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/const-integer-set-inl.h deleted file mode 100644 index b93846148a3e4595774507f638396ce13393ac0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/const-integer-set-inl.h +++ /dev/null @@ -1,87 +0,0 @@ -// util/const-integer-set-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ - -// Do not include this file directly. It is included by const-integer-set.h - -namespace kaldi { - -template -void ConstIntegerSet::InitInternal() { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - quick_set_.clear(); // just in case we previously had data. - if (slow_set_.size() == 0) { - lowest_member_ = (I)1; - highest_member_ = (I)0; - contiguous_ = false; - quick_ = false; - } else { - lowest_member_ = slow_set_.front(); - highest_member_ = slow_set_.back(); - size_t range = highest_member_ + 1 - lowest_member_; - if (range == slow_set_.size()) { - contiguous_ = true; - quick_ = false; - } else { - contiguous_ = false; - // If it would be more compact to store as bool - if (range < slow_set_.size() * 8 * sizeof(I)) { - // (assuming 1 bit per element)... - quick_set_.resize(range, false); - for (size_t i = 0; i < slow_set_.size(); i++) - quick_set_[slow_set_[i] - lowest_member_] = true; - quick_ = true; - } else { - quick_ = false; - } - } - } -} - -template -int ConstIntegerSet::count(I i) const { - if (i < lowest_member_ || i > highest_member_) { - return 0; - } else { - if (contiguous_) return true; - if (quick_) { - return (quick_set_[i - lowest_member_] ? 1 : 0); - } else { - bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); - return (ans ? 1 : 0); - } - } -} - -template -void ConstIntegerSet::Write(std::ostream &os, bool binary) const { - WriteIntegerVector(os, binary, slow_set_); -} - -template -void ConstIntegerSet::Read(std::istream &is, bool binary) { - ReadIntegerVector(is, binary, &slow_set_); - InitInternal(); -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/const-integer-set.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/const-integer-set.h deleted file mode 100644 index 809a56a7c83804bfaa4badb5e28059734bfcad1e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/const-integer-set.h +++ /dev/null @@ -1,96 +0,0 @@ -// util/const-integer-set.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_H_ -#include -#include -#include -#include -#include -#include "util/stl-utils.h" - -/* ConstIntegerSet is a way to efficiently test whether something is in a - supplied set of integers. It can be initialized from a vector or set, but - never changed after that. It either uses a sorted vector or an array of - bool, depending on the input. It behaves like a const version of an STL set, - with only a subset of the functionality, except all the member functions are - upper-case. - - Note that we could get rid of the member slow_set_, but we'd have to - do more work to implement an iterator type. This would save memory. -*/ - -namespace kaldi { - -template -class ConstIntegerSet { - public: - ConstIntegerSet() : lowest_member_(1), highest_member_(0) {} - - void Init(const std::vector &input) { - slow_set_ = input; - SortAndUniq(&slow_set_); - InitInternal(); - } - - void Init(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - - explicit ConstIntegerSet(const std::vector &input) : slow_set_(input) { - SortAndUniq(&slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const ConstIntegerSet &other) - : slow_set_(other.slow_set_) { - InitInternal(); - } - - int count(I i) const; // returns 1 or 0. - - typedef typename std::vector::const_iterator iterator; - iterator begin() const { return slow_set_.begin(); } - iterator end() const { return slow_set_.end(); } - size_t size() const { return slow_set_.size(); } - bool empty() const { return slow_set_.empty(); } - - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); - - private: - I lowest_member_; - I highest_member_; - bool contiguous_; - bool quick_; - std::vector quick_set_; - std::vector slow_set_; - void InitInternal(); -}; - -} // end namespace kaldi - -#include "util/const-integer-set-inl.h" - -#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/hash-list-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/hash-list-inl.h deleted file mode 100644 index 063fa7131ec618f0aae9dc30f4edd26c9dcce7fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/hash-list-inl.h +++ /dev/null @@ -1,193 +0,0 @@ -// util/hash-list-inl.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_INL_H_ -#define KALDI_UTIL_HASH_LIST_INL_H_ - -// Do not include this file directly. It is included by fast-hash.h - -namespace kaldi { - -template -HashList::HashList() { - list_head_ = NULL; - bucket_list_tail_ = static_cast(-1); // invalid. - hash_size_ = 0; - freed_head_ = NULL; -} - -template -void HashList::SetSize(size_t size) { - hash_size_ = size; - KALDI_ASSERT(list_head_ == NULL && - bucket_list_tail_ == - static_cast(-1)); // make sure empty. - if (size > buckets_.size()) buckets_.resize(size, HashBucket(0, NULL)); -} - -template -typename HashList::Elem *HashList::Clear() { - // Clears the hashtable and gives ownership of the currently contained list - // to the user. - for (size_t cur_bucket = bucket_list_tail_; - cur_bucket != static_cast(-1); - cur_bucket = buckets_[cur_bucket].prev_bucket) { - buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". - } - bucket_list_tail_ = static_cast(-1); - Elem *ans = list_head_; - list_head_ = NULL; - return ans; -} - -template -const typename HashList::Elem *HashList::GetList() const { - return list_head_; -} - -template -inline void HashList::Delete(Elem *e) { - e->tail = freed_head_; - freed_head_ = e; -} - -template -inline typename HashList::Elem *HashList::Find(I key) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - if (bucket.last_elem == NULL) { - return NULL; // empty bucket. - } else { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - return NULL; // Not found. - } -} - -template -inline typename HashList::Elem *HashList::New() { - if (freed_head_) { - Elem *ans = freed_head_; - freed_head_ = freed_head_->tail; - return ans; - } else { - Elem *tmp = new Elem[allocate_block_size_]; - for (size_t i = 0; i + 1 < allocate_block_size_; i++) - tmp[i].tail = tmp + i + 1; - tmp[allocate_block_size_ - 1].tail = NULL; - freed_head_ = tmp; - allocated_.push_back(tmp); - return this->New(); - } -} - -template -HashList::~HashList() { - // First test whether we had any memory leak within the - // HashList, i.e. things for which the user did not call Delete(). - size_t num_in_list = 0, num_allocated = 0; - for (Elem *e = freed_head_; e != NULL; e = e->tail) num_in_list++; - for (size_t i = 0; i < allocated_.size(); i++) { - num_allocated += allocate_block_size_; - delete[] allocated_[i]; - } - if (num_in_list != num_allocated) { - KALDI_WARN << "Possible memory leak: " << num_in_list - << " != " << num_allocated - << ": you might have forgotten to call Delete on " - << "some Elems"; - } -} - -template -inline typename HashList::Elem *HashList::Insert(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - // Check the element is existing or not. - if (bucket.last_elem != NULL) { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - } - - // This is a new element. Insert it. - Elem *elem = New(); - elem->key = key; - elem->val = val; - if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at - // head of bucket list (which is tail of regular list, they go in - // opposite directions). - if (bucket_list_tail_ == static_cast(-1)) { - // list was empty so this is the first elem. - KALDI_ASSERT(list_head_ == NULL); - list_head_ = elem; - } else { - // link in to the chain of Elems - buckets_[bucket_list_tail_].last_elem->tail = elem; - } - elem->tail = NULL; - bucket.last_elem = elem; - bucket.prev_bucket = bucket_list_tail_; - bucket_list_tail_ = index; - } else { - // Already-occupied bucket. Insert at tail of list of elements within - // the bucket. - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - } - return elem; -} - -template -void HashList::InsertMore(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - Elem *elem = New(); - elem->key = key; - elem->val = val; - - KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here - if (bucket.last_elem->key == key) { // standard behavior: add as last element - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - return; - } - Elem *e = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail); - // find place to insert in linked list - while (e != bucket.last_elem->tail && e->key != key) e = e->tail; - KALDI_ASSERT(e->key == key); // not found? - should not happen - elem->tail = e->tail; - e->tail = elem; -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/hash-list.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/hash-list.h deleted file mode 100644 index 31cc9bdc4870773475f8c5139539e320746bf5fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/hash-list.h +++ /dev/null @@ -1,146 +0,0 @@ -// util/hash-list.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_H_ -#define KALDI_UTIL_HASH_LIST_H_ - -#include -#include -#include -#include -#include - -#include "base/kaldi-error.h" - -/* This header provides utilities for a structure that's used in a decoder (but - is quite generic in nature so we implement and test it separately). - Basically it's a singly-linked list, but implemented in such a way that we - can quickly search for elements in the list. We give it a slightly richer - interface than just a hash and a list. The idea is that we want to separate - the hash part and the list part: basically, in the decoder, we want to have a - single hash for the current frame and the next frame, because by the time we - need to access the hash for the next frame we no longer need the hash for the - previous frame. So we have an operation that clears the hash but leaves the - list structure intact. We also control memory management inside this object, - to avoid repeated new's/deletes. - - See hash-list-test.cc for an example of how to use this object. -*/ - -namespace kaldi { - -template -class HashList { - public: - struct Elem { - I key; - T val; - Elem *tail; - }; - - /// Constructor takes no arguments. - /// Call SetSize to inform it of the likely size. - HashList(); - - /// Clears the hash and gives the head of the current list to the user; - /// ownership is transferred to the user (the user must call Delete() - /// for each element in the list, at his/her leisure). - Elem *Clear(); - - /// Gives the head of the current list to the user. Ownership retained in the - /// class. Caution: in December 2013 the return type was changed to const - /// Elem* and this function was made const. You may need to change some types - /// of local Elem* variables to const if this produces compilation errors. - const Elem *GetList() const; - - /// Think of this like delete(). It is to be called for each Elem in turn - /// after you "obtained ownership" by doing Clear(). This is not the opposite - /// of. Insert, it is the opposite of New. It's really a memory operation. - inline void Delete(Elem *e); - - /// This should probably not be needed to be called directly by the user. - /// Think of it as opposite - /// to Delete(); - inline Elem *New(); - - /// Find tries to find this element in the current list using the hashtable. - /// It returns NULL if not present. The Elem it returns is not owned by the - /// user, it is part of the internal list owned by this object, but the user - /// is free to modify the "val" element. - inline Elem *Find(I key); - - /// Insert inserts a new element into the hashtable/stored list. - /// Because element keys in a hashtable are unique, this operation checks - /// whether each inserted element has a key equivalent to the one of an - /// element already in the hashtable. If so, the element is not inserted, - /// returning an pointer to this existing element. - inline Elem *Insert(I key, T val); - - /// Insert inserts another element with same key into the hashtable/ - /// stored list. - /// By calling this, the user asserts that one element with that key is - /// already present. - /// We insert it that way, that all elements with the same key - /// follow each other. - /// Find() will return the first one of the elements with the same key. - inline void InsertMore(I key, T val); - - /// SetSize tells the object how many hash buckets to allocate (should - /// typically be at least twice the number of objects we expect to go in the - /// structure, for fastest performance). It must be called while the hash - /// is empty (e.g. after Clear() or after initializing the object, but before - /// adding anything to the hash. - void SetSize(size_t sz); - - /// Returns current number of hash buckets. - inline size_t Size() { return hash_size_; } - - ~HashList(); - - private: - struct HashBucket { - size_t prev_bucket; // index to next bucket (-1 if list tail). Note: - // list of buckets goes in opposite direction to list of Elems. - Elem *last_elem; // pointer to last element in this bucket (NULL if empty) - inline HashBucket(size_t i, Elem *e) : prev_bucket(i), last_elem(e) {} - }; - - Elem *list_head_; // head of currently stored list. - size_t bucket_list_tail_; // tail of list of active hash buckets. - - size_t hash_size_; // number of hash buckets. - - std::vector buckets_; - - Elem *freed_head_; // head of list of currently freed elements. [ready for - // allocation] - - std::vector allocated_; // list of allocated blocks. - - static const size_t allocate_block_size_ = 1024; // Number of Elements to - // allocate in one block. Must be largish so storing allocated_ doesn't - // become a problem. -}; - -} // end namespace kaldi - -#include "util/hash-list-inl.h" - -#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io-inl.h deleted file mode 100644 index 8b0c92131c4af2113eb33da6f3cfa9dc4dee83e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io-inl.h +++ /dev/null @@ -1,40 +0,0 @@ -// util/kaldi-io-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_INL_H_ -#define KALDI_UTIL_KALDI_IO_INL_H_ - -#include - -namespace kaldi { - -bool Input::Open(const std::string &rxfilename, bool *binary) { - return OpenInternal(rxfilename, true, binary); -} - -bool Input::OpenTextMode(const std::string &rxfilename) { - return OpenInternal(rxfilename, false, NULL); -} - -bool Input::IsOpen() { return impl_ != NULL; } - -bool Output::IsOpen() { return impl_ != NULL; } - -} // end namespace kaldi. - -#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io.cc deleted file mode 100644 index 5f8ec4870138df32f6aca9c12383cf3885411741..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io.cc +++ /dev/null @@ -1,898 +0,0 @@ -// util/kaldi-io.cc - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/kaldi-io.h" - -#include -#include -#include - -#include - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" -#include "util/kaldi-pipebuf.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -#ifdef KALDI_CYGWIN_COMPAT -#include "util/kaldi-cygwin-io-inl.h" -#define MapOsPath(x) MapCygwinPath(x) -#else // KALDI_CYGWIN_COMPAT -#define MapOsPath(x) x -#endif // KALDI_CYGWIN_COMPAT - -#if defined(_MSC_VER) -static FILE *popen(const char *command, const char *mode) { -#ifdef KALDI_CYGWIN_COMPAT - return kaldi::CygwinCompatPopen(command, mode); -#else // KALDI_CYGWIN_COMPAT - return _popen(command, mode); -#endif // KALDI_CYGWIN_COMPAT -} -#endif // _MSC_VER - -namespace kaldi { - -#ifndef _MSC_VER // on VS, we don't need this type. -// could replace basic_pipebuf with stdio_filebuf on some platforms. -// Would mean we could use less of our own code. -typedef basic_pipebuf PipebufType; -#endif -} // namespace kaldi - -namespace kaldi { - -std::string PrintableRxfilename(const std::string &rxfilename) { - if (rxfilename == "" || rxfilename == "-") { - return "standard input"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return rxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(rxfilename); - } -} - -std::string PrintableWxfilename(const std::string &wxfilename) { - if (wxfilename == "" || wxfilename == "-") { - return "standard output"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return wxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(wxfilename); - } -} - -OutputType ClassifyWxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardOutput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardOutput; - } else if (first_char == '|') { - return kPipeOutput; // An output pipe like "|blah". - } else if (isspace(first_char) || isspace(last_char) || last_char == '|') { - return kNoOutput; // Leading or trailing space: can't interpret this. - // Final '|' would represent an input pipe, not an - // output pipe. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoOutput; - } else if (isdigit(last_char)) { - // This could be a file, but we have to see if it's an offset into a file - // (like foo.ark:4314328), which is not allowed for writing (but is - // allowed for reaching). This eliminates some things which would be - // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed - // such filenames for writing, we woudln't be able to correctly read them). - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') return kNoOutput; - // else it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but we - // check for internal '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" - " wrong place (pipe without | at the beginning?): " - << filename; - return kNoOutput; - } - return kFileOutput; // It matched no other pattern: assume it's a filename. -} - -InputType ClassifyRxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardInput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardInput; - } else if (first_char == '|') { - return kNoInput; // An output pipe like "|blah": not - // valid for input. - } else if (last_char == '|') { - return kPipeInput; - } else if (isspace(first_char) || isspace(last_char)) { - return kNoInput; // We don't allow leading or trailing space in a filename. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoInput; - } else if (isdigit(last_char)) { - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') - return kOffsetFileInput; // Filename is like - // some_file:12345 - // otherwise it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but - // we check for '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified in this case. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" - " wrong place (pipe without | at the end?): " - << filename; - return kNoInput; - } - return kFileInput; // It matched no other pattern: assume it's a filename. -} - -class OutputImplBase { - public: - // Open will open it as a file (no header), and return true - // on success. It cannot be called on an already open stream. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::ostream &Stream() = 0; - virtual bool Close() = 0; - virtual ~OutputImplBase() {} -}; - -class FileOutputImpl : public OutputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (os_.is_open()) - KALDI_ERR << "FileOutputImpl::Open(), " - << "open called on already open file."; - filename_ = filename; - os_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out); - return os_.is_open(); - } - - virtual std::ostream &Stream() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return os_; - } - - virtual bool Close() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - os_.close(); - return !(os_.fail()); - } - virtual ~FileOutputImpl() { - if (os_.is_open()) { - os_.close(); - if (os_.fail()) KALDI_ERR << "Error closing output file " << filename_; - } - } - - private: - std::string filename_; - std::ofstream os_; -}; - -class StandardOutputImpl : public OutputImplBase { - public: - StandardOutputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardOutputImpl::Open(), " - "open called on already open file."; -#ifdef _MSC_VER - _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); -#endif - is_open_ = std::cout.good(); - return is_open_; - } - - virtual std::ostream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cout; - } - - virtual bool Close() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; - is_open_ = false; - std::cout << std::flush; - return !(std::cout.fail()); - } - virtual ~StandardOutputImpl() { - if (is_open_) { - std::cout << std::flush; - if (std::cout.fail()) KALDI_ERR << "Error writing to standard output"; - } - } - - private: - bool is_open_; -}; - -class PipeOutputImpl : public OutputImplBase { - public: - PipeOutputImpl() : f_(NULL), os_(NULL) {} - - virtual bool Open(const std::string &wxfilename, bool binary) { - filename_ = wxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should - // start with '|' - std::string cmd_name(wxfilename, 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); -#else - f_ = popen(cmd_name.c_str(), "w"); -#endif - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for writing, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't make the - // destructor try to close the stream when - // we're done. - (binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - os_ = new std::ostream(fb_); -#else - os_ = new std::ofstream(f_); -#endif - return os_->good(); - } - } - - virtual std::ostream &Stream() { - if (os_ == NULL) - KALDI_ERR << "PipeOutputImpl::Stream()," - " object not initialized."; - // I believe this error can only arise from coding error. - return *os_; - } - - virtual bool Close() { - if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; - bool ok = true; - os_->flush(); - if (os_->fail()) ok = false; - delete os_; - os_ = NULL; - int status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return ok; - } - virtual ~PipeOutputImpl() { - if (os_) { - if (!Close()) - KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); - } - } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::ostream *os_; -}; - -class InputImplBase { - public: - // Open will open it as a file, and return true on success. - // May be called twice only for kOffsetFileInput (otherwise, - // if called twice, we just create a new Input object, to avoid - // having to deal with the extra hassle of reopening with the - // same object. - // Note that we will to call Open with true (binary) for - // for text-mode Kaldi files; the only actual text-mode input - // is for non-Kaldi files. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::istream &Stream() = 0; - virtual int32 Close() = 0; // We only need to check failure in the case of - // kPipeInput. - // on close for input streams. - virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may - // call Open twice - // (has efficiency benefits). - - virtual ~InputImplBase() {} -}; - -class FileInputImpl : public InputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (is_.is_open()) - KALDI_ERR << "FileInputImpl::Open(), " - << "open called on already open file."; - is_.open( - MapOsPath(filename).c_str(), - binary ? std::ios_base::in | std::ios_base::binary : std::ios_base::in); - return is_.is_open(); - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kFileInput; } - - virtual ~FileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::ifstream is_; -}; - -class StandardInputImpl : public InputImplBase { - public: - StandardInputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardInputImpl::Open(), " - "open called on already open file."; - is_open_ = true; -#ifdef _MSC_VER - _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); -#endif - return true; // Don't check good() because would be false if - // eof, which may be valid input. - } - - virtual std::istream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cin; - } - - virtual InputType MyType() { return kStandardInput; } - - virtual int32 Close() { - if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; - is_open_ = false; - return 0; - } - virtual ~StandardInputImpl() {} - - private: - bool is_open_; -}; - -class PipeInputImpl : public InputImplBase { - public: - PipeInputImpl() : f_(NULL), is_(NULL) {} - - virtual bool Open(const std::string &rxfilename, bool binary) { - filename_ = rxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(rxfilename.length() != 0 && - rxfilename[rxfilename.length() - 1] == - '|'); // should end with '|' - std::string cmd_name(rxfilename, 0, rxfilename.length() - 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); -#else - f_ = popen(cmd_name.c_str(), "r"); -#endif - - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for reading, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't lead the - // destructor to close the stream. - (binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - is_ = new std::istream(fb_); -#else - is_ = new std::ifstream(f_); -#endif - if (is_->fail() || is_->bad()) return false; - if (is_->eof()) { - KALDI_WARN << "Pipe opened with command " - << PrintableRxfilename(rxfilename) << " is empty."; - // don't return false: empty may be valid. - } - return true; - } - } - - virtual std::istream &Stream() { - if (is_ == NULL) - KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return *is_; - } - - virtual int32 Close() { - if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open."; - delete is_; - is_ = NULL; - int32 status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return status; - } - virtual ~PipeInputImpl() { - if (is_) Close(); - } - virtual InputType MyType() { return kPipeInput; } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::istream *is_; -}; - -/* -#else - -// Just have an empty implementation of the pipe input that crashes if -// called. -class PipeInputImpl: public InputImplBase { - public: - PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this - platform."); } - virtual bool Open(const std::string, bool) { return 0; } - virtual std::istream &Stream() const { return NULL; } - virtual void Close() {} - virtual InputType MyType() { return kPipeInput; } -}; - -#endif -*/ - -class OffsetFileInputImpl : public InputImplBase { - // This class is a bit more complicated than the - - public: - // splits a filename like /my/file:123 into /my/file and the - // number 123. Crashes if not this format. - static void SplitFilename(const std::string &rxfilename, - std::string *filename, size_t *offset) { - size_t pos = rxfilename.find_last_of(':'); - KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling - // code, as the filename is supposed to be of the correct form at this - // point. - *filename = std::string(rxfilename, 0, pos); - std::string number(rxfilename, pos + 1); - bool ans = ConvertStringToInteger(number, offset); - if (!ans) - KALDI_ERR << "Cannot get offset from filename " << rxfilename - << " (possibly you compiled in 32-bit and have a >32-bit" - << " byte offset into a file; you'll have to compile 64-bit."; - } - - bool Seek(size_t offset) { - size_t cur_pos = is_.tellg(); - if (cur_pos == offset) { - return true; - } else if (cur_pos < offset && cur_pos + 100 > offset) { - // We're close enough that it may be faster to just - // read that data, rather than seek. - for (size_t i = cur_pos; i < offset; i++) is_.get(); - return (is_.tellg() == std::streampos(offset)); - } - // Try to actually seek. - is_.seekg(offset, std::ios_base::beg); - if (is_.fail()) { // failbit or badbit is set [error happened] - is_.close(); - return false; // failure. - } else { - is_.clear(); // Clear any failure bits (e.g. eof). - return true; // success. - } - } - - // This Open routine is unusual in that it is designed to work even - // if it was already open. This for efficiency when seeking multiple - // times. - virtual bool Open(const std::string &rxfilename, bool binary) { - if (is_.is_open()) { - // We are opening when we have an already-open file. - // We may have to seek within this file, or else close it and - // open a different one. - std::string tmp_filename; - size_t offset; - SplitFilename(rxfilename, &tmp_filename, &offset); - if (tmp_filename == filename_ && binary == binary_) { // Just seek - is_.clear(); // clear fail bit, etc. - return Seek(offset); - } else { - is_.close(); // don't bother checking error status of is_. - filename_ = tmp_filename; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } else { - size_t offset; - SplitFilename(rxfilename, &filename_, &offset); - binary_ = binary; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kOffsetFileInput; } - - virtual ~OffsetFileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::string filename_; // the actual filename - bool binary_; // true if was opened in binary mode. - std::ifstream is_; -}; - -Output::Output(const std::string &wxfilename, bool binary, bool write_header) - : impl_(NULL) { - if (!Open(wxfilename, binary, write_header)) { - if (impl_) { - delete impl_; - impl_ = NULL; - } - KALDI_ERR << "Error opening output stream " - << PrintableWxfilename(wxfilename); - } -} - -bool Output::Close() { - if (!impl_) { - return false; // error to call Close if not open. - } else { - bool ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } -} - -Output::~Output() { - if (impl_) { - bool ok = impl_->Close(); - delete impl_; - impl_ = NULL; - if (!ok) - KALDI_ERR << "Error closing output file " - << PrintableWxfilename(filename_) - << (ClassifyWxfilename(filename_) == kFileOutput - ? " (disk full?)" - : ""); - } -} - -std::ostream &Output::Stream() { // will throw if not open; else returns - // stream. - if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; - return impl_->Stream(); -} - -bool Output::Open(const std::string &wxfn, bool binary, bool header) { - if (IsOpen()) { - if (!Close()) { // Throw here rather than return status, as it's an error - // about something else: if the user wanted to avoid the exception he/she - // could have called Close(). - KALDI_ERR << "Output::Open(), failed to close output stream: " - << PrintableWxfilename(filename_); - } - } - - filename_ = wxfn; - - OutputType type = ClassifyWxfilename(wxfn); - KALDI_ASSERT(impl_ == NULL); - - if (type == kFileOutput) { - impl_ = new FileOutputImpl(); - } else if (type == kStandardOutput) { - impl_ = new StandardOutputImpl(); - } else if (type == kPipeOutput) { - impl_ = new PipeOutputImpl(); - } else { // type == kNoOutput - KALDI_WARN << "Invalid output filename format " - << PrintableWxfilename(wxfn); - return false; - } - if (!impl_->Open(wxfn, binary)) { - delete impl_; - impl_ = NULL; - return false; // failed to open. - } else { // successfully opened it. - if (header) { - InitKaldiOutputStream(impl_->Stream(), binary); - bool ok = impl_->Stream().good(); // still OK? - if (!ok) { - delete impl_; - impl_ = NULL; - return false; - } - return true; - } else { - return true; - } - } -} - -Input::Input(const std::string &rxfilename, bool *binary) : impl_(NULL) { - if (!Open(rxfilename, binary)) { - KALDI_ERR << "Error opening input stream " - << PrintableRxfilename(rxfilename); - } -} - -int32 Input::Close() { - if (impl_) { - int32 ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } else { - return 0; - } -} - -bool Input::OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary) { - InputType type = ClassifyRxfilename(rxfilename); - if (IsOpen()) { - // May have to close the stream first. - if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { - // We want to use the same object to Open... this is in case - // the files are the same, so we can just seek. - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always open in binary. - delete impl_; - impl_ = NULL; - return false; - } - // read the binary header, if requested. - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; - } else { - Close(); - // and fall through to code below which actually opens the file. - } - } - if (type == kFileInput) { - impl_ = new FileInputImpl(); - } else if (type == kStandardInput) { - impl_ = new StandardInputImpl(); - } else if (type == kPipeInput) { - impl_ = new PipeInputImpl(); - } else if (type == kOffsetFileInput) { - impl_ = new OffsetFileInputImpl(); - } else { // type == kNoInput - KALDI_WARN << "Invalid input filename format " - << PrintableRxfilename(rxfilename); - return false; - } - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always read in binary. - delete impl_; - impl_ = NULL; - return false; - } - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; -} - -Input::~Input() { - if (impl_) Close(); -} - -std::istream &Input::Stream() { - if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; - return impl_->Stream(); -} - -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io.h deleted file mode 100644 index 2175ca8f89ed5f3e3bade26528e924208df692c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-io.h +++ /dev/null @@ -1,266 +0,0 @@ -// util/kaldi-io.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_H_ -#define KALDI_UTIL_KALDI_IO_H_ - -#ifdef _MSC_VER -#include -#include -#endif -#include // For isspace. -#include -#include -#include "base/kaldi-common.h" -// #include "matrix/kaldi-matrix.h" - -namespace kaldi { - -class OutputImplBase; // Forward decl; defined in a .cc file -class InputImplBase; // Forward decl; defined in a .cc file - -/// \addtogroup io_group -/// @{ - -// The Output and Input classes handle stream-opening for "extended" filenames -// that include actual files, standard-input/standard-output, pipes, and -// offsets into actual files. They also handle reading and writing the -// binary-mode headers for Kaldi files, where applicable. The classes have -// versions of the Open routines that throw and do not throw, depending whether -// the calling code wants to catch the errors or not; there are also versions -// that write (or do not write) the Kaldi binary-mode header that says if it's -// binary mode. Generally files that contain Kaldi objects will have the header -// on, so we know upon reading them whether they have the header. So you would -// use the OpenWithHeader routines for these (or the constructor); but other -// types of objects (e.g. FSTs) would have files without a header so you would -// use OpenNoHeader. - -// We now document the types of extended filenames that we use. -// -// A "wxfilename" is an extended filename for writing. It can take three forms: -// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My -// Documents\\boo" -// (whatever the actual file-system interprets) -// (2) Standard output: "" or "-" -// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" -// -// -// A "rxfilename" is an extended filename for reading. It can take four forms: -// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". -// (2) Standard input: "" or "-" -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" -// [these are created by the Table and TableWriter classes; I may also write -// a program that creates them for arbitrary files] -// - -// Typical usage: -// ... -// bool binary; -// MyObject.Write(Output(some_filename, binary).Stream(), binary); -// -// ... more extensive example: -// { -// Output ko(some_filename, binary); -// MyObject1.Write(ko.Stream(), binary); -// MyObject2.Write(ko.Stream(), binary); -// } - -enum OutputType { kNoOutput, kFileOutput, kStandardOutput, kPipeOutput }; - -/// ClassifyWxfilename interprets filenames as follows: -/// - kNoOutput: invalid filenames (leading or trailing space, things that look -/// like wspecifiers and rspecifiers or like pipes to read from with leading -/// |. -/// - kFileOutput: Normal filenames -/// - kStandardOutput: The empty string or "-", interpreted as standard output -/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" -OutputType ClassifyWxfilename(const std::string &wxfilename); - -enum InputType { - kNoInput, - kFileInput, - kStandardInput, - kOffsetFileInput, - kPipeInput -}; - -/// ClassifyRxfilenames interprets filenames for reading as follows: -/// - kNoInput: invalid filenames (leading or trailing space, things that -/// look like wspecifiers and rspecifiers or pipes to write to -/// with trailing |. -/// - kFileInput: normal filenames -/// - kStandardInput: the empty string or "-" -/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" -/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 -InputType ClassifyRxfilename(const std::string &rxfilename); - -class Output { - public: - // The normal constructor, provided for convenience. - // Equivalent to calling with default constructor then Open() - // with these arguments. - Output(const std::string &filename, bool binary, bool write_header = true); - - Output() : impl_(NULL) {} - - /// This opens the stream, with the given mode (binary or text). It returns - /// true on success and false on failure. However, it will throw if something - /// was already open and could not be closed (to avoid this, call Close() - /// first. if write_header == true and binary == true, it writes the Kaldi - /// binary-mode header ('\0' then 'B'). You may call Open even if it is - /// already open; it will close the existing stream and reopen (however if - /// closing the old stream failed it will throw). - bool Open(const std::string &wxfilename, bool binary, bool write_header); - - inline bool IsOpen(); // return true if we have an open stream. Does not - // imply stream is good for writing. - - std::ostream &Stream(); // will throw if not open; else returns stream. - - // Close closes the stream. Calling Close is never necessary unless you - // want to avoid exceptions being thrown. There are times when calling - // Close will hurt efficiency (basically, when using offsets into files, - // and using the same Input object), - // but most of the time the user won't be doing this directly, it will - // be done in kaldi-table.{h, cc}, so you don't have to worry about it. - bool Close(); - - // This will throw if stream could not be closed (to check error status, - // call Close()). - ~Output(); - - private: - OutputImplBase *impl_; // non-NULL if open. - std::string filename_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Output); -}; - -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject.Read(ki.Stream(), binary_in); -// -// ... more extensive example: -// -// { -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject1.Read(ki.Stream(), &binary_in); -// MyObject2.Write(ki.Stream(), &binary_in); -// } -// Note that to catch errors you need to use try.. catch. -// Input communicates errors by throwing exceptions. - -// Input interprets four kinds of filenames: -// (1) Normal filenames -// (2) The empty string or "-", interpreted as standard output -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) Offsets into [real] files, e.g. "/my/filename:12049" -// The last one has no correspondence in Output. - -class Input { - public: - /// The normal constructor. Opens the stream in binary mode. - /// Equivalent to calling the default constructor followed by Open(); then, if - /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it - /// throws on error. - explicit Input(const std::string &rxfilename, bool *contents_binary = NULL); - - Input() : impl_(NULL) {} - - // Open opens the stream for reading (the mode, where relevant, is binary; use - // OpenTextMode for text-mode, we made this a separate function rather than a - // boolean argument, to avoid confusion with Kaldi's text/binary distinction, - // since reading in the file system's text mode is unusual.) If - // contents_binary != NULL, it reads the binary-mode header and puts it in the - // "binary" variable. Returns true on success. If it returns false it will - // not be open. You may call Open even if it is already open; it will close - // the existing stream and reopen (however if closing the old stream failed it - // will throw). - inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); - - // As Open but (if the file system has text/binary modes) opens in text mode; - // you shouldn't ever have to use this as in Kaldi we read even text files in - // binary mode (and ignore the \r). - inline bool OpenTextMode(const std::string &rxfilename); - - // Return true if currently open for reading and Stream() will - // succeed. Does not guarantee that the stream is good. - inline bool IsOpen(); - - // It is never necessary or helpful to call Close, except if - // you are concerned about to many filehandles being open. - // Close does not throw. It returns the exit code as int32 - // in the case of a pipe [kPipeInput], and always zero otherwise. - int32 Close(); - - // Returns the underlying stream. Throws if !IsOpen() - std::istream &Stream(); - - // Destructor does not throw: input streams may legitimately fail so we - // don't worry about the status when we close them. - ~Input(); - - private: - bool OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary); - InputImplBase *impl_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Input); -}; - -template -void ReadKaldiObject(const std::string &filename, C *c) { - bool binary_in; - Input ki(filename, &binary_in); - c->Read(ki.Stream(), binary_in); -} - -// Specialize the template for reading matrices, because we want to be able to -// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); -// -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); - -template -inline void WriteKaldiObject(const C &c, const std::string &filename, - bool binary) { - Output ko(filename, binary); - c.Write(ko.Stream(), binary); -} - -/// PrintableRxfilename turns the rxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard input". -std::string PrintableRxfilename(const std::string &rxfilename); - -/// PrintableWxfilename turns the wxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard output". -std::string PrintableWxfilename(const std::string &wxfilename); - -/// @} - -} // end namespace kaldi. - -#include "util/kaldi-io-inl.h" - -#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-pipebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-pipebuf.h deleted file mode 100644 index bcee80ccb1a6fa8ce3195483ac144c5ff66d2f89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/kaldi-pipebuf.h +++ /dev/null @@ -1,86 +0,0 @@ -// util/kaldi-pipebuf.h - -// Copyright 2009-2011 Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/** @file kaldi-pipebuf.h - * This is an Kaldi C++ Library header. - */ - -#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ -#define KALDI_UTIL_KALDI_PIPEBUF_H_ - -#include -#if !defined(_LIBCPP_VERSION) // libc++ -#include -#else -#include "util/basic-filebuf.h" -#endif - -namespace kaldi { -// This class provides a way to initialize a filebuf with a FILE* pointer -// directly; it will not close the file pointer when it is deleted. -// The C++ standard does not allow implementations of C++ to provide -// this constructor within basic_filebuf, which makes it hard to deal -// with pipes using completely native C++. This is a workaround - -#ifdef _MSC_VER -#elif defined(_LIBCPP_VERSION) // libc++ -template > -class basic_pipebuf : public basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : basic_filebuf() { - this->open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - } -}; // class basic_pipebuf -#else -template > -class basic_pipebuf : public std::basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : std::basic_filebuf() { - this->_M_file.sys_open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - this->_M_mode = mode; - this->_M_buf_size = BUFSIZ; - this->_M_allocate_internal_buffer(); - this->_M_reading = false; - this->_M_writing = false; - this->_M_set_buffer(-1); - } -}; // class basic_pipebuf -#endif // _MSC_VER - -} // namespace kaldi - -#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/parse-options.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/parse-options.cc deleted file mode 100644 index 1f2ef844d28d67ed58d2e0c9d7c7b674e8209df8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/parse-options.cc +++ /dev/null @@ -1,636 +0,0 @@ -// util/parse-options.cc - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -// Frantisek Skala; Arnab Ghoshal -// Copyright 2013 Tanel Alumae -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -namespace kaldi { - -ParseOptions::ParseOptions(const std::string &prefix, OptionsItf *other) - : print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { - ParseOptions *po = dynamic_cast(other); - if (po != NULL && po->other_parser_ != NULL) { - // we get here if this constructor is used twice, recursively. - other_parser_ = po->other_parser_; - } else { - other_parser_ = other; - } - if (po != NULL && po->prefix_ != "") { - prefix_ = po->prefix_ + std::string(".") + prefix; - } else { - prefix_ = prefix; - } -} - -void ParseOptions::Register(const std::string &name, bool *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, int32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, uint32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, float *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, double *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, std::string *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -// old-style, used for registering application-specific parameters -template -void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, - const std::string &doc) { - if (other_parser_ == NULL) { - this->RegisterCommon(name, ptr, doc, false); - } else { - KALDI_ASSERT(prefix_ != "" && - "Cannot use empty prefix when registering with prefix."); - std::string new_name = prefix_ + '.' + name; // name becomes prefix.name - other_parser_->Register(new_name, ptr, doc); - } -} - -// does the common part of the job of registering a parameter -template -void ParseOptions::RegisterCommon(const std::string &name, T *ptr, - const std::string &doc, bool is_standard) { - KALDI_ASSERT(ptr != NULL); - std::string idx = name; - NormalizeArgName(&idx); - if (doc_map_.find(idx) != doc_map_.end()) - KALDI_WARN << "Registering option twice, ignoring second time: " << name; - this->RegisterSpecific(name, idx, ptr, doc, is_standard); -} - -// used to register standard parameters (those that are present in all of the -// applications) -template -void ParseOptions::RegisterStandard(const std::string &name, T *ptr, - const std::string &doc) { - this->RegisterCommon(name, ptr, doc, true); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, bool *b, - const std::string &doc, bool is_standard) { - bool_map_[idx] = b; - doc_map_[idx] = - DocInfo(name, doc + " (bool, default = " + ((*b) ? "true)" : "false)"), - is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, int32 *i, - const std::string &doc, bool is_standard) { - int_map_[idx] = i; - std::ostringstream ss; - ss << doc << " (int, default = " << *i << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, uint32 *u, - const std::string &doc, bool is_standard) { - uint_map_[idx] = u; - std::ostringstream ss; - ss << doc << " (uint, default = " << *u << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, float *f, - const std::string &doc, bool is_standard) { - float_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (float, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, double *f, - const std::string &doc, bool is_standard) { - double_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (double, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, std::string *s, - const std::string &doc, bool is_standard) { - string_map_[idx] = s; - doc_map_[idx] = - DocInfo(name, doc + " (string, default = \"" + *s + "\")", is_standard); -} -void ParseOptions::DisableOption(const std::string &name) { - if (argv_ != NULL) - KALDI_ERR << "DisableOption must not be called after calling Read()."; - if (doc_map_.erase(name) == 0) - KALDI_ERR << "Option " << name - << " was not registered so cannot be disabled: "; - bool_map_.erase(name); - int_map_.erase(name); - uint_map_.erase(name); - float_map_.erase(name); - double_map_.erase(name); - string_map_.erase(name); -} - -int ParseOptions::NumArgs() const { return positional_args_.size(); } - -std::string ParseOptions::GetArg(int i) const { - // use KALDI_ERR if code error - if (i < 1 || i > static_cast(positional_args_.size())) - KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; - return positional_args_[i - 1]; -} - -// We currently do not support any other options. -enum ShellType { kBash = 0 }; - -// This can be changed in the code if it ever does need to be changed (as it's -// unlikely that one compilation of this tool-set would use both shells). -static ShellType kShellType = kBash; - -// Returns true if we need to escape a string before putting it into -// a shell (mainly thinking of bash shell, but should work for others) -// This is for the convenience of the user so command-lines that are -// printed out by ParseOptions::Read (with --print-args=true) are -// paste-able into the shell and will run. If you use a different type of -// shell, it might be necessary to change this function. -// But it's mostly a cosmetic issue as it basically affects how -// the program echoes its command-line arguments to the screen. -static bool MustBeQuoted(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - const char *c = str.c_str(); - if (*c == '\0') { - return true; // Must quote empty string - } else { - const char *ok_chars[2]; - - // These seem not to be interpreted as long as there are no other "bad" - // characters involved (e.g. "," would be interpreted as part of something - // like a{b,c}, but not on its own. - ok_chars[kBash] = "[]~#^_-+=:.,/"; - - // Just want to make sure that a space character doesn't get automatically - // inserted here via an automated style-checking script, like it did before. - KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); - - for (; *c != '\0'; c++) { - // For non-alphanumeric characters we have a list of characters which - // are OK. All others are forbidden (this is easier since the shell - // interprets most non-alphanumeric characters). - if (!isalnum(*c)) { - const char *d; - for (d = ok_chars[st]; *d != '\0'; d++) - if (*c == *d) break; - // If not alphanumeric or one of the "ok_chars", it must be escaped. - if (*d == '\0') return true; - } - } - return false; // The string was OK. No quoting or escaping. - } -} - -// Returns a quoted and escaped version of "str" -// which has previously been determined to need escaping. -// Our aim is to print out the command line in such a way that if it's -// pasted into a shell of ShellType "st" (only bash for now), it -// will get passed to the program in the same way. -static std::string QuoteAndEscape(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - // For now we use the following rules: - // In the normal case, we quote with single-quote "'", and to escape - // a single-quote we use the string: '\'' (interpreted as closing the - // single-quote, putting an escaped single-quote from the shell, and - // then reopening the single quote). - char quote_char = '\''; - const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b - - // If the string contains single-quotes that would need escaping this - // way, and we determine that the string could be safely double-quoted - // without requiring any escaping, then we double-quote the string. - // This is the case if the characters "`$\ do not appear in the string. - // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html - const char *c_str = str.c_str(); - if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { - quote_char = '"'; - escape_str = "\\\""; // should never be accessed. - } - - char buf[2]; - buf[1] = '\0'; - - buf[0] = quote_char; - std::string ans = buf; - const char *c = str.c_str(); - for (; *c != '\0'; c++) { - if (*c == quote_char) { - ans += escape_str; - } else { - buf[0] = *c; - ans += buf; - } - } - buf[0] = quote_char; - ans += buf; - return ans; -} - -// static function -std::string ParseOptions::Escape(const std::string &str) { - return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; -} - -int ParseOptions::Read(int argc, const char *const argv[]) { - argc_ = argc; - argv_ = argv; - std::string key, value; - int i; - if (argc > 0) { - // set global "const char*" g_program_name (name of the program) - // so it can be printed out in error messages; - // it's useful because often the stderr of different programs will - // be mixed together in the same log file. -#ifdef _MSC_VER - const char *c = strrchr(argv[0], '\\'); -#else - const char *c = strrchr(argv[0], '/'); -#endif - SetProgramName(c == NULL ? argv[0] : c + 1); - } - // first pass: look for config parameter, look for priority - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // a lone "--" marks the end of named options - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (key.compare("config") == 0) { - ReadConfigFile(value); - } - if (key.compare("help") == 0) { - PrintUsage(); - exit(0); - } - } - } - bool double_dash_seen = false; - // second pass: add the command line options - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // A lone "--" marks the end of named options. - // Skip that option and break the processing of named options - i += 1; - double_dash_seen = true; - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << argv[i]; - } - } else { - break; - } - } - - // process remaining arguments as positional - for (; i < argc; i++) { - if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { - double_dash_seen = true; - } else { - positional_args_.push_back(std::string(argv[i])); - } - } - - // if the user did not suppress this with --print-args = false.... - if (print_args_) { - std::ostringstream strm; - for (int j = 0; j < argc; j++) strm << Escape(argv[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } - return i; -} - -void ParseOptions::PrintUsage(bool print_command_line) { - std::cerr << '\n' << usage_ << '\n'; - DocMapType::iterator it; - // first we print application-specific options - bool app_specific_header_printed = false; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option - if (app_specific_header_printed == false) { // header was not yet printed - std::cerr << "Options:" << '\n'; - app_specific_header_printed = true; - } - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - if (app_specific_header_printed == true) { - std::cerr << '\n'; - } - - // then the standard options - std::cerr << "Standard options:" << '\n'; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - std::cerr << '\n'; - if (print_command_line) { - std::ostringstream strm; - strm << "Command line was: "; - for (int j = 0; j < argc_; j++) strm << Escape(argv_[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } -} - -void ParseOptions::PrintConfig(std::ostream &os) { - os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; - std::string key; - DocMapType::iterator it; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; - if (bool_map_.end() != bool_map_.find(key)) { - os << (*bool_map_[key] ? "true" : "false"); - } else if (int_map_.end() != int_map_.find(key)) { - os << (*int_map_[key]); - } else if (uint_map_.end() != uint_map_.find(key)) { - os << (*uint_map_[key]); - } else if (float_map_.end() != float_map_.find(key)) { - os << (*float_map_[key]); - } else if (double_map_.end() != double_map_.find(key)) { - os << (*double_map_[key]); - } else if (string_map_.end() != string_map_.find(key)) { - os << "'" << *string_map_[key] << "'"; - } else { - KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; - } - os << '\n'; - } - os << '\n'; -} - -void ParseOptions::ReadConfigFile(const std::string &filename) { - std::ifstream is(filename.c_str(), std::ifstream::in); - if (!is.good()) { - KALDI_ERR << "Cannot open config file: " << filename; - } - - std::string line, key, value; - int32 line_number = 0; - while (std::getline(is, line)) { - line_number++; - // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { - line.erase(pos); - } - // skip empty lines - Trim(&line); - if (line.length() == 0) continue; - - if (line.substr(0, 2) != "--") { - KALDI_ERR << "Reading config file " << filename << ": line " - << line_number << " does not look like a line " - << "from a Kaldi command-line program's config file: should " - << "be of the form --x=y. Note: config files intended to " - << "be sourced by shell scripts lack the '--'."; - } - - // parse option - bool has_equal_sign; - SplitLongArg(line, &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << line << " in config file " << filename; - } - } -} - -void ParseOptions::SplitLongArg(const std::string &in, std::string *key, - std::string *value, bool *has_equal_sign) { - KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. - size_t pos = in.find_first_of('=', 0); - if (pos == std::string::npos) { // we allow --option for bools - // defaults to empty. We handle this differently in different cases. - *key = in.substr(2, in.size() - 2); // 2 because starts with --. - *value = ""; - *has_equal_sign = false; - } else if (pos == 2) { // we also don't allow empty keys: --=value - PrintUsage(true); - KALDI_ERR << "Invalid option (no key): " << in; - } else { // normal case: --option=value - *key = in.substr(2, pos - 2); // 2 because starts with --. - *value = in.substr(pos + 1); - *has_equal_sign = true; - } -} - -void ParseOptions::NormalizeArgName(std::string *str) { - std::string out; - std::string::iterator it; - - for (it = str->begin(); it != str->end(); ++it) { - if (*it == '_') - out += '-'; // convert _ to - - else - out += std::tolower(*it); - } - *str = out; - - KALDI_ASSERT(str->length() > 0); -} - -bool ParseOptions::SetOption(const std::string &key, const std::string &value, - bool has_equal_sign) { - if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") - KALDI_ERR << "Invalid option --" << key << "="; - *(bool_map_[key]) = ToBool(value); - } else if (int_map_.end() != int_map_.find(key)) { - *(int_map_[key]) = ToInt(value); - } else if (uint_map_.end() != uint_map_.find(key)) { - *(uint_map_[key]) = ToUint(value); - } else if (float_map_.end() != float_map_.find(key)) { - *(float_map_[key]) = ToFloat(value); - } else if (double_map_.end() != double_map_.find(key)) { - *(double_map_[key]) = ToDouble(value); - } else if (string_map_.end() != string_map_.find(key)) { - if (!has_equal_sign) - KALDI_ERR << "Invalid option --" << key << " (option format is --x=y)."; - *(string_map_[key]) = value; - } else { - return false; - } - return true; -} - -bool ParseOptions::ToBool(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); - - // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { - return true; - } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { - return false; - } - // if it is neither true nor false: - PrintUsage(true); - KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " - << str; - return false; // never reached -} - -int32 ParseOptions::ToInt(const std::string &str) { - int32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -uint32 ParseOptions::ToUint(const std::string &str) { - uint32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -float ParseOptions::ToFloat(const std::string &str) { - float ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -double ParseOptions::ToDouble(const std::string &str) { - double ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -// instantiate templates -template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterStandard(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - float *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - double *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterCommon(const std::string &name, bool *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, int32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, uint32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, float *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, double *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, - std::string *ptr, - const std::string &doc, - bool is_standard); - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/parse-options.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/parse-options.h deleted file mode 100644 index 93a060f4a411dfd63298a91bb313e0b66d337a75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/parse-options.h +++ /dev/null @@ -1,265 +0,0 @@ -// util/parse-options.h - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ -#define KALDI_UTIL_PARSE_OPTIONS_H_ - -#include -#include -#include - -#include "base/kaldi-common.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/// The class ParseOptions is for parsing command-line options; see -/// \ref parse_options for more documentation. -class ParseOptions : public OptionsItf { - public: - explicit ParseOptions(const char *usage) - : print_args_(true), - help_(false), - usage_(usage), - argc_(0), - argv_(NULL), - prefix_(""), - other_parser_(NULL) { -#if !defined(_MSC_VER) && \ - !defined(__CYGWIN__) // This is just a convenient place to set the stderr - // to line - setlinebuf(stderr); // buffering mode, since it's called at program start. -#endif // This helps ensure different programs' output is not mixed up. - RegisterStandard("config", &config_, - "Configuration file to read (this " - "option may be repeated)"); - RegisterStandard("print-args", &print_args_, - "Print the command line arguments (to stderr)"); - RegisterStandard("help", &help_, "Print out usage message"); - RegisterStandard("verbose", &g_kaldi_verbose_level, - "Verbose level (higher->more logging)"); - } - - /** - This is a constructor for the special case where some options are - registered with a prefix to avoid conflicts. The object thus created will - only be used temporarily to register an options class with the original - options parser (which is passed as the *other pointer) using the given - prefix. It should not be used for any other purpose, and the prefix must - not be the empty string. It seems to be the least bad way of implementing - options with prefixes at this point. - Example of usage is: - ParseOptions po; // original ParseOptions object - ParseOptions po_mfcc("mfcc", &po); // object with prefix. - MfccOptions mfcc_opts; - mfcc_opts.Register(&po_mfcc); - The options will now get registered as, e.g., --mfcc.frame-shift=10.0 - instead of just --frame-shift=10.0 - */ - ParseOptions(const std::string &prefix, OptionsItf *other); - - ~ParseOptions() {} - - // Methods from the interface - void Register(const std::string &name, bool *ptr, const std::string &doc); - void Register(const std::string &name, int32 *ptr, const std::string &doc); - void Register(const std::string &name, uint32 *ptr, const std::string &doc); - void Register(const std::string &name, float *ptr, const std::string &doc); - void Register(const std::string &name, double *ptr, const std::string &doc); - void Register(const std::string &name, std::string *ptr, - const std::string &doc); - - /// If called after registering an option and before calling - /// Read(), disables that option from being used. Will crash - /// at runtime if that option had not been registered. - void DisableOption(const std::string &name); - - /// This one is used for registering standard parameters of all the programs - template - void RegisterStandard(const std::string &name, T *ptr, - const std::string &doc); - - /** - Parses the command line options and fills the ParseOptions-registered - variables. This must be called after all the variables were registered!!! - - Initially the variables have implicit values, - then the config file values are set-up, - finally the command line values given. - Returns the first position in argv that was not used. - [typically not useful: use NumParams() and GetParam(). ] - */ - int Read(int argc, const char *const *argv); - - /// Prints the usage documentation [provided in the constructor]. - void PrintUsage(bool print_command_line = false); - /// Prints the actual configuration of all the registered variables - void PrintConfig(std::ostream &os); - - /// Reads the options values from a config file. Must be called after - /// registering all options. This is usually used internally after the - /// standard --config option is used, but it may also be called from a - /// program. - void ReadConfigFile(const std::string &filename); - - /// Number of positional parameters (c.f. argc-1). - int NumArgs() const; - - /// Returns one of the positional parameters; 1-based indexing for argc/argv - /// compatibility. Will crash if param is not >=1 and <=NumArgs(). - std::string GetArg(int param) const; - - std::string GetOptArg(int param) const { - return (param <= NumArgs() ? GetArg(param) : ""); - } - - /// The following function will return a possibly quoted and escaped - /// version of "str", according to the current shell. Currently - /// this is just hardwired to bash. It's useful for debug output. - static std::string Escape(const std::string &str); - - private: - /// Template to register various variable types, - /// used for program-specific parameters - template - void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); - - // Following functions do just the datatype-specific part of the job - /// Register boolean variable - void RegisterSpecific(const std::string &name, const std::string &idx, - bool *b, const std::string &doc, bool is_standard); - /// Register int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - int32 *i, const std::string &doc, bool is_standard); - /// Register unsinged int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - uint32 *u, const std::string &doc, bool is_standard); - /// Register float variable - void RegisterSpecific(const std::string &name, const std::string &idx, - float *f, const std::string &doc, bool is_standard); - /// Register double variable [useful as we change BaseFloat type]. - void RegisterSpecific(const std::string &name, const std::string &idx, - double *f, const std::string &doc, bool is_standard); - /// Register string variable - void RegisterSpecific(const std::string &name, const std::string &idx, - std::string *s, const std::string &doc, - bool is_standard); - - /// Does the actual job for both kinds of parameters - /// Does the common part of the job for all datatypes, - /// then calls RegisterSpecific - template - void RegisterCommon(const std::string &name, T *ptr, const std::string &doc, - bool is_standard); - - /// Set option with name "key" to "value"; will crash if can't do it. - /// "has_equal_sign" is used to allow --x for a boolean option x, - /// and --y=, for a string option y. - bool SetOption(const std::string &key, const std::string &value, - bool has_equal_sign); - - bool ToBool(std::string str); - int32 ToInt(const std::string &str); - uint32 ToUint(const std::string &str); - float ToFloat(const std::string &str); - double ToDouble(const std::string &str); - - // maps for option variables - std::map bool_map_; - std::map int_map_; - std::map uint_map_; - std::map float_map_; - std::map double_map_; - std::map string_map_; - - /** - Structure for options' documentation - */ - struct DocInfo { - DocInfo() {} - DocInfo(const std::string &name, const std::string &usemsg) - : name_(name), use_msg_(usemsg), is_standard_(false) {} - DocInfo(const std::string &name, const std::string &usemsg, - bool is_standard) - : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} - - std::string name_; - std::string use_msg_; - bool is_standard_; - }; - typedef std::map DocMapType; - DocMapType doc_map_; ///< map for the documentation - - bool print_args_; ///< variable for the implicit --print-args parameter - bool help_; ///< variable for the implicit --help parameter - std::string config_; ///< variable for the implicit --config parameter - std::vector positional_args_; - const char *usage_; - int argc_; - const char *const *argv_; - - /// These members are not normally used. They are only used when the object - /// is constructed with a prefix - std::string prefix_; - OptionsItf *other_parser_; - - protected: - /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, - /// and sets "has_equal_sign" to true if an equals-sign was parsed.. - /// this is needed in order to correctly allow --x for a boolean option - /// x, and --y= for a string option y, and to disallow --x= and --y. - void SplitLongArg(const std::string &in, std::string *key, std::string *value, - bool *has_equal_sign); - - void NormalizeArgName(std::string *str); -}; - -/// This template is provided for convenience in reading config classes from -/// files; this is not the standard way to read configuration options, but may -/// occasionally be needed. This function assumes the config has a function -/// "void Register(OptionsItf *opts)" which it can call to register the -/// ParseOptions object. -template -void ReadConfigFromFile(const std::string &config_filename, C *c) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << config_filename << "'"; - ParseOptions po(usage_str.str().c_str()); - c->Register(&po); - po.ReadConfigFile(config_filename); -} - -/// This variant of the template ReadConfigFromFile is for if you need to read -/// two config classes from the same file. -template -void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << conf << "'"; - ParseOptions po(usage_str.str().c_str()); - c1->Register(&po); - c2->Register(&po); - po.ReadConfigFile(conf); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/simple-io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/simple-io-funcs.cc deleted file mode 100644 index 5ace601b6a2bb186dec78b0b25cb5a3227c48bc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/simple-io-funcs.cc +++ /dev/null @@ -1,80 +0,0 @@ -// util/simple-io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/simple-io-funcs.h" -#include "util/text-utils.h" - -namespace kaldi { - -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; - return ko.Close(); -} - -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - int32 i; - list->clear(); - while (!(is >> i).fail()) list->push_back(i); - is >> std::ws; - return is.eof(); // should be eof, or junk at end of file. -} - -bool WriteIntegerVectorVectorSimple( - const std::string &wxfilename, - const std::vector > &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - std::ostream &os = ko.Stream(); - for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i].size(); j++) { - os << list[i][j]; - if (j + 1 < list[i].size()) os << ' '; - } - os << '\n'; - } - return ko.Close(); -} - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - list->clear(); - std::string line; - while (std::getline(is, line)) { - std::vector v; - if (!SplitStringToIntegers(line, " \t\r", true, &v)) { - list->clear(); - return false; - } - list->push_back(v); - } - return is.eof(); // if we're not at EOF, something weird happened. -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/simple-io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/simple-io-funcs.h deleted file mode 100644 index 1ead12790ba9bd6a44ccdff855918270191b8ebd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/simple-io-funcs.h +++ /dev/null @@ -1,61 +0,0 @@ -// util/simple-io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ -#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ - -#include -#include -#include "util/kaldi-io.h" - -// This header contains some utilities for reading some common, simple text -// formats:integers in files, one per line, and integers in files, possibly -// multiple per line. these are not really fully native Kaldi formats; they are -// mostly for small files that might be generated by scripts, and can be read -// all at one time. for longer files of this type, we would probably use the -// Table code. - -namespace kaldi { - -/// WriteToList attempts to write this list of integers, one per line, -/// to the given file, in text format. -/// returns true if succeeded. -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &v); - -/// ReadFromList attempts to read this list of integers, one per line, -/// from the given file, in text format. -/// returns true if succeeded. -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *v); - -// This is a file format like: -// 1 2 -// 3 -// -// 4 5 6 -// etc. -bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, - const std::vector > &v); - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *v); - -} // end namespace kaldi. - -#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/stl-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/stl-utils.h deleted file mode 100644 index 8a29cd582c77b3078277aa9713b8676032bbc5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/stl-utils.h +++ /dev/null @@ -1,310 +0,0 @@ -// util/stl-utils.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_STL_UTILS_H_ -#define KALDI_UTIL_STL_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -using std::unordered_map; -using std::unordered_set; - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Sorts and uniq's (removes duplicates) from a vector. -template -inline void SortAndUniq(std::vector *vec) { - std::sort(vec->begin(), vec->end()); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Returns true if the vector is sorted. -template -inline bool IsSorted(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter < *iter) return false; - iter = next_iter; - } -} - -/// Returns true if the vector is sorted and contains each element -/// only once. -template -inline bool IsSortedAndUniq(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter <= *iter) return false; - iter = next_iter; - } -} - -/// Removes duplicate elements from a sorted list. -template -inline void Uniq(std::vector *vec) { // must be already sorted. - KALDI_PARANOID_ASSERT(IsSorted(*vec)); - KALDI_ASSERT(vec); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Copies the elements of a set to a vector. -template -void CopySetToVector(const std::set &s, std::vector *v) { - // copies members of s into v, in sorted order from lowest to highest - // (because the set was in sorted order). - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename std::set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -template -void CopySetToVector(const unordered_set &s, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename unordered_set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -/// Copies the (key, value) pairs in a map to a vector of pairs. -template -void CopyMapToVector(const std::map &m, - std::vector > *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector >::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = std::make_pair(miter->first, miter->second); - // do it like this because of const casting. - } -} - -/// Copies the keys in a map to a vector. -template -void CopyMapKeysToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->first; - } -} - -/// Copies the values in a map to a vector. -template -void CopyMapValuesToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->second; - } -} - -/// Copies the keys in a map to a set. -template -void CopyMapKeysToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) { - s->insert(s->end(), miter->first); - } -} - -/// Copies the values in a map to a set. -template -void CopyMapValuesToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) s->insert(s->end(), miter->second); -} - -/// Copies the contents of a vector to a set. -template -void CopyVectorToSet(const std::vector &v, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) s->insert(s->end(), *iter); - // s->end() is a hint in case v was sorted. will work regardless. -} - -/// Deletes any non-NULL pointers in the vector v, and sets -/// the corresponding entries of v to NULL -template -void DeletePointers(std::vector *v) { - KALDI_ASSERT(v != NULL); - typename std::vector::iterator iter = v->begin(), end = v->end(); - for (; iter != end; ++iter) { - if (*iter != NULL) { - delete *iter; - *iter = NULL; // set to NULL for extra safety. - } - } -} - -/// Returns true if the vector of pointers contains NULL pointers. -template -bool ContainsNullPointers(const std::vector &v) { - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) - if (*iter == static_cast(NULL)) return true; - return false; -} - -/// Copies the contents a vector of one type to a vector -/// of another type. -template -void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { - KALDI_ASSERT(vec_out != NULL); - vec_out->resize(vec_in.size()); - for (size_t i = 0; i < vec_in.size(); i++) - (*vec_out)[i] = static_cast(vec_in[i]); -} - -/// A hashing function-object for vectors. -template -struct VectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const noexcept { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - VectorHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - } - - private: - static const int kPrime = 7853; -}; - -/// A hashing function-object for pairs of ints -template -struct PairHasher { // hashing function for pair - size_t operator()(const std::pair &x) const noexcept { - // 7853 was chosen at random from a list of primes. - return x.first + x.second * 7853; - } - PairHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int1); - KALDI_ASSERT_IS_INTEGER_TYPE(Int2); - } -}; - -/// A hashing function object for strings. -struct StringHasher { // hashing function for std::string - size_t operator()(const std::string &str) const noexcept { - size_t ans = 0, len = str.length(); - const char *c = str.c_str(), *end = c + len; - for (; c != end; c++) { - ans *= kPrime; - ans += *c; - } - return ans; - } - - private: - static const int kPrime = 7853; -}; - -/// Reverses the contents of a vector. -template -inline void ReverseVector(std::vector *vec) { - KALDI_ASSERT(vec != NULL); - size_t sz = vec->size(); - for (size_t i = 0; i < sz / 2; i++) std::swap((*vec)[i], (*vec)[sz - 1 - i]); -} - -/// Comparator object for pairs that compares only the first pair. -template -struct CompareFirstMemberOfPair { - inline bool operator()(const std::pair &p1, const std::pair &p2) { - return p1.first < p2.first; - } -}; - -/// For a vector of pair where I is an integer and F a floating-point or -/// integer type, this function sorts a vector of type vector > on -/// the I value and then merges elements with equal I values, summing these over -/// the F component and then removing any F component with zero value. This -/// is for where the vector of pairs represents a map from the integer to float -/// component, with an "adding" type of semantics for combining the elements. -template -inline void MergePairVectorSumming(std::vector > *vec) { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - CompareFirstMemberOfPair c; - std::sort(vec->begin(), vec->end(), c); // sort on 1st element. - typename std::vector >::iterator out = vec->begin(), - in = vec->begin(), - end = vec->end(); - // special case: while there is nothing to be changed, skip over - // initial input (avoids unnecessary copying). - while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { - in++; - out++; - } - while (in < end) { - // We reach this point only at the first element of - // each stretch of identical .first elements. - *out = *in; - ++in; - while (in < end && in->first == out->first) { - out->second += in->second; // this is the merge operation. - ++in; - } - if (out->second != static_cast(0)) // Don't keep zero elements. - out++; - } - vec->erase(out, end); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/text-utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/text-utils.cc deleted file mode 100644 index fd70889644f6b4e14793ddd4f5b0d71a66768699..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/text-utils.cc +++ /dev/null @@ -1,580 +0,0 @@ -// util/text-utils.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/text-utils.h" - -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out) { - KALDI_ASSERT(out != NULL); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - F f = 0; - if (!ConvertStringToReal(split[i], &f)) return false; - (*out)[i] = f; - } - return true; -} - -// Instantiate the template above for float and double. -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out) { - std::string tmp_str; - for (size_t i = 0; i < vec_in.size(); i++) { - if (!omit_empty_strings || !vec_in[i].empty()) { - tmp_str.append(vec_in[i]); - if (i < vec_in.size() - 1) - if (!omit_empty_strings || !vec_in[i + 1].empty()) - tmp_str.append(delim); - } - } - str_out->swap(tmp_str); -} - -void Trim(std::string *str) { - const char *white_chars = " \t\n\r\f\v"; - - std::string::size_type pos = str->find_last_not_of(white_chars); - if (pos != std::string::npos) { - str->erase(pos + 1); - pos = str->find_first_not_of(white_chars); - if (pos != std::string::npos) str->erase(0, pos); - } else { - str->erase(str->begin(), str->end()); - } -} - -bool IsToken(const std::string &token) { - size_t l = token.length(); - if (l == 0) return false; - for (size_t i = 0; i < l; i++) { - unsigned char c = token[i]; - if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) - return false; - // The "&& (isascii(c) || c == 255)" was added so that we won't reject - // non-ASCII characters such as French characters with accents [except for - // 255 which is "nbsp", a form of space]. - } - return true; -} - -void SplitStringOnFirstSpace(const std::string &str, std::string *first, - std::string *rest) { - const char *white_chars = " \t\n\r\f\v"; - typedef std::string::size_type I; - const I npos = std::string::npos; - I first_nonwhite = str.find_first_not_of(white_chars); - if (first_nonwhite == npos) { - first->clear(); - rest->clear(); - return; - } - // next_white is first whitespace after first nonwhitespace. - I next_white = str.find_first_of(white_chars, first_nonwhite); - - if (next_white == npos) { // no more whitespace... - *first = std::string(str, first_nonwhite); - rest->clear(); - return; - } - I next_nonwhite = str.find_first_not_of(white_chars, next_white); - if (next_nonwhite == npos) { - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - rest->clear(); - return; - } - - I last_nonwhite = str.find_last_not_of(white_chars); - KALDI_ASSERT(last_nonwhite != npos); // or coding error. - - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - *rest = std::string(str, next_nonwhite, last_nonwhite + 1 - next_nonwhite); -} - -bool IsLine(const std::string &line) { - if (line.find('\n') != std::string::npos) return false; - if (line.empty()) return true; - if (isspace(*(line.begin()))) return false; - if (isspace(*(line.rbegin()))) return false; - std::string::const_iterator iter = line.begin(), end = line.end(); - for (; iter != end; iter++) - if (!isprint(*iter)) return false; - return true; -} - -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - -template bool ConvertStringToReal(const std::string &str, float *out); -template bool ConvertStringToReal(const std::string &str, double *out); - -/* - This function is a helper function of StringsApproxEqual. It should be - thought of as a recursive function-- it was designed that way-- but rather - than actually recursing (which would cause problems with stack overflow), we - just set the args and return to the start. - - The 'decimal_places_tolerance' argument is just passed in from outside, - see the documentation for StringsApproxEqual in text-utils.h to see an - explanation. The argument 'places_into_number' provides some information - about the strings 'a' and 'b' that precedes the current pointers. - For purposes of this comment, let's define the 'decimal' of a number - as the part that comes after the decimal point, e.g. in '99.123', - '123' would be the decimal. If 'places_into_number' is -1, it means - we're not currently inside some place like that (i.e. it's not the - case that we're pointing to the '1' or the '2' or the '3'). - If it's 0, then we'd be pointing to the first place after the decimal, - '1' in this case. Note if one of the numbers is shorter than the - other, like '99.123' versus '99.1234' and 'a' points to the first '3' - while 'b' points to the second '4', 'places_into_number' referes to the - shorter of the two, i.e. it would be 2 in this example. - - - */ -bool StringsApproxEqualInternal(const char *a, const char *b, - int32 decimal_places_tolerance, - int32 places_into_number) { -start: - char ca = *a, cb = *b; - if (ca == cb) { - if (ca == '\0') { - return true; - } else { - if (places_into_number >= 0) { - if (isdigit(ca)) { - places_into_number++; - } else { - places_into_number = -1; - } - } else { - if (ca == '.') { - places_into_number = 0; - } - } - a++; - b++; - goto start; - } - } else { - if (places_into_number >= decimal_places_tolerance && - (isdigit(ca) || isdigit(cb))) { - // we're potentially willing to accept this difference between the - // strings. - if (isdigit(ca)) a++; - if (isdigit(cb)) b++; - // we'll have advanced at least one of the two strings. - goto start; - } else if (places_into_number >= 0 && - ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { - // this clause is designed to ensure that, for example, - // "0.1" would count the same as "0.100001". - if (ca == '0') - a++; - else - b++; - places_into_number++; - goto start; - } else { - return false; - } - } -} - -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_tolerance) { - return StringsApproxEqualInternal(a.c_str(), b.c_str(), - decimal_places_tolerance, -1); -} - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = - std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals - // sign, or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign + 1] == '\'' || - line[next_equals_sign + 1] == '"') { - char my_quote = line[next_equals_sign + 1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote - << " in config line '" << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) - // foo=bar": in general, config values with spaces in them, even without - // quoting. - - size_t next_next_equals_sign = - line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != - std::string::npos) { // found a later equals sign. - size_t preceding_space = - line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -// void ExpectOneOrTwoTokens(std::istream &is, bool binary, -// const std::string &token1, -// const std::string &token2) { -// KALDI_ASSERT(token1 != token2); -// std::string temp; -// ReadToken(is, binary, &temp); -// if (temp == token1) { -// ExpectToken(is, binary, token2); -// } else { -// if (temp != token2) { -// KALDI_ERR << "Expecting token " << token1 << " or " << token2 -// << " but got " << temp; -// } -// } -// } - -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines) { - config_lines->resize(lines.size()); - for (size_t i = 0; i < lines.size(); i++) { - bool ret = (*config_lines)[i].ParseLine(lines[i]); - if (!ret) { - KALDI_ERR << "Error parsing config line: " << lines[i]; - } - } -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/text-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/text-utils.h deleted file mode 100644 index bc7763c4aff38214d97cbeda3b29c8717dd65318..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/kaldi/util/text-utils.h +++ /dev/null @@ -1,264 +0,0 @@ -// util/text-utils.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_TEXT_UTILS_H_ -#define KALDI_UTIL_TEXT_UTILS_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Split a string using any of the single character delimiters. -/// If omit_empty_strings == true, the output will contain any -/// nonempty strings after splitting on any of the -/// characters in the delimiter. If omit_empty_strings == false, -/// the output will contain n+1 strings if there are n characters -/// in the set "delim" within the input string. In this case -/// the empty string is split to a single empty string. -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -/// Joins the elements of a vector of strings into a single string using -/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings -/// in the vector are skipped. A vector of empty strings results in an empty -/// string on the output. -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out); - -/** - \brief Split a string (e.g. 1:2:3) into a vector of integers. - - \param [in] delim String containing a list of characters, any of which - is allowed as a delimiter. - \param [in] omit_empty_strings If true, empty strings between delimiters are - allowed and will not produce an output integer; if false, - instances of characters in 'delim' that are consecutive or - at the start or end of the string would be an error. - You'll normally want this to be true if 'delim' consists - of spaces, and false otherwise. - \param [out] out The output list of integers. -*/ -template -bool SplitStringToIntegers(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false [but - // should probably be true - // if "delim" is spaces]. - std::vector *out) { - KALDI_ASSERT(out != NULL); - KALDI_ASSERT_IS_INTEGER_TYPE(I); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - const char *this_str = split[i].c_str(); - char *end = NULL; - int64 j = 0; - j = KALDI_STRTOLL(this_str, &end); - if (end == this_str || *end != '\0') { - out->clear(); - return false; - } else { - I jI = static_cast(j); - if (static_cast(jI) != j) { - // output type cannot fit this integer. - out->clear(); - return false; - } - (*out)[i] = jI; - } - } - return true; -} - -// This is defined for F = float and double. -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out); - -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - const char *this_str = str.c_str(); - char *end = NULL; - errno = 0; - int64 i = KALDI_STRTOLL(this_str, &end); - if (end != this_str) - while (isspace(*end)) end++; - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out); - -/// Removes the beginning and trailing whitespaces from a string -void Trim(std::string *str); - -/// Removes leading and trailing white space from the string, then splits on the -/// first section of whitespace found (if present), putting the part before the -/// whitespace in "first" and the rest in "rest". If there is no such space, -/// everything that remains after removing leading and trailing whitespace goes -/// in "first". -void SplitStringOnFirstSpace(const std::string &line, std::string *first, - std::string *rest); - -/// Returns true if "token" is nonempty, and all characters are -/// printable and whitespace-free. -bool IsToken(const std::string &token); - -/// Returns true if "line" is free of \n characters and unprintable -/// characters, and does not contain leading or trailing whitespace. -bool IsLine(const std::string &line); - -/** - This function returns true when two text strings are approximately equal, and - false when they are not. The definition of 'equal' is normal string - equality, except that two substrings like "0.31134" and "0.311341" would be - considered equal. 'decimal_places_tolerance' controls how many digits after - the '.' have to match up. - E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would - return false because there is a difference in the 2nd decimal, but with - an argument of 1 it would return true. - */ -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_check = 2); - -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' - baz="a b c d='a b' e" and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free - of the '=' character. If values are going to contain the '=' character, you - need to quote them with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; -}; - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, const std::string &token2); - -/** - This function reads in a config file and *appends* its contents to a vector - of lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, std::vector *lines); - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); - -/// Returns true if 'name' would be a valid name for a component or node in a -/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing -/// only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - -} // namespace kaldi - -#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/CPPLINT.cfg deleted file mode 100644 index 51ff339c18435a6c3a3be03131080d7b8ab8de86..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/CPPLINT.cfg +++ /dev/null @@ -1 +0,0 @@ -exclude_files=.* diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/CMakeLists.txt deleted file mode 100644 index 04051ef5ae46c04a40c1ffccc98c37fa594ad13e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - -#-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o - -include_directories(./include/) -install(DIRECTORY include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - -add_subdirectory(lib) - -if(HAVE_SCRIPT) - add_subdirectory(script) -endif(HAVE_SCRIPT) - -if(HAVE_BIN) - add_subdirectory(bin) -endif(HAVE_BIN) - -add_subdirectory(extensions) - -if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) -endif(BUILD_TESTING) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/extensions/special/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/extensions/special/CMakeLists.txt deleted file mode 100644 index 9c71b750a72ffe3c2dafde657273361c3dbae409..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/extensions/special/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) -message(STATUS "${HEADER_FILES}") - -if(HAVE_BIN) - add_executable(fstspecial-bin - ../../bin/fstconvert.cc - ../../bin/fstconvert-main.cc - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ) - - set_target_properties(fstspecial-bin PROPERTIES - FOLDER special/bin - OUTPUT_NAME fstspecial - ) - - target_link_libraries(fstspecial-bin - fstscript - fst - ${CMAKE_DL_LIBS} - ) -endif(HAVE_BIN) - - -add_library(fstspecial - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ${HEADER_FILES} -) - -set_target_properties(fstspecial PROPERTIES - SOVERSION "${SOVERSION}" - FOLDER special -) -target_link_libraries(fstspecial - fst -) - -set(FST_SPECIAL_INSTALL_TARGETS fstspecial) -if(HAVE_BIN) - list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) -endif() - -install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib -) - -function (add_module _name) - add_library(${ARGV}) - if (TARGET ${_name}) - target_link_libraries(${_name} fst) - set_target_properties(${_name} - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true - FOLDER special/modules - ) - endif() - - install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) -endfunction() - -add_module(phi-fst MODULE phi-fst.cc) -add_module(rho-fst MODULE rho-fst.cc) -add_module(sigma-fst MODULE sigma-fst.cc) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/include/fst/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/include/fst/flags.h deleted file mode 100644 index b5ec8ff7416774a0612ae0fe7e008a630b289dd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/include/fst/flags.h +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style flag handling declarations and inline definitions. - -#ifndef FST_LIB_FLAGS_H_ -#define FST_LIB_FLAGS_H_ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" - -using std::string; - -// FLAGS USAGE: -// -// Definition example: -// -// DEFINE_int32(length, 0, "length"); -// -// This defines variable FLAGS_length, initialized to 0. -// -// Declaration example: -// -// DECLARE_int32(length); -// -// SET_FLAGS() can be used to set flags from the command line -// using, for example, '--length=2'. -// -// ShowUsage() can be used to print out command and flag usage. - -// #define DECLARE_bool(name) extern bool FLAGS_ ## name -// #define DECLARE_string(name) extern string FLAGS_ ## name -// #define DECLARE_int32(name) extern int32 FLAGS_ ## name -// #define DECLARE_int64(name) extern int64 FLAGS_ ## name -// #define DECLARE_double(name) extern double FLAGS_ ## name - -template -struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, - const char *file, const T val) - : address(addr), - doc_string(doc), - type_name(type), - file_name(file), - default_value(val) {} - - T *address; - const char *doc_string; - const char *type_name; - const char *file_name; - const T default_value; -}; - -template -class FlagRegister { - public: - static FlagRegister *GetRegister() { - static auto reg = new FlagRegister; - return reg; - } - - const FlagDescription &GetFlagDescription(const string &name) const { - fst::MutexLock l(&flag_lock_); - auto it = flag_table_.find(name); - return it != flag_table_.end() ? it->second : 0; - } - - void SetDescription(const string &name, - const FlagDescription &desc) { - fst::MutexLock l(&flag_lock_); - flag_table_.insert(make_pair(name, desc)); - } - - bool SetFlag(const string &val, bool *address) const { - if (val == "true" || val == "1" || val.empty()) { - *address = true; - return true; - } else if (val == "false" || val == "0") { - *address = false; - return true; - } - else { - return false; - } - } - - bool SetFlag(const string &val, string *address) const { - *address = val; - return true; - } - - bool SetFlag(const string &val, int32 *address) const { - char *p = 0; - *address = strtol(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, int64 *address) const { - char *p = 0; - *address = strtoll(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, double *address) const { - char *p = 0; - *address = strtod(val.c_str(), &p); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, FlagDescription >::const_iterator it = - flag_table_.begin(); - it != flag_table_.end(); - ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - if (arg == name) - return SetFlag(val, desc.address); - } - return false; - } - - void GetUsage(std::set> *usage_set) const { - for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - string usage = " --" + name; - usage += ": type = "; - usage += desc.type_name; - usage += ", default = "; - usage += GetDefault(desc.default_value) + "\n "; - usage += desc.doc_string; - usage_set->insert(make_pair(desc.file_name, usage)); - } - } - - private: - string GetDefault(bool default_value) const { - return default_value ? "true" : "false"; - } - - string GetDefault(const string &default_value) const { - return "\"" + default_value + "\""; - } - - template - string GetDefault(const V &default_value) const { - std::ostringstream strm; - strm << default_value; - return strm.str(); - } - - mutable fst::Mutex flag_lock_; // Multithreading lock. - std::map> flag_table_; -}; - -template -class FlagRegisterer { - public: - FlagRegisterer(const string &name, const FlagDescription &desc) { - auto registr = FlagRegister::GetRegister(); - registr->SetDescription(name, desc); - } - - private: - FlagRegisterer(const FlagRegisterer &) = delete; - FlagRegisterer &operator=(const FlagRegisterer &) = delete; -}; - - -#define DEFINE_VAR(type, name, value, doc) \ - type FLAGS_ ## name = value; \ - static FlagRegisterer \ - name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ - doc, \ - #type, \ - __FILE__, \ - value)) - -// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) -// #define DEFINE_string(name, value, doc) \ -// DEFINE_VAR(string, name, value, doc) -// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc) -// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc) -// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc) - - -// Temporary directory. -DECLARE_string(tmpdir); - -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, - const char *src = ""); - -#define SET_FLAGS(usage, argc, argv, rmflags) \ -gflags::ParseCommandLineFlags(argc, argv, true) -// SetFlags(usage, argc, argv, rmflags, __FILE__) - -// Deprecated; for backward compatibility. -inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { - return SetFlags(usage, argc, argv, rmflags); -} - -void ShowUsage(bool long_usage = true); - -#endif // FST_LIB_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/include/fst/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/include/fst/log.h deleted file mode 100644 index bf041c58ebfab73d03bb14adf28c7c7916a2217d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/patch/openfst/src/include/fst/log.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style logging declarations and inline definitions. - -#ifndef FST_LIB_LOG_H_ -#define FST_LIB_LOG_H_ - -#include -#include -#include - -#include -#include - -using std::string; - -DECLARE_int32(v); - -class LogMessage { - public: - LogMessage(const string &type) : fatal_(type == "FATAL") { - std::cerr << type << ": "; - } - ~LogMessage() { - std::cerr << std::endl; - if(fatal_) - exit(1); - } - std::ostream &stream() { return std::cerr; } - - private: - bool fatal_; -}; - -// #define LOG(type) LogMessage(#type).stream() -// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO) - -// Checks -inline void FstCheck(bool x, const char* expr, - const char *file, int line) { - if (!x) { - LOG(FATAL) << "Check failed: \"" << expr - << "\" file: " << file - << " line: " << line; - } -} - -// #define CHECK(x) FstCheck(static_cast(x), #x, __FILE__, __LINE__) -// #define CHECK_EQ(x, y) CHECK((x) == (y)) -// #define CHECK_LT(x, y) CHECK((x) < (y)) -// #define CHECK_GT(x, y) CHECK((x) > (y)) -// #define CHECK_LE(x, y) CHECK((x) <= (y)) -// #define CHECK_GE(x, y) CHECK((x) >= (y)) -// #define CHECK_NE(x, y) CHECK((x) != (y)) - -// Debug checks -// #define DCHECK(x) assert(x) -// #define DCHECK_EQ(x, y) DCHECK((x) == (y)) -// #define DCHECK_LT(x, y) DCHECK((x) < (y)) -// #define DCHECK_GT(x, y) DCHECK((x) > (y)) -// #define DCHECK_LE(x, y) DCHECK((x) <= (y)) -// #define DCHECK_GE(x, y) DCHECK((x) >= (y)) -// #define DCHECK_NE(x, y) DCHECK((x) != (y)) - - -// Ports -#define ATTRIBUTE_DEPRECATED __attribute__((deprecated)) - -#endif // FST_LIB_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/CMakeLists.txt deleted file mode 100644 index 6113bbc26eb8fe35e4e17ffd1cab382f0fb0f1f8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(post_processor STATIC - post_processor.cc -) -target_link_libraries(post_processor PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/post_processor.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/post_processor.cc deleted file mode 100644 index 315f62d34cbc441ecbaf7c07667eb35ee61c2c8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/post_processor.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "utils/string.h" - -namespace wenet { - -std::string PostProcessor::ProcessSpace(const std::string& str) { - std::string result = str; - // 1. remove ' ' if needed - // only spaces between mandarin words need to be removed, please note that - // if str contains '_', we assume that the decoding type must be - // `CtcPrefixBeamSearch` and this branch will do nothing since str must be - // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) - if (opts_.language_type == kMandarinEnglish && !str.empty()) { - result.clear(); - // split str by ' ' - std::vector words; - std::stringstream ss(str); - std::string tmp; - while (ss >> tmp) { - words.push_back(tmp); - } - // check english word - bool is_englishword_prev = false; - bool is_englishword_now = false; - for (std::string& w : words) { - is_englishword_now = CheckEnglishWord(w); - if (is_englishword_prev && is_englishword_now) { - result += (' ' + w); - } else { - result += (w); - } - is_englishword_prev = is_englishword_now; - } - } - // 2. replace '_' with ' ' - // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) - result = ProcessBlank(result, opts_.lowercase); - return result; -} - -std::string PostProcessor::Process(const std::string& str, bool finish) { - std::string result; - result = ProcessSpace(str); - // TODO(xcsong): do itn/punctuation if finish == true - return result; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/post_processor.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/post_processor.h deleted file mode 100644 index 54597845ebc88ad22e1244d2e693e2088cff6d21..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/post_processor/post_processor.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#ifndef POST_PROCESSOR_POST_PROCESSOR_H_ -#define POST_PROCESSOR_POST_PROCESSOR_H_ - -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -enum LanguageType { - // spaces between **mandarin words** should be removed. - // cases of processing spaces with mandarin-only, english-only - // and mandarin-english code-switch can be found in post_processor_test.cc - kMandarinEnglish = 0x00, - // spaces should be kept for most of the - // Indo-European languages (i.e., deutsch or english-deutsch code-switch). - // cases of those languages can be found in post_processor_test.cc - kIndoEuropean = 0x01 -}; - -struct PostProcessOptions { - // space options - // The decoded result may contain spaces (' ' or '_'), - // we will process those spaces according to language_type. More details can - // be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - LanguageType language_type = kMandarinEnglish; - // whether lowercase letters are required - bool lowercase = true; -}; - -// TODO(xcsong): add itn/punctuation related resource -struct PostProcessResource {}; - -// Post Processor -class PostProcessor { - public: - explicit PostProcessor(PostProcessOptions&& opts) : opts_(std::move(opts)) {} - explicit PostProcessor(const PostProcessOptions& opts) : opts_(opts) {} - // call other functions to do post processing - std::string Process(const std::string& str, bool finish); - // process spaces according to configurations - std::string ProcessSpace(const std::string& str); - // TODO(xcsong): add itn/punctuation - // void InverseTN(const std::string& str); - // void Punctuate(const std::string& str); - - private: - const PostProcessOptions opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(PostProcessor); -}; - -} // namespace wenet - -#endif // POST_PROCESSOR_POST_PROCESSOR_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/CMakeLists.txt deleted file mode 100644 index 145654105350e91a5f9121b47197f5fc60663f5c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -link_libraries(gtest_main gmock) - -add_executable(utils_test utils_test.cc) -target_link_libraries(utils_test PUBLIC utils) -add_test(UTILS_TEST utils_test) - -add_executable(ctc_prefix_beam_search_test ctc_prefix_beam_search_test.cc) -target_link_libraries(ctc_prefix_beam_search_test PUBLIC decoder) -add_test(CTC_PREFIX_BEAM_SEARCH_TEST ctc_prefix_beam_search_test) - -add_executable(post_processor_test post_processor_test.cc) -target_link_libraries(post_processor_test PUBLIC post_processor) -add_test(POST_PROCESSOR_TEST post_processor_test) - - -add_executable(feature_pipeline_test feature_pipeline_test.cc) -target_link_libraries(feature_pipeline_test PUBLIC frontend) -add_test(FEATURE_PIPELINE_TEST feature_pipeline_test) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/ctc_prefix_beam_search_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/ctc_prefix_beam_search_test.cc deleted file mode 100644 index d8f3b65693b934beb33f3a770795f0b6e7ce3456..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/ctc_prefix_beam_search_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(CtcPrefixBeamSearchTest, CtcPrefixBeamSearchLogicTest) { - using ::testing::ElementsAre; - // See https://robin1001.github.io/2020/12/11/ctc-search for the - // graph demonstration of the data - std::vector> data = { - {0.25, 0.40, 0.35}, {0.40, 0.35, 0.25}, {0.10, 0.50, 0.40}}; - // Apply log - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data[i].size(); j++) { - data[i][j] = std::log(data[i][j]); - } - } - wenet::CtcPrefixBeamSearchOptions option; - option.first_beam_size = 3; - option.second_beam_size = 3; - wenet::CtcPrefixBeamSearch prefix_beam_search(option); - prefix_beam_search.Search(data); - /* Test case info - | top k | result index | prefix score | viterbi score | timestamp | - |-------|--------------|--------------|---------------|-----------| - | top 1 | [2, 1] | 0.2185 | 0.07 | [0, 2] | - | top 2 | [1, 2] | 0.1550 | 0.064 | [0, 2] | - | top 3 | [1] | 0.1525 | 0.07 | [2] | - */ - const std::vector>& result = prefix_beam_search.Outputs(); - EXPECT_EQ(result.size(), 3); - ASSERT_THAT(result[0], ElementsAre(2, 1)); - ASSERT_THAT(result[1], ElementsAre(1, 2)); - ASSERT_THAT(result[2], ElementsAre(1)); - - const std::vector& likelihood = prefix_beam_search.Likelihood(); - EXPECT_EQ(likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(likelihood[0]), 0.2185); - EXPECT_FLOAT_EQ(std::exp(likelihood[1]), 0.1550); - EXPECT_FLOAT_EQ(std::exp(likelihood[2]), 0.1525); - - const std::vector& viterbi_likelihood = - prefix_beam_search.viterbi_likelihood(); - EXPECT_EQ(viterbi_likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[0]), 0.07); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[1]), 0.064); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[2]), 0.07); - - const std::vector>& times = prefix_beam_search.Times(); - EXPECT_EQ(times.size(), 3); - ASSERT_THAT(times[0], ElementsAre(0, 2)); - ASSERT_THAT(times[1], ElementsAre(0, 2)); - ASSERT_THAT(times[2], ElementsAre(2)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/feature_pipeline_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/feature_pipeline_test.cc deleted file mode 100644 index 244ec0735b6086211b476e8d97569e1ee5959bc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/feature_pipeline_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2022 Roney -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "frontend/feature_pipeline.h" -#include "utils/blocking_queue.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -void pushQueue(const std::shared_ptr>& que, - std::vector vec) { - que->Push(vec); -} - -void popQueue(const std::shared_ptr>& que, int num, - int back_data) { - auto pop_data = que->Pop(num); - ASSERT_EQ(pop_data[num - 1], back_data); -} - -TEST(FeaturePipelineTest, BlockingQueueTest) { - auto capacity_queue = std::make_shared>(2); - std::vector test_data{1, 2, 3, 4, 5}; - std::thread push_thread(&pushQueue, capacity_queue, test_data); - ASSERT_EQ(capacity_queue->Pop(), 1); - ASSERT_LE(capacity_queue->Size(), 2); // capacity_queue: 2 or 2,3 - auto pop_data = capacity_queue->Pop(3); // 2,3,4 num > capacity - ASSERT_EQ(pop_data.size(), 3); - ASSERT_EQ(pop_data[2], 4); - push_thread.join(); - ASSERT_EQ(capacity_queue->Size(), 1); // capacity_queue:5 - - std::thread pop_thread(&popQueue, capacity_queue, 3, 0); // num > capacity - capacity_queue->Push(9); // capacity_queue:5,9 - capacity_queue->Push(0); // capacity_queue:5,9,0 - pop_thread.join(); // capacity_queue: - ASSERT_EQ(capacity_queue->Size(), 0); - - pop_data = capacity_queue->Pop(0); - ASSERT_TRUE(pop_data.empty()); -} - -TEST(FeaturePipelineTest, PipelineTest) { - wenet::FeaturePipelineConfig config(80, 8000); - wenet::FeaturePipeline feature_pipeline(config); - int audio_len = 8 * 55; // audio len 55ms,4 frames - std::vector pcm(audio_len, 0); - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 4); - - std::vector> out_feats; - auto b = feature_pipeline.Read(2, &out_feats); - ASSERT_TRUE(b); - ASSERT_EQ(out_feats.size(), 2); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 2); - - std::vector out_feat; - b = feature_pipeline.ReadOne(&out_feat); - ASSERT_TRUE(b); - ASSERT_FALSE(out_feat.empty()); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 1); - - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 1); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); - - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - feature_pipeline.Read(2, &out_feats); - feature_pipeline.Reset(); - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 0); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/post_processor_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/post_processor_test.cc deleted file mode 100644 index fa11fa29231032d62389a93fd00b0ec782bf8a3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/post_processor_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(PostProcessorTest, ProcessSpacekMandarinEnglishTest) { - wenet::PostProcessOptions opts_lowercase; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: mandarin character - // decode type: CtcPrefixBeamSearch, "".join() - "震东好帅", - // modeling unit: mandarin word - // decode type: CtcWfstBeamSearch, " ".join() - " 吴迪 也 好帅", - // modeling unit: english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁binbin▁is▁also▁handsome", - // modeling unit: english word - // decode type: CtcWfstBeamSearch, " ".join() - " life is short i use wenet", - // modeling unit: mandarin character + english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "超哥▁is▁the▁most▁handsome", - // modeling unit: mandarin word + english word - // decode type: CtcWfstBeamSearch, " ".join() - " 人生 苦短 i use wenet", - }; - - std::vector result_lowercase = { - "震东好帅", - "吴迪也好帅", - "binbin is also handsome", - "life is short i use wenet", - "超哥 is the most handsome", - "人生苦短i use wenet", - }; - - std::vector result_uppercase = { - "震东好帅", - "吴迪也好帅", - "BINBIN IS ALSO HANDSOME", - "LIFE IS SHORT I USE WENET", - "超哥 IS THE MOST HANDSOME", - "人生苦短I USE WENET", - }; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} - -TEST(PostProcessorTest, ProcessSpacekIndoEuropeanTest) { - wenet::PostProcessOptions opts_lowercase; - opts_lowercase.language_type = wenet::kIndoEuropean; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.language_type = wenet::kIndoEuropean; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁zhendong▁ist▁so▁schön", - // modeling unit: word - // decode type: CtcWfstBeamSearch, " ".join() - " zhendong ist so schön"}; - - std::vector result_lowercase = {"zhendong ist so schön", - "zhendong ist so schön"}; - - std::vector result_uppercase = {"ZHENDONG IST SO SCHÖN", - "ZHENDONG IST SO SCHÖN"}; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/utils_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/utils_test.cc deleted file mode 100644 index 6b2bbac25e000ce854d5e55a50cb51109d62d758..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/test/utils_test.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "utils/utils.h" - -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -TEST(UtilsTest, TopKTest) { - using ::testing::ElementsAre; - using ::testing::FloatNear; - using ::testing::Pointwise; - std::vector data = {1, 3, 5, 7, 9, 2, 4, 6, 8, 10}; - std::vector values; - std::vector indices; - wenet::TopK(data, 3, &values, &indices); - EXPECT_THAT(values, Pointwise(FloatNear(1e-8), {10, 9, 8})); - ASSERT_THAT(indices, ElementsAre(9, 4, 8)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/toolchains/aarch64-linux-gnu.toolchain.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/toolchains/aarch64-linux-gnu.toolchain.cmake deleted file mode 100644 index 9ad37cba9eb6fa58aa194ece96cf9a5da472a76d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/toolchains/aarch64-linux-gnu.toolchain.cmake +++ /dev/null @@ -1,5 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -SET (CMAKE_SYSTEM_PROCESSOR aarch64) - -set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) -set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/toolchains/ios.toolchain.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/toolchains/ios.toolchain.cmake deleted file mode 100644 index 2bcb0adf7b07c0c5fd5bf16d1b687050579ba673..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/toolchains/ios.toolchain.cmake +++ /dev/null @@ -1,1014 +0,0 @@ -# This file is part of the ios-cmake project. It was retrieved from -# https://github.com/leetal/ios-cmake.git, which is a fork of -# https://github.com/gerstrong/ios-cmake.git, which is a fork of -# https://github.com/cristeab/ios-cmake.git, which is a fork of -# https://code.google.com/p/ios-cmake/. Which in turn is based off of -# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which -# are included with CMake 2.8.4 -# -# The ios-cmake project is licensed under the new BSD license. -# -# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, -# Kitware, Inc., Insight Software Consortium. All rights reserved. -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# This file is based off of the Platform/Darwin.cmake and -# Platform/UnixPaths.cmake files which are included with CMake 2.8.4 -# It has been altered for iOS development. -# -# Updated by Alex Stewart (alexs.mac@gmail.com) -# -# ***************************************************************************** -# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) -# under the BSD-3-Clause license -# https://github.com/leetal/ios-cmake -# ***************************************************************************** -# -# INFORMATION / HELP -# -############################################################################### -# OPTIONS # -############################################################################### -# -# PLATFORM: (default "OS64") -# OS = Build for iPhoneOS. -# OS64 = Build for arm64 iphoneOS. -# OS64COMBINED = Build for arm64 x86_64 iphoneOS + iphoneOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) -# SIMULATOR = Build for x86 i386 iphoneOS Simulator. -# SIMULATOR64 = Build for x86_64 iphoneOS Simulator. -# SIMULATORARM64 = Build for arm64 iphoneOS Simulator. -# TVOS = Build for arm64 tvOS. -# TVOSCOMBINED = Build for arm64 x86_64 tvOS + tvOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) -# SIMULATOR_TVOS = Build for x86_64 tvOS Simulator. -# WATCHOS = Build for armv7k arm64_32 for watchOS. -# WATCHOSCOMBINED = Build for armv7k arm64_32 x86_64 watchOS + watchOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) -# SIMULATOR_WATCHOS = Build for x86_64 for watchOS Simulator. -# MAC = Build for x86_64 macOS. -# MAC_ARM64 = Build for Apple Silicon macOS. -# MAC_CATALYST = Build for x86_64 macOS with Catalyst support (iOS toolchain on macOS). -# Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS -# MAC_CATALYST_ARM64 = Build for Apple Silicon macOS with Catalyst support (iOS toolchain on macOS). -# Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS -# -# CMAKE_OSX_SYSROOT: Path to the SDK to use. By default this is -# automatically determined from PLATFORM and xcodebuild, but -# can also be manually specified (although this should not be required). -# -# CMAKE_DEVELOPER_ROOT: Path to the Developer directory for the platform -# being compiled for. By default this is automatically determined from -# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should -# not be required). -# -# DEPLOYMENT_TARGET: Minimum SDK version to target. Default 2.0 on watchOS and 9.0 on tvOS+iOS -# -# NAMED_LANGUAGE_SUPPORT: -# ON (default) = Will require "enable_language(OBJC) and/or enable_language(OBJCXX)" for full OBJC|OBJCXX support -# OFF = Will embed the OBJC and OBJCXX flags into the CMAKE_C_FLAGS and CMAKE_CXX_FLAGS (legacy behaviour, CMake version < 3.16) -# -# ENABLE_BITCODE: (ON|OFF) Enables or disables bitcode support. Default ON -# -# ENABLE_ARC: (ON|OFF) Enables or disables ARC support. Default ON (ARC enabled by default) -# -# ENABLE_VISIBILITY: (ON|OFF) Enables or disables symbol visibility support. Default OFF (visibility hidden by default) -# -# ENABLE_STRICT_TRY_COMPILE: (ON|OFF) Enables or disables strict try_compile() on all Check* directives (will run linker -# to actually check if linking is possible). Default OFF (will set CMAKE_TRY_COMPILE_TARGET_TYPE to STATIC_LIBRARY) -# -# ARCHS: (armv7 armv7s armv7k arm64 arm64_32 i386 x86_64) If specified, will override the default architectures for the given PLATFORM -# OS = armv7 armv7s arm64 (if applicable) -# OS64 = arm64 (if applicable) -# SIMULATOR = i386 -# SIMULATOR64 = x86_64 -# SIMULATORARM64 = arm64 -# TVOS = arm64 -# SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated) -# WATCHOS = armv7k arm64_32 (if applicable) -# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) -# MAC = x86_64 -# MAC_ARM64 = arm64 -# MAC_CATALYST = x86_64 -# MAC_CATALYST_ARM64 = arm64 -# -# NOTE: When manually specifying ARCHS, put a semi-colon between the entries. E.g., -DARCHS="armv7;arm64" -# -############################################################################### -# END OPTIONS # -############################################################################### -# -# This toolchain defines the following properties (available via get_property()) for use externally: -# -# PLATFORM: The currently targeted platform. -# XCODE_VERSION: Version number (not including Build version) of Xcode detected. -# SDK_VERSION: Version of SDK being used. -# OSX_ARCHITECTURES: Architectures being compiled for (generated from PLATFORM). -# APPLE_TARGET_TRIPLE: Used by autoconf build systems. NOTE: If "ARCHS" are overridden, this will *NOT* be set! -# -# This toolchain defines the following macros for use externally: -# -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) -# A convenience macro for setting xcode specific properties on targets. -# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). -# -# find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the -# environment. Thanks to the android-cmake project for providing the -# command. -# - -cmake_minimum_required(VERSION 3.8.0) - -# CMake invokes the toolchain file twice during the first build, but only once during subsequent rebuilds. -if(DEFINED ENV{_IOS_TOOLCHAIN_HAS_RUN}) - return() -endif() -set(ENV{_IOS_TOOLCHAIN_HAS_RUN} true) - -# List of supported platform values -list(APPEND _supported_platforms - "OS" "OS64" "OS64COMBINED" "SIMULATOR" "SIMULATOR64" "SIMULATORARM64" - "TVOS" "TVOSCOMBINED" "SIMULATOR_TVOS" - "WATCHOS" "WATCHOSCOMBINED" "SIMULATOR_WATCHOS" - "MAC" "MAC_ARM64" - "MAC_CATALYST" "MAC_CATALYST_ARM64") - -# Cache what generator is used -set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}") - -# Check if using a CMake version capable of building combined FAT builds (simulator and target slices combined in one static lib) -if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14") - set(MODERN_CMAKE YES) -endif() - -# Get the Xcode version being used. -# Problem: CMake runs toolchain files multiple times, but can't read cache variables on some runs. -# Workaround: On first run (in which cache variables are always accessible), set an intermediary environment variable. -# -# NOTE: This pattern is used i many places in this toolchain to speed up checks of all sorts -if(DEFINED XCODE_VERSION_INT) - # Environment variables are always preserved. - set(ENV{_XCODE_VERSION_INT} "${XCODE_VERSION_INT}") -elseif(DEFINED ENV{_XCODE_VERSION_INT}) - set(XCODE_VERSION_INT "$ENV{_XCODE_VERSION_INT}") -elseif(NOT DEFINED XCODE_VERSION_INT) - find_program(XCODEBUILD_EXECUTABLE xcodebuild) - if(NOT XCODEBUILD_EXECUTABLE) - message(FATAL_ERROR "xcodebuild not found. Please install either the standalone commandline tools or Xcode.") - endif() - execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version - OUTPUT_VARIABLE XCODE_VERSION_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION_INT "${XCODE_VERSION_INT}") - string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION_INT "${XCODE_VERSION_INT}") - set(XCODE_VERSION_INT "${XCODE_VERSION_INT}" CACHE INTERNAL "") -endif() - -# Assuming that xcode 12.0 is installed you most probably have ios sdk 14.0 or later installed (tested on Big Sur) -# if you don't set a deployment target it will be set the way you only get 64-bit builds -if(NOT DEFINED DEPLOYMENT_TARGET AND XCODE_VERSION_INT VERSION_GREATER 12.0) - # Temporarily fix the arm64 issues in CMake install-combined by excluding arm64 for simulator builds (needed for Apple Silicon...) - set(CMAKE_XCODE_ATTRIBUTE_EXCLUDED_ARCHS[sdk=iphonesimulator*] "arm64") -endif() - -# Check if the platform variable is set -if(DEFINED PLATFORM) - # Environment variables are always preserved. - set(ENV{_PLATFORM} "${PLATFORM}") -elseif(DEFINED ENV{_PLATFORM}) - set(PLATFORM "$ENV{_PLATFORM}") -elseif(NOT DEFINED PLATFORM) - message(FATAL_ERROR "PLATFORM argument not set. Bailing configure since I don't know what target you want to build for!") -endif () - -if(PLATFORM MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode") - message(FATAL_ERROR "The combined builds support requires Xcode to be used as generator via '-G Xcode' command-line argument in CMake") -endif() - -# Safeguard that the platform value is set and is one of the supported values -list(FIND _supported_platforms ${PLATFORM} contains_PLATFORM) -if("${contains_PLATFORM}" EQUAL "-1") - string(REPLACE ";" "\n * " _supported_platforms_formatted "${_supported_platforms}") - message(FATAL_ERROR " Invalid PLATFORM specified! Current value: ${PLATFORM}.\n" - " Supported PLATFORM values: \n * ${_supported_platforms_formatted}") -endif() - -# Check if Apple Silicon is supported -if(PLATFORM MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$" AND ${CMAKE_VERSION} VERSION_LESS "3.19.5") - message(FATAL_ERROR "Apple Silicon builds requires a minimum of CMake 3.19.5") -endif() - -# Touch toolchain variable to suppress "unused variable" warning. -# This happens if CMake is invoked with the same command line the second time. -if(CMAKE_TOOLCHAIN_FILE) -endif() - -# Fix for PThread library not in path -set(CMAKE_THREAD_LIBS_INIT "-lpthread") -set(CMAKE_HAVE_THREADS_LIBRARY 1) -set(CMAKE_USE_WIN32_THREADS_INIT 0) -set(CMAKE_USE_PTHREADS_INIT 1) - -# Specify named language support defaults. -if(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16") - set(NAMED_LANGUAGE_SUPPORT ON) - message(STATUS "[DEFAULTS] Using explicit named language support! E.g., enable_language(CXX) is needed in the project files.") -elseif(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16") - set(NAMED_LANGUAGE_SUPPORT OFF) - message(STATUS "[DEFAULTS] Disabling explicit named language support. Falling back to legacy behaviour.") -elseif(DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16") - message(FATAL_ERROR "CMake named language support for OBJC and OBJCXX was added in CMake 3.16.") -endif() -set(NAMED_LANGUAGE_SUPPORT_INT ${NAMED_LANGUAGE_SUPPORT} CACHE BOOL - "Whether or not to enable explicit named language support" FORCE) - -# Specify minimum version of deployment target. -if(NOT DEFINED DEPLOYMENT_TARGET) - if (PLATFORM MATCHES "WATCHOS") - # Unless specified, SDK version 4.0 is used by default as minimum target version (watchOS). - set(DEPLOYMENT_TARGET "4.0") - elseif(PLATFORM STREQUAL "MAC") - # Unless specified, SDK version 10.13 (High sierra) is used by default as minimum target version (macos). - set(DEPLOYMENT_TARGET "10.13") - elseif(PLATFORM STREQUAL "MAC_ARM64") - # Unless specified, SDK version 11.0 (Big Sur) is used by default as minimum target version (macos on arm). - set(DEPLOYMENT_TARGET "11.0") - elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64") - # Unless specified, SDK version 13.0 is used by default as minimum target version (mac catalyst minimum requirement). - set(DEPLOYMENT_TARGET "13.1") - else() - # Unless specified, SDK version 11.0 is used by default as minimum target version (iOS, tvOS). - set(DEPLOYMENT_TARGET "11.0") - endif() - message(STATUS "[DEFAULTS] Using the default min-version since DEPLOYMENT_TARGET not provided!") -elseif(DEFINED DEPLOYMENT_TARGET AND PLATFORM MATCHES "^MAC_CATALYST" AND ${DEPLOYMENT_TARGET} VERSION_LESS "13.1") - message(FATAL_ERROR "Mac Catalyst builds requires a minimum deployment target of 13.1!") -endif() - -# Store the DEPLOYMENT_TARGET in the cache -set(DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}" CACHE INTERNAL "") - -# Handle the case where we are targeting iOS and a version above 10.3.4 (32-bit support dropped officially) -if(PLATFORM STREQUAL "OS" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4) - set(PLATFORM "OS64") - message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") -elseif(PLATFORM STREQUAL "SIMULATOR" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4) - set(PLATFORM "SIMULATOR64") - message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") -endif() - -set(PLATFORM_INT "${PLATFORM}") - -if(DEFINED ARCHS) - string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") -endif() - -# Determine the platform name and architectures for use in xcodebuild commands -# from the specified PLATFORM_INT name. -if(PLATFORM_INT STREQUAL "OS") - set(SDK_NAME iphoneos) - if(NOT ARCHS) - set(ARCHS armv7 armv7s arm64) - set(APPLE_TARGET_TRIPLE_INT arm-apple-ios${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) - endif() -elseif(PLATFORM_INT STREQUAL "OS64") - set(SDK_NAME iphoneos) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS arm64) # FIXME: Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example - else() - set(ARCHS arm64) - endif() - set(APPLE_TARGET_TRIPLE_INT aarch64-apple-ios${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) - endif() -elseif(PLATFORM_INT STREQUAL "OS64COMBINED") - set(SDK_NAME iphoneos) - if(MODERN_CMAKE) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS arm64 x86_64) # FIXME: Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64") - else() - set(ARCHS arm64 x86_64) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64") - endif() - set(APPLE_TARGET_TRIPLE_INT aarch64-x86_64-apple-ios${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the OS64COMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS i386) - set(APPLE_TARGET_TRIPLE_INT i386-apple-ios${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) - endif() - message(DEPRECATION "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.") -elseif(PLATFORM_INT STREQUAL "SIMULATOR64") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS x86_64) - set(APPLE_TARGET_TRIPLE_INT x86_64-apple-ios${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATORARM64") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS arm64) - set(APPLE_TARGET_TRIPLE_INT aarch64-apple-ios${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "TVOS") - set(SDK_NAME appletvos) - if(NOT ARCHS) - set(ARCHS arm64) - set(APPLE_TARGET_TRIPLE_INT aarch64-apple-tvos${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}) - endif() -elseif (PLATFORM_INT STREQUAL "TVOSCOMBINED") - set(SDK_NAME appletvos) - if(MODERN_CMAKE) - if(NOT ARCHS) - set(ARCHS arm64 x86_64) - set(APPLE_TARGET_TRIPLE_INT aarch64-x86_64-apple-tvos${DEPLOYMENT_TARGET}) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvsimulator*] "x86_64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvsimulator*] "x86_64") - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the TVOSCOMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") - set(SDK_NAME appletvsimulator) - if(NOT ARCHS) - set(ARCHS x86_64) - set(APPLE_TARGET_TRIPLE_INT x86_64-apple-tvos${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "WATCHOS") - set(SDK_NAME watchos) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS armv7k arm64_32) - set(APPLE_TARGET_TRIPLE_INT aarch64_32-apple-watchos${DEPLOYMENT_TARGET}) - else() - set(ARCHS armv7k) - set(APPLE_TARGET_TRIPLE_INT arm-apple-watchos${DEPLOYMENT_TARGET}) - endif() - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}) - endif() -elseif(PLATFORM_INT STREQUAL "WATCHOSCOMBINED") - set(SDK_NAME watchos) - if(MODERN_CMAKE) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS armv7k arm64_32 i386) - set(APPLE_TARGET_TRIPLE_INT aarch64_32-i386-apple-watchos${DEPLOYMENT_TARGET}) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k arm64_32") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k arm64_32") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386") - else() - set(ARCHS armv7k i386) - set(APPLE_TARGET_TRIPLE_INT arm-i386-apple-watchos${DEPLOYMENT_TARGET}) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386") - endif() - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the WATCHOSCOMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - set(SDK_NAME watchsimulator) - if(NOT ARCHS) - set(ARCHS i386) - set(APPLE_TARGET_TRIPLE_INT i386-apple-watchos${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "MAC" OR PLATFORM_INT STREQUAL "MAC_CATALYST") - set(SDK_NAME macosx) - if(NOT ARCHS) - set(ARCHS x86_64) - endif() - string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") - if(PLATFORM_INT STREQUAL "MAC") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) - elseif(PLATFORM_INT STREQUAL "MAC_CATALYST") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi) - endif() -elseif(PLATFORM_INT MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$") - set(SDK_NAME macosx) - if(NOT ARCHS) - set(ARCHS arm64) - endif() - string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") - if(PLATFORM_INT STREQUAL "MAC_ARM64") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) - elseif(PLATFORM_INT STREQUAL "MAC_CATALYST_ARM64") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi) - endif() -else() - message(FATAL_ERROR "Invalid PLATFORM: ${PLATFORM_INT}") -endif() - -string(REPLACE ";" " " ARCHS_SPACED "${ARCHS}") - -if(MODERN_CMAKE AND PLATFORM_INT MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode") - message(FATAL_ERROR "The COMBINED options only work with Xcode generator, -G Xcode") -endif() - -if(CMAKE_GENERATOR MATCHES "Xcode" AND PLATFORM_INT MATCHES "^MAC_CATALYST") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") - set(CMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS "macosx") - set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-maccatalyst") - if(NOT DEFINED MACOSX_DEPLOYMENT_TARGET) - set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "10.15") - else() - set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "${MACOSX_DEPLOYMENT_TARGET}") - endif() -elseif(CMAKE_GENERATOR MATCHES "Xcode") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") - set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}") - if(NOT PLATFORM_INT MATCHES ".*COMBINED") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}") - endif() -endif() - -# If user did not specify the SDK root to use, then query xcodebuild for it. -if(DEFINED CMAKE_OSX_SYSROOT_INT) - # Environment variables are always preserved. - set(ENV{_CMAKE_OSX_SYSROOT_INT} "${CMAKE_OSX_SYSROOT_INT}") -elseif(DEFINED ENV{_CMAKE_OSX_SYSROOT_INT}) - set(CMAKE_OSX_SYSROOT_INT "$ENV{_CMAKE_OSX_SYSROOT_INT}") -elseif(NOT DEFINED CMAKE_OSX_SYSROOT_INT) - execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version -sdk ${SDK_NAME} Path - OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() - -if (NOT DEFINED CMAKE_OSX_SYSROOT_INT AND NOT DEFINED CMAKE_OSX_SYSROOT) - message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" - "is pointing to the correct path. Please run:" - "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" - "and see if that fixes the problem for you.") - message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} " - "does not exist.") -elseif(DEFINED CMAKE_OSX_SYSROOT_INT) - set(CMAKE_OSX_SYSROOT_INT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") - # Specify the location or name of the platform SDK to be used in CMAKE_OSX_SYSROOT. - set(CMAKE_OSX_SYSROOT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") -endif() - -# Use bitcode or not -if(NOT DEFINED ENABLE_BITCODE AND NOT ARCHS MATCHES "((^|;|, )(i386|x86_64))+") - # Unless specified, enable bitcode support by default - message(STATUS "[DEFAULTS] Enabling bitcode support by default. ENABLE_BITCODE not provided!") - set(ENABLE_BITCODE ON) -elseif(NOT DEFINED ENABLE_BITCODE) - message(STATUS "[DEFAULTS] Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!") - set(ENABLE_BITCODE OFF) -endif() -set(ENABLE_BITCODE_INT ${ENABLE_BITCODE} CACHE BOOL - "Whether or not to enable bitcode" FORCE) -# Use ARC or not -if(NOT DEFINED ENABLE_ARC) - # Unless specified, enable ARC support by default - set(ENABLE_ARC ON) - message(STATUS "[DEFAULTS] Enabling ARC support by default. ENABLE_ARC not provided!") -endif() -set(ENABLE_ARC_INT ${ENABLE_ARC} CACHE BOOL "Whether or not to enable ARC" FORCE) -# Use hidden visibility or not -if(NOT DEFINED ENABLE_VISIBILITY) - # Unless specified, disable symbols visibility by default - set(ENABLE_VISIBILITY OFF) - message(STATUS "[DEFAULTS] Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") -endif() -set(ENABLE_VISIBILITY_INT ${ENABLE_VISIBILITY} CACHE BOOL "Whether or not to hide symbols from the dynamic linker (-fvisibility=hidden)" FORCE) -# Set strict compiler checks or not -if(NOT DEFINED ENABLE_STRICT_TRY_COMPILE) - # Unless specified, disable strict try_compile() - set(ENABLE_STRICT_TRY_COMPILE OFF) - message(STATUS "[DEFAULTS] Using NON-strict compiler checks by default. ENABLE_STRICT_TRY_COMPILE not provided!") -endif() -set(ENABLE_STRICT_TRY_COMPILE_INT ${ENABLE_STRICT_TRY_COMPILE} CACHE BOOL - "Whether or not to use strict compiler checks" FORCE) - -# Get the SDK version information. -if(DEFINED SDK_VERSION) - # Environment variables are always preserved. - set(ENV{_SDK_VERSION} "${SDK_VERSION}") -elseif(DEFINED ENV{_SDK_VERSION}) - set(SDK_VERSION "$ENV{_SDK_VERSION}") -elseif(NOT DEFINED SDK_VERSION) - execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -sdk ${CMAKE_OSX_SYSROOT_INT} -version SDKVersion - OUTPUT_VARIABLE SDK_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() - -# Find the Developer root for the specific iOS platform being compiled for -# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in -# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain -# this information from xcrun or xcodebuild. -if (NOT DEFINED CMAKE_DEVELOPER_ROOT AND NOT CMAKE_GENERATOR MATCHES "Xcode") - get_filename_component(PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT_INT} PATH) - get_filename_component(CMAKE_DEVELOPER_ROOT ${PLATFORM_SDK_DIR} PATH) - if (NOT EXISTS "${CMAKE_DEVELOPER_ROOT}") - message(FATAL_ERROR "Invalid CMAKE_DEVELOPER_ROOT: ${CMAKE_DEVELOPER_ROOT} does not exist.") - endif() -endif() - -# Find the C & C++ compilers for the specified SDK. -if(DEFINED CMAKE_C_COMPILER) - # Environment variables are always preserved. - set(ENV{_CMAKE_C_COMPILER} "${CMAKE_C_COMPILER}") -elseif(DEFINED ENV{_CMAKE_C_COMPILER}) - set(CMAKE_C_COMPILER "$ENV{_CMAKE_C_COMPILER}") - set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) -elseif(NOT DEFINED CMAKE_C_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang - OUTPUT_VARIABLE CMAKE_C_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) -endif() -if(DEFINED CMAKE_CXX_COMPILER) - # Environment variables are always preserved. - set(ENV{_CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}") -elseif(DEFINED ENV{_CMAKE_CXX_COMPILER}) - set(CMAKE_CXX_COMPILER "$ENV{_CMAKE_CXX_COMPILER}") -elseif(NOT DEFINED CMAKE_CXX_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang++ - OUTPUT_VARIABLE CMAKE_CXX_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() -# Find (Apple's) libtool. -if(DEFINED BUILD_LIBTOOL) - # Environment variables are always preserved. - set(ENV{_BUILD_LIBTOOL} "${BUILD_LIBTOOL}") -elseif(DEFINED ENV{_BUILD_LIBTOOL}) - set(BUILD_LIBTOOL "$ENV{_BUILD_LIBTOOL}") -elseif(NOT DEFINED BUILD_LIBTOOL) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find libtool - OUTPUT_VARIABLE BUILD_LIBTOOL - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() -# Find the toolchain's provided install_name_tool if none is found on the host -if(DEFINED CMAKE_INSTALL_NAME_TOOL) - # Environment variables are always preserved. - set(ENV{_CMAKE_INSTALL_NAME_TOOL} "${CMAKE_INSTALL_NAME_TOOL}") -elseif(DEFINED ENV{_CMAKE_INSTALL_NAME_TOOL}) - set(CMAKE_INSTALL_NAME_TOOL "$ENV{_CMAKE_INSTALL_NAME_TOOL}") -elseif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find install_name_tool - OUTPUT_VARIABLE CMAKE_INSTALL_NAME_TOOL_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - set(CMAKE_INSTALL_NAME_TOOL ${CMAKE_INSTALL_NAME_TOOL_INT} CACHE INTERNAL "") -endif() - -# Configure libtool to be used instead of ar + ranlib to build static libraries. -# This is required on Xcode 7+, but should also work on previous versions of -# Xcode. -get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES) -foreach(lang ${languages}) - set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "${BUILD_LIBTOOL} -static -o " CACHE INTERNAL "") -endforeach() - -# CMake 3.14+ support building for iOS, watchOS and tvOS out of the box. -if(MODERN_CMAKE) - if(SDK_NAME MATCHES "iphone") - set(CMAKE_SYSTEM_NAME iOS) - elseif(SDK_NAME MATCHES "macosx") - set(CMAKE_SYSTEM_NAME Darwin) - elseif(SDK_NAME MATCHES "appletv") - set(CMAKE_SYSTEM_NAME tvOS) - elseif(SDK_NAME MATCHES "watch") - set(CMAKE_SYSTEM_NAME watchOS) - endif() - # Provide flags for a combined FAT library build on newer CMake versions - if(PLATFORM_INT MATCHES ".*COMBINED") - set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO") - set(CMAKE_IOS_INSTALL_COMBINED YES) - endif() -elseif(NOT DEFINED CMAKE_SYSTEM_NAME AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.10") - # Legacy code path prior to CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified - set(CMAKE_SYSTEM_NAME iOS) -elseif(NOT DEFINED CMAKE_SYSTEM_NAME) - # Legacy code path prior to CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified - set(CMAKE_SYSTEM_NAME Darwin) -endif() -# Standard settings. -set(CMAKE_SYSTEM_VERSION ${SDK_VERSION} CACHE INTERNAL "") -set(UNIX ON CACHE BOOL "") -set(APPLE ON CACHE BOOL "") -if(PLATFORM STREQUAL "MAC" OR PLATFORM STREQUAL "MAC_ARM64") - set(IOS OFF CACHE BOOL "") - set(MACOS ON CACHE BOOL "") -elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64") - set(IOS ON CACHE BOOL "") - set(MACOS ON CACHE BOOL "") -else() - set(IOS ON CACHE BOOL "") -endif() -set(CMAKE_AR ar CACHE FILEPATH "" FORCE) -set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE) -set(CMAKE_STRIP strip CACHE FILEPATH "" FORCE) -# Set the architectures for which to build. -set(CMAKE_OSX_ARCHITECTURES ${ARCHS} CACHE INTERNAL "") -# Change the type of target generated for try_compile() so it'll work when cross-compiling, weak compiler checks -if(NOT ENABLE_STRICT_TRY_COMPILE_INT) - set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) -endif() -# All iOS/Darwin specific settings - some may be redundant. -set(CMAKE_MACOSX_BUNDLE YES) -set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO") -set(CMAKE_SHARED_LIBRARY_PREFIX "lib") -set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") -set(CMAKE_SHARED_MODULE_PREFIX "lib") -set(CMAKE_SHARED_MODULE_SUFFIX ".so") -set(CMAKE_C_COMPILER_ABI ELF) -set(CMAKE_CXX_COMPILER_ABI ELF) -set(CMAKE_C_HAS_ISYSROOT 1) -set(CMAKE_CXX_HAS_ISYSROOT 1) -set(CMAKE_MODULE_EXISTS 1) -set(CMAKE_DL_LIBS "") -set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") -set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") -set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") - -if(ARCHS MATCHES "((^|;|, )(arm64|arm64e|x86_64))+") - set(CMAKE_C_SIZEOF_DATA_PTR 8) - set(CMAKE_CXX_SIZEOF_DATA_PTR 8) - if(ARCHS MATCHES "((^|;|, )(arm64|arm64e))+") - set(CMAKE_SYSTEM_PROCESSOR "aarch64") - else() - set(CMAKE_SYSTEM_PROCESSOR "x86_64") - endif() -else() - set(CMAKE_C_SIZEOF_DATA_PTR 4) - set(CMAKE_CXX_SIZEOF_DATA_PTR 4) - set(CMAKE_SYSTEM_PROCESSOR "arm") -endif() - -# Note that only Xcode 7+ supports the newer more specific: -# -m${SDK_NAME}-version-min flags, older versions of Xcode use: -# -m(ios/ios-simulator)-version-min instead. -if(${CMAKE_VERSION} VERSION_LESS "3.11") - if(PLATFORM_INT STREQUAL "OS" OR PLATFORM_INT STREQUAL "OS64") - if(XCODE_VERSION_INT VERSION_LESS 7.0) - set(SDK_NAME_VERSION_FLAGS - "-mios-version-min=${DEPLOYMENT_TARGET}") - else() - # Xcode 7.0+ uses flags we can build directly from SDK_NAME. - set(SDK_NAME_VERSION_FLAGS - "-m${SDK_NAME}-version-min=${DEPLOYMENT_TARGET}") - endif() - elseif(PLATFORM_INT STREQUAL "TVOS") - set(SDK_NAME_VERSION_FLAGS - "-mtvos-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") - set(SDK_NAME_VERSION_FLAGS - "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "WATCHOS") - set(SDK_NAME_VERSION_FLAGS - "-mwatchos-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - set(SDK_NAME_VERSION_FLAGS - "-mwatchos-simulator-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "MAC") - set(SDK_NAME_VERSION_FLAGS - "-mmacosx-version-min=${DEPLOYMENT_TARGET}") - else() - # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min. - set(SDK_NAME_VERSION_FLAGS - "-mios-simulator-version-min=${DEPLOYMENT_TARGET}") - endif() -elseif(NOT PLATFORM_INT MATCHES "^MAC_CATALYST") - # Newer versions of CMake sets the version min flags correctly, skip this for Mac Catalyst targets - set(CMAKE_OSX_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET}) -endif() - -if(DEFINED APPLE_TARGET_TRIPLE_INT) - set(APPLE_TARGET_TRIPLE ${APPLE_TARGET_TRIPLE_INT} CACHE INTERNAL "") - set(CMAKE_C_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) - set(CMAKE_CXX_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) - set(CMAKE_ASM_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) -endif() - -if(PLATFORM_INT MATCHES "^MAC_CATALYST") - set(C_TARGET_FLAGS "-isystem ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/usr/include -iframework ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks") -endif() - -if(ENABLE_BITCODE_INT) - set(BITCODE "-fembed-bitcode") - set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode") - set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "YES") -else() - set(BITCODE "") - set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "NO") -endif() - -if(ENABLE_ARC_INT) - set(FOBJC_ARC "-fobjc-arc") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "YES") -else() - set(FOBJC_ARC "-fno-objc-arc") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "NO") -endif() - -if(NAMED_LANGUAGE_SUPPORT_INT) - set(OBJC_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0") - set(OBJC_LEGACY_VARS "") -else() - set(OBJC_VARS "") - set(OBJC_LEGACY_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0") -endif() - -if(NOT ENABLE_VISIBILITY_INT) - foreach(lang ${languages}) - set(CMAKE_${lang}_VISIBILITY_PRESET "hidden" CACHE INTERNAL "") - endforeach() - set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES") - set(VISIBILITY "-fvisibility=hidden -fvisibility-inlines-hidden") -else() - foreach(lang ${languages}) - set(CMAKE_${lang}_VISIBILITY_PRESET "default" CACHE INTERNAL "") - endforeach() - set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "NO") - set(VISIBILITY "-fvisibility=default") -endif() - -if(DEFINED APPLE_TARGET_TRIPLE) - set(APPLE_TARGET_TRIPLE_FLAG "-target ${APPLE_TARGET_TRIPLE}") -endif() - -#Check if Xcode generator is used, since that will handle these flags automagically -if(CMAKE_GENERATOR MATCHES "Xcode") - message(STATUS "Not setting any manual command-line buildflags, since Xcode is selected as generator. Modifying the Xcode build-settings directly instead.") -else() - set(CMAKE_C_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_C_FLAGS}") - set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_C_FLAGS_DEBUG}") - set(CMAKE_C_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_C_FLAGS_MINSIZEREL}") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_C_FLAGS_RELWITHDEBINFO}") - set(CMAKE_C_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_C_FLAGS_RELEASE}") - set(CMAKE_CXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_CXX_FLAGS}") - set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_CXX_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_CXX_FLAGS_MINSIZEREL}") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") - set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_CXX_FLAGS_RELEASE}") - if(NAMED_LANGUAGE_SUPPORT_INT) - set(CMAKE_OBJC_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJC_FLAGS}") - set(CMAKE_OBJC_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJC_FLAGS_DEBUG}") - set(CMAKE_OBJC_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJC_FLAGS_MINSIZEREL}") - set(CMAKE_OBJC_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJC_FLAGS_RELWITHDEBINFO}") - set(CMAKE_OBJC_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJC_FLAGS_RELEASE}") - set(CMAKE_OBJCXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJCXX_FLAGS}") - set(CMAKE_OBJCXX_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJCXX_FLAGS_DEBUG}") - set(CMAKE_OBJCXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJCXX_FLAGS_MINSIZEREL}") - set(CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO}") - set(CMAKE_OBJCXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJCXX_FLAGS_RELEASE}") - endif() - set(CMAKE_C_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") - set(CMAKE_CXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") - if(NAMED_LANGUAGE_SUPPORT_INT) - set(CMAKE_OBJC_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJC_LINK_FLAGS}") - set(CMAKE_OBJCXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJCXX_LINK_FLAGS}") - endif() - set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -x assembler-with-cpp -arch ${CMAKE_OSX_ARCHITECTURES} ${APPLE_TARGET_TRIPLE_FLAG}") -endif() - -## Print status messages to inform of the current state -message(STATUS "Configuring ${SDK_NAME} build for platform: ${PLATFORM_INT}, architecture(s): ${ARCHS}") -message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT_INT}") -message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}") -message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}") -message(STATUS "Using libtool: ${BUILD_LIBTOOL}") -message(STATUS "Using install name tool: ${CMAKE_INSTALL_NAME_TOOL}") -if(DEFINED APPLE_TARGET_TRIPLE) - message(STATUS "Autoconf target triple: ${APPLE_TARGET_TRIPLE}") -endif() -message(STATUS "Using minimum deployment version: ${DEPLOYMENT_TARGET}" - " (SDK version: ${SDK_VERSION})") -if(MODERN_CMAKE) - message(STATUS "Merging integrated CMake 3.14+ iOS,tvOS,watchOS,macOS toolchain(s) with this toolchain!") - if(PLATFORM_INT MATCHES ".*COMBINED") - message(STATUS "Will combine built (static) artifacts into FAT lib...") - endif() -endif() -if(CMAKE_GENERATOR MATCHES "Xcode") - message(STATUS "Using Xcode version: ${XCODE_VERSION_INT}") -endif() -message(STATUS "CMake version: ${CMAKE_VERSION}") -if(DEFINED SDK_NAME_VERSION_FLAGS) - message(STATUS "Using version flags: ${SDK_NAME_VERSION_FLAGS}") -endif() -message(STATUS "Using a data_ptr size of: ${CMAKE_CXX_SIZEOF_DATA_PTR}") -if(ENABLE_BITCODE_INT) - message(STATUS "Bitcode: Enabled") -else() - message(STATUS "Bitcode: Disabled") -endif() - -if(ENABLE_ARC_INT) - message(STATUS "ARC: Enabled") -else() - message(STATUS "ARC: Disabled") -endif() - -if(ENABLE_VISIBILITY_INT) - message(STATUS "Hiding symbols: Disabled") -else() - message(STATUS "Hiding symbols: Enabled") -endif() - -# Set global properties -set_property(GLOBAL PROPERTY PLATFORM "${PLATFORM}") -set_property(GLOBAL PROPERTY APPLE_TARGET_TRIPLE "${APPLE_TARGET_TRIPLE_INT}") -set_property(GLOBAL PROPERTY SDK_VERSION "${SDK_VERSION}") -set_property(GLOBAL PROPERTY XCODE_VERSION "${XCODE_VERSION_INT}") -set_property(GLOBAL PROPERTY OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}") - -# Export configurable variables for the try_compile() command. -set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES - PLATFORM - XCODE_VERSION_INT - SDK_VERSION - NAMED_LANGUAGE_SUPPORT - DEPLOYMENT_TARGET - CMAKE_DEVELOPER_ROOT - CMAKE_OSX_SYSROOT_INT - ENABLE_BITCODE - ENABLE_ARC - CMAKE_ASM_COMPILER - CMAKE_C_COMPILER - CMAKE_C_COMPILER_TARGET - CMAKE_CXX_COMPILER - CMAKE_CXX_COMPILER_TARGET - BUILD_LIBTOOL - CMAKE_INSTALL_NAME_TOOL - CMAKE_C_FLAGS - CMAKE_C_DEBUG - CMAKE_C_MINSIZEREL - CMAKE_C_RELWITHDEBINFO - CMAKE_C_RELEASE - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_MINSIZEREL - CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_LINK_FLAGS - CMAKE_CXX_LINK_FLAGS - CMAKE_ASM_FLAGS -) - -if(NAMED_LANGUAGE_SUPPORT_INT) - list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES - CMAKE_OBJC_FLAGS - CMAKE_OBJC_DEBUG - CMAKE_OBJC_MINSIZEREL - CMAKE_OBJC_RELWITHDEBINFO - CMAKE_OBJC_RELEASE - CMAKE_OBJCXX_FLAGS - CMAKE_OBJCXX_DEBUG - CMAKE_OBJCXX_MINSIZEREL - CMAKE_OBJCXX_RELWITHDEBINFO - CMAKE_OBJCXX_RELEASE - CMAKE_OBJC_LINK_FLAGS - CMAKE_OBJCXX_LINK_FLAGS - ) -endif() - -set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) -set(CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") -set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -Wl,-headerpad_max_install_names") -set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -Wl,-headerpad_max_install_names") -set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") -set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") -set(CMAKE_FIND_LIBRARY_SUFFIXES ".tbd" ".dylib" ".so" ".a") -set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-install_name") - -# Set the find root to the SDK developer roots. -# Note: CMAKE_FIND_ROOT_PATH is only useful when cross-compiling. Thus, do not set on macOS builds. -if(NOT PLATFORM_INT MATCHES "^MAC.*$") - list(APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") - set(CMAKE_IGNORE_PATH "/System/Library/Frameworks;/usr/local/lib" CACHE INTERNAL "") -endif() - -# Default to searching for frameworks first. -set(CMAKE_FIND_FRAMEWORK FIRST) - -# Set up the default search directories for frameworks. -if(PLATFORM_INT MATCHES "^MAC_CATALYST") - set(CMAKE_FRAMEWORK_PATH - ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks - ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks - ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks - ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "") -else() - set(CMAKE_FRAMEWORK_PATH - ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks - ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks - ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "") -endif() - -# By default, search both the specified iOS SDK and the remainder of the host filesystem. -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH CACHE INTERNAL "") -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH CACHE INTERNAL "") -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH CACHE INTERNAL "") -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH CACHE INTERNAL "") -endif() - -# -# Some helper-macros below to simplify and beautify the CMakeFile -# - -# This little macro lets you set any Xcode specific property. -macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) - set(XCODE_RELVERSION_I "${XCODE_RELVERSION}") - if(XCODE_RELVERSION_I STREQUAL "All") - set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") - else() - set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}") - endif() -endmacro(set_xcode_property) - -# This macro lets you find executable programs on the host system. -macro(find_host_package) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE NEVER) - set(_TOOLCHAIN_IOS ${IOS}) - set(IOS OFF) - find_package(${ARGN}) - set(IOS ${_TOOLCHAIN_IOS}) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) -endmacro(find_host_package) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/CMakeLists.txt deleted file mode 100644 index 686362688c050d48224ca0a01e0d24b03d94758a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_library(utils STATIC - string.cc - utils.cc -) - -if(NOT ANDROID) - if(MSVC) - target_link_libraries(utils PUBLIC fst) - else() - target_link_libraries(utils PUBLIC fst dl) - endif() -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/blocking_queue.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/blocking_queue.h deleted file mode 100644 index 9bf0127d9298fbfae2eeebb9431c680fc5dd7647..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/blocking_queue.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_BLOCKING_QUEUE_H_ -#define UTILS_BLOCKING_QUEUE_H_ - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity = std::numeric_limits::max()) - : capacity_(capacity) {} - - void Push(const T& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(value); - } - not_empty_condition_.notify_one(); - } - - void Push(T&& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - void Push(const std::vector& values) { - { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(value); - } - } - not_empty_condition_.notify_one(); - } - - void Push(std::vector&& values) { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - T Pop() { - std::unique_lock lock(mutex_); - while (queue_.empty()) { - not_empty_condition_.wait(lock); - } - T t(std::move(queue_.front())); - queue_.pop(); - not_full_condition_.notify_one(); - return t; - } - - // num can be greater than capacity,but it needs to be used with care - std::vector Pop(size_t num) { - std::unique_lock lock(mutex_); - std::vector block_data; - while (block_data.size() < num) { - while (queue_.empty()) { - not_full_condition_.notify_one(); - not_empty_condition_.wait(lock); - } - block_data.push_back(std::move(queue_.front())); - queue_.pop(); - } - not_full_condition_.notify_one(); - return block_data; - } - - bool Empty() const { - std::lock_guard lock(mutex_); - return queue_.empty(); - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - void Clear() { - while (!Empty()) { - Pop(); - } - } - - private: - size_t capacity_; - mutable std::mutex mutex_; - std::condition_variable not_full_condition_; - std::condition_variable not_empty_condition_; - std::queue queue_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace wenet - -#endif // UTILS_BLOCKING_QUEUE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/file.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/file.h deleted file mode 100644 index 83ad9c8c52fecd334b3549285bf39cd4f59b9f2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/file.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FILE_H_ -#define UTILS_FILE_H_ - -#include -#include - -namespace wenet { - -inline bool FileExists(const std::string& path) { - std::ifstream f(path.c_str()); - return f.good(); -} - -} // namespace wenet - -#endif // UTILS_FILE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/flags.h deleted file mode 100644 index 3432aa78847322edec8d6d2aec59ed7ca5352fcd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/flags.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FLAGS_H_ -#define UTILS_FLAGS_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/flags.h" - -#endif // UTILS_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/json.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/json.h deleted file mode 100644 index bf8d94a3e42504139b10daa39b8f8e7a8b2d93cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/json.h +++ /dev/null @@ -1,754 +0,0 @@ -// Copyright (c) From https://github.com/nbsdx/SimpleJSON -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_JSON_H_ -#define UTILS_JSON_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace json { - -using std::deque; -using std::enable_if; -using std::initializer_list; -using std::is_convertible; -using std::is_floating_point; -using std::is_integral; -using std::is_same; -using std::map; -using std::string; - -namespace { // NOLINT -string json_escape(const string& str) { - string output; - for (unsigned i = 0; i < str.length(); ++i) switch (str[i]) { - case '\"': - output += "\\\""; - break; - case '\\': - output += "\\\\"; - break; - case '\b': - output += "\\b"; - break; - case '\f': - output += "\\f"; - break; - case '\n': - output += "\\n"; - break; - case '\r': - output += "\\r"; - break; - case '\t': - output += "\\t"; - break; - default: - output += str[i]; - break; - } - return std::move(output); -} -} // namespace - -class JSON { - union BackingData { - BackingData(double d) : Float(d) {} - BackingData(int l) : Int(l) {} - BackingData(bool b) : Bool(b) {} - BackingData(string s) : String(new string(s)) {} - BackingData() : Int(0) {} - - deque* List; - map* Map; - string* String; - double Float; - int Int; - bool Bool; - } Internal; - - public: - enum class Class { Null, Object, Array, String, Floating, Integral, Boolean }; - - template - class JSONWrapper { - Container* object; - - public: - explicit JSONWrapper(Container* val) : object(val) {} - explicit JSONWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::iterator begin() { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::iterator end() { - return object ? object->end() : typename Container::iterator(); - } - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::iterator(); - } - }; - - template - class JSONConstWrapper { - const Container* object; - - public: - explicit JSONConstWrapper(const Container* val) : object(val) {} - explicit JSONConstWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::const_iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::const_iterator(); - } - }; - - JSON() : Internal(), Type(Class::Null) {} - - explicit JSON(initializer_list list) : JSON() { - SetType(Class::Object); - for (auto i = list.begin(), e = list.end(); i != e; ++i, ++i) - operator[](i->ToString()) = *std::next(i); - } - - JSON(JSON&& other) : Internal(other.Internal), Type(other.Type) { - other.Type = Class::Null; - other.Internal.Map = nullptr; - } - - JSON& operator=(JSON&& other) { - ClearInternal(); - Internal = other.Internal; - Type = other.Type; - other.Internal.Map = nullptr; - other.Type = Class::Null; - return *this; - } - - JSON(const JSON& other) { - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - } - - JSON& operator=(const JSON& other) { - ClearInternal(); - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - return *this; - } - - ~JSON() { - switch (Type) { - case Class::Array: - delete Internal.List; - break; - case Class::Object: - delete Internal.Map; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - template - explicit JSON(T b, typename enable_if::value>::type* = 0) - : Internal(b), Type(Class::Boolean) {} - - template - explicit JSON(T i, typename enable_if::value && - !is_same::value>::type* = 0) - : Internal(static_cast(i)), Type(Class::Integral) {} - - template - explicit JSON(T f, typename enable_if::value>::type* = 0) - : Internal(static_cast(f)), Type(Class::Floating) {} - - template - explicit JSON(T s, - typename enable_if::value>::type* = 0) - : Internal(string(s)), Type(Class::String) {} - - explicit JSON(std::nullptr_t) : Internal(), Type(Class::Null) {} - - static JSON Make(Class type) { - JSON ret; - ret.SetType(type); - return ret; - } - - static JSON Load(const string&); - - template - void append(T arg) { - SetType(Class::Array); - Internal.List->emplace_back(arg); - } - - template - void append(T arg, U... args) { - append(arg); - append(args...); - } - - template - typename enable_if::value, JSON&>::type operator=(T b) { - SetType(Class::Boolean); - Internal.Bool = b; - return *this; - } - - template - typename enable_if::value && !is_same::value, - JSON&>::type - operator=(T i) { - SetType(Class::Integral); - Internal.Int = i; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=(T f) { - SetType(Class::Floating); - Internal.Float = f; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=( - T s) { - SetType(Class::String); - *Internal.String = string(s); - return *this; - } - - JSON& operator[](const string& key) { - SetType(Class::Object); - return Internal.Map->operator[](key); - } - - JSON& operator[](unsigned index) { - SetType(Class::Array); - if (index >= Internal.List->size()) Internal.List->resize(index + 1); - return Internal.List->operator[](index); - } - - JSON& at(const string& key) { return operator[](key); } - - const JSON& at(const string& key) const { return Internal.Map->at(key); } - - JSON& at(unsigned index) { return operator[](index); } - - const JSON& at(unsigned index) const { return Internal.List->at(index); } - - int length() const { - if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - bool hasKey(const string& key) const { - if (Type == Class::Object) - return Internal.Map->find(key) != Internal.Map->end(); - return false; - } - - int size() const { - if (Type == Class::Object) - return Internal.Map->size(); - else if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - Class JSONType() const { return Type; } - - /// Functions for getting primitives from the JSON object. - bool IsNull() const { return Type == Class::Null; } - - string ToString() const { - bool b; - return std::move(ToString(&b)); - } - string ToString(bool* ok) const { - *ok = (Type == Class::String); - return *ok ? std::move(json_escape(*Internal.String)) : string(""); - } - - double ToFloat() const { - bool b; - return ToFloat(&b); - } - double ToFloat(bool* ok) const { - *ok = (Type == Class::Floating); - return *ok ? Internal.Float : 0.0; - } - - int ToInt() const { - bool b; - return ToInt(&b); - } - int ToInt(bool* ok) const { - *ok = (Type == Class::Integral); - return *ok ? Internal.Int : 0; - } - - bool ToBool() const { - bool b; - return ToBool(&b); - } - bool ToBool(bool* ok) const { - *ok = (Type == Class::Boolean); - return *ok ? Internal.Bool : false; - } - - JSONWrapper> ObjectRange() { - if (Type == Class::Object) - return JSONWrapper>(Internal.Map); - return JSONWrapper>(nullptr); - } - - JSONWrapper> ArrayRange() { - if (Type == Class::Array) return JSONWrapper>(Internal.List); - return JSONWrapper>(nullptr); - } - - JSONConstWrapper> ObjectRange() const { - if (Type == Class::Object) - return JSONConstWrapper>(Internal.Map); - return JSONConstWrapper>(nullptr); - } - - JSONConstWrapper> ArrayRange() const { - if (Type == Class::Array) - return JSONConstWrapper>(Internal.List); - return JSONConstWrapper>(nullptr); - } - - string dump(int depth = 1, string tab = " ") const { - string pad = ""; - for (int i = 0; i < depth; ++i, pad += tab) { - } - - switch (Type) { - case Class::Null: - return "null"; - case Class::Object: { - string s = "{\n"; - bool skip = true; - for (auto& p : *Internal.Map) { - if (!skip) s += ",\n"; - s += (pad + "\"" + p.first + "\" : " + p.second.dump(depth + 1, tab)); - skip = false; - } - s += ("\n" + pad.erase(0, 2) + "}"); - return s; - } - case Class::Array: { - string s = "["; - bool skip = true; - for (auto& p : *Internal.List) { - if (!skip) s += ", "; - s += p.dump(depth + 1, tab); - skip = false; - } - s += "]"; - return s; - } - case Class::String: - return "\"" + json_escape(*Internal.String) + "\""; - case Class::Floating: - return std::to_string(Internal.Float); - case Class::Integral: - return std::to_string(Internal.Int); - case Class::Boolean: - return Internal.Bool ? "true" : "false"; - default: - return ""; - } - return ""; - } - - friend std::ostream& operator<<(std::ostream&, const JSON&); - - private: - void SetType(Class type) { - if (type == Type) return; - - ClearInternal(); - - switch (type) { - case Class::Null: - Internal.Map = nullptr; - break; - case Class::Object: - Internal.Map = new map(); - break; - case Class::Array: - Internal.List = new deque(); - break; - case Class::String: - Internal.String = new string(); - break; - case Class::Floating: - Internal.Float = 0.0; - break; - case Class::Integral: - Internal.Int = 0; - break; - case Class::Boolean: - Internal.Bool = false; - break; - } - - Type = type; - } - - private: - /* beware: only call if YOU know that Internal is allocated. No checks - performed here. This function should be called in a constructed JSON just - before you are going to overwrite Internal... -*/ - void ClearInternal() { - switch (Type) { - case Class::Object: - delete Internal.Map; - break; - case Class::Array: - delete Internal.List; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - private: - Class Type = Class::Null; -}; - -JSON Array() { return std::move(JSON::Make(JSON::Class::Array)); } - -template -JSON Array(T... args) { - JSON arr = JSON::Make(JSON::Class::Array); - arr.append(args...); - return std::move(arr); -} - -JSON Object() { return std::move(JSON::Make(JSON::Class::Object)); } - -std::ostream& operator<<(std::ostream& os, const JSON& json) { - os << json.dump(); - return os; -} - -namespace { // NOLINT -JSON parse_next(const string&, size_t&); - -void consume_ws(const string& str, size_t& offset) { // NOLINT - while (isspace(str[offset])) ++offset; -} - -JSON parse_object(const string& str, size_t& offset) { // NOLINT - JSON Object = JSON::Make(JSON::Class::Object); - - ++offset; - consume_ws(str, offset); - if (str[offset] == '}') { - ++offset; - return std::move(Object); - } - - while (true) { - JSON Key = parse_next(str, offset); - consume_ws(str, offset); - if (str[offset] != ':') { - std::cerr << "Error: Object: Expected colon, found '" << str[offset] - << "'\n"; - break; - } - consume_ws(str, ++offset); - JSON Value = parse_next(str, offset); - Object[Key.ToString()] = Value; - - consume_ws(str, offset); - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == '}') { - ++offset; - break; - } else { - std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] - << "'\n"; - break; - } - } - - return std::move(Object); -} - -JSON parse_array(const string& str, size_t& offset) { // NOLINT - JSON Array = JSON::Make(JSON::Class::Array); - unsigned index = 0; - - ++offset; - consume_ws(str, offset); - if (str[offset] == ']') { - ++offset; - return std::move(Array); - } - - while (true) { - Array[index++] = parse_next(str, offset); - consume_ws(str, offset); - - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == ']') { - ++offset; - break; - } else { - std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] - << "'\n"; - return std::move(JSON::Make(JSON::Class::Array)); - } - } - - return std::move(Array); -} - -JSON parse_string(const string& str, size_t& offset) { // NOLINT - JSON String; - string val; - for (char c = str[++offset]; c != '\"'; c = str[++offset]) { - if (c == '\\') { - switch (str[++offset]) { - case '\"': - val += '\"'; - break; - case '\\': - val += '\\'; - break; - case '/': - val += '/'; - break; - case 'b': - val += '\b'; - break; - case 'f': - val += '\f'; - break; - case 'n': - val += '\n'; - break; - case 'r': - val += '\r'; - break; - case 't': - val += '\t'; - break; - case 'u': { - val += "\\u"; - for (unsigned i = 1; i <= 4; ++i) { - c = str[offset + i]; - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { - val += c; - } else { - std::cerr << "ERROR: String: Expected hex character in unicode " - "escape, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::String)); - } - } - offset += 4; - } break; - default: - val += '\\'; - break; - } - } else { - val += c; - } - } - ++offset; - String = val; - return std::move(String); -} - -JSON parse_number(const string& str, size_t& offset) { // NOLINT - JSON Number; - string val, exp_str; - char c; - bool isDouble = false; - int exp = 0; - while (true) { - c = str[offset++]; - if ((c == '-') || (c >= '0' && c <= '9')) { - val += c; - } else if (c == '.') { - val += c; - isDouble = true; - } else { - break; - } - } - if (c == 'E' || c == 'e') { - c = str[offset++]; - if (c == '-') { - ++offset; - exp_str += '-'; - } - while (true) { - c = str[offset++]; - if (c >= '0' && c <= '9') { - exp_str += c; - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: Expected a number for exponent, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } else { - break; - } - } - exp = std::stol(exp_str); - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - --offset; - - if (isDouble) { - Number = std::stod(val) * std::pow(10, exp); - } else { - if (!exp_str.empty()) - Number = std::stol(val) * std::pow(10, exp); - else - Number = std::stol(val); - } - return std::move(Number); -} - -JSON parse_bool(const string& str, size_t& offset) { // NOLINT - JSON Bool; - if (str.substr(offset, 4) == "true") { - Bool = true; - } else if (str.substr(offset, 5) == "false") { - Bool = false; - } else { - std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" - << str.substr(offset, 5) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += (Bool.ToBool() ? 4 : 5); - return std::move(Bool); -} - -JSON parse_null(const string& str, size_t& offset) { // NOLINT - JSON Null; - if (str.substr(offset, 4) != "null") { - std::cerr << "ERROR: Null: Expected 'null', found '" - << str.substr(offset, 4) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += 4; - return std::move(Null); -} - -JSON parse_next(const string& str, size_t& offset) { // NOLINT - char value; - consume_ws(str, offset); - value = str[offset]; - switch (value) { - case '[': - return std::move(parse_array(str, offset)); - case '{': - return std::move(parse_object(str, offset)); - case '\"': - return std::move(parse_string(str, offset)); - case 't': - case 'f': - return std::move(parse_bool(str, offset)); - case 'n': - return std::move(parse_null(str, offset)); - default: - if ((value <= '9' && value >= '0') || value == '-') - return std::move(parse_number(str, offset)); - } - std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; - return JSON(); -} -} // namespace - -JSON JSON::Load(const string& str) { - size_t offset = 0; - return std::move(parse_next(str, offset)); -} - -} // namespace json - -#endif // UTILS_JSON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/log.h deleted file mode 100644 index c2bf03f261a8711f74da819d80d68e8eb9fb124a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/log.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_LOG_H_ -#define UTILS_LOG_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/log.h" - -#endif // UTILS_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/string.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/string.cc deleted file mode 100644 index 1ab93adf3cac1bc5a42c0b8c6cadbde399678fef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/string.cc +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/string.h" - -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -void SplitString(const std::string& str, std::vector* strs) { - SplitStringToVector(Trim(str), " \t", true, strs); -} - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars) { - chars->clear(); - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - assert((str[i] & 0xF8) <= 0xF0); - if ((str[i] & 0x80) == 0x00) { - // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - // The next 1,920 characters need two bytes to encode, - // which covers the remainder of almost all Latin-script alphabets. - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - // Three bytes are needed for characters in the rest of - // the Basic Multilingual Plane, which contains virtually all characters - // in common use, including most Chinese, Japanese and Korean characters. - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - // Four bytes are needed for characters in the other planes of Unicode, - // which include less common CJK characters, various historic scripts, - // mathematical symbols, and emoji (pictographic symbols). - bytes = 4; - } - chars->push_back(str.substr(i, bytes)); - } -} - -int UTF8StringLength(const std::string& str) { - int len = 0; - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - if ((str[i] & 0x80) == 0x00) { - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - bytes = 4; - } - ++len; - } - return len; -} - -bool CheckEnglishChar(const std::string& ch) { - // all english characters should be encoded in one byte - if (ch.size() != 1) return false; - // english words may contain apostrophe, i.e., "He's" - return isalpha(ch[0]) || ch[0] == '\''; -} - -bool CheckEnglishWord(const std::string& word) { - std::vector chars; - SplitUTF8StringToChars(word, &chars); - for (size_t k = 0; k < chars.size(); k++) { - if (!CheckEnglishChar(chars[k])) { - return false; - } - } - return true; -} - -std::string JoinString(const std::string& c, - const std::vector& strs) { - std::string result; - if (strs.size() > 0) { - for (int i = 0; i < strs.size() - 1; i++) { - result += (strs[i] + c); - } - result += strs.back(); - } - return result; -} - -bool IsAlpha(const std::string& str) { - for (size_t i = 0; i < str.size(); i++) { - if (!isalpha(str[i])) { - return false; - } - } - return true; -} - -std::string ProcessBlank(const std::string& str, bool lowercase) { - std::string result; - if (!str.empty()) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - for (std::string& ch : chars) { - if (ch != kSpaceSymbol) { - result.append(ch); - } else { - // Ignore consecutive space or located in head - if (!result.empty() && result.back() != ' ') { - result.push_back(' '); - } - } - } - // Ignore tailing space - if (!result.empty() && result.back() == ' ') { - result.pop_back(); - } - // NOTE: convert string to wstring - // see issue 745: https://github.com/wenet-e2e/wenet/issues/745 - std::locale loc(""); - std::wstring_convert, wchar_t> converter; - std::wstring wsresult = converter.from_bytes(result); - for (auto& c : wsresult) { - c = lowercase ? tolower(c, loc) : toupper(c, loc); - } - result = converter.to_bytes(wsresult); - } - return result; -} - -std::string Ltrim(const std::string& str) { - size_t start = str.find_first_not_of(WHITESPACE); - return (start == std::string::npos) ? "" : str.substr(start); -} - -std::string Rtrim(const std::string& str) { - size_t end = str.find_last_not_of(WHITESPACE); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); -} - -std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } - -std::string JoinPath(const std::string& left, const std::string& right) { - std::string path(left); - if (path.size() && path.back() != '/') { - path.push_back('/'); - } - path.append(right); - return path; -} - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str) { - unsigned len = str.size() * 2; - setlocale(LC_CTYPE, ""); - wchar_t* p = new wchar_t[len]; - mbstowcs(p, str.c_str(), len); - std::wstring wstr(p); - delete[] p; - return wstr; -} -#endif - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/string.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/string.h deleted file mode 100644 index bf7a52ae09bce45ab7e34a5277652d7ae91bae1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/string.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_STRING_H_ -#define UTILS_STRING_H_ - -#include -#include -#include -#include -#include - -#include "fst/symbol-table.h" - -namespace wenet { - -const char WHITESPACE[] = " \n\r\t\f\v"; - -// Split the string with space or tab. -void SplitString(const std::string& str, std::vector* strs); - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out); - -// NOTE(Xingchen Song): we add this function to make it possible to -// support multilingual recipe in the future, in which characters of -// different languages are all encoded in UTF-8 format. -// UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding -// Split the UTF-8 string into chars. -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars); - -int UTF8StringLength(const std::string& str); - -// Check whether the UTF-8 char is alphabet or '. -bool CheckEnglishChar(const std::string& ch); - -// Check whether the UTF-8 word is only contains alphabet or '. -bool CheckEnglishWord(const std::string& word); - -std::string JoinString(const std::string& c, - const std::vector& strs); - -bool IsAlpha(const std::string& str); - -// Split the UTF-8 string into words by symbol table. -// Return whether not contains oov. -bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - -// Replace ▁ with space, then remove head, tail and consecutive space. -std::string ProcessBlank(const std::string& str, bool lowercase); - -std::string Ltrim(const std::string& str); - -std::string Rtrim(const std::string& str); - -std::string Trim(const std::string& str); - -std::string JoinPath(const std::string& left, const std::string& right); - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str); -#endif - -} // namespace wenet - -#endif // UTILS_STRING_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/thread_pool.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/thread_pool.h deleted file mode 100644 index a78162995d90bf079ad091cf14cb9f2cd4476d05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/thread_pool.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2012 Jakob Progsch, Václav Zeman - -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. - -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: - -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. - -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original software. - -// 3. This notice may not be removed or altered from any source -// distribution. - -#ifndef UTILS_THREAD_POOL_H_ -#define UTILS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - public: - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared >( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) { - worker.join(); - } -} - -#endif // UTILS_THREAD_POOL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/timer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/timer.h deleted file mode 100644 index 068519f98d140ba0eef68babe2ad2fdcb798c074..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/timer.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_TIMER_H_ -#define UTILS_TIMER_H_ - -#include - -namespace wenet { - -class Timer { - public: - Timer() : time_start_(std::chrono::steady_clock::now()) {} - void Reset() { time_start_ = std::chrono::steady_clock::now(); } - // return int in milliseconds - int Elapsed() const { - auto time_now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(time_now - - time_start_) - .count(); - } - - private: - std::chrono::time_point time_start_; -}; -} // namespace wenet - -#endif // UTILS_TIMER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/utils.cc deleted file mode 100644 index c37e36c6e9f629e0a4b11cf21a791aefd58b659f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/utils.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/utils.h" - -#include -#include -#include -#include -#include -#include - -#include "utils/log.h" - -namespace wenet { - -float LogAdd(float x, float y) { - static float num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - float xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -template -struct ValueComp { - bool operator()(const std::pair& lhs, - const std::pair& rhs) const { - return lhs.first > rhs.first || - (lhs.first == rhs.first && lhs.second < rhs.second); - } -}; - -// We refer the pytorch topk implementation -// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices) { - std::vector> heap_data; - int n = data.size(); - for (int32_t i = 0; i < k && i < n; ++i) { - heap_data.emplace_back(data[i], i); - } - std::priority_queue, std::vector>, - ValueComp> - pq(ValueComp(), std::move(heap_data)); - for (int32_t i = k; i < n; ++i) { - if (pq.top().first < data[i]) { - pq.pop(); - pq.emplace(data[i], i); - } - } - - values->resize(std::min(k, n)); - indices->resize(std::min(k, n)); - int32_t cur = values->size() - 1; - while (!pq.empty()) { - const auto& item = pq.top(); - (*values)[cur] = item.first; - (*indices)[cur] = item.second; - pq.pop(); - cur -= 1; - } -} - -template void TopK(const std::vector& data, int32_t k, - std::vector* values, - std::vector* indices); - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/utils.h deleted file mode 100644 index f9957c0b6e8ae27d9260e75cf55e786055827801..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/ios/utils/utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_UTILS_H_ -#define UTILS_UTILS_H_ - -#include -#include -#include - -namespace wenet { - -#define WENET_DISALLOW_COPY_AND_ASSIGN(Type) \ - Type(const Type&) = delete; \ - Type& operator=(const Type&) = delete; - -const float kFloatMax = std::numeric_limits::max(); -// kSpaceSymbol in UTF-8 is: ▁ -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Return the sum of two probabilities in log scale -float LogAdd(float x, float y); - -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices); - -} // namespace wenet - -#endif // UTILS_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/CMakeLists.txt deleted file mode 100644 index 71628eb7f5e12b00f6c52b390f140e37465de43b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/CMakeLists.txt +++ /dev/null @@ -1,66 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -project(wenet VERSION 0.1) - -option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF) -option(GRAPH_TOOLS "whether to build TLG graph tools" OFF) -option(BUILD_TESTING "whether to build unit test" OFF) - -option(GRPC "whether to build with gRPC" OFF) -# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost -# which is a very big library -option(WEBSOCKET "whether to build with websocket" OFF) -option(XPU "whether to build with XPU" ON) - -set(CMAKE_VERBOSE_MAKEFILE OFF) - -include(FetchContent) -set(FETCHCONTENT_QUIET OFF) -get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -set(FETCHCONTENT_BASE_DIR ${fc_base}) - -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC") - -# Include all dependency -include(openfst) -# This CMakeLists.txt is only used for kunlun xpu, so remove the contents -# about onnx, libtorch, gpu and windows. -include(xpu) -# Compile xpu_conformer.a and conformer_test -add_subdirectory(xpu) - -include_directories( - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/kaldi -) - -# Build all libraries -add_subdirectory(utils) -add_subdirectory(frontend) -add_subdirectory(post_processor) -add_subdirectory(kaldi) # kaldi: wfst based decoder -add_subdirectory(decoder) -add_subdirectory(api) - -# Optionally, you can build with websocket -if(WEBSOCKET) - include(boost) - add_subdirectory(websocket) -endif() - -# Optionally, you can build with gRPC -if(GRPC) - include(grpc) - add_subdirectory(grpc) -endif() - -# Build all bins -add_subdirectory(bin) - -# Unit Test -if(BUILD_TESTING) - include(gtest) - add_subdirectory(test) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/README.md deleted file mode 100644 index 2e096b796a603571b79372ecc8955a2981f2913e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# 在昆仑芯片上运行Wenet -## 介绍 -下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。 - -## 准备XPU运行环境 - -在开始之前,请确认您获得以下必须的环境。 - - XRE(XPU Runtime Environment):昆仑芯片的基础运行环境,包括芯片驱动程序、runtime api库、固件FW工具等功能模块。 - XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库,提供应用程序中使用的高性能DNN功能库。 - -如果您需要任何帮助,或是想要进一步了解昆仑芯片,请通过官方网址联系我们: -https://www.kunlunxin.com.cn/ - -## 操作步骤 -- 第一步:构建,需要cmake 3.14及以上版本 - -``` sh -export CXX=${your_g++_path} -export CC=${your_gcc_path} -export XPU_API_PATH=${your_api_path} - -# -r : release version; -d : debug version -bash ./compile.sh -r -``` - -- 第二步:测试,测试结果将在控制台输出 - -``` sh -## set KUNLUN XPU visible device -export XPU_VISIBLE_DEVICES=0 -export XPUSIM_DEVICE_MODEL=KUNLUN2 -## set logging level -export GLOG_logtostderr=1 -export GLOG_v=3 -## set speech wav and model/weight path -wav_path=${your_test_wav_path} -xpu_model_dir=${your_xpu_weight_dir} -units=${your_units.txt} -## executive command -./build/bin/decoder_main \ - --chunk_size -1 \ - --wav_path ${wav_path} \ - --xpu_model_dir ${xpu_model_di} \ - --unit_path ${units} \ - --device_id 0 \ - --nbest 3 2>&1 | tee log.txt -``` - -单条语音执行结果如下所示: - -``` sh -XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded -I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/ -I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/ -I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: ======= -I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4 -I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6 -I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538 -I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538 -I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1 -I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict -I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418 -I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418 -I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0! -I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418 -I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103 -I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512 -I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538 -I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms -I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 -I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 -I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3 -I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3 -I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3 -I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14 -I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms. -I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况 -I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况 -I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms. -test 甚至出现交易几乎停滞的情况 -I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms. -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/README_EN.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/README_EN.md deleted file mode 100644 index ff78792f995e8c7074339c1b4b9fb9439fa18de7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/README_EN.md +++ /dev/null @@ -1,87 +0,0 @@ -# WeNet running on KUNLUNXIN XPU device -## Introduction -The below example shows how to deploy WeNet offline and online ASR models on XPUs. -XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing. - -## Setup environment for XPU device - -Before the start, makesure you have these necessary environment - - XRE(XPU Runtime Environment):The basic operating environment of the XPUs - includes functional modules such as chip drivers, runtime api library, and firmware tools. - - XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications. - -If you would like to know more about XPUs or need any help, please contact us through the official website: - -https://www.kunlunxin.com.cn/ - -## Instruction -- Step 1. Build, the build requires cmake 3.14 or above. - -``` sh -export CXX=${your_g++_path} -export CC=${your_gcc_path} -export XPU_API_PATH=${your_api_path} - -# -r : release version; -d : debug version -bash ./compile.sh -r -``` - -- Step 2. Testing, the result is shown in the console. - -``` sh -## set KUNLUN XPU visible device -export XPU_VISIBLE_DEVICES=0 -export XPUSIM_DEVICE_MODEL=KUNLUN2 -## set logging level -export GLOG_logtostderr=1 -export GLOG_v=3 -## set speech wav and model/weight/units path -wav_path=${your_test_wav_path} -xpu_model_dir=${your_xpu_weight_dir} -units=${your_units.txt} -## executive command -./build/bin/decoder_main \ - --chunk_size -1 \ - --wav_path $wav_path \ - --xpu_model_dir $xpu_model_dir \ - --unit_path $units \ - --device_id 0 \ - --nbest 3 2>&1 | tee log.txt -``` - -A typical output result is as following: - -``` sh -XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded -I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/ -I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/ -I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: ======= -I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4 -I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6 -I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538 -I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538 -I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1 -I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict -I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418 -I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418 -I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0! -I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418 -I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103 -I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512 -I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538 -I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms -I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 -I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 -I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3 -I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3 -I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3 -I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14 -I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms. -I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况 -I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况 -I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms. -test 甚至出现交易几乎停滞的情况 -I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms. -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/CMakeLists.txt deleted file mode 100644 index 8d61ca8477f0f0b6128f1effe0a2738494b2620f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(TORCH) - add_library(wenet_api SHARED wenet_api.cc) - target_link_libraries(wenet_api PUBLIC decoder) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/README.md deleted file mode 100644 index 5eaa13b977eb4836eb930452f4434dc9f2ea4139..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# WeNet API - -We refer [vosk](https://github.com/alphacep/vosk-api/blob/master/src/vosk_api.h) -for the interface design. - - -We are going to implement the following interfaces: - -- [x] non-streaming recognition -- [] streaming recognition -- [] nbest -- [] contextual biasing word -- [] alignment -- [] language support(post processor) -- [] label check diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/wenet_api.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/wenet_api.cc deleted file mode 100644 index cb1e0c8552e0126e2db274a29075578fe351a25f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/wenet_api.cc +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" - -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "decoder/torch_asr_model.h" -#include "post_processor/post_processor.h" -#include "utils/file.h" -#include "utils/json.h" -#include "utils/string.h" - -class Recognizer { - public: - explicit Recognizer(const std::string& model_dir) { - // FeaturePipeline init - feature_config_ = std::make_shared(80, 16000); - feature_pipeline_ = - std::make_shared(*feature_config_); - // Resource init - resource_ = std::make_shared(); - wenet::TorchAsrModel::InitEngineThreads(); - std::string model_path = wenet::JoinPath(model_dir, "final.zip"); - CHECK(wenet::FileExists(model_path)); - - auto model = std::make_shared(); - model->Read(model_path); - resource_->model = model; - - // units.txt: E2E model unit - std::string unit_path = wenet::JoinPath(model_dir, "units.txt"); - CHECK(wenet::FileExists(unit_path)); - resource_->unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(unit_path)); - - std::string fst_path = wenet::JoinPath(model_dir, "TLG.fst"); - if (wenet::FileExists(fst_path)) { // With LM - resource_->fst = std::shared_ptr>( - fst::Fst::Read(fst_path)); - - std::string symbol_path = wenet::JoinPath(model_dir, "words.txt"); - CHECK(wenet::FileExists(symbol_path)); - resource_->symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(symbol_path)); - } else { // Without LM, symbol_table is the same as unit_table - resource_->symbol_table = resource_->unit_table; - } - - // Context config init - context_config_ = std::make_shared(); - decode_options_ = std::make_shared(); - post_process_opts_ = std::make_shared(); - } - - void Reset() { - if (feature_pipeline_ != nullptr) { - feature_pipeline_->Reset(); - } - if (decoder_ != nullptr) { - decoder_->Reset(); - } - result_.clear(); - } - - void InitDecoder() { - CHECK(decoder_ == nullptr); - // Optional init context graph - if (context_.size() > 0) { - context_config_->context_score = context_score_; - auto context_graph = - std::make_shared(*context_config_); - context_graph->BuildContextGraph(context_, resource_->symbol_table); - resource_->context_graph = context_graph; - } - // PostProcessor - if (language_ == "chs") { // TODO(Binbin Zhang): CJK(chs, jp, kr) - post_process_opts_->language_type = wenet::kMandarinEnglish; - } else { - post_process_opts_->language_type = wenet::kIndoEuropean; - } - resource_->post_processor = - std::make_shared(*post_process_opts_); - // Init decoder - decoder_ = std::make_shared(feature_pipeline_, resource_, - *decode_options_); - } - - void Decode(const char* data, int len, int last) { - using wenet::DecodeState; - // Init decoder when it is called first time - if (decoder_ == nullptr) { - InitDecoder(); - } - // Convert to 16 bits PCM data to float - CHECK_EQ(len % 2, 0); - feature_pipeline_->AcceptWaveform(reinterpret_cast(data), - len / 2); - if (last > 0) { - feature_pipeline_->set_input_finished(); - } - - while (true) { - DecodeState state = decoder_->Decode(false); - if (state == DecodeState::kWaitFeats) { - break; - } else if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - UpdateResult(true); - break; - } else if (state == DecodeState::kEndpoint && continuous_decoding_) { - decoder_->Rescoring(); - UpdateResult(true); - decoder_->ResetContinuousDecoding(); - } else { // kEndBatch - UpdateResult(false); - } - } - } - - void UpdateResult(bool final_result) { - json::JSON obj; - obj["type"] = final_result ? "final_result" : "partial_result"; - int nbest = final_result ? nbest_ : 1; - obj["nbest"] = json::Array(); - for (int i = 0; i < nbest && i < decoder_->result().size(); i++) { - json::JSON one; - one["sentence"] = decoder_->result()[i].sentence; - if (final_result && enable_timestamp_) { - one["word_pieces"] = json::Array(); - for (const auto& word_piece : decoder_->result()[i].word_pieces) { - json::JSON piece; - piece["word"] = word_piece.word; - piece["start"] = word_piece.start; - piece["end"] = word_piece.end; - one["word_pieces"].append(piece); - } - } - one["sentence"] = decoder_->result()[i].sentence; - obj["nbest"].append(one); - } - result_ = obj.dump(); - } - - const char* GetResult() { return result_.c_str(); } - - void set_nbest(int n) { nbest_ = n; } - void set_enable_timestamp(bool flag) { enable_timestamp_ = flag; } - void AddContext(const char* word) { context_.emplace_back(word); } - void set_context_score(float score) { context_score_ = score; } - void set_language(const char* lang) { language_ = lang; } - void set_continuous_decoding(bool flag) { continuous_decoding_ = flag; } - - private: - // NOTE(Binbin Zhang): All use shared_ptr for clone in the future - std::shared_ptr feature_config_ = nullptr; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr resource_ = nullptr; - std::shared_ptr decode_options_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr context_config_ = nullptr; - std::shared_ptr post_process_opts_ = nullptr; - - int nbest_ = 1; - std::string result_; - bool enable_timestamp_ = false; - std::vector context_; - float context_score_; - std::string language_ = "chs"; - bool continuous_decoding_ = false; -}; - -void* wenet_init(const char* model_dir) { - Recognizer* decoder = new Recognizer(model_dir); - return reinterpret_cast(decoder); -} - -void wenet_free(void* decoder) { - delete reinterpret_cast(decoder); -} - -void wenet_reset(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Reset(); -} - -void wenet_decode(void* decoder, const char* data, int len, int last) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Decode(data, len, last); -} - -const char* wenet_get_result(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - return recognizer->GetResult(); -} - -void wenet_set_log_level(int level) { - FLAGS_logtostderr = true; - FLAGS_v = level; -} - -void wenet_set_nbest(void* decoder, int n) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_nbest(n); -} - -void wenet_set_timestamp(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - bool enable = flag > 0 ? true : false; - recognizer->set_enable_timestamp(enable); -} - -void wenet_add_context(void* decoder, const char* word) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->AddContext(word); -} - -void wenet_set_context_score(void* decoder, float score) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_context_score(score); -} - -void wenet_set_language(void* decoder, const char* lang) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_language(lang); -} - -void wenet_set_continuous_decoding(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_continuous_decoding(flag > 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/wenet_api.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/wenet_api.h deleted file mode 100644 index e839aaa40166a6e50d9aa2ac0e697356bd25b941..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/api/wenet_api.h +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_WENET_API_H_ -#define API_WENET_API_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/** Init decoder from the file and returns the object - * - * @param model_dir: the model dir - * @returns model object or NULL if problem occured - */ -void* wenet_init(const char* model_dir); - -/** Free wenet decoder and corresponding resource - */ -void wenet_free(void* decoder); - -/** Reset decoder for next decoding - */ -void wenet_reset(void* decoder); - -/** Decode the input wav data - * @param data: pcm data, encoded as int16_t(16 bits) - * @param len: data length - * @param last: if it is the last package - */ -void wenet_decode(void* decoder, const char* data, int len, int last); - -/** Get decode result in json format - * It returns partial result when last is 0 - * It returns final result when last is 1 - - { - "nbest" : [{ - "sentence" : "are you okay" - "word_pieces" : [{ - "end" : 960, - "start" : 0, - "word" : "are" - }, { - "end" : 1200, - "start" : 960, - "word" : "you" - }, { - ...}] - }, { - "sentence" : "are you ok" - }], - "type" : "final_result" - } - - "type": final_result/partial_result - "nbest": nbest is enabled when n > 1 in final_result - "sentence": the ASR result - "word_pieces": optional, output timestamp when enabled - */ -const char* wenet_get_result(void* decoder); - -/** Set n-best, range 1~10 - * wenet_get_result will return top-n best results - */ -void wenet_set_nbest(void* decoder, int n); - -/** Whether to enable word level timestamp in results - disable it when flag = 0, otherwise enable - */ -void wenet_set_timestamp(void* decoder, int flag); - -/** Add one contextual biasing - */ -void wenet_add_context(void* decoder, const char* word); - -/** Set contextual biasing bonus score - */ -void wenet_set_context_score(void* decoder, float score); - -/** Set language, has effect on the postpocessing - * @param: lang, could be chs/en now - */ -void wenet_set_language(void* decoder, const char* lang); - -/** Set log level - * We use glog in wenet, so the level is the glog level - */ -void wenet_set_log_level(int level); - -/** Enable continous decoding or not - * flag > 0: enable, otherwise disable - */ -void wenet_set_continuous_decoding(void* decoder, int flag); - -#ifdef __cplusplus -} -#endif - -#endif // API_WENET_API_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/CMakeLists.txt deleted file mode 100644 index a117b8bcb580c8738a7ce72f88bc10ff0a450e98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -add_executable(decoder_main decoder_main.cc) -target_link_libraries(decoder_main PUBLIC decoder) - -add_executable(label_checker_main label_checker_main.cc) -target_link_libraries(label_checker_main PUBLIC decoder) - -# if(TORCH) -# add_executable(api_main api_main.cc) -# target_link_libraries(api_main PUBLIC wenet_api) -# endif() - -if(WEBSOCKET) - add_executable(websocket_client_main websocket_client_main.cc) - target_link_libraries(websocket_client_main PUBLIC websocket) - add_executable(websocket_server_main websocket_server_main.cc) - target_link_libraries(websocket_server_main PUBLIC websocket) -endif() - -if(GRPC) - add_executable(grpc_server_main grpc_server_main.cc) - target_link_libraries(grpc_server_main PUBLIC wenet_grpc) - add_executable(grpc_client_main grpc_client_main.cc) - target_link_libraries(grpc_client_main PUBLIC wenet_grpc) -endif() - -if(HTTP) - add_executable(http_client_main http_client_main.cc) - target_link_libraries(http_client_main PUBLIC http) - add_executable(http_server_main http_server_main.cc) - target_link_libraries(http_server_main PUBLIC http) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/api_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/api_main.cc deleted file mode 100644 index 94b20d52a7b8eee5c39a12af4e1e25324d7d880f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/api_main.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" -#include "frontend/wav.h" -#include "utils/flags.h" - -DEFINE_string(model_dir, "", "model dir path"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_bool(enable_timestamp, false, "enable timestamps"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet_set_log_level(2); - - void* decoder = wenet_init(FLAGS_model_dir.c_str()); - wenet_set_timestamp(decoder, FLAGS_enable_timestamp == true ? 1 : 0); - wenet::WavReader wav_reader(FLAGS_wav_path); - std::vector data(wav_reader.num_samples()); - for (int i = 0; i < wav_reader.num_samples(); i++) { - data[i] = static_cast(*(wav_reader.data() + i)); - } - - for (int i = 0; i < 10; i++) { - // Return the final result when last is 1 - wenet_decode(decoder, reinterpret_cast(data.data()), - data.size() * 2, 1); - const char* result = wenet_get_result(decoder); - LOG(INFO) << i << " " << result; - wenet_reset(decoder); - } - wenet_free(decoder); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/decoder_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/decoder_main.cc deleted file mode 100644 index b8f1dbae6b88390504cc9ce63f33dc9bd54a2d6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/decoder_main.cc +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" -#include "utils/thread_pool.h" -#include "utils/timer.h" -#include "utils/utils.h" - -DEFINE_bool(simulate_streaming, false, "simulate streaming input"); -DEFINE_bool(output_nbest, false, "output n-best of decode result"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_string(wav_scp, "", "input wav scp"); -DEFINE_string(result, "", "result output file"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); -DEFINE_int32(thread_num, 1, "num of decode thread"); -DEFINE_int32(warmup, 0, "num of warmup decode, 0 means no warmup"); - -std::shared_ptr g_decode_config; -std::shared_ptr g_feature_config; -std::shared_ptr g_decode_resource; - -std::ofstream g_result; -std::mutex g_mutex; -int g_total_waves_dur = 0; -int g_total_decode_time = 0; - -void decode(std::pair wav, bool warmup = false) { - wenet::WavReader wav_reader(wav.second); - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - - auto feature_pipeline = - std::make_shared(*g_feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - - wenet::AsrDecoder decoder(feature_pipeline, g_decode_resource, - *g_decode_config); - - int wave_dur = static_cast(static_cast(num_samples) / - wav_reader.sample_rate() * 1000); - int decode_time = 0; - std::string final_result; - while (true) { - wenet::Timer timer; - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - } - int chunk_decode_time = timer.Elapsed(); - decode_time += chunk_decode_time; - if (decoder.DecodedSomething()) { - LOG(INFO) << "Partial result: " << decoder.result()[0].sentence; - } - - if (FLAGS_continuous_decoding && state == wenet::DecodeState::kEndpoint) { - if (decoder.DecodedSomething()) { - decoder.Rescoring(); - LOG(INFO) << "Final result (continuous decoding): " - << decoder.result()[0].sentence; - final_result.append(decoder.result()[0].sentence); - } - decoder.ResetContinuousDecoding(); - } - - if (state == wenet::DecodeState::kEndFeats) { - break; - } else if (FLAGS_chunk_size > 0 && FLAGS_simulate_streaming) { - float frame_shift_in_ms = - static_cast(g_feature_config->frame_shift) / - wav_reader.sample_rate() * 1000; - auto wait_time = - decoder.num_frames_in_current_chunk() * frame_shift_in_ms - - chunk_decode_time; - if (wait_time > 0) { - LOG(INFO) << "Simulate streaming, waiting for " << wait_time << "ms"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(wait_time))); - } - } - } - if (decoder.DecodedSomething()) { - final_result.append(decoder.result()[0].sentence); - } - LOG(INFO) << wav.first << " Final result: " << final_result << std::endl; - LOG(INFO) << "Decoded " << wave_dur << "ms audio taken " << decode_time - << "ms."; - - if (!warmup) { - g_mutex.lock(); - std::ostream& buffer = FLAGS_result.empty() ? std::cout : g_result; - if (!FLAGS_output_nbest) { - buffer << wav.first << " " << final_result << std::endl; - } else { - buffer << "wav " << wav.first << std::endl; - auto& results = decoder.result(); - for (auto& r : results) { - if (r.sentence.empty()) continue; - buffer << "candidate " << r.score << " " << r.sentence << std::endl; - } - } - g_total_waves_dur += wave_dur; - g_total_decode_time += decode_time; - g_mutex.unlock(); - } -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - g_decode_config = wenet::InitDecodeOptionsFromFlags(); - g_feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - g_decode_resource = wenet::InitDecodeResourceFromFlags(); - - if (FLAGS_wav_path.empty() && FLAGS_wav_scp.empty()) { - LOG(FATAL) << "Please provide the wave path or the wav scp."; - } - std::vector> waves; - if (!FLAGS_wav_path.empty()) { - waves.emplace_back(make_pair("test", FLAGS_wav_path)); - } else { - std::ifstream wav_scp(FLAGS_wav_scp); - std::string line; - while (getline(wav_scp, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_GE(strs.size(), 2); - waves.emplace_back(make_pair(strs[0], strs[1])); - } - - if (waves.empty()) { - LOG(FATAL) << "Please provide non-empty wav scp."; - } - } - - if (!FLAGS_result.empty()) { - g_result.open(FLAGS_result, std::ios::out); - } - - // Warmup - if (FLAGS_warmup > 0) { - LOG(INFO) << "Warming up..."; - { - ThreadPool pool(FLAGS_thread_num); - auto wav = waves[0]; - for (int i = 0; i < FLAGS_warmup; i++) { - pool.enqueue(decode, wav, true); - } - } - LOG(INFO) << "Warmup done."; - } - - { - ThreadPool pool(FLAGS_thread_num); - for (auto& wav : waves) { - pool.enqueue(decode, wav, false); - } - } - - LOG(INFO) << "Total: decoded " << g_total_waves_dur << "ms audio taken " - << g_total_decode_time << "ms."; - LOG(INFO) << "RTF: " << std::setprecision(4) - << static_cast(g_total_decode_time) / g_total_waves_dur; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/grpc_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/grpc_client_main.cc deleted file mode 100644 index f2d226d48d3757c5f095335eff3288f5d227282b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/grpc_client_main.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "grpc/grpc_client.h" -#include "utils/flags.h" -#include "utils/timer.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::GrpcClient client(FLAGS_hostname, FLAGS_port, FLAGS_nbest, - FLAGS_continuous_decoding); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - std::vector pcm_data(wav_reader.data(), - wav_reader.data() + num_samples); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(pcm_data[j])); - } - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/grpc_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/grpc_server_main.cc deleted file mode 100644 index b00f3cbade1ee70dadfb49829e9ca73fd50c2be2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/grpc_server_main.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "grpc/grpc_server.h" -#include "utils/log.h" - -DEFINE_int32(port, 10086, "grpc listening port"); -DEFINE_int32(workers, 4, "grpc num workers"); - -using grpc::Server; -using grpc::ServerBuilder; - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::GrpcServer service(feature_config, decode_config, decode_resource); - grpc::EnableDefaultHealthCheckService(true); - grpc::reflection::InitProtoReflectionServerBuilderPlugin(); - ServerBuilder builder; - std::string address("0.0.0.0:" + std::to_string(FLAGS_port)); - builder.AddListeningPort(address, grpc::InsecureServerCredentials()); - builder.RegisterService(&service); - builder.SetSyncServerOption(ServerBuilder::SyncServerOption::NUM_CQS, - FLAGS_workers); - std::unique_ptr server(builder.BuildAndStart()); - LOG(INFO) << "Listening at port " << FLAGS_port; - server->Wait(); - google::ShutdownGoogleLogging(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/http_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/http_client_main.cc deleted file mode 100644 index b59ee3f5f32bf08552416b183802029ac5d5afa5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/http_client_main.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "http/http_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of http server"); -DEFINE_int32(port, 10086, "port of http server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Convert to short - std::vector data; - data.reserve(num_samples); - for (int j = 0; j < num_samples; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // Send data - wenet::HttpClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - wenet::Timer timer; - VLOG(2) << "Send " << data.size() << " samples"; - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/http_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/http_server_main.cc deleted file mode 100644 index e30cf2bcdf746c2072f023e90f470ccba5467c2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/http_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "http/http_server.h" - -DEFINE_int32(port, 10086, "http listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::HttpServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/label_checker_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/label_checker_main.cc deleted file mode 100644 index e36e3d5c29a38a7ebee80606ebd8e69ae8b1eb96..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/label_checker_main.cc +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_string(text, "", "kaldi style text input file"); -DEFINE_string(wav_scp, "", "kaldi style wav scp"); -DEFINE_double(is_penalty, 1.0, - "insertion/substitution penalty for align insertion"); -DEFINE_double(del_penalty, 1.0, "deletion penalty for align insertion"); -DEFINE_string(result, "", "result output file"); -DEFINE_string(timestamp, "", "timestamp output file"); - -namespace wenet { - -const char* kDeletion = ""; -// Is: Insertion and substitution -const char* kIsStart = ""; -const char* kIsEnd = ""; - -bool MapToLabel(const std::string& text, - std::shared_ptr symbol_table, - std::vector* labels) { - labels->clear(); - // Split label to char sequence - std::vector chars; - SplitUTF8StringToChars(text, &chars); - for (size_t i = 0; i < chars.size(); i++) { - // ▁ is special symbol for white space - std::string label = chars[i] != " " ? chars[i] : "▁"; - int id = symbol_table->Find(label); - if (id != -1) { // fst::kNoSymbol - // LOG(INFO) << label << " " << id; - labels->push_back(id); - } - } - return true; -} - -std::shared_ptr MakeSymbolTableForFst( - std::shared_ptr isymbol_table) { - LOG(INFO) << isymbol_table; - CHECK(isymbol_table != nullptr); - auto osymbol_table = std::make_shared(); - osymbol_table->AddSymbol("", 0); - CHECK_EQ(isymbol_table->Find(""), 0); - osymbol_table->AddSymbol("", 1); - for (int i = 1; i < isymbol_table->NumSymbols(); i++) { - std::string symbol = isymbol_table->Find(i); - osymbol_table->AddSymbol(symbol, i + 1); - } - osymbol_table->AddSymbol(kDeletion, isymbol_table->NumSymbols() + 1); - osymbol_table->AddSymbol(kIsStart, isymbol_table->NumSymbols() + 2); - osymbol_table->AddSymbol(kIsEnd, isymbol_table->NumSymbols() + 3); - return osymbol_table; -} - -void CompileCtcFst(std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int start = ofst->AddState(); - ofst->SetStart(start); - CHECK_EQ(symbol_table->Find(""), 0); - CHECK_EQ(symbol_table->Find(""), 1); - ofst->AddArc(start, fst::StdArc(1, 0, 0.0, start)); - // Exclude kDeletion and kInsertion - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - int s = ofst->AddState(); - ofst->AddArc(start, fst::StdArc(i, i, 0.0, s)); - ofst->AddArc(s, fst::StdArc(i, 0, 0.0, s)); - ofst->AddArc(s, fst::StdArc(0, 0, 0.0, start)); - } - ofst->SetFinal(start, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdOLabelCompare()); -} - -void CompileAlignFst(std::vector labels, - std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int deletion = symbol_table->Find(kDeletion); - int insertion_start = symbol_table->Find(kIsStart); - int insertion_end = symbol_table->Find(kIsEnd); - - int start = ofst->AddState(); - ofst->SetStart(start); - // Filler State - int filler_start = ofst->AddState(); - int filler_end = ofst->AddState(); - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - ofst->AddArc(filler_start, fst::StdArc(i, i, FLAGS_is_penalty, filler_end)); - } - ofst->AddArc(filler_end, fst::StdArc(0, 0, 0.0, filler_start)); - - int prev = start; - // Alignment path and optional filler - for (size_t i = 0; i < labels.size(); i++) { - int cur = ofst->AddState(); - // 1. Insertion or Substitution - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - // 2. Correct - ofst->AddArc(prev, fst::StdArc(labels[i], labels[i], 0.0, cur)); - // 3. Deletion - ofst->AddArc(prev, fst::StdArc(0, deletion, FLAGS_del_penalty, cur)); - - prev = cur; - } - // Optional add endding filler - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - ofst->SetFinal(prev, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdILabelCompare()); -} - -} // namespace wenet - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - CHECK(decode_resource->unit_table != nullptr); - - auto wfst_symbol_table = - wenet::MakeSymbolTableForFst(decode_resource->unit_table); - // wfst_symbol_table->WriteText("fst.txt"); - // Reset symbol_table to on-the-fly generated wfst_symbol_table - decode_resource->symbol_table = wfst_symbol_table; - - // Compile ctc FST - fst::StdVectorFst ctc_fst; - wenet::CompileCtcFst(wfst_symbol_table, &ctc_fst); - // ctc_fst.Write("ctc.fst"); - - std::unordered_map wav_table; - std::ifstream wav_is(FLAGS_wav_scp); - std::string line; - while (std::getline(wav_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_EQ(strs.size(), 2); - wav_table[strs[0]] = strs[1]; - } - - std::ifstream text_is(FLAGS_text); - std::ofstream result_os(FLAGS_result, std::ios::out); - std::ofstream timestamp_out; - if (!FLAGS_timestamp.empty()) { - timestamp_out.open(FLAGS_timestamp, std::ios::out); - } - std::ostream& timestamp_os = - FLAGS_timestamp.empty() ? std::cout : timestamp_out; - - while (std::getline(text_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - if (strs.size() < 2) continue; - std::string key = strs[0]; - LOG(INFO) << "Processing " << key; - if (wav_table.find(key) != wav_table.end()) { - strs.erase(strs.begin()); - std::string text = wenet::JoinString(" ", strs); - std::vector labels; - wenet::MapToLabel(text, wfst_symbol_table, &labels); - // Prepare FST for alignment decoding - fst::StdVectorFst align_fst; - wenet::CompileAlignFst(labels, wfst_symbol_table, &align_fst); - // align_fst.Write("align.fst"); - auto decoding_fst = std::make_shared(); - fst::Compose(ctc_fst, align_fst, decoding_fst.get()); - // decoding_fst->Write("decoding.fst"); - // Preapre feature pipeline - wenet::WavReader wav_reader; - if (!wav_reader.Open(wav_table[key])) { - LOG(WARNING) << "Error in reading " << wav_table[key]; - continue; - } - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - auto feature_pipeline = - std::make_shared(*feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - decode_resource->fst = decoding_fst; - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - wenet::AsrDecoder decoder(feature_pipeline, decode_resource, - *decode_config); - while (true) { - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - break; - } - } - std::string final_result; - std::string timestamp_str; - if (decoder.DecodedSomething()) { - const wenet::DecodeResult& result = decoder.result()[0]; - final_result = result.sentence; - std::stringstream ss; - for (const auto& w : result.word_pieces) { - ss << " " << w.word << " " << w.start << " " << w.end; - } - timestamp_str = ss.str(); - } - result_os << key << " " << final_result << std::endl; - timestamp_os << key << " " << timestamp_str << std::endl; - LOG(INFO) << key << " " << final_result; - } else { - LOG(WARNING) << "No wav file for " << key; - } - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/websocket_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/websocket_client_main.cc deleted file mode 100644 index 3eaa96069dc5f57673fbb2819bf7d4883e0d5ffa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/websocket_client_main.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "websocket/websocket_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::WebSocketClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - client.set_continuous_decoding(FLAGS_continuous_decoding); - client.SendStartSignal(); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // TODO(Binbin Zhang): Network order? - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - client.SendEndSignal(); - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/websocket_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/websocket_server_main.cc deleted file mode 100644 index 796d9d2e6d151f7c08b43d66b7245c58ee086cc2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/bin/websocket_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "websocket/websocket_server.h" - -DEFINE_int32(port, 10086, "websocket listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::WebSocketServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/boost.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/boost.cmake deleted file mode 100644 index 8684c0ec43960da213da923dc57416f04301ea2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/boost.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FetchContent_Declare(boost - URL https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz - URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a -) -FetchContent_MakeAvailable(boost) -include_directories(${boost_SOURCE_DIR}) - -if(MSVC) - add_definitions(-DBOOST_ALL_DYN_LINK -DBOOST_ALL_NO_LIB) -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/bpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/bpu.cmake deleted file mode 100644 index 350d76c19d6f656fb130de09877d649cf49972a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/bpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -if(BPU) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(EASY_DNN_URL "https://github.com/xingchensong/toolchain_pkg/releases/download/easy_dnn/easy_dnn.0.4.11.tar.gz") - set(URL_HASH "SHA256=a1a6f77d1baae7181d75ec5d37a2ee529ac4e1c4400babd6ceb1c007392a4904") - else() - message(FATAL_ERROR "Unsupported CMake System Processor '${CMAKE_SYSTEM_PROCESSOR}' (expected 'aarch64')") - endif() - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Linux')") - endif() - - FetchContent_Declare(easy_dnn - URL ${EASY_DNN_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(easy_dnn) - include_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/include) - link_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/lib) - - add_definitions(-DUSE_BPU) - # NOTE(xcsong): Reasons for adding flag `-fuse-ld=gold`: - # https://stackoverflow.com/questions/59915966/unknown-gcc-linker-error-but-builds-sucessfully/59916438#59916438 - # https://github.com/tensorflow/tensorflow/issues/47849 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/gflags.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/gflags.cmake deleted file mode 100644 index 53ae5763b5a8c860b7e64d35b380eee5429f539d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/gflags.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(gflags - URL https://github.com/gflags/gflags/archive/v2.2.2.zip - URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 -) -FetchContent_MakeAvailable(gflags) -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/glog.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/glog.cmake deleted file mode 100644 index 447ab4132f669ee2c3a52c37959dd684a39ff21b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/glog.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(glog - URL https://github.com/google/glog/archive/v0.4.0.zip - URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc -) -FetchContent_MakeAvailable(glog) -include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/grpc.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/grpc.cmake deleted file mode 100644 index 644093a4bf8191f3a45b0df0a72c000981c48f58..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/grpc.cmake +++ /dev/null @@ -1,9 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) -# third_party: grpc -# On how to build grpc, you may refer to https://github.com/grpc/grpc -# We recommend manually recursive clone the repo to avoid internet connection problem -FetchContent_Declare(gRPC - GIT_REPOSITORY https://github.com/grpc/grpc - GIT_TAG v1.37.1 -) -FetchContent_MakeAvailable(gRPC) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/gtest.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/gtest.cmake deleted file mode 100644 index 30dc7c1a31d8b83991841a4dc33f61ed078b532a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/gtest.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FetchContent_Declare(googletest - URL https://github.com/google/googletest/archive/release-1.11.0.zip - URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a -) -if(MSVC) - set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) -endif() -FetchContent_MakeAvailable(googletest) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/libtorch.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/libtorch.cmake deleted file mode 100644 index 3cd9245b2da52f8be206d27164de5f411bff171b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/libtorch.cmake +++ /dev/null @@ -1,79 +0,0 @@ -if(TORCH) - add_definitions(-DUSE_TORCH) - if(NOT ANDROID) - if(GPU) - if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(FATAL_ERROR "GPU is supported only Linux, you can use CPU version") - else() - add_definitions(-DUSE_GPU) - endif() - endif() - - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - if(${CMAKE_BUILD_TYPE} MATCHES "Release") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bece54d36377990257e9d028c687c5b6759c5cfec0a0153da83cf6f0f71f648f") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=3cc7ba3c3865d86f03d78c2f0878fdbed8b764359476397a5c95cf3bba0d665a") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CXX11_ABI) - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=d52f63577a07adb0bfd6d77c90f7da21896e94f71eb7dcd55ed7835ccb3b2b59") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip") - set(URL_HASH "SHA256=80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865") - endif() - else() - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bee1b7be308792aa60fc95a4f5274d9658cb7248002d0e333d49eb81ec88430c") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip") - set(URL_HASH "SHA256=90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad") - endif() - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.13.0.zip") - set(URL_HASH "SHA256=a8f80050b95489b4e002547910410c2c230e9f590ffab2482e19e809afe4f7aa") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_definitions(-DIOS) - else() - message(FATAL_ERROR "Unsupported System '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux', 'Darwin' or 'iOS')") - endif() - - # iOS use LibTorch from pod install - if(NOT IOS) - FetchContent_Declare(libtorch - URL ${LIBTORCH_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(libtorch) - find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG") - endif() - - if(MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - file(COPY ${TORCH_DLLS} DESTINATION ${CMAKE_BINARY_DIR}) - endif() - else() - # Change version in runtime/android/app/build.gradle. - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - include_directories( - ${PYTORCH_INCLUDE_DIRS} - ${PYTORCH_INCLUDE_DIRS}/torch/csrc/api/include - ) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/onnx.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/onnx.cmake deleted file mode 100644 index bd55402cb2a6024620fa6ff8b5c413207041adfa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/onnx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -if(ONNX) - set(ONNX_VERSION "1.12.0") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-win-x64-${ONNX_VERSION}.zip") - set(URL_HASH "SHA256=8b5d61204989350b7904ac277f5fbccd3e6736ddbb6ec001e412723d71c9c176") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-aarch64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5820d9f343df73c63b6b2b174a1ff62575032e171c9564bcf92060f46827d0ac") - else() - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5d503ce8540358b59be26c675e42081be14a3e833a5301926f555451046929c5") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-osx-x86_64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=09b17f712f8c6f19bb63da35d508815b443cbb473e16c6192abfaa297c02f600") - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux' or 'Darwin')") - endif() - - FetchContent_Declare(onnxruntime - URL ${ONNX_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(onnxruntime) - include_directories(${onnxruntime_SOURCE_DIR}/include) - link_directories(${onnxruntime_SOURCE_DIR}/lib) - - if(MSVC) - file(GLOB ONNX_DLLS "${onnxruntime_SOURCE_DIR}/lib/*.dll") - file(COPY ${ONNX_DLLS} DESTINATION ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}) - endif() - - add_definitions(-DUSE_ONNX) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/openfst.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/openfst.cmake deleted file mode 100644 index 490a3da6b571ec228114167fb9c0d9e9b4043bd2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/openfst.cmake +++ /dev/null @@ -1,45 +0,0 @@ -if(NOT ANDROID) - include(gflags) - # We can't build glog with gflags, unless gflags is pre-installed. - # If build glog with pre-installed gflags, there will be conflict. - set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) - include(glog) - - if(NOT GRAPH_TOOLS) - set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) - set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) - endif() - set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) - set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) - set(HAVE_GRM OFF CACHE BOOL "Build grm" FORCE) - set(HAVE_FAR OFF CACHE BOOL "Build far" FORCE) - set(HAVE_PDT OFF CACHE BOOL "Build pdt" FORCE) - set(HAVE_MPDT OFF CACHE BOOL "Build mpdt" FORCE) - set(HAVE_LINEAR OFF CACHE BOOL "Build linear" FORCE) - set(HAVE_LOOKAHEAD OFF CACHE BOOL "Build lookahead" FORCE) - set(HAVE_NGRAM OFF CACHE BOOL "Build ngram" FORCE) - set(HAVE_SPECIAL OFF CACHE BOOL "Build special" FORCE) - - if(MSVC) - add_compile_options(/W0 /wd4244 /wd4267) - endif() - - # "OpenFST port for Windows" builds openfst with cmake for multiple platforms. - # Openfst is compiled with glog/gflags to avoid log and flag conflicts with log and flags in wenet/libtorch. - # To build openfst with gflags and glog, we comment out some vars of {flags, log}.h and flags.cc. - set(openfst_SOURCE_DIR ${fc_base}/openfst-src CACHE PATH "OpenFST source directory") - FetchContent_Declare(openfst - URL https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz - URL_HASH SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e - PATCH_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} - ) - FetchContent_MakeAvailable(openfst) - add_dependencies(fst gflags glog) - target_link_libraries(fst PUBLIC gflags_nothreads_static glog) - include_directories(${openfst_SOURCE_DIR}/src/include) -else() - set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) - include_directories(${openfst_BINARY_DIR}/include) - link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) - link_libraries(log gflags_nothreads glog fst) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/pybind11.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/pybind11.cmake deleted file mode 100644 index 6bdae202c1c4d94228e5f92dab051c118dba7d3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/pybind11.cmake +++ /dev/null @@ -1,7 +0,0 @@ -FetchContent_Declare(pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.zip - URL_HASH SHA256=d1646e6f70d8a3acb2ddd85ce1ed543b5dd579c68b8fb8e9638282af20edead8 -) -FetchContent_MakeAvailable(pybind11) - -add_subdirectory(${pybind11_SOURCE_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/xpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/xpu.cmake deleted file mode 100644 index 38418671b0237550cd01d4d95e8743067e113e56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/cmake/xpu.cmake +++ /dev/null @@ -1,37 +0,0 @@ -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -if(XPU) - set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") - set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) - if(NOT DEFINED ENV{XPU_API_PATH}) - message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") - else() - set(XPU_API_PATH $ENV{XPU_API_PATH}) - message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") - endif() - - include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ - ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) - link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) - - add_definitions(-DUSE_XPU) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/compile.sh b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/compile.sh deleted file mode 100644 index d64a6f050184e6fa3c4fdd71177b26a24649c7b9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/compile.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -set -e - -usage() { - echo "Usage:" - echo "bash compile.sh [-r] [-d] [-c]" - echo "Description:" - echo "-r, build release." - echo "-d, build debug." - echo "-c, remove cmakecache or build dir, then build." - echo "Example 1:" - echo " ./compile.sh -r " - echo " means: remove cache files in build dir, then build release." - echo "Example 2:" - echo " ./compile.sh -d -c all " - echo " means: remove all files in build dir, then build debug." - exit -1 -} - -if [ -z $CXX ]; then - echo -e "\033[31m [WARNING]: NO CXX in your env. Suggest setting CXX variable to support C++14. \033[0m" - sleep 2 -fi - -build_type='Release' -clean_type='cache' - -while getopts 'rdc:h' OPT; do - case $OPT in - r) build_type="Release";; - d) build_type="Debug";; - c) clean_type="$OPTARG";; - h) usage;; - ?) usage;; - esac -done - -if [ ! -d ./build ];then - mkdir build -fi - -if [ "$clean_type" = "all" ];then - pushd build - rm -rf ./* - popd -else - pushd build - rm -rf CMakeFiles/ cmake_install.cmake CMakeCache.txt CPackSourceConfig.cmake - popd -fi - -build_cmd="cd build && cmake -DINTTYPES_FORMAT:STRING=C99 " - -if [ "$build_type" = "Release" ];then - build_cmd="${build_cmd} -DCMAKE_BUILD_TYPE=Release .. && cmake --build ./ " -else - build_cmd="${build_cmd} -DCMAKE_BUILD_TYPE=Debug .. && cmake --build ./ " -fi - -echo "build command is ${build_cmd}" - -eval ${build_cmd} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/CMakeLists.txt deleted file mode 100644 index fe03efb288eb1c7ae3d05e896e95855e5865472f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -set(decoder_srcs - asr_decoder.cc - asr_model.cc - context_graph.cc - ctc_prefix_beam_search.cc - ctc_wfst_beam_search.cc - ctc_endpoint.cc -) - -if(NOT TORCH AND NOT ONNX AND NOT XPU AND NOT IOS AND NOT BPU) - message(FATAL_ERROR "Please build with TORCH or ONNX or XPU or IOS or BPU!!!") -endif() -if(TORCH OR IOS) - list(APPEND decoder_srcs torch_asr_model.cc) -endif() -if(ONNX) - list(APPEND decoder_srcs onnx_asr_model.cc) -endif() - -add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend - post_processor utils) - -if(ANDROID) - target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) -else() - if(TORCH) - target_link_libraries(decoder PUBLIC ${TORCH_LIBRARIES}) - endif() - if(ONNX) - target_link_libraries(decoder PUBLIC onnxruntime) - endif() - if(BPU) - target_link_libraries(decoder PUBLIC bpu_asr_model) - endif() - if(XPU) - target_link_libraries(decoder PUBLIC xpu_conformer) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_decoder.cc deleted file mode 100644 index 34de7550ea287b37d2cb707e148f5d6853b3d804..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_decoder.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/asr_decoder.h" - -#include - -#include -#include -#include - -#include "utils/timer.h" - -namespace wenet { - -AsrDecoder::AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts) - : feature_pipeline_(std::move(feature_pipeline)), - // Make a copy of the model ASR model since we will change the inner - // status of the model - model_(resource->model->Copy()), - post_processor_(resource->post_processor), - symbol_table_(resource->symbol_table), - fst_(resource->fst), - unit_table_(resource->unit_table), - opts_(opts), - ctc_endpointer_(new CtcEndpoint(opts.ctc_endpoint_config)) { - if (opts_.reverse_weight > 0) { - // Check if model has a right to left decoder - CHECK(model_->is_bidirectional_decoder()); - } - if (nullptr == fst_) { - searcher_.reset(new CtcPrefixBeamSearch(opts.ctc_prefix_search_opts, - resource->context_graph)); - } else { - searcher_.reset(new CtcWfstBeamSearch(*fst_, opts.ctc_wfst_search_opts, - resource->context_graph)); - } - ctc_endpointer_->frame_shift_in_ms(frame_shift_in_ms()); -} - -void AsrDecoder::Reset() { - start_ = false; - result_.clear(); - num_frames_ = 0; - global_frame_offset_ = 0; - model_->Reset(); - searcher_->Reset(); - feature_pipeline_->Reset(); - ctc_endpointer_->Reset(); -} - -void AsrDecoder::ResetContinuousDecoding() { - global_frame_offset_ = num_frames_; - start_ = false; - result_.clear(); - model_->Reset(); - searcher_->Reset(); - ctc_endpointer_->Reset(); -} - -DecodeState AsrDecoder::Decode(bool block) { - return this->AdvanceDecoding(block); -} - -void AsrDecoder::Rescoring() { - // Do attention rescoring - Timer timer; - AttentionRescoring(); - VLOG(2) << "Rescoring cost latency: " << timer.Elapsed() << "ms."; -} - -DecodeState AsrDecoder::AdvanceDecoding(bool block) { - DecodeState state = DecodeState::kEndBatch; - model_->set_chunk_size(opts_.chunk_size); - model_->set_num_left_chunks(opts_.num_left_chunks); - int num_required_frames = model_->num_frames_for_chunk(start_); - std::vector> chunk_feats; - // Return immediately if we do not want to block - if (!block && !feature_pipeline_->input_finished() && - feature_pipeline_->NumQueuedFrames() < num_required_frames) { - return DecodeState::kWaitFeats; - } - // If not okay, that means we reach the end of the input - if (!feature_pipeline_->Read(num_required_frames, &chunk_feats)) { - state = DecodeState::kEndFeats; - } - - num_frames_ += chunk_feats.size(); - VLOG(2) << "Required " << num_required_frames << " get " - << chunk_feats.size(); - Timer timer; - std::vector> ctc_log_probs; - model_->ForwardEncoder(chunk_feats, &ctc_log_probs); - int forward_time = timer.Elapsed(); - if (opts_.ctc_wfst_search_opts.blank_scale != 1.0) { - for (int i = 0; i < ctc_log_probs.size(); i++) { - ctc_log_probs[i][0] = ctc_log_probs[i][0] - + std::log(opts_.ctc_wfst_search_opts.blank_scale); - } - } - timer.Reset(); - searcher_->Search(ctc_log_probs); - int search_time = timer.Elapsed(); - VLOG(3) << "forward takes " << forward_time << " ms, search takes " - << search_time << " ms"; - UpdateResult(); - - if (state != DecodeState::kEndFeats) { - if (ctc_endpointer_->IsEndpoint(ctc_log_probs, DecodedSomething())) { - VLOG(1) << "Endpoint is detected at " << num_frames_; - state = DecodeState::kEndpoint; - } - } - - start_ = true; - return state; -} - -void AsrDecoder::UpdateResult(bool finish) { - const auto& hypotheses = searcher_->Outputs(); - const auto& inputs = searcher_->Inputs(); - const auto& likelihood = searcher_->Likelihood(); - const auto& times = searcher_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - int offset = global_frame_offset_ * feature_frame_shift_in_ms(); - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (searcher_->Type() == kWfstBeamSearch) { - path.sentence += (' ' + word); - } else { - path.sentence += (word); - } - } - - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - const std::vector& input = inputs[i]; - const std::vector& time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ > 0 - ? time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ - : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : start; - } - int end = time_stamp[j] * frame_shift_in_ms(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : end; - } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } - } - - if (post_processor_ != nullptr) { - path.sentence = post_processor_->Process(path.sentence, finish); - } - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } -} - -void AsrDecoder::AttentionRescoring() { - searcher_->FinalizeSearch(); - UpdateResult(true); - // No need to do rescoring - if (0.0 == opts_.rescoring_weight) { - return; - } - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = searcher_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - std::vector rescoring_score; - model_->AttentionRescoring(hypotheses, opts_.reverse_weight, - &rescoring_score); - - // Combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - result_[i].score = opts_.rescoring_weight * rescoring_score[i] + - opts_.ctc_weight * result_[i].score; - } - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_decoder.h deleted file mode 100644 index df71f5b7bad7b2ffdc69bbd7ab11f576bed464d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_decoder.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_ASR_DECODER_H_ -#define DECODER_ASR_DECODER_H_ - -#include -#include -#include -#include - -#include "fst/fstlib.h" -#include "fst/symbol-table.h" - -#include "decoder/asr_model.h" -#include "decoder/context_graph.h" -#include "decoder/ctc_endpoint.h" -#include "decoder/ctc_prefix_beam_search.h" -#include "decoder/ctc_wfst_beam_search.h" -#include "decoder/search_interface.h" -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/utils.h" - -namespace wenet { - -struct DecodeOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 64 = 16*4 - int chunk_size = 16; - int num_left_chunks = -1; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores in the following two search - // methods are different. - // For CtcPrefixBeamSearch, it's a sum(prefix) score + context score - // For CtcWfstBeamSearch, it's a max(viterbi) path score + context score - // So we should carefully set ctc_weight according to the search methods. - float ctc_weight = 0.5; - float rescoring_weight = 1.0; - float reverse_weight = 0.0; - CtcEndpointConfig ctc_endpoint_config; - CtcPrefixBeamSearchOptions ctc_prefix_search_opts; - CtcWfstBeamSearchOptions ctc_wfst_search_opts; -}; - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -enum DecodeState { - kEndBatch = 0x00, // End of current decoding batch, normal case - kEndpoint = 0x01, // Endpoint is detected - kEndFeats = 0x02, // All feature is decoded - kWaitFeats = 0x03 // Feat is not enough for one chunk inference, wait -}; - -// DecodeResource is thread safe, which can be shared for multiple -// decoding threads -struct DecodeResource { - std::shared_ptr model = nullptr; - std::shared_ptr symbol_table = nullptr; - std::shared_ptr> fst = nullptr; - std::shared_ptr unit_table = nullptr; - std::shared_ptr context_graph = nullptr; - std::shared_ptr post_processor = nullptr; -}; - -// Torch ASR decoder -class AsrDecoder { - public: - AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts); - // @param block: if true, block when feature is not enough for one chunk - // inference. Otherwise, return kWaitFeats. - DecodeState Decode(bool block = true); - void Rescoring(); - void Reset(); - void ResetContinuousDecoding(); - bool DecodedSomething() const { - return !result_.empty() && !result_[0].sentence.empty(); - } - - // This method is used for time benchmark - int num_frames_in_current_chunk() const { - return num_frames_in_current_chunk_; - } - int frame_shift_in_ms() const { - return model_->subsampling_rate() * - feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - int feature_frame_shift_in_ms() const { - return feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - const std::vector& result() const { return result_; } - - private: - DecodeState AdvanceDecoding(bool block = true); - void AttentionRescoring(); - - void UpdateResult(bool finish = false); - - std::shared_ptr feature_pipeline_; - std::shared_ptr model_; - std::shared_ptr post_processor_; - - std::shared_ptr> fst_ = nullptr; - // output symbol table - std::shared_ptr symbol_table_; - // e2e unit symbol table - std::shared_ptr unit_table_ = nullptr; - const DecodeOptions& opts_; - // cache feature - bool start_ = false; - // For continuous decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = 100; // timestamp gap between words in a sentence - - std::unique_ptr searcher_; - std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(AsrDecoder); -}; - -} // namespace wenet - -#endif // DECODER_ASR_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_model.cc deleted file mode 100644 index 8c7b0fb1195cf07bac6c3ff1bb8cb0e187e977da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_model.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#include "decoder/asr_model.h" - -#include -#include - -namespace wenet { - -int AsrModel::num_frames_for_chunk(bool start) const { - int num_required_frames = 0; - if (chunk_size_ > 0) { - if (!start) { // First batch - int context = right_context_ + 1; // Add current frame - num_required_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - num_required_frames = chunk_size_ * subsampling_rate_; - } - } else { - num_required_frames = std::numeric_limits::max(); - } - return num_required_frames; -} - -void AsrModel::CacheFeature( - const std::vector>& chunk_feats) { - // Cache feature for next chunk - const int cached_feature_size = 1 + right_context_ - subsampling_rate_; - if (chunk_feats.size() >= cached_feature_size) { - // TODO(Binbin Zhang): Only deal the case when - // chunk_feats.size() > cached_feature_size here, and it's consistent - // with our current model, refine it later if we have new model or - // new requirements - cached_feature_.resize(cached_feature_size); - for (int i = 0; i < cached_feature_size; ++i) { - cached_feature_[i] = - chunk_feats[chunk_feats.size() - cached_feature_size + i]; - } - } -} - -void AsrModel::ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) { - ctc_prob->clear(); - int num_frames = cached_feature_.size() + chunk_feats.size(); - if (num_frames >= right_context_ + 1) { - this->ForwardEncoderFunc(chunk_feats, ctc_prob); - this->CacheFeature(chunk_feats); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_model.h deleted file mode 100644 index d100dd818551014fa4769c1766bc3b1b626e8453..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/asr_model.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#ifndef DECODER_ASR_MODEL_H_ -#define DECODER_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "utils/timer.h" -#include "utils/utils.h" - -namespace wenet { - -class AsrModel { - public: - virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } - virtual int sos() const { return sos_; } - virtual int eos() const { return eos_; } - virtual bool is_bidirectional_decoder() const { - return is_bidirectional_decoder_; - } - virtual int offset() const { return offset_; } - - // If chunk_size > 0, streaming case. Otherwise, none streaming case - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { - num_left_chunks_ = num_left_chunks; - } - // start: if it is the start chunk of one sentence - virtual int num_frames_for_chunk(bool start) const; - - virtual void Reset() = 0; - - virtual void ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob); - - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - - virtual std::shared_ptr Copy() const = 0; - - protected: - virtual void ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) = 0; - virtual void CacheFeature(const std::vector>& chunk_feats); - - int right_context_ = 1; - int subsampling_rate_ = 1; - int sos_ = 0; - int eos_ = 0; - bool is_bidirectional_decoder_ = false; - int chunk_size_ = 16; - int num_left_chunks_ = -1; // -1 means all left chunks - int offset_ = 0; - - std::vector> cached_feature_; -}; - -} // namespace wenet - -#endif // DECODER_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/context_graph.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/context_graph.cc deleted file mode 100644 index adc59c506de2afa7087815887295e4d8735d2a35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/context_graph.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/context_graph.h" - -#include - -#include "fst/determinize.h" - -#include "utils/string.h" -#include "utils/utils.h" - -namespace wenet { - -ContextGraph::ContextGraph(ContextConfig config) : config_(config) {} - -void ContextGraph::BuildContextGraph( - const std::vector& query_contexts, - const std::shared_ptr& symbol_table) { - CHECK(symbol_table != nullptr) << "Symbols table should not be nullptr!"; - start_tag_id_ = symbol_table->AddSymbol(""); - end_tag_id_ = symbol_table->AddSymbol(""); - symbol_table_ = symbol_table; - if (query_contexts.empty()) { - if (graph_ != nullptr) graph_.reset(); - return; - } - - std::unique_ptr ofst(new fst::StdVectorFst()); - // State 0 is the start state and the final state. - int start_state = ofst->AddState(); - ofst->SetStart(start_state); - ofst->SetFinal(start_state, fst::StdArc::Weight::One()); - - LOG(INFO) << "Contexts count size: " << query_contexts.size(); - int count = 0; - for (const auto& context : query_contexts) { - if (context.size() > config_.max_context_length) { - LOG(INFO) << "Skip long context: " << context; - continue; - } - if (++count > config_.max_contexts) break; - - std::vector words; - // Split context to words by symbol table, and build the context graph. - bool no_oov = SplitUTF8StringToWords(Trim(context), symbol_table, &words); - if (!no_oov) { - LOG(WARNING) << "Ignore unknown word found during compilation."; - continue; - } - - int prev_state = start_state; - int next_state = start_state; - float escape_score = 0; - for (size_t i = 0; i < words.size(); ++i) { - int word_id = symbol_table_->Find(words[i]); - float score = (i * config_.incremental_context_score - + config_.context_score) * UTF8StringLength(words[i]); - next_state = (i < words.size() - 1) ? ofst->AddState() : start_state; - ofst->AddArc(prev_state, - fst::StdArc(word_id, word_id, score, next_state)); - // Add escape arc to clean the previous context score. - if (i > 0) { - // ilabel and olabel of the escape arc is 0 (). - ofst->AddArc(prev_state, fst::StdArc(0, 0, -escape_score, start_state)); - } - prev_state = next_state; - escape_score += score; - } - } - std::unique_ptr det_fst(new fst::StdVectorFst()); - fst::Determinize(*ofst, det_fst.get()); - graph_ = std::move(det_fst); -} - -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // escape score, will be overwritten when ilabel equals to word id. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - next_state = arc.nextstate; - *score = arc.weight.Value(); - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} - -bool ContextGraph::SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - bool no_oov = true; - for (size_t start = 0; start < chars.size();) { - for (size_t end = chars.size(); end > start; --end) { - std::string word; - for (size_t i = start; i < end; i++) { - word += chars[i]; - } - // Skip space. - if (word == " ") { - start = end; - continue; - } - // Add '▁' at the beginning of English word. - if (IsAlpha(word)) { - word = kSpaceSymbol + word; - } - - if (symbol_table->Find(word) != -1) { - words->emplace_back(word); - start = end; - continue; - } - if (end == start + 1) { - ++start; - no_oov = false; - LOG(WARNING) << word << " is oov."; - } - } - } - return no_oov; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/context_graph.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/context_graph.h deleted file mode 100644 index 41b59206987cfe22d421f40506057830b6311f8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/context_graph.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CONTEXT_GRAPH_H_ -#define DECODER_CONTEXT_GRAPH_H_ - -#include -#include -#include - -#include "fst/compose.h" -#include "fst/fst.h" -#include "fst/vector-fst.h" - -namespace wenet { - -using StateId = fst::StdArc::StateId; - -struct ContextConfig { - int max_contexts = 5000; - int max_context_length = 100; - float context_score = 3.0; - float incremental_context_score = 0.0; -}; - -class ContextGraph { - public: - explicit ContextGraph(ContextConfig config); - void BuildContextGraph(const std::vector& query_context, - const std::shared_ptr& symbol_table); - int GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary); - - int start_tag_id() { return start_tag_id_; } - int end_tag_id() { return end_tag_id_; } - - private: - bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - - int start_tag_id_ = -1; - int end_tag_id_ = -1; - ContextConfig config_; - std::shared_ptr symbol_table_ = nullptr; - std::unique_ptr graph_ = nullptr; - DISALLOW_COPY_AND_ASSIGN(ContextGraph); -}; - -} // namespace wenet - -#endif // DECODER_CONTEXT_GRAPH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_endpoint.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_endpoint.cc deleted file mode 100644 index 4a64dd048f32401ab0dca468836cfac8be943d26..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_endpoint.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_endpoint.h" - -#include - -#include -#include - -#include "utils/log.h" - -namespace wenet { - -CtcEndpoint::CtcEndpoint(const CtcEndpointConfig& config) : config_(config) { - Reset(); -} - -void CtcEndpoint::Reset() { - num_frames_decoded_ = 0; - num_frames_trailing_blank_ = 0; -} - -static bool RuleActivated(const CtcEndpointRule& rule, - const std::string& rule_name, bool decoded_sth, - int trailing_silence, int utterance_length) { - bool ans = (decoded_sth || !rule.must_decoded_sth) && - trailing_silence >= rule.min_trailing_silence && - utterance_length >= rule.min_utterance_length; - if (ans) { - VLOG(2) << "Endpointing rule " << rule_name - << " activated: " << (decoded_sth ? "true" : "false") << ',' - << trailing_silence << ',' << utterance_length; - } - return ans; -} - -bool CtcEndpoint::IsEndpoint( - const std::vector>& ctc_log_probs, - bool decoded_something) { - for (int t = 0; t < ctc_log_probs.size(); ++t) { - const auto& logp_t = ctc_log_probs[t]; - float blank_prob = expf(logp_t[config_.blank]); - - num_frames_decoded_++; - if (blank_prob > config_.blank_threshold) { - num_frames_trailing_blank_++; - } else { - num_frames_trailing_blank_ = 0; - } - } - CHECK_GE(num_frames_decoded_, num_frames_trailing_blank_); - CHECK_GT(frame_shift_in_ms_, 0); - int utterance_length = num_frames_decoded_ * frame_shift_in_ms_; - int trailing_silence = num_frames_trailing_blank_ * frame_shift_in_ms_; - if (RuleActivated(config_.rule1, "rule1", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule2, "rule2", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule3, "rule3", decoded_something, trailing_silence, - utterance_length)) - return true; - return false; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_endpoint.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_endpoint.h deleted file mode 100644 index 56d9e08e7d3fab5562028e956f7b1d6ebac7b9e4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_endpoint.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_ENDPOINT_H_ -#define DECODER_CTC_ENDPOINT_H_ - -#include - -namespace wenet { - -struct CtcEndpointRule { - bool must_decoded_sth; - int min_trailing_silence; - int min_utterance_length; - - CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000, - int min_utterance_length = 0) - : must_decoded_sth(must_decoded_sth), - min_trailing_silence(min_trailing_silence), - min_utterance_length(min_utterance_length) {} -}; - -struct CtcEndpointConfig { - /// We consider blank as silence for purposes of endpointing. - int blank = 0; // blank id - float blank_threshold = 0.8; // blank threshold to be silence - /// We support three rules. We terminate decoding if ANY of these rules - /// evaluates to "true". If you want to add more rules, do it by changing this - /// code. If you want to disable a rule, you can set the silence-timeout for - /// that rule to a very large number. - - /// rule1 times out after 5000 ms of silence, even if we decoded nothing. - CtcEndpointRule rule1; - /// rule2 times out after 1000 ms of silence after decoding something. - CtcEndpointRule rule2; - /// rule3 times out after the utterance is 20000 ms long, regardless of - /// anything else. - CtcEndpointRule rule3; - - CtcEndpointConfig() - : rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {} -}; - -class CtcEndpoint { - public: - explicit CtcEndpoint(const CtcEndpointConfig& config); - - void Reset(); - /// This function returns true if this set of endpointing rules thinks we - /// should terminate decoding. - bool IsEndpoint(const std::vector>& ctc_log_probs, - bool decoded_something); - - void frame_shift_in_ms(int frame_shift_in_ms) { - frame_shift_in_ms_ = frame_shift_in_ms; - } - - private: - CtcEndpointConfig config_; - int frame_shift_in_ms_ = -1; - int num_frames_decoded_ = 0; - int num_frames_trailing_blank_ = 0; -}; - -} // namespace wenet - -#endif // DECODER_CTC_ENDPOINT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_prefix_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index 154c8864ba98255528a33a80a35b18eee8fa5dc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_prefix_beam_search.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -CtcPrefixBeamSearch::CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : opts_(opts), context_graph_(context_graph) { - Reset(); -} - -void CtcPrefixBeamSearch::Reset() { - hypotheses_.clear(); - likelihood_.clear(); - cur_hyps_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); - abs_time_step_ = 0; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - prefix_score.v_s = 0.0; - prefix_score.v_ns = 0.0; - std::vector empty; - cur_hyps_[empty] = prefix_score; - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.total_score()); - times_.emplace_back(empty); -} - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.total_score() > b.second.total_score(); -} - -void CtcPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - const std::vector& start_boundaries = prefix.second.start_boundaries; - const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - if (s < start_boundaries.size() && i == start_boundaries[s]) { - output.emplace_back(context_graph_->start_tag_id()); - ++s; - } - output.emplace_back(input[i]); - if (e < end_boundaries.size() && i == end_boundaries[e]) { - output.emplace_back(context_graph_->end_tag_id()); - ++e; - } - } - outputs_.emplace_back(output); -} - -void CtcPrefixBeamSearch::UpdateHypotheses( - const std::vector, PrefixScore>>& hpys) { - cur_hyps_.clear(); - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - for (auto& item : hpys) { - cur_hyps_[item.first] = item.second; - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.total_score()); - viterbi_likelihood_.emplace_back(item.second.viterbi_score()); - times_.emplace_back(item.second.times()); - } -} - -// Please refer https://robin1001.github.io/2020/12/11/ctc-search -// for how CTC prefix beam search works, and there is a simple graph demo in -// it. -void CtcPrefixBeamSearch::Search(const std::vector>& logp) { - if (logp.size() == 0) return; - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. First beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. Token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields s(blank ending score) and - // ns(none blank ending score) to -inf, respectively. - if (id == opts_.blank) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = LogAdd(next_score.s, prefix_score.score() + prob); - next_score.v_s = prefix_score.viterbi_score() + prob; - next_score.times_s = prefix_score.times(); - // Prefix not changed, copy the context from prefix. - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.ns = LogAdd(next_score1.ns, prefix_score.ns + prob); - if (next_score1.v_ns < prefix_score.v_ns + prob) { - next_score1.v_ns = prefix_score.v_ns + prob; - if (next_score1.cur_token_prob < prob) { - next_score1.cur_token_prob = prob; - next_score1.times_ns = prefix_score.times_ns; - CHECK_GT(next_score1.times_ns.size(), 0); - next_score1.times_ns.back() = abs_time_step_; - } - } - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.ns = LogAdd(next_score2.ns, prefix_score.s + prob); - if (next_score2.v_ns < prefix_score.v_s + prob) { - next_score2.v_ns = prefix_score.v_s + prob; - next_score2.cur_token_prob = prob; - next_score2.times_ns = prefix_score.times_s; - next_score2.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score2.has_context) { - // Prefix changed, calculate the context score. - next_score2.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score2.has_context = true; - } - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = LogAdd(next_score.ns, prefix_score.score() + prob); - if (next_score.v_ns < prefix_score.viterbi_score() + prob) { - next_score.v_ns = prefix_score.viterbi_score() + prob; - next_score.cur_token_prob = prob; - next_score.times_ns = prefix_score.times(); - next_score.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score.has_context) { - // Calculate the context score. - next_score.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score.has_context = true; - } - } - } - } - - // 3. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), arr.begin() + second_beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. Update cur_hyps_ and get new result - UpdateHypotheses(arr); - } -} - -void CtcPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } - -void CtcPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - CHECK_EQ(hypotheses_.size(), cur_hyps_.size()); - CHECK_EQ(hypotheses_.size(), likelihood_.size()); - // We should backoff the context score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_state != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); - } - } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_prefix_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_prefix_beam_search.h deleted file mode 100644 index f44ec23c37af517c9e45140f89ef7346768f5d35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_prefix_beam_search.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_PREFIX_BEAM_SEARCH_H_ -#define DECODER_CTC_PREFIX_BEAM_SEARCH_H_ - -#include -#include -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "utils/utils.h" - -namespace wenet { - -struct CtcPrefixBeamSearchOptions { - int blank = 0; // blank id - int first_beam_size = 10; - int second_beam_size = 10; -}; - -struct PrefixScore { - float s = -kFloatMax; // blank ending score - float ns = -kFloatMax; // none blank ending score - float v_s = -kFloatMax; // viterbi blank ending score - float v_ns = -kFloatMax; // viterbi none blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_s; // times of viterbi blank path - std::vector times_ns; // times of viterbi none blank path - - float score() const { return LogAdd(s, ns); } - float viterbi_score() const { return v_s > v_ns ? v_s : v_ns; } - const std::vector& times() const { - return v_s > v_ns ? times_s : times_ns; - } - - bool has_context = false; - int context_state = 0; - float context_score = 0; - std::vector start_boundaries; - std::vector end_boundaries; - - void CopyContext(const PrefixScore& prefix_score) { - context_state = prefix_score.context_state; - context_score = prefix_score.context_score; - start_boundaries = prefix_score.start_boundaries; - end_boundaries = prefix_score.end_boundaries; - } - - void UpdateContext(const std::shared_ptr& context_graph, - const PrefixScore& prefix_score, int word_id, - int prefix_len) { - this->CopyContext(prefix_score); - - float score = 0; - bool is_start_boundary = false; - bool is_end_boundary = false; - - context_state = - context_graph->GetNextState(prefix_score.context_state, word_id, &score, - &is_start_boundary, &is_end_boundary); - context_score += score; - if (is_start_boundary) start_boundaries.emplace_back(prefix_len); - if (is_end_boundary) end_boundaries.emplace_back(prefix_len); - } - - float total_score() const { return score() + context_score; } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -class CtcPrefixBeamSearch : public SearchInterface { - public: - explicit CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph = nullptr); - - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kPrefixBeamSearch; } - void UpdateOutputs(const std::pair, PrefixScore>& prefix); - void UpdateHypotheses( - const std::vector, PrefixScore>>& hpys); - void UpdateFinalContext(); - - const std::vector& viterbi_likelihood() const { - return viterbi_likelihood_; - } - const std::vector>& Inputs() const override { - return hypotheses_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - int abs_time_step_ = 0; - - // N-best list and corresponding likelihood_, in sorted order - std::vector> hypotheses_; - std::vector likelihood_; - std::vector viterbi_likelihood_; - std::vector> times_; - - std::unordered_map, PrefixScore, PrefixHash> cur_hyps_; - std::shared_ptr context_graph_ = nullptr; - // Outputs contain the hypotheses_ and tags like: and - std::vector> outputs_; - const CtcPrefixBeamSearchOptions& opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(CtcPrefixBeamSearch); -}; - -} // namespace wenet - -#endif // DECODER_CTC_PREFIX_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_wfst_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_wfst_beam_search.cc deleted file mode 100644 index 10e93f387e87b5f16fb7784d7060c50f227bf58e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_wfst_beam_search.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_wfst_beam_search.h" - -#include - -namespace wenet { - -void DecodableTensorScaled::Reset() { - num_frames_ready_ = 0; - done_ = false; - // Give an empty initialization, will throw error when - // AcceptLoglikes is not called - logp_.clear(); -} - -void DecodableTensorScaled::AcceptLoglikes(const std::vector& logp) { - ++num_frames_ready_; - // TODO(Binbin Zhang): Avoid copy here - logp_ = logp; -} - -float DecodableTensorScaled::LogLikelihood(int32 frame, int32 index) { - CHECK_GT(index, 0); - CHECK_LT(frame, num_frames_ready_); - return scale_ * logp_[index - 1]; -} - -bool DecodableTensorScaled::IsLastFrame(int32 frame) const { - CHECK_LT(frame, num_frames_ready_); - return done_ && (frame == num_frames_ready_ - 1); -} - -int32 DecodableTensorScaled::NumIndices() const { - LOG(FATAL) << "Not implement"; - return 0; -} - -CtcWfstBeamSearch::CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : decodable_(opts.acoustic_scale), - decoder_(fst, opts, context_graph), - context_graph_(context_graph), - opts_(opts) { - Reset(); -} - -void CtcWfstBeamSearch::Reset() { - num_frames_ = 0; - decoded_frames_mapping_.clear(); - is_last_frame_blank_ = false; - last_best_ = 0; - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - decodable_.Reset(); - decoder_.InitDecoding(); -} - -void CtcWfstBeamSearch::Search(const std::vector>& logp) { - if (0 == logp.size()) { - return; - } - // Every time we get the log posterior, we decode it all before return - for (int i = 0; i < logp.size(); i++) { - float blank_score = std::exp(logp[i][0]); - if (blank_score > opts_.blank_skip_thresh * opts_.blank_scale) { - VLOG(3) << "skipping frame " << num_frames_ << " score " << blank_score; - is_last_frame_blank_ = true; - last_frame_prob_ = logp[i]; - } else { - // Get the best symbol - int cur_best = - std::max_element(logp[i].begin(), logp[i].end()) - logp[i].begin(); - // Optional, adding one blank frame if we has skipped it in two same - // symbols - if (cur_best != 0 && is_last_frame_blank_ && cur_best == last_best_) { - decodable_.AcceptLoglikes(last_frame_prob_); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_ - 1); - VLOG(2) << "Adding blank frame at symbol " << cur_best; - } - last_best_ = cur_best; - - decodable_.AcceptLoglikes(logp[i]); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_); - is_last_frame_blank_ = false; - } - num_frames_++; - } - // Get the best path - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - if (decoded_frames_mapping_.size() > 0) { - inputs_.resize(1); - outputs_.resize(1); - likelihood_.resize(1); - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, false); - std::vector alignment; - kaldi::LatticeWeight weight; - fst::GetLinearSymbolSequence(lat, &alignment, &outputs_[0], &weight); - ConvertToInputs(alignment, &inputs_[0]); - RemoveContinuousTags(&outputs_[0]); - VLOG(3) << weight.Value1() << " " << weight.Value2(); - likelihood_[0] = -(weight.Value1() + weight.Value2()); - } -} - -void CtcWfstBeamSearch::FinalizeSearch() { - decodable_.SetFinish(); - decoder_.FinalizeDecoding(); - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - if (decoded_frames_mapping_.size() > 0) { - std::vector nbest_lats; - if (opts_.nbest == 1) { - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, true); - nbest_lats.push_back(std::move(lat)); - } else { - // Get N-best path by lattice(CompactLattice) - kaldi::CompactLattice clat; - decoder_.GetLattice(&clat, true); - kaldi::Lattice lat, nbest_lat; - fst::ConvertLattice(clat, &lat); - // TODO(Binbin Zhang): it's n-best word lists here, not character n-best - fst::ShortestPath(lat, &nbest_lat, opts_.nbest); - fst::ConvertNbestToVector(nbest_lat, &nbest_lats); - } - int nbest = nbest_lats.size(); - inputs_.resize(nbest); - outputs_.resize(nbest); - likelihood_.resize(nbest); - times_.resize(nbest); - for (int i = 0; i < nbest; i++) { - kaldi::LatticeWeight weight; - std::vector alignment; - fst::GetLinearSymbolSequence(nbest_lats[i], &alignment, &outputs_[i], - &weight); - ConvertToInputs(alignment, &inputs_[i], ×_[i]); - RemoveContinuousTags(&outputs_[i]); - likelihood_[i] = -(weight.Value1() + weight.Value2()); - } - } -} - -void CtcWfstBeamSearch::ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time) { - input->clear(); - if (time != nullptr) time->clear(); - for (int cur = 0; cur < alignment.size(); ++cur) { - // ignore blank - if (alignment[cur] - 1 == 0) continue; - // merge continuous same label - if (cur > 0 && alignment[cur] == alignment[cur - 1]) continue; - - input->push_back(alignment[cur] - 1); - if (time != nullptr) { - time->push_back(decoded_frames_mapping_[cur]); - } - } -} - -void CtcWfstBeamSearch::RemoveContinuousTags(std::vector* output) { - if (context_graph_) { - for (auto it = output->begin(); it != output->end();) { - if (*it == context_graph_->start_tag_id() || - *it == context_graph_->end_tag_id()) { - if (it + 1 != output->end() && *it == *(it + 1)) { - it = output->erase(it); - continue; - } - } - ++it; - } - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_wfst_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_wfst_beam_search.h deleted file mode 100644 index 204a0c8db1254035b7e3bd4a6e02b65d66b756f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/ctc_wfst_beam_search.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_WFST_BEAM_SEARCH_H_ -#define DECODER_CTC_WFST_BEAM_SEARCH_H_ - -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "kaldi/decoder/lattice-faster-online-decoder.h" -#include "utils/utils.h" - -namespace wenet { - -class DecodableTensorScaled : public kaldi::DecodableInterface { - public: - explicit DecodableTensorScaled(float scale = 1.0) : scale_(scale) { Reset(); } - - void Reset(); - int32 NumFramesReady() const override { return num_frames_ready_; } - bool IsLastFrame(int32 frame) const override; - float LogLikelihood(int32 frame, int32 index) override; - int32 NumIndices() const override; - void AcceptLoglikes(const std::vector& logp); - void SetFinish() { done_ = true; } - - private: - int num_frames_ready_ = 0; - float scale_ = 1.0; - bool done_ = false; - std::vector logp_; -}; - -// LatticeFasterDecoderConfig has the following key members -// beam: decoding beam -// max_active: Decoder max active states -// lattice_beam: Lattice generation beam -struct CtcWfstBeamSearchOptions : public kaldi::LatticeFasterDecoderConfig { - float acoustic_scale = 1.0; - float nbest = 10; - // When blank score is greater than this thresh, skip the frame in viterbi - // search - float blank_skip_thresh = 0.98; - float blank_scale = 1.0; -}; - -class CtcWfstBeamSearch : public SearchInterface { - public: - explicit CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph); - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kWfstBeamSearch; } - // For CTC prefix beam search, both inputs and outputs are hypotheses_ - const std::vector>& Inputs() const override { - return inputs_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - // Sub one and remove - void ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time = nullptr); - void RemoveContinuousTags(std::vector* output); - - int num_frames_ = 0; - std::vector decoded_frames_mapping_; - - int last_best_ = 0; // last none blank best id - std::vector last_frame_prob_; - bool is_last_frame_blank_ = false; - std::vector> inputs_, outputs_; - std::vector likelihood_; - std::vector> times_; - DecodableTensorScaled decodable_; - kaldi::LatticeFasterOnlineDecoder decoder_; - std::shared_ptr context_graph_; - const CtcWfstBeamSearchOptions& opts_; -}; - -} // namespace wenet - -#endif // DECODER_CTC_WFST_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/onnx_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/onnx_asr_model.cc deleted file mode 100644 index fc7afc704febbde3b7e350e392dc46763c453e74..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/onnx_asr_model.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/onnx_asr_model.h" - -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -Ort::Env OnnxAsrModel::env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, ""); -Ort::SessionOptions OnnxAsrModel::session_options_ = Ort::SessionOptions(); - -void OnnxAsrModel::InitEngineThreads(int num_threads) { - session_options_.SetIntraOpNumThreads(num_threads); -} - -void OnnxAsrModel::GetInputOutputInfo( - const std::shared_ptr& session, - std::vector* in_names, std::vector* out_names) { - Ort::AllocatorWithDefaultOptions allocator; - // Input info - int num_nodes = session->GetInputCount(); - in_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetInputName(i, allocator); - Ort::TypeInfo type_info = session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tInput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*in_names)[i] = name; - } - // Output info - num_nodes = session->GetOutputCount(); - out_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetOutputName(i, allocator); - Ort::TypeInfo type_info = session->GetOutputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tOutput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*out_names)[i] = name; - } -} - -void OnnxAsrModel::Read(const std::string& model_dir) { - std::string encoder_onnx_path = model_dir + "/encoder.onnx"; - std::string rescore_onnx_path = model_dir + "/decoder.onnx"; - std::string ctc_onnx_path = model_dir + "/ctc.onnx"; - - // 1. Load sessions - try { -#ifdef _MSC_VER - encoder_session_ = std::make_shared( - env_, ToWString(encoder_onnx_path).c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, ToWString(rescore_onnx_path).c_str(), session_options_); - ctc_session_ = std::make_shared( - env_, ToWString(ctc_onnx_path).c_str(), session_options_); -#else - encoder_session_ = std::make_shared( - env_, encoder_onnx_path.c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, rescore_onnx_path.c_str(), session_options_); - ctc_session_ = std::make_shared(env_, ctc_onnx_path.c_str(), - session_options_); -#endif - } catch (std::exception const& e) { - LOG(ERROR) << "error when load onnx model: " << e.what(); - exit(0); - } - - // 2. Read metadata - auto model_metadata = encoder_session_->GetModelMetadata(); - - Ort::AllocatorWithDefaultOptions allocator; - encoder_output_size_ = - atoi(model_metadata.LookupCustomMetadataMap("output_size", allocator)); - num_blocks_ = - atoi(model_metadata.LookupCustomMetadataMap("num_blocks", allocator)); - head_ = atoi(model_metadata.LookupCustomMetadataMap("head", allocator)); - cnn_module_kernel_ = atoi( - model_metadata.LookupCustomMetadataMap("cnn_module_kernel", allocator)); - subsampling_rate_ = atoi( - model_metadata.LookupCustomMetadataMap("subsampling_rate", allocator)); - right_context_ = - atoi(model_metadata.LookupCustomMetadataMap("right_context", allocator)); - sos_ = atoi(model_metadata.LookupCustomMetadataMap("sos_symbol", allocator)); - eos_ = atoi(model_metadata.LookupCustomMetadataMap("eos_symbol", allocator)); - is_bidirectional_decoder_ = atoi(model_metadata.LookupCustomMetadataMap( - "is_bidirectional_decoder", allocator)); - chunk_size_ = - atoi(model_metadata.LookupCustomMetadataMap("chunk_size", allocator)); - num_left_chunks_ = - atoi(model_metadata.LookupCustomMetadataMap("left_chunks", allocator)); - - LOG(INFO) << "Onnx Model Info:"; - LOG(INFO) << "\tencoder_output_size " << encoder_output_size_; - LOG(INFO) << "\tnum_blocks " << num_blocks_; - LOG(INFO) << "\thead " << head_; - LOG(INFO) << "\tcnn_module_kernel " << cnn_module_kernel_; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; - LOG(INFO) << "\tchunk_size " << chunk_size_; - LOG(INFO) << "\tnum_left_chunks " << num_left_chunks_; - - // 3. Read model nodes - LOG(INFO) << "Onnx Encoder:"; - GetInputOutputInfo(encoder_session_, &encoder_in_names_, &encoder_out_names_); - LOG(INFO) << "Onnx CTC:"; - GetInputOutputInfo(ctc_session_, &ctc_in_names_, &ctc_out_names_); - LOG(INFO) << "Onnx Rescore:"; - GetInputOutputInfo(rescore_session_, &rescore_in_names_, &rescore_out_names_); -} - -OnnxAsrModel::OnnxAsrModel(const OnnxAsrModel& other) { - // metadatas - encoder_output_size_ = other.encoder_output_size_; - num_blocks_ = other.num_blocks_; - head_ = other.head_; - cnn_module_kernel_ = other.cnn_module_kernel_; - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - // sessions - encoder_session_ = other.encoder_session_; - ctc_session_ = other.ctc_session_; - rescore_session_ = other.rescore_session_; - - // node names - encoder_in_names_ = other.encoder_in_names_; - encoder_out_names_ = other.encoder_out_names_; - ctc_in_names_ = other.ctc_in_names_; - ctc_out_names_ = other.ctc_out_names_; - rescore_in_names_ = other.rescore_in_names_; - rescore_out_names_ = other.rescore_out_names_; -} - -std::shared_ptr OnnxAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void OnnxAsrModel::Reset() { - offset_ = 0; - encoder_outs_.clear(); - cached_feature_.clear(); - // Reset att_cache - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - if (num_left_chunks_ > 0) { - int required_cache_size = chunk_size_ * num_left_chunks_; - offset_ = required_cache_size; - att_cache_.resize(num_blocks_ * head_ * required_cache_size * - encoder_output_size_ / head_ * 2, - 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, required_cache_size, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } else { - att_cache_.resize(0, 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, 0, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } - - // Reset cnn_cache - cnn_cache_.resize( - num_blocks_ * encoder_output_size_ * (cnn_module_kernel_ - 1), 0.0); - const int64_t cnn_cache_shape[] = {num_blocks_, 1, encoder_output_size_, - cnn_module_kernel_ - 1}; - cnn_cache_ort_ = Ort::Value::CreateTensor( - memory_info, cnn_cache_.data(), cnn_cache_.size(), cnn_cache_shape, 4); -} - -void OnnxAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - // 1. Prepare onnx required data, splice cached_feature_ and chunk_feats - // chunk - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - std::vector feats; - for (size_t i = 0; i < cached_feature_.size(); ++i) { - feats.insert(feats.end(), cached_feature_[i].begin(), - cached_feature_[i].end()); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - feats.insert(feats.end(), chunk_feats[i].begin(), chunk_feats[i].end()); - } - const int64_t feats_shape[3] = {1, num_frames, feature_dim}; - Ort::Value feats_ort = Ort::Value::CreateTensor( - memory_info, feats.data(), feats.size(), feats_shape, 3); - // offset - int64_t offset_int64 = static_cast(offset_); - Ort::Value offset_ort = Ort::Value::CreateTensor( - memory_info, &offset_int64, 1, std::vector{}.data(), 0); - // required_cache_size - int64_t required_cache_size = chunk_size_ * num_left_chunks_; - Ort::Value required_cache_size_ort = Ort::Value::CreateTensor( - memory_info, &required_cache_size, 1, std::vector{}.data(), 0); - // att_mask - Ort::Value att_mask_ort{nullptr}; - std::vector att_mask(required_cache_size + chunk_size_, 1); - if (num_left_chunks_ > 0) { - int chunk_idx = offset_ / chunk_size_ - num_left_chunks_; - if (chunk_idx < num_left_chunks_) { - for (int i = 0; i < (num_left_chunks_ - chunk_idx) * chunk_size_; ++i) { - att_mask[i] = 0; - } - } - const int64_t att_mask_shape[] = {1, 1, required_cache_size + chunk_size_}; - att_mask_ort = Ort::Value::CreateTensor( - memory_info, reinterpret_cast(att_mask.data()), att_mask.size(), - att_mask_shape, 3); - } - - // 2. Encoder chunk forward - std::vector inputs; - for (auto name : encoder_in_names_) { - if (!strcmp(name, "chunk")) { - inputs.emplace_back(std::move(feats_ort)); - } else if (!strcmp(name, "offset")) { - inputs.emplace_back(std::move(offset_ort)); - } else if (!strcmp(name, "required_cache_size")) { - inputs.emplace_back(std::move(required_cache_size_ort)); - } else if (!strcmp(name, "att_cache")) { - inputs.emplace_back(std::move(att_cache_ort_)); - } else if (!strcmp(name, "cnn_cache")) { - inputs.emplace_back(std::move(cnn_cache_ort_)); - } else if (!strcmp(name, "att_mask")) { - inputs.emplace_back(std::move(att_mask_ort)); - } - } - - std::vector ort_outputs = encoder_session_->Run( - Ort::RunOptions{nullptr}, encoder_in_names_.data(), inputs.data(), - inputs.size(), encoder_out_names_.data(), encoder_out_names_.size()); - - offset_ += static_cast( - ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape()[1]); - att_cache_ort_ = std::move(ort_outputs[1]); - cnn_cache_ort_ = std::move(ort_outputs[2]); - - std::vector ctc_inputs; - ctc_inputs.emplace_back(std::move(ort_outputs[0])); - - std::vector ctc_ort_outputs = ctc_session_->Run( - Ort::RunOptions{nullptr}, ctc_in_names_.data(), ctc_inputs.data(), - ctc_inputs.size(), ctc_out_names_.data(), ctc_out_names_.size()); - encoder_outs_.push_back(std::move(ctc_inputs[0])); - - float* logp_data = ctc_ort_outputs[0].GetTensorMutableData(); - auto type_info = ctc_ort_outputs[0].GetTensorTypeAndShapeInfo(); - - int num_outputs = type_info.GetShape()[1]; - int output_dim = type_info.GetShape()[2]; - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), logp_data + i * output_dim, - sizeof(float) * output_dim); - } -} - -float OnnxAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void OnnxAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - - std::vector rescore_input; - int encoder_len = 0; - for (int i = 0; i < encoder_outs_.size(); i++) { - float* encoder_outs_data = encoder_outs_[i].GetTensorMutableData(); - auto type_info = encoder_outs_[i].GetTensorTypeAndShapeInfo(); - for (int j = 0; j < type_info.GetElementCount(); j++) { - rescore_input.emplace_back(encoder_outs_data[j]); - } - encoder_len += type_info.GetShape()[1]; - } - - const int64_t decode_input_shape[] = {1, encoder_len, encoder_output_size_}; - - std::vector hyps_pad; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad.emplace_back(0); - } - } - - const int64_t hyps_pad_shape[] = {num_hyps, max_hyps_len}; - - const int64_t hyps_lens_shape[] = {num_hyps}; - - Ort::Value decode_input_tensor_ = Ort::Value::CreateTensor( - memory_info, rescore_input.data(), rescore_input.size(), - decode_input_shape, 3); - Ort::Value hyps_pad_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_pad.data(), hyps_pad.size(), hyps_pad_shape, 2); - Ort::Value hyps_lens_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_lens.data(), hyps_lens.size(), hyps_lens_shape, 1); - - std::vector rescore_inputs; - - rescore_inputs.emplace_back(std::move(hyps_pad_tensor_)); - rescore_inputs.emplace_back(std::move(hyps_lens_tensor_)); - rescore_inputs.emplace_back(std::move(decode_input_tensor_)); - - std::vector rescore_outputs = rescore_session_->Run( - Ort::RunOptions{nullptr}, rescore_in_names_.data(), rescore_inputs.data(), - rescore_inputs.size(), rescore_out_names_.data(), - rescore_out_names_.size()); - - float* decoder_outs_data = rescore_outputs[0].GetTensorMutableData(); - float* r_decoder_outs_data = rescore_outputs[1].GetTensorMutableData(); - - auto type_info = rescore_outputs[0].GetTensorTypeAndShapeInfo(); - int decode_out_len = type_info.GetShape()[2]; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - score = ComputeAttentionScore( - decoder_outs_data + max_hyps_len * decode_out_len * i, hyp, eos_, - decode_out_len); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore( - r_decoder_outs_data + max_hyps_len * decode_out_len * i, r_hyp, eos_, - decode_out_len); - } - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/onnx_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/onnx_asr_model.h deleted file mode 100644 index f5d9e9a0c61d728f2fb6d45d1428234abae98c90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/onnx_asr_model.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_ONNX_ASR_MODEL_H_ -#define DECODER_ONNX_ASR_MODEL_H_ - -#include -#include -#include - -#include "onnxruntime_cxx_api.h" // NOLINT - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -class OnnxAsrModel : public AsrModel { - public: - static void InitEngineThreads(int num_threads = 1); - - public: - OnnxAsrModel() = default; - OnnxAsrModel(const OnnxAsrModel& other); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - void GetInputOutputInfo(const std::shared_ptr& session, - std::vector* in_names, - std::vector* out_names); - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // sessions - // NOTE(Mddct): The Env holds the logging state used by all other objects. - // One Env must be created before using any other Onnxruntime functionality. - static Ort::Env env_; // shared environment across threads. - static Ort::SessionOptions session_options_; - std::shared_ptr encoder_session_ = nullptr; - std::shared_ptr rescore_session_ = nullptr; - std::shared_ptr ctc_session_ = nullptr; - - // node names - std::vector encoder_in_names_, encoder_out_names_; - std::vector ctc_in_names_, ctc_out_names_; - std::vector rescore_in_names_, rescore_out_names_; - - // caches - Ort::Value att_cache_ort_{nullptr}; - Ort::Value cnn_cache_ort_{nullptr}; - std::vector encoder_outs_; - // NOTE: Instead of making a copy of the xx_cache, ONNX only maintains - // its data pointer when initializing xx_cache_ort (see https://github.com/ - // microsoft/onnxruntime/blob/master/onnxruntime/core/framework - // /tensor.cc#L102-L129), so we need the following variables to keep - // our data "alive" during the lifetime of decoder. - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // DECODER_ONNX_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/params.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/params.h deleted file mode 100644 index 3edc877f1bb6d876ca087cab8e4ed00d42e97e63..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/params.h +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_PARAMS_H_ -#define DECODER_PARAMS_H_ - -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#ifdef USE_ONNX -#include "decoder/onnx_asr_model.h" -#endif -#ifdef USE_TORCH -#include "decoder/torch_asr_model.h" -#endif -#ifdef USE_XPU -#include "xpu/xpu_asr_model.h" -#endif -#ifdef USE_BPU -#include "bpu/bpu_asr_model.h" -#endif -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); - -// TorchAsrModel flags -DEFINE_string(model_path, "", "pytorch exported model path"); -// OnnxAsrModel flags -DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); -// XPUAsrModel flags -DEFINE_string(xpu_model_dir, "", - "directory where the XPU model and weights is saved"); -// BPUAsrModel flags -DEFINE_string(bpu_model_dir, "", - "directory where the HORIZON BPU model is saved"); - -// FeaturePipelineConfig flags -DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); -DEFINE_int32(sample_rate, 16000, "sample rate for audio"); - -// TLG fst -DEFINE_string(fst_path, "", "TLG fst path"); - -// DecodeOptions flags -DEFINE_int32(chunk_size, 16, "decoding chunk size"); -DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); -DEFINE_double(ctc_weight, 0.5, - "ctc weight when combining ctc score and rescoring score"); -DEFINE_double(rescoring_weight, 1.0, - "rescoring weight when combining ctc score and rescoring score"); -DEFINE_double(reverse_weight, 0.0, - "used for bitransformer rescoring. it must be 0.0 if decoder is" - "conventional transformer decoder, and only reverse_weight > 0.0" - "dose the right to left decoder will be calculated and used"); -DEFINE_int32(max_active, 7000, "max active states in ctc wfst search"); -DEFINE_int32(min_active, 200, "min active states in ctc wfst search"); -DEFINE_double(beam, 16.0, "beam in ctc wfst search"); -DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); -DEFINE_double(blank_skip_thresh, 1.0, - "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(blank_scale, 1.0, "blank scale for ctc wfst search"); -DEFINE_double(length_penalty, 0.0, - "length penalty ctc wfst search, will not" - "apply on self-loop arc, for balancing the del/ins ratio, " - "suggest set to -3.0"); -DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); - -// SymbolTable flags -DEFINE_string(dict_path, "", - "dict symbol table path, required when LM is enabled"); -DEFINE_string(unit_path, "", - "e2e model unit symbol table, it is used in both " - "with/without LM scenarios for context/timestamp"); - -// Context flags -DEFINE_string(context_path, "", "context path, is used to build context graph"); -DEFINE_double(context_score, 3.0, "is used to rescore the decoded result"); - -// PostProcessOptions flags -DEFINE_int32(language_type, 0, - "remove spaces according to language type" - "0x00 = kMandarinEnglish, " - "0x01 = kIndoEuropean"); -DEFINE_bool(lowercase, true, "lowercase final result if needed"); - -namespace wenet { -std::shared_ptr InitFeaturePipelineConfigFromFlags() { - auto feature_config = std::make_shared( - FLAGS_num_bins, FLAGS_sample_rate); - return feature_config; -} - -std::shared_ptr InitDecodeOptionsFromFlags() { - auto decode_config = std::make_shared(); - decode_config->chunk_size = FLAGS_chunk_size; - decode_config->num_left_chunks = FLAGS_num_left_chunks; - decode_config->ctc_weight = FLAGS_ctc_weight; - decode_config->reverse_weight = FLAGS_reverse_weight; - decode_config->rescoring_weight = FLAGS_rescoring_weight; - decode_config->ctc_wfst_search_opts.max_active = FLAGS_max_active; - decode_config->ctc_wfst_search_opts.min_active = FLAGS_min_active; - decode_config->ctc_wfst_search_opts.beam = FLAGS_beam; - decode_config->ctc_wfst_search_opts.lattice_beam = FLAGS_lattice_beam; - decode_config->ctc_wfst_search_opts.acoustic_scale = FLAGS_acoustic_scale; - decode_config->ctc_wfst_search_opts.blank_skip_thresh = - FLAGS_blank_skip_thresh; - decode_config->ctc_wfst_search_opts.blank_scale = FLAGS_blank_scale; - decode_config->ctc_wfst_search_opts.length_penalty = FLAGS_length_penalty; - decode_config->ctc_wfst_search_opts.nbest = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; - return decode_config; -} - -std::shared_ptr InitDecodeResourceFromFlags() { - auto resource = std::make_shared(); - const int kNumGemmThreads = 1; - if (!FLAGS_onnx_dir.empty()) { -#ifdef USE_ONNX - LOG(INFO) << "Reading onnx model "; - OnnxAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_onnx_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; -#endif - } else if (!FLAGS_model_path.empty()) { -#ifdef USE_TORCH - LOG(INFO) << "Reading torch model " << FLAGS_model_path; - TorchAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_model_path); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; -#endif - } else if (!FLAGS_xpu_model_dir.empty()) { -#ifdef USE_XPU - LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; - auto model = std::make_shared(); - model->SetEngineThreads(kNumGemmThreads); - model->SetDeviceId(FLAGS_device_id); - model->Read(FLAGS_xpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; -#endif - } else if (!FLAGS_bpu_model_dir.empty()) { -#ifdef USE_BPU - LOG(INFO) << "Reading Horizon BPU model from " << FLAGS_bpu_model_dir; - auto model = std::make_shared(); - model->Read(FLAGS_bpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DBPU=ON'."; -#endif - } else { - LOG(FATAL) << "Please set ONNX, TORCH, XPU or BPU model path!!!"; - } - - LOG(INFO) << "Reading unit table " << FLAGS_unit_path; - auto unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_unit_path)); - CHECK(unit_table != nullptr); - resource->unit_table = unit_table; - - if (!FLAGS_fst_path.empty()) { // With LM - CHECK(!FLAGS_dict_path.empty()); - LOG(INFO) << "Reading fst " << FLAGS_fst_path; - auto fst = std::shared_ptr>( - fst::Fst::Read(FLAGS_fst_path)); - CHECK(fst != nullptr); - resource->fst = fst; - - LOG(INFO) << "Reading symbol table " << FLAGS_dict_path; - auto symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_dict_path)); - CHECK(symbol_table != nullptr); - resource->symbol_table = symbol_table; - } else { // Without LM, symbol_table is the same as unit_table - resource->symbol_table = unit_table; - } - - if (!FLAGS_context_path.empty()) { - LOG(INFO) << "Reading context " << FLAGS_context_path; - std::vector contexts; - std::ifstream infile(FLAGS_context_path); - std::string context; - while (getline(infile, context)) { - contexts.emplace_back(Trim(context)); - } - ContextConfig config; - config.context_score = FLAGS_context_score; - resource->context_graph = std::make_shared(config); - resource->context_graph->BuildContextGraph(contexts, - resource->symbol_table); - } - - PostProcessOptions post_process_opts; - post_process_opts.language_type = - FLAGS_language_type == 0 ? kMandarinEnglish : kIndoEuropean; - post_process_opts.lowercase = FLAGS_lowercase; - resource->post_processor = - std::make_shared(std::move(post_process_opts)); - return resource; -} - -} // namespace wenet - -#endif // DECODER_PARAMS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/search_interface.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/search_interface.h deleted file mode 100644 index 25bad26705f8be44561d2c686f50a63035b14bbf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/search_interface.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_SEARCH_INTERFACE_H_ -#define DECODER_SEARCH_INTERFACE_H_ - -namespace wenet { - -#include - -enum SearchType { - kPrefixBeamSearch = 0x00, - kWfstBeamSearch = 0x01, -}; - -class SearchInterface { - public: - virtual ~SearchInterface() {} - virtual void Search(const std::vector>& logp) = 0; - virtual void Reset() = 0; - virtual void FinalizeSearch() = 0; - - virtual SearchType Type() const = 0; - // N-best inputs id - virtual const std::vector>& Inputs() const = 0; - // N-best outputs id - virtual const std::vector>& Outputs() const = 0; - // N-best likelihood - virtual const std::vector& Likelihood() const = 0; - // N-best timestamp - virtual const std::vector>& Times() const = 0; -}; - -} // namespace wenet - -#endif // DECODER_SEARCH_INTERFACE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/torch_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/torch_asr_model.cc deleted file mode 100644 index 3abca283e12f5c173c9511707229ea82b31f26d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/torch_asr_model.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/torch_asr_model.h" - -#include -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -namespace wenet { - -#ifndef IOS -void TorchAsrModel::InitEngineThreads(int num_threads) { - // For multi-thread performance - at::set_num_threads(num_threads); - VLOG(1) << "Num intra-op threads: " << at::get_num_threads(); -} -#endif - -void TorchAsrModel::Read(const std::string& model_path) { - torch::DeviceType device = at::kCPU; -#ifdef USE_GPU - if (!torch::cuda::is_available()) { - VLOG(1) << "CUDA is not available! Please check your GPU settings"; - throw std::runtime_error("CUDA is not available!"); - } else { - VLOG(1) << "CUDA available! Running on GPU"; - device = at::kCUDA; - } -#endif - torch::jit::script::Module model = torch::jit::load(model_path, device); - model_ = std::make_shared(std::move(model)); - torch::NoGradGuard no_grad; - model_->eval(); - torch::jit::IValue o1 = model_->run_method("subsampling_rate"); - CHECK_EQ(o1.isInt(), true); - subsampling_rate_ = o1.toInt(); - torch::jit::IValue o2 = model_->run_method("right_context"); - CHECK_EQ(o2.isInt(), true); - right_context_ = o2.toInt(); - torch::jit::IValue o3 = model_->run_method("sos_symbol"); - CHECK_EQ(o3.isInt(), true); - sos_ = o3.toInt(); - torch::jit::IValue o4 = model_->run_method("eos_symbol"); - CHECK_EQ(o4.isInt(), true); - eos_ = o4.toInt(); - torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder"); - CHECK_EQ(o5.isBool(), true); - is_bidirectional_decoder_ = o5.toBool(); - - VLOG(1) << "Torch Model Info:"; - VLOG(1) << "\tsubsampling_rate " << subsampling_rate_; - VLOG(1) << "\tright context " << right_context_; - VLOG(1) << "\tsos " << sos_; - VLOG(1) << "\teos " << eos_; - VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - // 2. Model copy, just copy the model ptr since: - // PyTorch allows using multiple CPU threads during TorchScript model - // inference, please see https://pytorch.org/docs/stable/notes/cpu_ - // threading_torchscript_inference.html - model_ = other.model_; - - // NOTE(Binbin Zhang): - // inner states for forward are not copied here. -} - -std::shared_ptr TorchAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void TorchAsrModel::Reset() { - offset_ = 0; - att_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - encoder_outs_.clear(); - cached_feature_.clear(); -} - -void TorchAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - torch::Tensor feats = - torch::zeros({1, num_frames, feature_dim}, torch::kFloat); - for (size_t i = 0; i < cached_feature_.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(cached_feature_[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][i] = std::move(row); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(chunk_feats[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][cached_feature_.size() + i] = std::move(row); - } - - // 2. Encoder chunk forward -#ifdef USE_GPU - feats = feats.to(at::kCUDA); - att_cache_ = att_cache_.to(at::kCUDA); - cnn_cache_ = cnn_cache_.to(at::kCUDA); -#endif - int required_cache_size = chunk_size_ * num_left_chunks_; - torch::NoGradGuard no_grad; - std::vector inputs = {feats, offset_, required_cache_size, - att_cache_, cnn_cache_}; - - // Refer interfaces in wenet/transformer/asr_model.py - auto outputs = - model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements(); - CHECK_EQ(outputs.size(), 3); -#ifdef USE_GPU - torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU); - att_cache_ = outputs[1].toTensor().to(at::kCPU); - cnn_cache_ = outputs[2].toTensor().to(at::kCPU); -#else - torch::Tensor chunk_out = outputs[0].toTensor(); - att_cache_ = outputs[1].toTensor(); - cnn_cache_ = outputs[2].toTensor(); -#endif - offset_ += chunk_out.size(1); - - // The first dimension of returned value is for batchsize, which is 1 -#ifdef USE_GPU - chunk_out = chunk_out.to(at::kCUDA); - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor(); - ctc_log_probs = ctc_log_probs.to(at::kCPU)[0]; - encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU))); -#else - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor()[0]; - encoder_outs_.push_back(std::move(chunk_out)); -#endif - - // Copy to output - int num_outputs = ctc_log_probs.size(0); - int output_dim = ctc_log_probs.size(1); - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(), - sizeof(float) * output_dim); - } -} - -float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, - int eos) { - float score = 0.0f; - auto accessor = prob.accessor(); - for (size_t j = 0; j < hyp.size(); ++j) { - score += accessor[j][hyp[j]]; - } - score += accessor[hyp.size()][eos]; - return score; -} - -void TorchAsrModel::AttentionRescoring( - const std::vector>& hyps, float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - torch::NoGradGuard no_grad; - // Step 1: Prepare input for libtorch - torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong); - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_length[i] = static_cast(length); - } - torch::Tensor hyps_tensor = - torch::zeros({num_hyps, max_hyps_len}, torch::kLong); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_tensor[i][0] = sos_; - for (size_t j = 0; j < hyp.size(); ++j) { - hyps_tensor[i][j + 1] = hyp[j]; - } - } - - // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_ - torch::Tensor encoder_out = torch::cat(encoder_outs_, 1); -#ifdef USE_GPU - hyps_tensor = hyps_tensor.to(at::kCUDA); - hyps_length = hyps_length.to(at::kCUDA); - encoder_out = encoder_out.to(at::kCUDA); -#endif - auto outputs = model_ - ->run_method("forward_attention_decoder", hyps_tensor, - hyps_length, encoder_out, reverse_weight) - .toTuple() - ->elements(); -#ifdef USE_GPU - auto probs = outputs[0].toTensor().to(at::kCPU); - auto r_probs = outputs[1].toTensor().to(at::kCPU); -#else - auto probs = outputs[0].toTensor(); - auto r_probs = outputs[1].toTensor(); -#endif - CHECK_EQ(probs.size(0), num_hyps); - CHECK_EQ(probs.size(1), max_hyps_len); - - // Step 3: Compute rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left-to-right decoder score - score = ComputeAttentionScore(probs[i], hyp, eos_); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - // right-to-left score - CHECK_EQ(r_probs.size(0), num_hyps); - CHECK_EQ(r_probs.size(1), max_hyps_len); - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_); - } - - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/torch_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/torch_asr_model.h deleted file mode 100644 index a3cebe08798f1cad60ca4cd73c7b2488173b6114..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/decoder/torch_asr_model.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_TORCH_ASR_MODEL_H_ -#define DECODER_TORCH_ASR_MODEL_H_ - -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -#include "decoder/asr_model.h" -#include "utils/utils.h" - -namespace wenet { - -class TorchAsrModel : public AsrModel { - public: -#ifndef IOS - static void InitEngineThreads(int num_threads = 1); -#endif - - public: - using TorchModule = torch::jit::script::Module; - TorchAsrModel() = default; - TorchAsrModel(const TorchAsrModel& other); - void Read(const std::string& model_path); - std::shared_ptr torch_model() const { return model_; } - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, int eos); - - private: - std::shared_ptr model_ = nullptr; - std::vector encoder_outs_; - // transformer/conformer attention cache - torch::Tensor att_cache_ = torch::zeros({0, 0, 0, 0}); - // conformer-only conv_module cache - torch::Tensor cnn_cache_ = torch::zeros({0, 0, 0, 0}); -}; - -} // namespace wenet - -#endif // DECODER_TORCH_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/CMakeLists.txt deleted file mode 100644 index 78872257e43bb9a6ffcedaae977bf0173817ae50..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(frontend STATIC - feature_pipeline.cc - fft.cc -) -target_link_libraries(frontend PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fbank.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fbank.h deleted file mode 100644 index 5a650dc035b8e244388cc1f2e0b9512654de7fda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fbank.h +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FBANK_H_ -#define FRONTEND_FBANK_H_ - -#include -#include -#include -#include -#include - -#include "frontend/fft.h" -#include "utils/log.h" - -namespace wenet { - -// This code is based on kaldi Fbank implementation, please see -// https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.cc -class Fbank { - public: - Fbank(int num_bins, int sample_rate, int frame_length, int frame_shift) - : num_bins_(num_bins), - sample_rate_(sample_rate), - frame_length_(frame_length), - frame_shift_(frame_shift), - use_log_(true), - remove_dc_offset_(true), - generator_(0), - distribution_(0, 1.0), - dither_(0.0) { - fft_points_ = UpperPowerOfTwo(frame_length_); - // generate bit reversal table and trigonometric function table - const int fft_points_4 = fft_points_ / 4; - bitrev_.resize(fft_points_); - sintbl_.resize(fft_points_ + fft_points_4); - make_sintbl(fft_points_, sintbl_.data()); - make_bitrev(fft_points_, bitrev_.data()); - - int num_fft_bins = fft_points_ / 2; - float fft_bin_width = static_cast(sample_rate_) / fft_points_; - int low_freq = 20, high_freq = sample_rate_ / 2; - float mel_low_freq = MelScale(low_freq); - float mel_high_freq = MelScale(high_freq); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); - bins_.resize(num_bins_); - center_freqs_.resize(num_bins_); - for (int bin = 0; bin < num_bins; ++bin) { - float left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - center_freqs_[bin] = InverseMelScale(center_mel); - std::vector this_bin(num_fft_bins); - int first_index = -1, last_index = -1; - for (int i = 0; i < num_fft_bins; ++i) { - float freq = (fft_bin_width * i); // Center frequency of this fft - // bin. - float mel = MelScale(freq); - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) - weight = (mel - left_mel) / (center_mel - left_mel); - else - weight = (right_mel - mel) / (right_mel - center_mel); - this_bin[i] = weight; - if (first_index == -1) first_index = i; - last_index = i; - } - } - CHECK(first_index != -1 && last_index >= first_index); - bins_[bin].first = first_index; - int size = last_index + 1 - first_index; - bins_[bin].second.resize(size); - for (int i = 0; i < size; ++i) { - bins_[bin].second[i] = this_bin[first_index + i]; - } - } - - // povey window - povey_window_.resize(frame_length_); - double a = M_2PI / (frame_length - 1); - for (int i = 0; i < frame_length; ++i) { - povey_window_[i] = pow(0.5 - 0.5 * cos(a * i), 0.85); - } - } - - void set_use_log(bool use_log) { use_log_ = use_log; } - - void set_remove_dc_offset(bool remove_dc_offset) { - remove_dc_offset_ = remove_dc_offset; - } - - void set_dither(float dither) { dither_ = dither; } - - int num_bins() const { return num_bins_; } - - static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); - } - - static inline float MelScale(float freq) { - return 1127.0f * logf(1.0f + freq / 700.0f); - } - - static int UpperPowerOfTwo(int n) { - return static_cast(pow(2, ceil(log(n) / log(2)))); - } - - // pre emphasis - void PreEmphasis(float coeff, std::vector* data) const { - if (coeff == 0.0) return; - for (int i = data->size() - 1; i > 0; i--) - (*data)[i] -= coeff * (*data)[i - 1]; - (*data)[0] -= coeff * (*data)[0]; - } - - // Apply povey window on data in place - void Povey(std::vector* data) const { - CHECK_GE(data->size(), povey_window_.size()); - for (size_t i = 0; i < povey_window_.size(); ++i) { - (*data)[i] *= povey_window_[i]; - } - } - - // Compute fbank feat, return num frames - int Compute(const std::vector& wave, - std::vector>* feat) { - int num_samples = wave.size(); - if (num_samples < frame_length_) return 0; - int num_frames = 1 + ((num_samples - frame_length_) / frame_shift_); - feat->resize(num_frames); - std::vector fft_real(fft_points_, 0), fft_img(fft_points_, 0); - std::vector power(fft_points_ / 2); - for (int i = 0; i < num_frames; ++i) { - std::vector data(wave.data() + i * frame_shift_, - wave.data() + i * frame_shift_ + frame_length_); - // optional add noise - if (dither_ != 0.0) { - for (size_t j = 0; j < data.size(); ++j) - data[j] += dither_ * distribution_(generator_); - } - // optinal remove dc offset - if (remove_dc_offset_) { - float mean = 0.0; - for (size_t j = 0; j < data.size(); ++j) mean += data[j]; - mean /= data.size(); - for (size_t j = 0; j < data.size(); ++j) data[j] -= mean; - } - - PreEmphasis(0.97, &data); - Povey(&data); - // copy data to fft_real - memset(fft_img.data(), 0, sizeof(float) * fft_points_); - memset(fft_real.data() + frame_length_, 0, - sizeof(float) * (fft_points_ - frame_length_)); - memcpy(fft_real.data(), data.data(), sizeof(float) * frame_length_); - fft(bitrev_.data(), sintbl_.data(), fft_real.data(), fft_img.data(), - fft_points_); - // power - for (int j = 0; j < fft_points_ / 2; ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - } - - (*feat)[i].resize(num_bins_); - // cepstral coefficients, triangle filter array - for (int j = 0; j < num_bins_; ++j) { - float mel_energy = 0.0; - int s = bins_[j].first; - for (size_t k = 0; k < bins_[j].second.size(); ++k) { - mel_energy += bins_[j].second[k] * power[s + k]; - } - // optional use log - if (use_log_) { - if (mel_energy < std::numeric_limits::epsilon()) - mel_energy = std::numeric_limits::epsilon(); - mel_energy = logf(mel_energy); - } - - (*feat)[i][j] = mel_energy; - } - } - return num_frames; - } - - private: - int num_bins_; - int sample_rate_; - int frame_length_, frame_shift_; - int fft_points_; - bool use_log_; - bool remove_dc_offset_; - std::vector center_freqs_; - std::vector>> bins_; - std::vector povey_window_; - std::default_random_engine generator_; - std::normal_distribution distribution_; - float dither_; - - // bit reversal table - std::vector bitrev_; - // trigonometric function table - std::vector sintbl_; -}; - -} // namespace wenet - -#endif // FRONTEND_FBANK_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/feature_pipeline.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/feature_pipeline.cc deleted file mode 100644 index ab450b15cd35ebd8101a3bcdec4f963a73bed10c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/feature_pipeline.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/feature_pipeline.h" - -#include -#include - -namespace wenet { - -FeaturePipeline::FeaturePipeline(const FeaturePipelineConfig& config) - : config_(config), - feature_dim_(config.num_bins), - fbank_(config.num_bins, config.sample_rate, config.frame_length, - config.frame_shift), - num_frames_(0), - input_finished_(false) {} - -void FeaturePipeline::AcceptWaveform(const float* pcm, const int size) { - std::vector> feats; - std::vector waves; - waves.insert(waves.end(), remained_wav_.begin(), remained_wav_.end()); - waves.insert(waves.end(), pcm, pcm + size); - int num_frames = fbank_.Compute(waves, &feats); - feature_queue_.Push(std::move(feats)); - num_frames_ += num_frames; - - int left_samples = waves.size() - config_.frame_shift * num_frames; - remained_wav_.resize(left_samples); - std::copy(waves.begin() + config_.frame_shift * num_frames, waves.end(), - remained_wav_.begin()); - // We are still adding wave, notify input is not finished - finish_condition_.notify_one(); -} - -void FeaturePipeline::AcceptWaveform(const int16_t* pcm, const int size) { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = static_cast(pcm[i]); - } - this->AcceptWaveform(float_pcm, size); - delete[] float_pcm; -} - -void FeaturePipeline::set_input_finished() { - CHECK(!input_finished_); - { - std::lock_guard lock(mutex_); - input_finished_ = true; - } - finish_condition_.notify_one(); -} - -bool FeaturePipeline::ReadOne(std::vector* feat) { - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - return false; - } - } -} - -bool FeaturePipeline::Read(int num_frames, - std::vector>* feats) { - feats->clear(); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - *feats = std::move(feature_queue_.Pop(feature_queue_.Size())); - return false; - } - } -} - -void FeaturePipeline::Reset() { - input_finished_ = false; - num_frames_ = 0; - remained_wav_.clear(); - feature_queue_.Clear(); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/feature_pipeline.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/feature_pipeline.h deleted file mode 100644 index 9918d6b573255795e0e665f0a9598c44be625c19..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/feature_pipeline.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FEATURE_PIPELINE_H_ -#define FRONTEND_FEATURE_PIPELINE_H_ - -#include -#include -#include -#include - -#include "frontend/fbank.h" -#include "utils/blocking_queue.h" -#include "utils/log.h" - -namespace wenet { - -struct FeaturePipelineConfig { - int num_bins; - int sample_rate; - int frame_length; - int frame_shift; - FeaturePipelineConfig(int num_bins, int sample_rate) - : num_bins(num_bins), // 80 dim fbank - sample_rate(sample_rate) { // 16k sample rate - frame_length = sample_rate / 1000 * 25; // frame length 25ms - frame_shift = sample_rate / 1000 * 10; // frame shift 10ms - } - - void Info() const { - LOG(INFO) << "feature pipeline config" - << " num_bins " << num_bins << " frame_length " << frame_length - << " frame_shift " << frame_shift; - } -}; - -// Typically, FeaturePipeline is used in two threads: one thread A calls -// AcceptWaveform() to add raw wav data and set_input_finished() to notice -// the end of input wav, another thread B (decoder thread) calls Read() to -// consume features.So a BlockingQueue is used to make this class thread safe. - -// The Read() is designed as a blocking method when there is no feature -// in feature_queue_ and the input is not finished. - -// See bin/decoder_main.cc, websocket/websocket_server.cc and -// decoder/torch_asr_decoder.cc for usage - -class FeaturePipeline { - public: - explicit FeaturePipeline(const FeaturePipelineConfig& config); - - // The feature extraction is done in AcceptWaveform(). - void AcceptWaveform(const float* pcm, const int size); - void AcceptWaveform(const int16_t* pcm, const int size); - - // Current extracted frames number. - int num_frames() const { return num_frames_; } - int feature_dim() const { return feature_dim_; } - const FeaturePipelineConfig& config() const { return config_; } - - // The caller should call this method when speech input is end. - // Never call AcceptWaveform() after calling set_input_finished() ! - void set_input_finished(); - bool input_finished() const { return input_finished_; } - - // Return False if input is finished and no feature could be read. - // Return True if a feature is read. - // This function is a blocking method. It will block the thread when - // there is no feature in feature_queue_ and the input is not finished. - bool ReadOne(std::vector* feat); - - // Read #num_frames frame features. - // Return False if less than #num_frames features are read and the - // input is finished. - // Return True if #num_frames features are read. - // This function is a blocking method when there is no feature - // in feature_queue_ and the input is not finished. - bool Read(int num_frames, std::vector>* feats); - - void Reset(); - bool IsLastFrame(int frame) const { - return input_finished_ && (frame == num_frames_ - 1); - } - - int NumQueuedFrames() const { return feature_queue_.Size(); } - - private: - const FeaturePipelineConfig& config_; - int feature_dim_; - Fbank fbank_; - - BlockingQueue> feature_queue_; - int num_frames_; - bool input_finished_; - - // The feature extraction is done in AcceptWaveform(). - // This waveform sample points are consumed by frame size. - // The residual waveform sample points after framing are - // kept to be used in next AcceptWaveform() calling. - std::vector remained_wav_; - - // Used to block the Read when there is no feature in feature_queue_ - // and the input is not finished. - mutable std::mutex mutex_; - std::condition_variable finish_condition_; -}; - -} // namespace wenet - -#endif // FRONTEND_FEATURE_PIPELINE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fft.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fft.cc deleted file mode 100644 index 9e05f854e79ea733d0411045385e924c2670b7f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fft.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include - -#include "frontend/fft.h" - -namespace wenet { - -void make_sintbl(int n, float* sintbl) { - int i, n2, n4, n8; - float c, s, dc, ds, t; - - n2 = n / 2; - n4 = n / 4; - n8 = n / 8; - t = sin(M_PI / n); - dc = 2 * t * t; - ds = sqrt(dc * (2 - dc)); - t = 2 * dc; - c = sintbl[n4] = 1; - s = sintbl[0] = 0; - for (i = 1; i < n8; ++i) { - c -= dc; - dc += t * c; - s += ds; - ds -= t * s; - sintbl[i] = s; - sintbl[n4 - i] = c; - } - if (n8 != 0) sintbl[n8] = sqrt(0.5); - for (i = 0; i < n4; ++i) sintbl[n2 - i] = sintbl[i]; - for (i = 0; i < n2 + n4; ++i) sintbl[i + n2] = -sintbl[i]; -} - -void make_bitrev(int n, int* bitrev) { - int i, j, k, n2; - - n2 = n / 2; - i = j = 0; - for (;;) { - bitrev[i] = j; - if (++i >= n) break; - k = n2; - while (k <= j) { - j -= k; - k /= 2; - } - j += k; - } -} - -// bitrev: bit reversal table -// sintbl: trigonometric function table -// x:real part -// y:image part -// n: fft length -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n) { - int i, j, k, ik, h, d, k2, n4, inverse; - float t, s, c, dx, dy; - - /* preparation */ - if (n < 0) { - n = -n; - inverse = 1; /* inverse transform */ - } else { - inverse = 0; - } - n4 = n / 4; - if (n == 0) { - return 0; - } - - /* bit reversal */ - for (i = 0; i < n; ++i) { - j = bitrev[i]; - if (i < j) { - t = x[i]; - x[i] = x[j]; - x[j] = t; - t = y[i]; - y[i] = y[j]; - y[j] = t; - } - } - - /* transformation */ - for (k = 1; k < n; k = k2) { - h = 0; - k2 = k + k; - d = n / k2; - for (j = 0; j < k; ++j) { - c = sintbl[h + n4]; - if (inverse) - s = -sintbl[h]; - else - s = sintbl[h]; - for (i = j; i < n; i += k2) { - ik = i + k; - dx = s * y[ik] + c * x[ik]; - dy = c * y[ik] - s * x[ik]; - x[ik] = x[i] - dx; - x[i] += dx; - y[ik] = y[i] - dy; - y[i] += dy; - } - h += d; - } - } - if (inverse) { - /* divide by n in case of the inverse transformation */ - for (i = 0; i < n; ++i) { - x[i] /= n; - y[i] /= n; - } - } - return 0; /* finished successfully */ -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fft.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fft.h deleted file mode 100644 index 6b92e406c44b4768eaee6e734f55bb39cd9af28b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/fft.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_FFT_H_ -#define FRONTEND_FFT_H_ - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -namespace wenet { - -// Fast Fourier Transform - -void make_sintbl(int n, float* sintbl); - -void make_bitrev(int n, int* bitrev); - -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n); - -} // namespace wenet - -#endif // FRONTEND_FFT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/wav.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/wav.h deleted file mode 100644 index 688a049a940ebbdc83f24e59134fff22b7b09bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/frontend/wav.h +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2016 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_WAV_H_ -#define FRONTEND_WAV_H_ - -#include -#include -#include -#include -#include - -#include - -#include "utils/log.h" - -namespace wenet { - -struct WavHeader { - char riff[4] = {'R', 'I', 'F', 'F'}; - unsigned int size = 0; - char wav[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - unsigned int fmt_size = 16; - uint16_t format = 1; - uint16_t channels = 0; - unsigned int sample_rate = 0; - unsigned int bytes_per_second = 0; - uint16_t block_size = 0; - uint16_t bit = 0; - char data[4] = {'d', 'a', 't', 'a'}; - unsigned int data_size = 0; - - WavHeader() {} - - WavHeader(int num_samples, int num_channel, int sample_rate, - int bits_per_sample) { - data_size = num_samples * num_channel * (bits_per_sample / 8); - size = sizeof(WavHeader) - 8 + data_size; - channels = num_channel; - this->sample_rate = sample_rate; - bytes_per_second = sample_rate * num_channel * (bits_per_sample / 8); - block_size = num_channel * (bits_per_sample / 8); - bit = bits_per_sample; - } -}; - -class WavReader { - public: - WavReader() : data_(nullptr) {} - explicit WavReader(const std::string& filename) { Open(filename); } - - bool Open(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "rb"); - if (NULL == fp) { - LOG(WARNING) << "Error in read " << filename; - return false; - } - - WavHeader header; - fread(&header, 1, sizeof(header), fp); - if (header.fmt_size < 16) { - fprintf(stderr, - "WaveData: expect PCM format data " - "to have fmt chunk of at least size 16.\n"); - return false; - } else if (header.fmt_size > 16) { - int offset = 44 - 8 + header.fmt_size - 16; - fseek(fp, offset, SEEK_SET); - fread(header.data, 8, sizeof(char), fp); - } - // check "RIFF" "WAVE" "fmt " "data" - - // Skip any sub-chunks between "fmt" and "data". Usually there will - // be a single "fact" sub chunk, but on Windows there can also be a - // "list" sub chunk. - while (0 != strncmp(header.data, "data", 4)) { - // We will just ignore the data in these chunks. - fseek(fp, header.data_size, SEEK_CUR); - // read next sub chunk - fread(header.data, 8, sizeof(char), fp); - } - - num_channel_ = header.channels; - sample_rate_ = header.sample_rate; - bits_per_sample_ = header.bit; - int num_data = header.data_size / (bits_per_sample_ / 8); - data_ = new float[num_data]; - num_samples_ = num_data / num_channel_; - - for (int i = 0; i < num_data; ++i) { - switch (bits_per_sample_) { - case 8: { - char sample; - fread(&sample, 1, sizeof(char), fp); - data_[i] = static_cast(sample); - break; - } - case 16: { - int16_t sample; - fread(&sample, 1, sizeof(int16_t), fp); - data_[i] = static_cast(sample); - break; - } - case 32: { - int sample; - fread(&sample, 1, sizeof(int), fp); - data_[i] = static_cast(sample); - break; - } - default: - fprintf(stderr, "unsupported quantization bits"); - exit(1); - } - } - fclose(fp); - return true; - } - - int num_channel() const { return num_channel_; } - int sample_rate() const { return sample_rate_; } - int bits_per_sample() const { return bits_per_sample_; } - int num_samples() const { return num_samples_; } - - ~WavReader() { - delete[] data_; - } - - const float* data() const { return data_; } - - private: - int num_channel_; - int sample_rate_; - int bits_per_sample_; - int num_samples_; // sample points per channel - float* data_; -}; - -class WavWriter { - public: - WavWriter(const float* data, int num_samples, int num_channel, - int sample_rate, int bits_per_sample) - : data_(data), - num_samples_(num_samples), - num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample) {} - - void Write(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "wb"); - WavHeader header(num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fwrite(&header, 1, sizeof(header), fp); - - for (int i = 0; i < num_samples_; ++i) { - for (int j = 0; j < num_channel_; ++j) { - switch (bits_per_sample_) { - case 8: { - char sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 16: { - int16_t sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 32: { - int sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - } - } - } - fclose(fp); - } - - private: - const float* data_; - int num_samples_; // total float points in data_ - int num_channel_; - int sample_rate_; - int bits_per_sample_; -}; - -class StreamWavWriter { - public: - StreamWavWriter(int num_channel, int sample_rate, int bits_per_sample) - : num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample), - total_num_samples_(0) {} - - StreamWavWriter(const std::string& filename, int num_channel, - int sample_rate, int bits_per_sample) - : StreamWavWriter(num_channel, sample_rate, bits_per_sample) { - Open(filename); - } - - void Open(const std::string& filename) { - fp_ = fopen(filename.c_str(), "wb"); - fseek(fp_, sizeof(WavHeader), SEEK_SET); - } - - void Write(const int16_t* sample_data, size_t num_samples) { - fwrite(sample_data, sizeof(int16_t), num_samples, fp_); - total_num_samples_ += num_samples; - } - - void Close() { - WavHeader header(total_num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fseek(fp_, 0L, SEEK_SET); - fwrite(&header, 1, sizeof(header), fp_); - fclose(fp_); - } - - private: - FILE* fp_; - int num_channel_; - int sample_rate_; - int bits_per_sample_; - size_t total_num_samples_; -}; - -} // namespace wenet - -#endif // FRONTEND_WAV_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/CMakeLists.txt deleted file mode 100644 index 2a152dd0d38cdc17d2758d7dbd542cd974d5f0c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# compile wenet.proto -set(PROTO_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -add_custom_command( - OUTPUT ${PROTO_DIR}/wenet.pb.cc - ${PROTO_DIR}/wenet.pb.h - ${PROTO_DIR}/wenet.grpc.pb.cc - ${PROTO_DIR}/wenet.grpc.pb.h - COMMAND ${protobuf_BINARY_DIR}/protoc - ARGS --grpc_out "${PROTO_DIR}" - --cpp_out "${PROTO_DIR}" - -I "${PROTO_DIR}" - --plugin=protoc-gen-grpc=${grpc_BINARY_DIR}/grpc_cpp_plugin - wenet.proto) - -# grpc_server/client -link_directories(${protobuf_BINARY_DIR}/lib) -add_library(wenet_grpc STATIC - grpc_client.cc - grpc_server.cc - wenet.pb.cc - wenet.grpc.pb.cc -) -target_link_libraries(wenet_grpc PUBLIC grpc++ grpc++_reflection decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_client.cc deleted file mode 100644 index 7a2e3f6f384980b6566468213d3eead43a404070..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_client.cc +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "grpc/grpc_client.h" - -#include "utils/log.h" - -namespace wenet { -using grpc::Channel; -using grpc::ClientContext; -using grpc::ClientReaderWriter; -using grpc::Status; -using wenet::Request; -using wenet::Response; - -GrpcClient::GrpcClient(const std::string& host, int port, int nbest, - bool continuous_decoding) - : host_(host), - port_(port), - nbest_(nbest), - continuous_decoding_(continuous_decoding) { - Connect(); - t_.reset(new std::thread(&GrpcClient::ReadLoopFunc, this)); -} - -void GrpcClient::Connect() { - channel_ = grpc::CreateChannel(host_ + ":" + std::to_string(port_), - grpc::InsecureChannelCredentials()); - stub_ = ASR::NewStub(channel_); - context_ = std::make_shared(); - stream_ = stub_->Recognize(context_.get()); - request_ = std::make_shared(); - response_ = std::make_shared(); - request_->mutable_decode_config()->set_nbest_config(nbest_); - request_->mutable_decode_config()->set_continuous_decoding_config( - continuous_decoding_); - stream_->Write(*request_); -} - -void GrpcClient::SendBinaryData(const void* data, size_t size) { - const int16_t* pdata = reinterpret_cast(data); - request_->set_audio_data(pdata, size); - stream_->Write(*request_); -} - -void GrpcClient::ReadLoopFunc() { - try { - while (stream_->Read(response_.get())) { - for (int i = 0; i < response_->nbest_size(); i++) { - // you can also traverse wordpieces like demonstrated above - LOG(INFO) << i + 1 << "best " << response_->nbest(i).sentence(); - } - if (response_->status() != Response_Status_ok) { - break; - } - if (response_->type() == Response_Type_speech_end) { - done_ = true; - break; - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void GrpcClient::Join() { - stream_->WritesDone(); - t_->join(); - Status status = stream_->Finish(); - if (!status.ok()) { - LOG(INFO) << "Recognize rpc failed."; - } -} -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_client.h deleted file mode 100644 index 36e36a0f5f5ec5bbb818009fe931e863eaa7fd60..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_client.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GRPC_GRPC_CLIENT_H_ -#define GRPC_GRPC_CLIENT_H_ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "grpc/wenet.grpc.pb.h" -#include "utils/utils.h" - -namespace wenet { - -using grpc::Channel; -using grpc::ClientContext; -using grpc::ClientReaderWriter; -using wenet::ASR; -using wenet::Request; -using wenet::Response; - -class GrpcClient { - public: - GrpcClient(const std::string& host, int port, int nbest, - bool continuous_decoding); - - void SendBinaryData(const void* data, size_t size); - void ReadLoopFunc(); - void Join(); - bool done() const { return done_; } - - private: - void Connect(); - std::string host_; - int port_; - std::shared_ptr channel_{nullptr}; - std::unique_ptr stub_{nullptr}; - std::shared_ptr context_{nullptr}; - std::unique_ptr> stream_{nullptr}; - std::shared_ptr request_{nullptr}; - std::shared_ptr response_{nullptr}; - int nbest_ = 1; - bool continuous_decoding_ = false; - bool done_ = false; - std::unique_ptr t_{nullptr}; - - WENET_DISALLOW_COPY_AND_ASSIGN(GrpcClient); -}; - -} // namespace wenet - -#endif // GRPC_GRPC_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_server.cc deleted file mode 100644 index 26268bc02a2f2ea56bb24a1eb379a565f693429a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_server.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "grpc/grpc_server.h" - -namespace wenet { - -using grpc::ServerReaderWriter; -using wenet::Request; -using wenet::Response; - -GrpcConnectionHandler::GrpcConnectionHandler( - ServerReaderWriter* stream, - std::shared_ptr request, std::shared_ptr response, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : stream_(std::move(stream)), - request_(std::move(request)), - response_(std::move(response)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - -void GrpcConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Received speech start signal, start reading speech"; - got_start_tag_ = true; - response_->set_status(Response::ok); - response_->set_type(Response::server_ready); - stream_->Write(*response_); - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = std::make_shared( - &GrpcConnectionHandler::DecodeThreadFunc, this); -} - -void GrpcConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Received speech end signal"; - CHECK(feature_pipeline_ != nullptr); - feature_pipeline_->set_input_finished(); - got_end_tag_ = true; -} - -void GrpcConnectionHandler::OnPartialResult() { - LOG(INFO) << "Partial result"; - response_->set_status(Response::ok); - response_->set_type(Response::partial_result); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnFinalResult() { - LOG(INFO) << "Final result"; - response_->set_status(Response::ok); - response_->set_type(Response::final_result); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnFinish() { - // Send finish tag - response_->set_status(Response::ok); - response_->set_type(Response::speech_end); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnSpeechData() { - // Read binary PCM data - const int16_t* pcm_data = - reinterpret_cast(request_->audio_data().c_str()); - int num_samples = request_->audio_data().length() / sizeof(int16_t); - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - feature_pipeline_->AcceptWaveform(pcm_data, num_samples); -} - -void GrpcConnectionHandler::SerializeResult(bool finish) { - for (const DecodeResult& path : decoder_->result()) { - Response_OneBest* one_best_ = response_->add_nbest(); - one_best_->set_sentence(path.sentence); - if (finish) { - for (const WordPiece& word_piece : path.word_pieces) { - Response_OnePiece* one_piece_ = one_best_->add_wordpieces(); - one_piece_->set_word(word_piece.word); - one_piece_->set_start(word_piece.start); - one_piece_->set_end(word_piece.end); - } - } - if (response_->nbest_size() == nbest_) { - break; - } - } - return; -} - -void GrpcConnectionHandler::DecodeThreadFunc() { - while (true) { - DecodeState state = decoder_->Decode(); - response_->clear_status(); - response_->clear_type(); - response_->clear_nbest(); - if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - SerializeResult(true); - OnFinalResult(); - OnFinish(); - stop_recognition_ = true; - break; - } else if (state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - SerializeResult(true); - OnFinalResult(); - // If it's not continuous decoding, continue to do next recognition - // otherwise stop the recognition - if (continuous_decoding_) { - decoder_->ResetContinuousDecoding(); - } else { - OnFinish(); - stop_recognition_ = true; - break; - } - } else { - if (decoder_->DecodedSomething()) { - SerializeResult(false); - OnPartialResult(); - } - } - } -} - -void GrpcConnectionHandler::operator()() { - try { - while (stream_->Read(request_.get())) { - if (!got_start_tag_) { - nbest_ = request_->decode_config().nbest_config(); - continuous_decoding_ = - request_->decode_config().continuous_decoding_config(); - OnSpeechStart(); - } else { - OnSpeechData(); - } - } - OnSpeechEnd(); - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -Status GrpcServer::Recognize(ServerContext* context, - ServerReaderWriter* stream) { - LOG(INFO) << "Get Recognize request" << std::endl; - auto request = std::make_shared(); - auto response = std::make_shared(); - GrpcConnectionHandler handler(stream, request, response, feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.join(); - return Status::OK; -} -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_server.h deleted file mode 100644 index 3ab47ce5b15897c2a596d8ef27f2e7c4f8d26a3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/grpc_server.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GRPC_GRPC_SERVER_H_ -#define GRPC_GRPC_SERVER_H_ - -#include -#include -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -#include "grpc/wenet.grpc.pb.h" - -namespace wenet { - -using grpc::ServerContext; -using grpc::ServerReaderWriter; -using grpc::Status; -using wenet::ASR; -using wenet::Request; -using wenet::Response; - -class GrpcConnectionHandler { - public: - GrpcConnectionHandler(ServerReaderWriter* stream, - std::shared_ptr request, - std::shared_ptr response, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnFinish(); - void OnSpeechData(); - void OnPartialResult(); - void OnFinalResult(); - void DecodeThreadFunc(); - void SerializeResult(bool finish); - - bool continuous_decoding_ = false; - int nbest_ = 1; - ServerReaderWriter* stream_; - std::shared_ptr request_; - std::shared_ptr response_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - bool got_start_tag_ = false; - bool got_end_tag_ = false; - // When endpoint is detected, stop recognition, and stop receiving data. - bool stop_recognition_ = false; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class GrpcServer final : public ASR::Service { - public: - GrpcServer(std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - Status Recognize(ServerContext* context, - ServerReaderWriter* reader) override; - - private: - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - DISALLOW_COPY_AND_ASSIGN(GrpcServer); -}; - -} // namespace wenet - -#endif // GRPC_GRPC_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/wenet.proto b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/wenet.proto deleted file mode 100644 index 4c3033c034c513611c9159ff9db42b225be2cc98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/grpc/wenet.proto +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -syntax = "proto3"; - -option java_package = "ex.grpc"; -option objc_class_prefix = "wenet"; - -package wenet; - -service ASR { - rpc Recognize (stream Request) returns (stream Response) {} -} - -message Request { - - message DecodeConfig { - int32 nbest_config = 1; - bool continuous_decoding_config = 2; - } - - oneof RequestPayload { - DecodeConfig decode_config = 1; - bytes audio_data = 2; - } -} - -message Response { - - message OneBest { - string sentence = 1; - repeated OnePiece wordpieces = 2; - } - - message OnePiece { - string word = 1; - int32 start = 2; - int32 end = 3; - } - - enum Status { - ok = 0; - failed = 1; - } - - enum Type { - server_ready = 0; - partial_result = 1; - final_result = 2; - speech_end = 3; - } - - Status status = 1; - Type type = 2; - repeated OneBest nbest = 3; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/CMakeLists.txt deleted file mode 100644 index b072309e44b90dcee44ea31e9bcbc1741e73f151..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) - -project(kaldi) - -# include_directories() is called in the root CMakeLists.txt - -add_library(kaldi-util - base/kaldi-error.cc - base/kaldi-math.cc - util/kaldi-io.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/text-utils.cc -) -target_link_libraries(kaldi-util PUBLIC utils) - -add_library(kaldi-decoder - lat/determinize-lattice-pruned.cc - lat/lattice-functions.cc - decoder/lattice-faster-decoder.cc - decoder/lattice-faster-online-decoder.cc -) -target_link_libraries(kaldi-decoder PUBLIC kaldi-util) - -if(GRAPH_TOOLS) - # Arpa binary - add_executable(arpa2fst - lm/arpa-file-parser.cc - lm/arpa-lm-compiler.cc - lmbin/arpa2fst.cc - ) - target_link_libraries(arpa2fst PUBLIC kaldi-util) - - # FST tools binary - set(FST_BINS - fstaddselfloops - fstdeterminizestar - fstisstochastic - fstminimizeencoded - fsttablecompose - ) - - if(NOT MSVC) - # dl is for dynamic linking, otherwise there is a linking error on linux - link_libraries(dl) - endif() - foreach(name IN LISTS FST_BINS) - add_executable(${name} - fstbin/${name}.cc - fstext/kaldi-fst-io.cc - ) - target_link_libraries(${name} PUBLIC kaldi-util) - endforeach() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/README.md deleted file mode 100644 index 4eb9c9173b747686f00b658afc5e1e0dfdc17e68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/README.md +++ /dev/null @@ -1,21 +0,0 @@ -We use Kaldi decoder to implement TLG based language model integration, -so we copied related files to this directory. -The main changes are: - -1. To minimize the change, we use the same directories tree as Kaldi. - -2. We replace Kaldi log system with glog in the following way. - -``` c++ -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_INFO \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) -``` - -3. We lint all the files to satisfy the lint in WeNet. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs-inl.h deleted file mode 100644 index 9397400833676b323492321183c989cec2f41c3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs-inl.h +++ /dev/null @@ -1,329 +0,0 @@ -// base/io-funcs-inl.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; -// Johns Hopkins University (Author: Daniel Povey) -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_INL_H_ -#define KALDI_BASE_IO_FUNCS_INL_H_ 1 - -// Do not include this file directly. It is included by base/io-funcs.h - -#include -#include -#include - -namespace kaldi { - -// Template that covers integers. -template -void WriteBasicType(std::ostream &os, bool binary, T t) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char len_c = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(t)); - os.put(len_c); - os.write(reinterpret_cast(&t), sizeof(t)); - } else { - if (sizeof(t) == 1) - os << static_cast(t) << " "; - else - os << t << " "; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteBasicType."; - } -} - -// Template that covers integers. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t) { - KALDI_PARANOID_ASSERT(t != NULL); - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - int len_c_in = is.get(); - if (len_c_in == -1) - KALDI_ERR << "ReadBasicType: encountered end of stream."; - char len_c = static_cast(len_c_in), - len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(*t)); - if (len_c != len_c_expected) { - KALDI_ERR << "ReadBasicType: did not get expected integer type, " - << static_cast(len_c) << " vs. " - << static_cast(len_c_expected) - << ". You can change this code to successfully" - << " read it later, if needed."; - // insert code here to read "wrong" type. Might have a switch statement. - } - is.read(reinterpret_cast(t), sizeof(*t)); - } else { - if (sizeof(*t) == 1) { - int16 i; - is >> i; - *t = i; - } else { - is >> *t; - } - } - if (is.fail()) { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << is.peek(); - } -} - -// Template that covers integers. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector >::const_iterator iter = v.begin(), - end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(iter->first) << ',' - << static_cast(iter->second) << ' '; - else - os << iter->first << ',' << iter->second << ' '; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerPairVector."; - } -} - -// Template that covers integers. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz * 2); - } - } else { - std::vector > tmp_v; // use temporary so v doesn't use - // extra memory due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); - } else { - T next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::pair(next_t1, next_t2)); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerPairVector: read failure at file position " - << is.tellg(); -} - -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(*iter) << " "; - else - os << *iter << " "; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerVector."; - } -} - -template -inline void ReadIntegerVector(std::istream &is, bool binary, - std::vector *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz); - } - } else { - std::vector tmp_v; // use temporary so v doesn't use extra memory - // due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back((T)next_t); - } else { - T next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(next_t); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerVector: read failure at file position " - << is.tellg(); -} - -// Initialize an opened stream for writing by writing an optional binary -// header and modifying the floating-point precision. -inline void InitKaldiOutputStream(std::ostream &os, bool binary) { - // This does not throw exceptions (does not check for errors). - if (binary) { - os.put('\0'); - os.put('B'); - } - // Note, in non-binary mode we may at some point want to mess with - // the precision a bit. - // 7 is a bit more than the precision of float.. - if (os.precision() < 7) os.precision(7); -} - -/// Initialize an opened stream for reading by detecting the binary header and -// setting the "binary" value appropriately. -inline bool InitKaldiInputStream(std::istream &is, bool *binary) { - // Sets the 'binary' variable. - // Throws exception in the very unusual situation that stream - // starts with '\0' but not then 'B'. - - if (is.peek() == '\0') { // seems to be binary - is.get(); - if (is.peek() != 'B') { - return false; - } - is.get(); - *binary = true; - return true; - } else { - *binary = false; - return true; - } -} - -} // end namespace kaldi. - -#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs.cc deleted file mode 100644 index bd6c350780d1096ff8c452fd00864aa07a30ac65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs.cc +++ /dev/null @@ -1,215 +0,0 @@ -// base/io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" - -namespace kaldi { - -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b) { - os << (b ? "T" : "F"); - if (!binary) os << " "; - if (os.fail()) KALDI_ERR << "Write failure in WriteBasicType"; -} - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b) { - KALDI_PARANOID_ASSERT(b != NULL); - if (!binary) is >> std::ws; // eat up whitespace. - char c = is.peek(); - if (c == 'T') { - *b = true; - is.get(); - } else if (c == 'F') { - *b = false; - is.get(); - } else { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << CharToString(c); - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, float f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f) { - KALDI_PARANOID_ASSERT(f != NULL); - if (binary) { - double d; - int c = is.peek(); - if (c == sizeof(*f)) { - is.get(); - is.read(reinterpret_cast(f), sizeof(*f)); - } else if (c == sizeof(d)) { - ReadBasicType(is, binary, &d); - *f = d; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *f; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, double *d) { - KALDI_PARANOID_ASSERT(d != NULL); - if (binary) { - float f; - int c = is.peek(); - if (c == sizeof(*d)) { - is.get(); - is.read(reinterpret_cast(d), sizeof(*d)); - } else if (c == sizeof(f)) { - ReadBasicType(is, binary, &f); - *d = f; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *d; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -void CheckToken(const char *token) { - if (*token == '\0') KALDI_ERR << "Token is empty (not a valid token)"; - const char *orig_token = token; - while (*token != '\0') { - if (::isspace(*token)) - KALDI_ERR << "Token is not a valid token (contains space): '" - << orig_token << "'"; - token++; - } -} - -void WriteToken(std::ostream &os, bool binary, const char *token) { - // binary mode is ignored; - // we use space as termination character in either case. - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - os << token << " "; - if (os.fail()) { - KALDI_ERR << "Write failure in WriteToken."; - } -} - -int Peek(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // eat up whitespace. - return is.peek(); -} - -void WriteToken(std::ostream &os, bool binary, const std::string &token) { - WriteToken(os, binary, token.c_str()); -} - -void ReadToken(std::istream &is, bool binary, std::string *str) { - KALDI_ASSERT(str != NULL); - if (!binary) is >> std::ws; // consume whitespace. - is >> *str; - if (is.fail()) { - KALDI_ERR << "ReadToken, failed to read token at file position " - << is.tellg(); - } - if (!isspace(is.peek())) { - KALDI_ERR << "ReadToken, expected space after token, saw instead " - << CharToString(static_cast(is.peek())) - << ", at file position " << is.tellg(); - } - is.get(); // consume the space. -} - -int PeekToken(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // consume whitespace. - bool read_bracket; - if (static_cast(is.peek()) == '<') { - read_bracket = true; - is.get(); - } else { - read_bracket = false; - } - int ans = is.peek(); - if (read_bracket) { - if (!is.unget()) { - // Clear the bad bit. This code can be (and is in fact) reached, since the - // C++ standard does not guarantee that a call to unget() must succeed. - is.clear(); - } - } - return ans; -} - -void ExpectToken(std::istream &is, bool binary, const char *token) { - int pos_at_start = is.tellg(); - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - if (!binary) is >> std::ws; // consume whitespace. - std::string str; - is >> str; - is.get(); // consume the space. - if (is.fail()) { - KALDI_ERR << "Failed to read token [started at file position " - << pos_at_start << "], expected " << token; - } - // The second half of the '&&' expression below is so that if we're expecting - // "", we will accept "Foo>" instead. This is so that the model-reading - // code will tolerate errors in PeekToken where is.unget() failed; search for - // is.clear() in PeekToken() for an explanation. - if (strcmp(str.c_str(), token) != 0 && - !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { - KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str - << "\"."; - } -} - -void ExpectToken(std::istream &is, bool binary, const std::string &token) { - ExpectToken(is, binary, token.c_str()); -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs.h deleted file mode 100644 index 06ad1e3d2d8dc8385886a7c6653f620642c7c05a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/io-funcs.h +++ /dev/null @@ -1,246 +0,0 @@ -// base/io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_H_ -#define KALDI_BASE_IO_FUNCS_H_ - -// This header only contains some relatively low-level I/O functions. -// The full Kaldi I/O declarations are in ../util/kaldi-io.h -// and ../util/kaldi-table.h -// They were put in util/ in order to avoid making the Matrix library -// dependent on them. - -#include -#include -#include -#include - -#include "base/io-funcs-inl.h" -#include "base/kaldi-common.h" - -namespace kaldi { - -/* - This comment describes the Kaldi approach to I/O. All objects can be written - and read in two modes: binary and text. In addition we want to make the I/O - work if we redefine the typedef "BaseFloat" between floats and doubles. - We also want to have control over whitespace in text mode without affecting - the meaning of the file, for pretty-printing purposes. - - Errors are handled by throwing a KaldiFatalError exception. - - For integer and floating-point types (and boolean values): - - WriteBasicType(std::ostream &, bool binary, const T&); - ReadBasicType(std::istream &, bool binary, T*); - - and we expect these functions to be defined in such a way that they work when - the type T changes between float and double, so you can read float into double - and vice versa]. Note that for efficiency and space-saving reasons, the - Vector and Matrix classes do not use these functions [but they preserve the - type interchangeability in their own way] - - For a class (or struct) C: - class C { - .. - Write(std::ostream &, bool binary, [possibly extra optional args for - specific classes]) const; Read(std::istream &, bool binary, [possibly extra - optional args for specific classes]); - .. - } - NOTE: The only actual optional args we used are the "add" arguments in - Vector/Matrix classes, which specify whether we should sum the data already - in the class with the data being read. - - For types which are typedef's involving stl classes, I/O is as follows: - typedef std::vector > MyTypedefName; - - The user should define something like: - - WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); - ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); - - The user would have to write these functions. - - For a type std::vector: - - void WriteIntegerVector(std::ostream &os, bool binary, const std::vector - &v); void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - - For other types, e.g. vectors of pairs, the user should create a routine of - the type WriteMyTypedefName. This is to avoid introducing confusing templated - functions; we could easily create templated functions to handle most of these - cases but they would have to share the same name. - - It also often happens that the user needs to write/read special tokens as part - of a file. These might be class headers, or separators/identifiers in the - class. We provide special functions for manipulating these. These special - tokens must be nonempty and must not contain any whitespace. - - void WriteToken(std::ostream &os, bool binary, const char*); - void WriteToken(std::ostream &os, bool binary, const std::string & token); - int Peek(std::istream &is, bool binary); - void ReadToken(std::istream &is, bool binary, std::string *str); - void PeekToken(std::istream &is, bool binary, std::string *str); - - WriteToken writes the token and one space (whether in binary or text mode). - - Peek returns the first character of the next token, by consuming whitespace - (in text mode) and then returning the peek() character. It returns -1 at EOF; - it doesn't throw. It's useful if a class can have various forms based on - typedefs and virtual classes, and wants to know which version to read. - - ReadToken allows the caller to obtain the next token. PeekToken works just - like ReadToken, but seeks back to the beginning of the token. A subsequent - call to ReadToken will read the same token again. This is useful when - different object types are written to the same file; using PeekToken one can - decide which of the objects to read. - - There is currently no special functionality for writing/reading strings (where - the strings contain data rather than "special tokens" that are whitespace-free - and nonempty). This is because Kaldi is structured in such a way that strings - don't appear, except as OpenFst symbol table entries (and these have their own - format). - - - NOTE: you should not call ReadIntegerType and WriteIntegerType with types, - such as int and size_t, that are machine-independent -- at least not - if you want your file formats to port between machines. Use int32 and - int64 where necessary. There is no way to detect this using compile-time - assertions because C++ only keeps track of the internal representation of - the type. -*/ - -/// \addtogroup io_funcs_basic -/// @{ - -/// WriteBasicType is the name of the write function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void WriteBasicType(std::ostream &os, bool binary, T t); - -/// ReadBasicType is the name of the read function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void ReadBasicType(std::istream &is, bool binary, T *t); - -// Declare specialization for bool. -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b); - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b); - -// Declare specializations for float and double. -template <> -void WriteBasicType(std::ostream &os, bool binary, float f); - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f); - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f); - -template <> -void ReadBasicType(std::istream &is, bool binary, double *f); - -// Define ReadBasicType that accepts an "add" parameter to add to -// the destination. Caution: if used in Read functions, be careful -// to initialize the parameters concerned to zero in the default -// constructor. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { - if (!add) { - ReadBasicType(is, binary, t); - } else { - T tmp = T(0); - ReadBasicType(is, binary, &tmp); - *t += tmp; - } -} - -/// Function for writing STL vectors of integer types. -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v); - -/// Function for reading STL vector of integer types. -template -inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - -/// Function for writing STL vectors of pairs of integer types. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v); - -/// Function for reading STL vector of pairs of integer types. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v); - -/// The WriteToken functions are for writing nonempty sequences of non-space -/// characters. They are not for general strings. -void WriteToken(std::ostream &os, bool binary, const char *token); -void WriteToken(std::ostream &os, bool binary, const std::string &token); - -/// Peek consumes whitespace (if binary == false) and then returns the peek() -/// value of the stream. -int Peek(std::istream &is, bool binary); - -/// ReadToken gets the next token and puts it in str (exception on failure). If -/// PeekToken() had been previously called, it is possible that the stream had -/// failed to unget the starting '<' character. In this case ReadToken() returns -/// the token string without the leading '<'. You must be prepared to handle -/// this case. ExpectToken() handles this internally, and is not affected. -void ReadToken(std::istream &is, bool binary, std::string *token); - -/// PeekToken will return the first character of the next token, or -1 if end of -/// file. It's the same as Peek(), except if the first character is '<' it will -/// skip over it and will return the next character. It will attempt to unget -/// the '<' so the stream is where it was before you did PeekToken(), however, -/// this is not guaranteed (see ReadToken()). -int PeekToken(std::istream &is, bool binary); - -/// ExpectToken tries to read in the given token, and throws an exception -/// on failure. -void ExpectToken(std::istream &is, bool binary, const char *token); -void ExpectToken(std::istream &is, bool binary, const std::string &token); - -/// ExpectPretty attempts to read the text in "token", but only in non-binary -/// mode. Throws exception on failure. It expects an exact match except that -/// arbitrary whitespace matches arbitrary whitespace. -void ExpectPretty(std::istream &is, bool binary, const char *token); -void ExpectPretty(std::istream &is, bool binary, const std::string &token); - -/// @} end "addtogroup io_funcs_basic" - -/// InitKaldiOutputStream initializes an opened stream for writing by writing an -/// optional binary header and modifying the floating-point precision; it will -/// typically not be called by users directly. -inline void InitKaldiOutputStream(std::ostream &os, bool binary); - -/// InitKaldiInputStream initializes an opened stream for reading by detecting -/// the binary header and setting the "binary" value appropriately; -/// It will typically not be called by users directly. -inline bool InitKaldiInputStream(std::istream &is, bool *binary); - -} // end namespace kaldi. -#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-common.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-common.h deleted file mode 100644 index eee5f34d7234e7c029e6bb59584d3ee65ff5a875..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-common.h +++ /dev/null @@ -1,41 +0,0 @@ -// base/kaldi-common.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_COMMON_H_ -#define KALDI_BASE_KALDI_COMMON_H_ 1 - -#include -#include -#include // C string stuff like strcpy -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-utils.h" -#include "base/kaldi-error.h" -#include "base/kaldi-types.h" -// #include "base/io-funcs.h" -#include "base/kaldi-math.h" -// #include "base/timer.h" - -#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-error.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-error.cc deleted file mode 100644 index 77edc6af6e56bb8fa3431d519e58fda9ee0bac6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-error.cc +++ /dev/null @@ -1,42 +0,0 @@ -// base/kaldi-error.cc - -// Copyright 2019 LAIX (Yi Sun) -// Copyright 2019 SmartAction LLC (kkm) -// Copyright 2016 Brno University of Technology (author: Karel Vesely) -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-error.h" - -#include - -namespace kaldi { - -/***** GLOBAL VARIABLES FOR LOGGING *****/ - -int32 g_kaldi_verbose_level = 0; -static std::string program_name; // NOLINT - -void SetProgramName(const char *basename) { - // Using the 'static std::string' for the program name is mostly harmless, - // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ - // string implementation has been found in the wild that would not be just - // an empty string when zero-initialized but not yet constructed. - program_name = basename; -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-error.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-error.h deleted file mode 100644 index 0f65db372b5f05a8017433eed7c95badc819a0a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// base/kaldi-error.h - -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_ERROR_H_ -#define KALDI_BASE_KALDI_ERROR_H_ 1 - -#include "utils/log.h" - -namespace kaldi { - -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_LOG \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) - - -/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ - -/// Called by ParseOptions to set base name (no directory) of the executing -/// program. The name is printed in logging code along with every message, -/// because in our scripts, we often mix together the stderr of many programs. -/// This function is very thread-unsafe. -void SetProgramName(const char *basename); - -/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. -/// Do not use directly, prefer {Get,Set}VerboseLevel(). -extern int32 g_kaldi_verbose_level; - -/// Get verbosity level, usually set via command line '--verbose=' switch. -inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } - -/// This should be rarely used, except by programs using Kaldi as library; -/// command-line programs set the verbose level automatically from ParseOptions. -inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } - -} // namespace kaldi - -#endif // KALDI_BASE_KALDI_ERROR_H_ - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-math.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-math.cc deleted file mode 100644 index 175d9f49b6c5216645e90e146f4e2eab5572c342..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-math.cc +++ /dev/null @@ -1,164 +0,0 @@ -// base/kaldi-math.cc - -// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; -// Saarland University; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-math.h" -#ifndef _MSC_VER -#include -#include -#endif -#include -#include - -namespace kaldi { -// These routines are tested in matrix/matrix-test.cc - -int32 RoundUpToNearestPowerOfTwo(int32 n) { - KALDI_ASSERT(n > 0); - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n+1; -} - -static std::mutex _RandMutex; - -int Rand(struct RandomState* state) { -#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) - // On Windows and Cygwin, just call Rand() - return rand(); -#else - if (state) { - return rand_r(&(state->seed)); - } else { - std::lock_guard lock(_RandMutex); - return rand(); - } -#endif -} - -RandomState::RandomState() { - // we initialize it as Rand() + 27437 instead of just Rand(), because on some - // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be - // the case that rand_r when initialized with rand() will give you the exact - // same sequence of numbers that rand() will give if you keep calling rand() - // after that initial call. This can cause problems with repeated sequences. - // For example if you initialize two RandomState structs one after the other - // without calling rand() in between, they would give you the same sequence - // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just - // a randomly chosen prime number. - seed = unsigned(Rand()) + 27437; -} - -bool WithProb(BaseFloat prob, struct RandomState* state) { - KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, - // but we allow slightly larger values that could arise from roundoff in - // previous calculations. - KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); - if (prob == 0) { - return false; - } else if (prob == 1.0) { - return true; - } else if (prob * RAND_MAX < 128.0) { - // prob is very small but nonzero, and the "main algorithm" - // wouldn't work that well. So: with probability 1/128, we - // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... - // Note: we know that prob * 128.0 < 1.0, because - // we asserted RAND_MAX > 128 * 128. - return WithProb(prob * 128.0); - } else { - return false; - } - } else { - return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); - } -} - -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { - // This is not exact. - KALDI_ASSERT(max_val >= min_val); - if (max_val == min_val) return min_val; - -#ifdef _MSC_VER - // RAND_MAX is quite small on Windows -> may need to handle larger numbers. - if (RAND_MAX > (max_val-min_val)*8) { - // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + - ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); - } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > - (unsigned int)((max_val+1-min_val)*8)) { - // *8 to avoid inaccuracies in probability, from the modulus... - return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) - % (unsigned int)(max_val+1-min_val)); - } else { - KALDI_ERR << "rand_int failed because we do not support such large " - "random numbers. (Extend this function)."; - } - } -#else - return min_val + - (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); -#endif -} - -// Returns poisson-distributed random number. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state) { - // Knuth's algorithm. - KALDI_ASSERT(lambda >= 0); - float L = expf(-lambda), p = 1.0; - int32 k = 0; - do { - k++; - float u = RandUniform(state); - p *= u; - } while (p > L); - return k-1; -} - -void RandGauss2(float *a, float *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float u1 = RandUniform(state); - float u2 = RandUniform(state); - u1 = sqrtf(-2.0f * logf(u1)); - u2 = 2.0f * M_PI * u2; - *a = u1 * cosf(u2); - *b = u1 * sinf(u2); -} - -void RandGauss2(double *a, double *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float a_float, b_float; - // Just because we're using doubles doesn't mean we need super-high-quality - // random numbers, so we just use the floating-point version internally. - RandGauss2(&a_float, &b_float, state); - *a = a_float; - *b = b_float; -} - - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-math.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-math.h deleted file mode 100644 index 93c265ee96e704893da26b9083a44a9e60c6c192..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-math.h +++ /dev/null @@ -1,363 +0,0 @@ -// base/kaldi-math.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; -// Jan Silovsky; Saarland University -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_MATH_H_ -#define KALDI_BASE_KALDI_MATH_H_ 1 - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include - -#include "base/kaldi-types.h" -#include "base/kaldi-common.h" - - -#ifndef DBL_EPSILON -#define DBL_EPSILON 2.2204460492503131e-16 -#endif -#ifndef FLT_EPSILON -#define FLT_EPSILON 1.19209290e-7f -#endif - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.4142135623730950488016887 -#endif - -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244008443621048490 -#endif - -#ifndef M_LOG_2PI -#define M_LOG_2PI 1.8378770664093454835606594728112 -#endif - -#ifndef M_LN2 -#define M_LN2 0.693147180559945309417232121458 -#endif - -#ifndef M_LN10 -#define M_LN10 2.302585092994045684017991454684 -#endif - - -#define KALDI_ISNAN std::isnan -#define KALDI_ISINF std::isinf -#define KALDI_ISFINITE(x) std::isfinite(x) - -#if !defined(KALDI_SQR) -# define KALDI_SQR(x) ((x) * (x)) -#endif - -namespace kaldi { - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline double Exp(double x) { return exp(x); } -#ifndef KALDI_NO_EXPF -inline float Exp(float x) { return expf(x); } -#else -inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF -#else -inline double Exp(double x) { return exp(x); } -#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -// Microsoft CL v18.0 buggy 64-bit implementation of -// expf() incorrectly returns -inf for exp(-inf). -inline float Exp(float x) { return exp(static_cast(x)); } -#else -inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -inline double Log(double x) { return log(x); } -inline float Log(float x) { return logf(x); } - -#if !defined(_MSC_VER) || (_MSC_VER >= 1700) -inline double Log1p(double x) { return log1p(x); } -inline float Log1p(float x) { return log1pf(x); } -#else -inline double Log1p(double x) { - const double cutoff = 1.0e-08; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} - -inline float Log1p(float x) { - const float cutoff = 1.0e-07; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} -#endif - -static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! -static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! - -// -infinity -const float kLogZeroFloat = -std::numeric_limits::infinity(); -const double kLogZeroDouble = -std::numeric_limits::infinity(); -const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); - -// Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state = NULL); - -// State for thread-safe random number generator -struct RandomState { - RandomState(); - unsigned seed; -}; - -// Returns a random integer between first and last inclusive. -int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); - -// Returns true with probability "prob", -bool WithProb(BaseFloat prob, struct RandomState* state = NULL); -// with 0 <= prob <= 1 [we check this]. -// Internally calls Rand(). This function is carefully implemented so -// that it should work even if prob is very small. - -/// Returns a random number strictly between 0 and 1. -inline float RandUniform(struct RandomState* state = NULL) { - return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); -} - -inline float RandGauss(struct RandomState* state = NULL) { - return static_cast(sqrtf (-2 * Log(RandUniform(state))) - * cosf(2*M_PI*RandUniform(state))); -} - -// Returns poisson-distributed random number. Uses Knuth's algorithm. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state = NULL); - -// Returns a pair of gaussian random numbers. Uses Box-Muller transform -void RandGauss2(float *a, float *b, RandomState *state = NULL); -void RandGauss2(double *a, double *b, RandomState *state = NULL); - -// Also see Vector::RandCategorical(). - -// This is a randomized pruning mechanism that preserves expectations, -// that we typically use to prune posteriors. -template -inline Float RandPrune(Float post, BaseFloat prune_thresh, - struct RandomState* state = NULL) { - KALDI_ASSERT(prune_thresh >= 0.0); - if (post == 0.0 || std::abs(post) >= prune_thresh) - return post; - return (post >= 0 ? 1.0 : -1.0) * - (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); -} - -// returns log(exp(x) + exp(y)). -inline double LogAdd(double x, double y) { - double diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffDouble) { - double res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) + exp(y)). -inline float LogAdd(float x, float y) { - float diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffFloat) { - float res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) - exp(y)). -inline double LogSub(double x, double y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - double diff = y - x; // Will be negative. - double res = x + Log(1.0 - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroDouble; - return res; -} - - -// returns log(exp(x) - exp(y)). -inline float LogSub(float x, float y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - float diff = y - x; // Will be negative. - float res = x + Log(1.0f - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroFloat; - return res; -} - -/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). -static inline bool ApproxEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - if (a == b) return true; - float diff = std::abs(a-b); - if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); -} - -/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) -static inline void AssertEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); -} - - -// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. -int32 RoundUpToNearestPowerOfTwo(int32 n); - -/// Returns a / b, rounding towards negative infinity in all cases. -static inline int32 DivideRoundingDown(int32 a, int32 b) { - KALDI_ASSERT(b != 0); - if (a * b >= 0) - return a / b; - else if (a < 0) - return (a - b + 1) / b; - else - return (a - b - 1) / b; -} - -template I Gcd(I m, I n) { - if (m == 0 || n == 0) { - if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. - KALDI_ERR << "Undefined GCD since m = 0, n = 0."; - } - return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); - // return absolute value of whichever is nonzero - } - // could use compile-time assertion - // but involves messing with complex template stuff. - KALDI_ASSERT(std::numeric_limits::is_integer); - while (1) { - m %= n; - if (m == 0) return (n > 0 ? n : -n); - n %= m; - if (n == 0) return (m > 0 ? m : -m); - } -} - -/// Returns the least common multiple of two integers. Will -/// crash unless the inputs are positive. -template I Lcm(I m, I n) { - KALDI_ASSERT(m > 0 && n > 0); - I gcd = Gcd(m, n); - return gcd * (m/gcd) * (n/gcd); -} - - -template void Factorize(I m, std::vector *factors) { - // Splits a number into its prime factors, in sorted order from - // least to greatest, with duplication. A very inefficient - // algorithm, which is mainly intended for use in the - // mixed-radix FFT computation (where we assume most factors - // are small). - KALDI_ASSERT(factors != NULL); - KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. - factors->clear(); - I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; - - // First try small factors. - for (I i = 0; i < 10; i++) { - if (m == 1) return; // We're done. - while (m % small_factors[i] == 0) { - m /= small_factors[i]; - factors->push_back(small_factors[i]); - } - } - // Next try all odd numbers starting from 31. - for (I j = 31;; j += 2) { - if (m == 1) return; - while (m % j == 0) { - m /= j; - factors->push_back(j); - } - } -} - -inline double Hypot(double x, double y) { return hypot(x, y); } -inline float Hypot(float x, float y) { return hypotf(x, y); } - - - - -} // namespace kaldi - - -#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-types.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-types.h deleted file mode 100644 index 7ebf4f85386192a65e176d8f0ecde9bb348af4a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-types.h +++ /dev/null @@ -1,75 +0,0 @@ -// base/kaldi-types.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_TYPES_H_ -#define KALDI_BASE_KALDI_TYPES_H_ 1 - -namespace kaldi { -// TYPEDEFS .................................................................. -#if (KALDI_DOUBLEPRECISION != 0) -typedef double BaseFloat; -#else -typedef float BaseFloat; -#endif -} - -#ifdef _MSC_VER -#include -#define ssize_t SSIZE_T -#endif - -// we can do this a different way if some platform -// we find in the future lacks stdint.h -#include - -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - -#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-utils.h deleted file mode 100644 index bd434d09ed92ec94bc4208f53a4416f941edfdb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/base/kaldi-utils.h +++ /dev/null @@ -1,155 +0,0 @@ -// base/kaldi-utils.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; -// Saarland University; Karel Vesely; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_UTILS_H_ -#define KALDI_BASE_KALDI_UTILS_H_ 1 - -#if defined(_MSC_VER) -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX -# include -#endif - -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) -#if _MSC_VER < 1400 -#define __restrict__ -#else -#define __restrict__ __restrict -#endif -#endif - -#if defined(_MSC_VER) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = _aligned_malloc(size, align)) -# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) -#elif defined(__CYGWIN__) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = aligned_alloc(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#else -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#endif - -#ifdef __ICC -#pragma warning(disable: 383) // ICPC remark we don't want. -#pragma warning(disable: 810) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#pragma warning(disable: 1418) // ICPC remark we don't want. -#pragma warning(disable: 444) // ICPC remark we don't want. -#pragma warning(disable: 869) // ICPC remark we don't want. -#pragma warning(disable: 1287) // ICPC remark we don't want. -#pragma warning(disable: 279) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#endif - - -namespace kaldi { - - -// CharToString prints the character in a human-readable form, for debugging. -std::string CharToString(const char &c); - - -inline int MachineIsLittleEndian() { - int check = 1; - return (*reinterpret_cast(&check) != 0); -} - -// This function kaldi::Sleep() provides a portable way -// to sleep for a possibly fractional -// number of seconds. On Windows it's only accurate to microseconds. -void Sleep(float seconds); -} // namespace kaldi - -#define KALDI_SWAP8(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ - (reinterpret_cast(&a))[7] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ - (reinterpret_cast(&a))[6] = t;\ - t = (reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ - (reinterpret_cast(&a))[5] = t;\ - t = (reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ - (reinterpret_cast(&a))[4] = t;} while (0) -#define KALDI_SWAP4(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=t;} while (0) -#define KALDI_SWAP2(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1] = t;} while (0) - - -// Makes copy constructor and operator= private. -#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ - type(const type&); \ - void operator = (const type&) - -template class KaldiCompileTimeAssert { }; -template<> class KaldiCompileTimeAssert { - public: - static inline void Check() { } -}; - -#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() - -#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ - KaldiCompileTimeAssert::is_specialized \ - && std::numeric_limits::is_integer>::Check() - -#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ - KaldiCompileTimeAssert::is_specialized \ - && !std::numeric_limits::is_integer>::Check() - -#if defined(_MSC_VER) -#define KALDI_STRCASECMP _stricmp -#elif defined(__CYGWIN__) -#include -#define KALDI_STRCASECMP strcasecmp -#else -#define KALDI_STRCASECMP strcasecmp -#endif -#ifdef _MSC_VER -# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif - -#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-decoder.cc deleted file mode 100644 index 06f77557fa49a23f6a44d07c327a1b3b081c6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-decoder.cc +++ /dev/null @@ -1,1101 +0,0 @@ -// decoder/lattice-faster-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2018 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "decoder/lattice-faster-decoder.h" -// #include "lat/lattice-functions.h" - -namespace kaldi { - -// instantiate this class once for each thing you have to decode. -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : fst_(&fst), - delete_fst_(false), - config_(config), - num_toks_(0), - context_graph_(context_graph) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const LatticeFasterDecoderConfig &config, FST *fst) - : fst_(fst), delete_fst_(true), config_(config), num_toks_(0) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::~LatticeFasterDecoderTpl() { - DeleteElems(toks_.Clear()); - ClearActiveTokens(); - if (delete_fst_) delete fst_; -} - -template -void LatticeFasterDecoderTpl::InitDecoding() { - // clean up from last time: - DeleteElems(toks_.Clear()); - cost_offsets_.clear(); - ClearActiveTokens(); - warned_ = false; - num_toks_ = 0; - decoding_finalized_ = false; - final_costs_.clear(); - StateId start_state = fst_->Start(); - KALDI_ASSERT(start_state != fst::kNoStateId); - active_toks_.resize(1); - Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL); - active_toks_[0].toks = start_tok; - toks_.Insert(start_state, start_tok); - num_toks_++; - ProcessNonemitting(config_.beam); -} - -// Returns true if any kind of traceback is available (not necessarily from -// a final state). It should only very rarely return false; this indicates -// an unusual search error. -template -bool LatticeFasterDecoderTpl::Decode( - DecodableInterface *decodable) { - InitDecoding(); - // We use 1-based indexing for frames in this decoder (if you view it in - // terms of features), but note that the decodable object uses zero-based - // numbering, which we have to correct for when we call it. - AdvanceDecoding(decodable); - FinalizeDecoding(); - - // Returns true if we have any kind of traceback available (not necessarily - // to the end state; query ReachedFinal() for that). - return !active_toks_.empty() && active_toks_.back().toks != NULL; -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - Lattice raw_lat; - GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, olat); - return (olat->NumStates() != 0); -} - -// Outputs an FST corresponding to the raw, state-level lattice -template -bool LatticeFasterDecoderTpl::GetRawLattice( - Lattice *ofst, bool use_final_probs) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (decoding_finalized_ ? final_costs_ : final_costs_local); - if (!decoding_finalized_ && use_final_probs) - ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_ / 2 + 3; - unordered_map tok_map(bucket_count); - // First create all states. - std::vector token_list; - for (int32 f = 0; f <= num_frames; f++) { - if (active_toks_[f].toks == NULL) { - KALDI_WARN << "GetRawLattice: no tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - TopSortTokens(active_toks_[f].toks, &token_list); - for (size_t i = 0; i < token_list.size(); i++) - if (token_list[i] != NULL) tok_map[token_list[i]] = ofst->AddState(); - } - // The next statement sets the start state of the output FST. Because we - // topologically sorted the tokens, state zero must be the start-state. - ofst->SetStart(0); - - KALDI_VLOG(4) << "init:" << num_toks_ / 2 + 3 - << " buckets:" << tok_map.bucket_count() - << " load:" << tok_map.load_factor() - << " max:" << tok_map.max_load_factor(); - // Now create all arcs. - for (int32 f = 0; f <= num_frames; f++) { - for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) { - StateId cur_state = tok_map[tok]; - for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) { - typename unordered_map::const_iterator iter = - tok_map.find(l->next_tok); - StateId nextstate = iter->second; - KALDI_ASSERT(iter != tok_map.end()); - BaseFloat cost_offset = 0.0; - if (l->ilabel != 0) { // emitting.. - KALDI_ASSERT(f >= 0 && f < cost_offsets_.size()); - cost_offset = cost_offsets_[f]; - } - - StateId state = cur_state; - if (l->is_start_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->start_tag_id(), Weight(0, 0), tmp); - ofst->AddArc(state, arc); - state = tmp; - } - if (l->is_end_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->end_tag_id(), Weight(0, 0), nextstate); - ofst->AddArc(tmp, arc); - nextstate = tmp; - } - - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(state, arc); - } - if (f == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - } - - fst::TopSort(ofst); - return (ofst->NumStates() > 0); -} - -// This function is now deprecated, since now we do determinization from outside -// the LatticeFasterDecoder class. Outputs an FST corresponding to the -// lattice-determinized lattice (one path per word sequence). -template -bool LatticeFasterDecoderTpl::GetLattice( - CompactLattice *ofst, bool use_final_probs) const { - Lattice raw_fst; - GetRawLattice(&raw_fst, use_final_probs); - Invert(&raw_fst); // make it so word labels are on the input. - // (in phase where we get backward-costs). - fst::ILabelCompare ilabel_comp; - ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes - // lattice-determinization more efficient. - - fst::DeterminizeLatticePrunedOptions lat_opts; - lat_opts.max_mem = config_.det_opts.max_mem; - - DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); - raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. - Connect(ofst); // Remove unreachable states... there might be - // a small number of these, in some cases. - // Note: if something went wrong and the raw lattice was empty, - // we should still get to this point in the code without warnings or failures. - return (ofst->NumStates() != 0); -} - -template -void LatticeFasterDecoderTpl::PossiblyResizeHash(size_t num_toks) { - size_t new_sz = static_cast(static_cast(num_toks) * - config_.hash_ratio); - if (new_sz > toks_.Size()) { - toks_.SetSize(new_sz); - } -} - -/* - A note on the definition of extra_cost. - - extra_cost is used in pruning tokens, to save memory. - - extra_cost can be thought of as a beta (backward) cost assuming - we had set the betas on currently-active tokens to all be the negative - of the alphas for those tokens. (So all currently active tokens would - be on (tied) best paths). - - We can use the extra_cost to accurately prune away tokens that we know will - never appear in the lattice. If the extra_cost is greater than the desired - lattice beam, the token would provably never appear in the lattice, so we can - prune away the token. - - (Note: we don't update all the extra_costs every time we update a frame; we - only do it every 'config_.prune_interval' frames). - */ - -// FindOrAddToken either locates a token in hash of toks_, -// or if necessary inserts a new, empty token (i.e. with no forward links) -// for the current frame. [note: it's inserted if necessary into hash toks_ -// and also into the singly linked list of tokens active on this frame -// (whose head is at active_toks_[frame]). -template -inline typename LatticeFasterDecoderTpl::Elem * -LatticeFasterDecoderTpl::FindOrAddToken(StateId state, - int32 frame_plus_one, - BaseFloat tot_cost, - Token *backpointer, - bool *changed) { - // Returns the Token pointer. Sets "changed" (if non-NULL) to true - // if the token was newly created or the cost changed. - KALDI_ASSERT(frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - Elem *e_found = toks_.Insert(state, NULL); - if (e_found->val == NULL) { // no such token presently. - const BaseFloat extra_cost = 0.0; - // tokens on the currently final frame have zero extra_cost - // as any of them could end up - // on the winning path. - Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer); - // NULL: no forward links yet - toks = new_tok; - num_toks_++; - e_found->val = new_tok; - if (changed) *changed = true; - return e_found; - } else { - Token *tok = e_found->val; // There is an existing Token for this state. - if (tok->tot_cost > tot_cost) { // replace old token - tok->tot_cost = tot_cost; - // SetBackpointer() just does tok->backpointer = backpointer in - // the case where Token == BackpointerToken, else nothing. - tok->SetBackpointer(backpointer); - // we don't allocate a new token, the old stays linked in active_toks_ - // we only replace the tot_cost - // in the current frame, there are no forward links (and no extra_cost) - // only in ProcessNonemitting we have to delete forward links - // in case we visit a state for the second time - // those forward links, that lead to this replaced token before: - // they remain and will hopefully be pruned later (PruneForwardLinks...) - if (changed) *changed = true; - } else { - if (changed) *changed = false; - } - return e_found; - } -} - -// prunes outgoing links for all tokens in active_toks_[frame] -// it's called by PruneActiveTokens -// all links, that have link_extra_cost > lattice_beam are pruned -template -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - // delta is the amount by which the extra_costs must change - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - - *extra_costs_changed = false; - *links_pruned = false; - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - if (active_toks_[frame_plus_one].toks == - NULL) { // empty list; should not happen. - if (!warned_) { - KALDI_WARN << "No tokens alive [doing pruning].. warning first " - "time only for each utterance\n"; - warned_ = true; - } - } - - // We have to iterate until there is no more change, because the links - // are not guaranteed to be in topological order. - bool changed = true; // difference new minus old extra cost >= delta ? - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost for tok. - BaseFloat tok_extra_cost = std::numeric_limits::infinity(); - // tok_extra_cost is the best (min) of link_extra_cost of outgoing links - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state - KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN - // the graph_cost contatins the context score - // if it's the score of the backoff arc, it should be removed. - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - *links_pruned = true; - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; // move to next link - link = link->next; - } - } // for all outgoing links - if (fabs(tok_extra_cost - tok->extra_cost) > delta) - changed = true; // difference new minus old is bigger than delta - tok->extra_cost = tok_extra_cost; - // will be +infinity or <= lattice_beam_. - // infinity indicates, that no forward link survived pruning - } // for all Token on active_toks_[frame] - if (changed) *extra_costs_changed = true; - - // Note: it's theoretically possible that aggressive compiler - // optimizations could cause an infinite loop here for small delta and - // high-dynamic-range scores. - } // while changed -} - -// PruneForwardLinksFinal is a version of PruneForwardLinks that we call -// on the final frame. If there are final tokens active, it uses -// the final-probs for pruning, otherwise it treats all tokens as final. -template -void LatticeFasterDecoderTpl::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame_plus_one = active_toks_.size() - 1; - - if (active_toks_[frame_plus_one].toks == - NULL) // empty list; should not happen. - KALDI_WARN << "No tokens alive at end of file"; - - typedef typename unordered_map::const_iterator IterType; - ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); - decoding_finalized_ = true; - // We call DeleteElems() as a nicety, not because it's really necessary; - // otherwise there would be a time, after calling PruneTokensForFrame() on the - // final frame, when toks_.GetList() or toks_.Clear() would contain pointers - // to nonexistent tokens. - DeleteElems(toks_.Clear()); - - // Now go through tokens on this frame, pruning forward links... may have to - // iterate a few times until there is no more change, because the list is not - // in topological order. This is a modified version of the code in - // PruneForwardLinks, but here we also take account of the final-probs. - bool changed = true; - BaseFloat delta = 1.0e-05; - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost. It has a term in it that corresponds - // to the "final-prob", so instead of initializing tok_extra_cost to - // infinity below we set it to the difference between the - // (score+final_prob) of this token, and the best such (score+final_prob). - BaseFloat final_cost; - if (final_costs_.empty()) { - final_cost = 0.0; - } else { - IterType iter = final_costs_.find(tok); - if (iter != final_costs_.end()) - final_cost = iter->second; - else - final_cost = std::numeric_limits::infinity(); - } - BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_; - // tok_extra_cost will be a "min" over either directly being final, or - // being indirectly final through other links, and the loop below may - // decrease its value: - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; - link = link->next; - } - } - // prune away tokens worse than lattice_beam above best path. This step - // was not necessary in the non-final case because then, this case - // showed up as having no forward links. Here, the tok_extra_cost has - // an extra component relating to the final-prob. - if (tok_extra_cost > config_.lattice_beam) - tok_extra_cost = std::numeric_limits::infinity(); - // to be pruned in PruneTokensForFrame - - if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true; - tok->extra_cost = - tok_extra_cost; // will be +infinity or <= lattice_beam_. - } - } // while changed -} - -template -BaseFloat LatticeFasterDecoderTpl::FinalRelativeCost() const { - if (!decoding_finalized_) { - BaseFloat relative_cost; - ComputeFinalCosts(NULL, &relative_cost, NULL); - return relative_cost; - } else { - // we're not allowed to call that function if FinalizeDecoding() has - // been called; return a cached value. - return final_relative_cost_; - } -} - -// Prune away any tokens on this frame that have no forward links. -// [we don't do this in PruneForwardLinks because it would give us -// a problem with dangling pointers]. -// It's called by PruneActiveTokens if any forward links have been pruned -template -void LatticeFasterDecoderTpl::PruneTokensForFrame( - int32 frame_plus_one) { - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]"; - Token *tok, *next_tok, *prev_tok = NULL; - for (tok = toks; tok != NULL; tok = next_tok) { - next_tok = tok->next; - if (tok->extra_cost == std::numeric_limits::infinity()) { - // token is unreachable from end of graph; (no forward links survived) - // excise tok from list and delete tok. - if (prev_tok != NULL) - prev_tok->next = tok->next; - else - toks = tok->next; - delete tok; - num_toks_--; - } else { // fetch next Token - prev_tok = tok; - } - } -} - -// Go backwards through still-alive tokens, pruning them, starting not from -// the current frame (where we want to keep all tokens) but from the frame -// before that. We go backwards through the frames and stop when we reach a -// point where the delta-costs are not changing (and the delta controls when we -// consider a cost to have "not changed"). -template -void LatticeFasterDecoderTpl::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // The index "f" below represents a "frame plus one", i.e. you'd have to - // subtract one to get the corresponding index for the decodable object. - for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) { - // Reason why we need to prune forward links in this situation: - // (1) we have never pruned them (new TokenList) - // (2) we have not yet pruned the forward links to the next f, - // after any of those tokens have changed their extra_cost. - if (active_toks_[f].must_prune_forward_links) { - bool extra_costs_changed = false, links_pruned = false; - PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); - if (extra_costs_changed && f > 0) // any token has changed extra_cost - active_toks_[f - 1].must_prune_forward_links = true; - if (links_pruned) // any link was pruned - active_toks_[f].must_prune_tokens = true; - active_toks_[f].must_prune_forward_links = false; // job done - } - if (f + 1 < cur_frame_plus_one && // except for last f (no forward links) - active_toks_[f + 1].must_prune_tokens) { - PruneTokensForFrame(f + 1); - active_toks_[f + 1].must_prune_tokens = false; - } - } - KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin - << " to " << num_toks_; -} - -template -void LatticeFasterDecoderTpl::ComputeFinalCosts( - unordered_map *final_costs, - BaseFloat *final_relative_cost, BaseFloat *final_best_cost) const { - KALDI_ASSERT(!decoding_finalized_); - if (final_costs != NULL) final_costs->clear(); - const Elem *final_toks = toks_.GetList(); - BaseFloat infinity = std::numeric_limits::infinity(); - BaseFloat best_cost = infinity, best_cost_with_final = infinity; - - while (final_toks != NULL) { - StateId state = final_toks->key; - Token *tok = final_toks->val; - const Elem *next = final_toks->tail; - BaseFloat final_cost = fst_->Final(state).Value(); - BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost; - best_cost = std::min(cost, best_cost); - best_cost_with_final = std::min(cost_with_final, best_cost_with_final); - if (final_costs != NULL && final_cost != infinity) - (*final_costs)[tok] = final_cost; - final_toks = next; - } - if (final_relative_cost != NULL) { - if (best_cost == infinity && best_cost_with_final == infinity) { - // Likely this will only happen if there are no tokens surviving. - // This seems the least bad way to handle it. - *final_relative_cost = infinity; - } else { - *final_relative_cost = best_cost_with_final - best_cost; - } - } - if (final_best_cost != NULL) { - if (best_cost_with_final != infinity) { // final-state exists. - *final_best_cost = best_cost_with_final; - } else { // no final-state exists. - *final_best_cost = best_cost; - } - } -} - -template -void LatticeFasterDecoderTpl::AdvanceDecoding( - DecodableInterface *decodable, int32 max_num_frames) { - if (std::is_same >::value) { - // if the type 'FST' is the FST base-class, then see if the FST type of fst_ - // is actually VectorFst or ConstFst. If so, call the AdvanceDecoding() - // function after casting *this to the more specific type. - if (fst_->Type() == "const") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } else if (fst_->Type() == "vector") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } - } - - KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ && - "You must call InitDecoding() before AdvanceDecoding"); - int32 num_frames_ready = decodable->NumFramesReady(); - // num_frames_ready must be >= num_frames_decoded, or else - // the number of frames ready must have decreased (which doesn't - // make sense) or the decodable object changed between calls - // (which isn't allowed). - KALDI_ASSERT(num_frames_ready >= NumFramesDecoded()); - int32 target_frames_decoded = num_frames_ready; - if (max_num_frames >= 0) - target_frames_decoded = - std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames); - while (NumFramesDecoded() < target_frames_decoded) { - if (NumFramesDecoded() % config_.prune_interval == 0) { - PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); - } -} - -// FinalizeDecoding() is a version of PruneActiveTokens that we call -// (optionally) on the final frame. Takes into account the final-prob of -// tokens. This function used to be called PruneActiveTokensFinal(). -template -void LatticeFasterDecoderTpl::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // PruneForwardLinksFinal() prunes final frame (with final-probs), and - // sets decoding_finalized_. - PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { - bool b1, b2; // values not used. - BaseFloat dontcare = 0.0; // delta of zero means we must always update - PruneForwardLinks(f, &b1, &b2, dontcare); - PruneTokensForFrame(f + 1); - } - PruneTokensForFrame(0); - KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " - << num_toks_; -} - -/// Gets the weight cutoff. Also counts the active tokens. -template -BaseFloat LatticeFasterDecoderTpl::GetCutoff( - Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, - Elem **best_elem) { - BaseFloat best_weight = std::numeric_limits::infinity(); - // positive == high cost == bad. - size_t count = 0; - if (config_.max_active == std::numeric_limits::max() && - config_.min_active == 0) { - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = static_cast(e->val->tot_cost); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - if (adaptive_beam != NULL) *adaptive_beam = config_.beam; - return best_weight + config_.beam; - } else { - tmp_array_.clear(); - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = e->val->tot_cost; - tmp_array_.push_back(w); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - - BaseFloat beam_cutoff = best_weight + config_.beam, - min_active_cutoff = std::numeric_limits::infinity(), - max_active_cutoff = std::numeric_limits::infinity(); - - KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() - << " is " << tmp_array_.size(); - - if (tmp_array_.size() > static_cast(config_.max_active)) { - std::nth_element(tmp_array_.begin(), - tmp_array_.begin() + config_.max_active, - tmp_array_.end()); - max_active_cutoff = tmp_array_[config_.max_active]; - } - if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam. - if (adaptive_beam) - *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; - return max_active_cutoff; - } - if (tmp_array_.size() > static_cast(config_.min_active)) { - if (config_.min_active == 0) { - min_active_cutoff = best_weight; - } else { - std::nth_element( - tmp_array_.begin(), tmp_array_.begin() + config_.min_active, - tmp_array_.size() > static_cast(config_.max_active) - ? tmp_array_.begin() + config_.max_active - : tmp_array_.end()); - min_active_cutoff = tmp_array_[config_.min_active]; - } - } - if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. - if (adaptive_beam) - *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; - return min_active_cutoff; - } else { - *adaptive_beam = config_.beam; - return beam_cutoff; - } - } -} - -template -BaseFloat LatticeFasterDecoderTpl::ProcessEmitting( - DecodableInterface *decodable) { - KALDI_ASSERT(active_toks_.size() > 0); - int32 frame = - active_toks_.size() - 1; // frame is the frame-index - // (zero-based) used to get likelihoods - // from the decodable object. - active_toks_.resize(active_toks_.size() + 1); - - Elem *final_toks = - toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_ - // in simple-decoder.h. Removes the Elems from - // being indexed in the hash in toks_. - Elem *best_elem = NULL; - BaseFloat adaptive_beam; - size_t tok_cnt; - BaseFloat cur_cutoff = - GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); - KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " - << adaptive_beam; - - PossiblyResizeHash( - tok_cnt); // This makes sure the hash is always big enough. - - BaseFloat next_cutoff = std::numeric_limits::infinity(); - // pruning "online" before having seen all tokens - - BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good - // dynamic range. - - // First process the best token to get a hopefully - // reasonably tight bound on the next cutoff. The only - // products of the next block are "next_cutoff" and "cost_offset". - if (best_elem) { - StateId state = best_elem->key; - Token *tok = best_elem->val; - cost_offset = -tok->tot_cost; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat new_weight = arc.weight.Value() + cost_offset - - decodable->LogLikelihood(frame, arc.ilabel) + - tok->tot_cost; - if (state != arc.nextstate) { - new_weight += config_.length_penalty; - } - if (new_weight + adaptive_beam < next_cutoff) - next_cutoff = new_weight + adaptive_beam; - } - } - } - - // Store the offset on the acoustic likelihoods that we're applying. - // Could just do cost_offsets_.push_back(cost_offset), but we - // do it this way as it's more robust to future code changes. - cost_offsets_.resize(frame + 1, 0.0); - cost_offsets_[frame] = cost_offset; - - // the tokens are now owned here, in final_toks, and the hash is empty. - // 'owned' is a complex thing here; the point is we need to call DeleteElem - // on each elem 'e' to let toks_ know we're done with them. - for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) { - // loop this way because we delete "e" as we go. - StateId state = e->key; - Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat ac_cost = cost_offset - - decodable->LogLikelihood(frame, arc.ilabel), - graph_cost = arc.weight.Value(); - if (state != arc.nextstate) { - graph_cost += config_.length_penalty; - } - BaseFloat cur_cost = tok->tot_cost, - tot_cost = cur_cost + ac_cost + graph_cost; - if (tot_cost >= next_cutoff) - continue; - else if (tot_cost + adaptive_beam < next_cutoff) - next_cutoff = - tot_cost + adaptive_beam; // prune by best current token - // Note: the frame indexes into active_toks_ are one-based, - // hence the + 1. - Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); - // NULL: no change indicator needed - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - // Add ForwardLink from tok to next_tok (put on head of list - // tok->links) - tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); - tok->links->context_score = context_score; - } - } // for all arcs - } - e_tail = e->tail; - toks_.Delete(e); // delete Elem - } - return next_cutoff; -} - -// static inline -template -void LatticeFasterDecoderTpl::DeleteForwardLinks(Token *tok) { - ForwardLinkT *l = tok->links, *m; - while (l != NULL) { - m = l->next; - delete l; - l = m; - } - tok->links = NULL; -} - -template -void LatticeFasterDecoderTpl::ProcessNonemitting(BaseFloat cutoff) { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame = static_cast(active_toks_.size()) - 2; - // Note: "frame" is the time-index we just processed, or -1 if - // we are processing the nonemitting transitions before the - // first frame (called from InitDecoding()). - - // Processes nonemitting arcs for one frame. Propagates within toks_. - // Note-- this queue structure is not very optimal as - // it may cause us to process states unnecessarily (e.g. more than once), - // but in the baseline code, turning this vector into a set to fix this - // problem did not improve overall speed. - - KALDI_ASSERT(queue_.empty()); - - if (toks_.GetList() == NULL) { - if (!warned_) { - KALDI_WARN << "Error, no surviving tokens: frame is " << frame; - warned_ = true; - } - } - - int before = 0, after = 0; - for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) { - StateId state = e->key; - if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(e); - ++before; - } - - while (!queue_.empty()) { - ++after; - const Elem *e = queue_.back(); - queue_.pop_back(); - - StateId state = e->key; - Token *tok = - e->val; // would segfault if e is a NULL pointer but this can't happen. - BaseFloat cur_cost = tok->tot_cost; - if (cur_cost >= cutoff) // Don't bother processing successors. - continue; - // If "tok" has any existing forward links, delete them, - // because we're about to regenerate them. This is a kind - // of non-optimality (remember, this is the simple decoder), - // but since most states are emitting it's not a huge issue. - DeleteForwardLinks(tok); // necessary when re-visiting - tok->links = NULL; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel == 0) { // propagate nonemitting only... - BaseFloat graph_cost = arc.weight.Value(), - tot_cost = cur_cost + graph_cost; - if (tot_cost < cutoff) { - bool changed; - - Elem *e_new = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed); - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_new->val->context_state = tok->context_state; - } else { - e_new->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - - tok->links = - new ForwardLinkT(e_new->val, 0, arc.olabel, graph_cost, 0, - is_start_boundary, is_end_boundary, tok->links); - tok->links->context_score = context_score; - - // "changed" tells us whether the new token has a different - // cost from before, or is new [if so, add into queue]. - if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0) - queue_.push_back(e_new); - } - } - } // for all arcs - } // while queue not empty - KALDI_VLOG(3) << "ProcessNonemitting " << before << " " << after; -} - -template -void LatticeFasterDecoderTpl::DeleteElems(Elem *list) { - for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { - e_tail = e->tail; - toks_.Delete(e); - } -} - -template -void LatticeFasterDecoderTpl< - FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin - for (size_t i = 0; i < active_toks_.size(); i++) { - // Delete all tokens alive on this frame, and any forward - // links they may have. - for (Token *tok = active_toks_[i].toks; tok != NULL;) { - DeleteForwardLinks(tok); - Token *next_tok = tok->next; - delete tok; - num_toks_--; - tok = next_tok; - } - } - active_toks_.clear(); - KALDI_ASSERT(num_toks_ == 0); -} - -// static -template -void LatticeFasterDecoderTpl::TopSortTokens( - Token *tok_list, std::vector *topsorted_list) { - unordered_map token2pos; - using std::unordered_set; - typedef typename unordered_map::iterator IterType; - int32 num_toks = 0; - for (Token *tok = tok_list; tok != NULL; tok = tok->next) num_toks++; - int32 cur_pos = 0; - // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0. - // This is likely to be in closer to topological order than - // if we had given them ascending order, because of the way - // new tokens are put at the front of the list. - for (Token *tok = tok_list; tok != NULL; tok = tok->next) - token2pos[tok] = num_toks - ++cur_pos; - - unordered_set reprocess; - - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) { - Token *tok = iter->first; - int32 pos = iter->second; - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - // We only need to consider epsilon links, since non-epsilon links - // transition between frames and this function only needs to sort a list - // of tokens from a single frame. - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { // another token on this - // frame, so must consider it. - int32 next_pos = following_iter->second; - if (next_pos < pos) { // reassign the position of the next Token. - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - // In case we had previously assigned this token to be reprocessed, we can - // erase it from that set because it's "happy now" (we just processed it). - reprocess.erase(tok); - } - - size_t max_loop = 1000000, - loop_count; // max_loop is to detect epsilon cycles. - for (loop_count = 0; !reprocess.empty() && loop_count < max_loop; - ++loop_count) { - std::vector reprocess_vec; - for (typename unordered_set::iterator iter = reprocess.begin(); - iter != reprocess.end(); ++iter) - reprocess_vec.push_back(*iter); - reprocess.clear(); - for (typename std::vector::iterator iter = reprocess_vec.begin(); - iter != reprocess_vec.end(); ++iter) { - Token *tok = *iter; - int32 pos = token2pos[tok]; - // Repeat the processing we did above (for comments, see above). - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { - int32 next_pos = following_iter->second; - if (next_pos < pos) { - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - } - } - KALDI_ASSERT(loop_count < max_loop && - "Epsilon loops exist in your decoding " - "graph (this is not allowed!)"); - - topsorted_list->clear(); - topsorted_list->resize(cur_pos, - NULL); // create a list with NULLs in between. - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) - (*topsorted_list)[iter->second] = iter->first; -} - -// Instantiate the template for the combination of token types and FST types -// that we'll need. -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; - -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-decoder.h deleted file mode 100644 index 0152b85447e354b770745b748d266b1ca2d57024..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-decoder.h +++ /dev/null @@ -1,558 +0,0 @@ -// decoder/lattice-faster-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_ - -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "decoder/context_graph.h" -#include "fst/fstlib.h" -#include "fstext/fstext-lib.h" -#include "itf/decodable-itf.h" -#include "lat/determinize-lattice-pruned.h" -#include "lat/kaldi-lattice.h" -#include "util/hash-list.h" - -namespace kaldi { - -struct LatticeFasterDecoderConfig { - BaseFloat beam; - int32 max_active; - int32 min_active; - BaseFloat lattice_beam; - int32 prune_interval; - bool determinize_lattice; // not inspected by this class... used in - // command-line program. - BaseFloat beam_delta; - BaseFloat hash_ratio; - // Note: we don't make prune_scale configurable on the command line, it's not - // a very important parameter. It affects the algorithm that prunes the - // tokens as we go. - BaseFloat prune_scale; - BaseFloat length_penalty; // for balancing the del/ins ratio, suggested -3.0 - - // Most of the options inside det_opts are not actually queried by the - // LatticeFasterDecoder class itself, but by the code that calls it, for - // example in the function DecodeUtteranceLatticeFaster. - fst::DeterminizeLatticePhonePrunedOptions det_opts; - - LatticeFasterDecoderConfig() - : beam(16.0), - max_active(std::numeric_limits::max()), - min_active(200), - lattice_beam(10.0), - prune_interval(25), - determinize_lattice(true), - beam_delta(0.5), - hash_ratio(2.0), - prune_scale(0.1), - length_penalty(0.0) {} - void Register(OptionsItf *opts) { - det_opts.Register(opts); - opts->Register("beam", &beam, - "Decoding beam. Larger->slower, more accurate."); - opts->Register("max-active", &max_active, - "Decoder max active states. Larger->slower; " - "more accurate"); - opts->Register("min-active", &min_active, - "Decoder minimum #active states."); - opts->Register("lattice-beam", &lattice_beam, - "Lattice generation beam. Larger->slower, " - "and deeper lattices"); - opts->Register("prune-interval", &prune_interval, - "Interval (in frames) at " - "which to prune tokens"); - opts->Register( - "determinize-lattice", &determinize_lattice, - "If true, " - "determinize the lattice (lattice-determinization, keeping only " - "best pdf-sequence for each word-sequence)."); - opts->Register( - "beam-delta", &beam_delta, - "Increment used in decoding-- this " - "parameter is obscure and relates to a speedup in the way the " - "max-active constraint is applied. Larger is more accurate."); - opts->Register("hash-ratio", &hash_ratio, - "Setting used in decoder to " - "control hash behavior"); - } - void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && - min_active <= max_active && prune_interval > 0 && - beam_delta > 0.0 && hash_ratio >= 1.0 && prune_scale > 0.0 && - prune_scale < 1.0); - } -}; - -namespace decoder { -// We will template the decoder on the token type as well as the FST type; this -// is a mechanism so that we can use the same underlying decoder code for -// versions of the decoder that support quickly getting the best path -// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also -// those that do not (LatticeFasterDecoder). - -// ForwardLinks are the links from a token to a token on the next frame. -// or sometimes on the current frame (for input-epsilon links). -template -struct ForwardLink { - using Label = fst::StdArc::Label; - - Token *next_tok; // the next token [or NULL if represents final-state] - Label ilabel; // ilabel on arc - Label olabel; // olabel on arc - BaseFloat graph_cost; // graph cost of traversing arc (contains LM, etc.) - BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing arc - bool is_start_boundary; - bool is_end_boundary; - float context_score; - ForwardLink *next; // next in singly-linked list of forward arcs (arcs - // in the state-level lattice) from a token. - inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, - bool is_start_boundary, bool is_end_boundary, - ForwardLink *next) - : next_tok(next_tok), - ilabel(ilabel), - olabel(olabel), - graph_cost(graph_cost), - acoustic_cost(acoustic_cost), - is_start_boundary(is_start_boundary), - is_end_boundary(is_end_boundary), - context_score(0), - next(next) {} -}; - -struct StdToken { - using ForwardLinkT = ForwardLink; - using Token = StdToken; - - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals the - // minimum difference between the cost of the best path that this link is a - // part of, and the cost of the absolute best path, under the assumption that - // any of the currently active states at the decoding front may eventually - // succeed (e.g. if you were to take the currently active states one by one - // and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - Token *next; - - // This function does nothing and should be optimized out; it's needed - // so we can share the regular LatticeFasterDecoderTpl code and the code - // for LatticeFasterOnlineDecoder that supports fast traceback. - inline void SetBackpointer(Token *backpointer) {} - - // This constructor just ignores the 'backpointer' argument. That argument is - // needed so that we can use the same decoder code for LatticeFasterDecoderTpl - // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a - // fast way to obtain the best path). - inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links, - Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - context_state(0), - next(next) {} -}; - -struct BackpointerToken { - using ForwardLinkT = ForwardLink; - using Token = BackpointerToken; - - // BackpointerToken is like Token but also - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - BackpointerToken *next; - - // Best preceding BackpointerToken (could be a on this frame, connected to - // this via an epsilon transition, or on a previous frame). This is only - // required for an efficient GetBestPath function in - // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation - // (the "links" list is what stores the forward links, for that). - Token *backpointer; - - inline void SetBackpointer(Token *backpointer) { - this->backpointer = backpointer; - } - - inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, - ForwardLinkT *links, Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - next(next), - backpointer(backpointer), - context_state(0) {} -}; - -} // namespace decoder - -/** This is the "normal" lattice-generating decoder. - See \ref lattices_generation \ref decoders_faster and \ref decoders_simple - for more information. - - The decoder is templated on the FST type and the token type. The token type - will normally be StdToken, but also may be BackpointerToken which is to - support quick lookup of the current best path (see - lattice-faster-online-decoder.h) - - The FST you invoke this decoder which is expected to equal - Fst::Fst, a.k.a. StdFst, or GrammarFst. If you invoke it with - FST == StdFst and it notices that the actual FST type is - fst::VectorFst or fst::ConstFst, the decoder object - will internally cast itself to one that is templated on those more specific - types; this is an optimization for speed. - */ -template -class LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph); - - // This version of the constructor takes ownership of the fst, and will delete - // it when this object is destroyed. - LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config, FST *fst); - - void SetOptions(const LatticeFasterDecoderConfig &config) { - config_ = config; - } - - const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - - ~LatticeFasterDecoderTpl(); - - /// Decodes until there are no more frames left in the "decodable" object.. - /// note, this may block waiting for input if the "decodable" object blocks. - /// Returns true if any kind of traceback is available (not necessarily from a - /// final state). - bool Decode(DecodableInterface *decodable); - - /// says whether a final-state was active on the last frame. If it was not, - /// the lattice (or traceback) will end with states that are not final-states. - bool ReachedFinal() const { - return FinalRelativeCost() != std::numeric_limits::infinity(); - } - - /// Outputs an FST corresponding to the single best path through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. Note: this just calls - /// GetRawLattice() and figures out the shortest path. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// Outputs an FST corresponding to the raw, state-level - /// tracebacks. Returns true if result is nonempty. - /// If "use_final_probs" is true AND we reached the final-state - /// of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - /// The raw lattice will be topologically sorted. - /// - /// See also GetRawLatticePruned in lattice-faster-online-decoder.h, - /// which also supports a pruning beam, in case for some reason - /// you want it pruned tighter than the regular lattice beam. - /// We could put that here in future needed. - bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const; - - /// [Deprecated, users should now use GetRawLattice and determinize it - /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper]. - /// Outputs an FST corresponding to the lattice-determinized - /// lattice (one path per word sequence). Returns true if result is - /// nonempty. If "use_final_probs" is true AND we reached the final-state of - /// the graph then it will include those as final-probs, else it will treat - /// all final-probs as one. - bool GetLattice(CompactLattice *ofst, bool use_final_probs = true) const; - - /// InitDecoding initializes the decoding, and should only be used if you - /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to - /// call this. You can also call InitDecoding if you have already decoded an - /// utterance and want to start with a new utterance. - void InitDecoding(); - - /// This will decode until there are no more frames ready in the decodable - /// object. You can keep calling it each time more frames become available. - /// If max_num_frames is specified, it specifies the maximum number of frames - /// the function will decode before returning. - void AdvanceDecoding(DecodableInterface *decodable, - int32 max_num_frames = -1); - - /// This function may be optionally called after AdvanceDecoding(), when you - /// do not plan to decode any further. It does an extra pruning step that - /// will help to prune the lattices output by GetLattice and (particularly) - /// GetRawLattice more completely, particularly toward the end of the - /// utterance. If you call this, you cannot call AdvanceDecoding again (it - /// will fail), and you cannot call GetLattice() and related functions with - /// use_final_probs = false. Used to be called PruneActiveTokensFinal(). - void FinalizeDecoding(); - - /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives - /// more information. It returns the difference between the best (final-cost - /// plus cost) of any token on the final frame, and the best cost of any token - /// on the final frame. If it is infinity it means no final-states were - /// present on the final frame. It will usually be nonnegative. If it not - /// too positive (e.g. < 5 is my first guess, but this is not tested) you can - /// take it as a good indication that we reached the final-state with - /// reasonable likelihood. - BaseFloat FinalRelativeCost() const; - - // Returns the number of frames decoded so far. The value returned changes - // whenever we call ProcessEmitting(). - inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; } - - protected: - // we make things protected instead of private, as code in - // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the - // internals. - - // Deletes the elements of the singly linked list tok->links. - inline static void DeleteForwardLinks(Token *tok); - - // head of per-frame list of Tokens (list is in topological order), - // and something saying whether we ever pruned it using PruneForwardLinks. - struct TokenList { - Token *toks; - bool must_prune_forward_links; - bool must_prune_tokens; - TokenList() - : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) {} - }; - - using Elem = typename HashList::Elem; - // Equivalent to: - // struct Elem { - // StateId key; - // Token *val; - // Elem *tail; - // }; - - void PossiblyResizeHash(size_t num_toks); - - // FindOrAddToken either locates a token in hash of toks_, or if necessary - // inserts a new, empty token (i.e. with no forward links) for the current - // frame. [note: it's inserted if necessary into hash toks_ and also into the - // singly linked list of tokens active on this frame (whose head is at - // active_toks_[frame]). The frame_plus_one argument is the acoustic frame - // index plus one, which is used to index into the active_toks_ array. - // Returns the Token pointer. Sets "changed" (if non-NULL) to true if the - // token was newly created or the cost changed. - // If Token == StdToken, the 'backpointer' argument has no purpose (and will - // hopefully be optimized out). - inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one, - BaseFloat tot_cost, Token *backpointer, - bool *changed); - - // prunes outgoing links for all tokens in active_toks_[frame] - // it's called by PruneActiveTokens - // all links, that have link_extra_cost > lattice_beam are pruned - // delta is the amount by which the extra_costs must change - // before we set *extra_costs_changed = true. - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed, - bool *links_pruned, BaseFloat delta); - - // This function computes the final-costs for tokens active on the final - // frame. It outputs to final-costs, if non-NULL, a map from the Token* - // pointer to the final-prob of the corresponding state, for all Tokens - // that correspond to states that have final-probs. This map will be - // empty if there were no final-probs. It outputs to - // final_relative_cost, if non-NULL, the difference between the best - // forward-cost including the final-prob cost, and the best forward-cost - // without including the final-prob cost (this will usually be positive), or - // infinity if there were no final-probs. [c.f. FinalRelativeCost(), which - // outputs this quanitity]. It outputs to final_best_cost, if - // non-NULL, the lowest for any token t active on the final frame, of - // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in - // the graph of the state corresponding to token t, or the best of - // forward-cost[t] if there were no final-probs active on the final frame. - // You cannot call this after FinalizeDecoding() has been called; in that - // case you should get the answer from class-member variables. - void ComputeFinalCosts(unordered_map *final_costs, - BaseFloat *final_relative_cost, - BaseFloat *final_best_cost) const; - - // PruneForwardLinksFinal is a version of PruneForwardLinks that we call - // on the final frame. If there are final tokens active, it uses - // the final-probs for pruning, otherwise it treats all tokens as final. - void PruneForwardLinksFinal(); - - // Prune away any tokens on this frame that have no forward links. - // [we don't do this in PruneForwardLinks because it would give us - // a problem with dangling pointers]. - // It's called by PruneActiveTokens if any forward links have been pruned - void PruneTokensForFrame(int32 frame_plus_one); - - // Go backwards through still-alive tokens, pruning them if the - // forward+backward cost is more than lat_beam away from the best path. It's - // possible to prove that this is "correct" in the sense that we won't lose - // anything outside of lat_beam, regardless of what happens in the future. - // delta controls when it considers a cost to have changed enough to continue - // going backward and propagating the change. larger delta -> will recurse - // less far. - void PruneActiveTokens(BaseFloat delta); - - /// Gets the weight cutoff. Also counts the active tokens. - BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, - BaseFloat *adaptive_beam, Elem **best_elem); - - /// Processes emitting arcs for one frame. Propagates from prev_toks_ to - /// cur_toks_. Returns the cost cutoff for subsequent ProcessNonemitting() to - /// use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); - - /// Processes nonemitting (epsilon) arcs for one frame. Called after - /// ProcessEmitting() on each frame. The cost cutoff is computed by the - /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); - - // HashList defined in ../util/hash-list.h. It actually allows us to maintain - // more than one list (e.g. for current and previous frames), but only one of - // them at a time can be indexed by StateId. It is indexed by frame-index - // plus one, where the frame-index is zero-based, as used in decodable object. - // That is, the emitting probs of frame t are accounted for in tokens at - // toks_[t+1]. The zeroth frame is for nonemitting transition at the start of - // the graph. - HashList toks_; - - std::vector active_toks_; // Lists of tokens, indexed by - // frame (members of TokenList are toks, must_prune_forward_links, - // must_prune_tokens). - std::vector - queue_; // temp variable used in ProcessNonemitting, - std::vector tmp_array_; // used in GetCutoff. - - // fst_ is a pointer to the FST we are decoding from. - const FST *fst_; - // delete_fst_ is true if the pointer fst_ needs to be deleted when this - // object is destroyed. - bool delete_fst_; - - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic log-likelihoods on that - // frame in order to keep everything in a nice dynamic range i.e. close to - // zero, to reduce roundoff errors. - LatticeFasterDecoderConfig config_; - int32 num_toks_; // current total #toks allocated... - bool warned_; - - /// decoding_finalized_ is true if someone called FinalizeDecoding(). [note, - /// calling this is optional]. If true, it's forbidden to decode more. Also, - /// if this is set, then the output of ComputeFinalCosts() is in the next - /// three variables. The reason we need to do this is that after - /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some - /// of the tokens on the last frame are freed, so we free the list from toks_ - /// to avoid having dangling pointers hanging around. - bool decoding_finalized_; - /// For the meaning of the next 3 variables, see the comment for - /// decoding_finalized_ above., and ComputeFinalCosts(). - unordered_map final_costs_; - BaseFloat final_relative_cost_; - BaseFloat final_best_cost_; - - std::shared_ptr context_graph_ = nullptr; - - // There are various cleanup tasks... the toks_ structure contains - // singly linked lists of Token pointers, where Elem is the list type. - // It also indexes them in a hash, indexed by state (this hash is only - // maintained for the most recent frame). toks_.Clear() - // deletes them from the hash and returns the list of Elems. The - // function DeleteElems calls toks_.Delete(elem) for each elem in - // the list, which returns ownership of the Elem to the toks_ structure - // for reuse, but does not delete the Token pointer. The Token pointers - // are reference-counted and are ultimately deleted in PruneTokensForFrame, - // but are also linked together on each frame by their own linked-list, - // using the "next" pointer. We delete them manually. - void DeleteElems(Elem *list); - - // This function takes a singly linked list of tokens for a single frame, and - // outputs a list of them in topological order (it will crash if no such order - // can be found, which will typically be due to decoding graphs with epsilon - // cycles, which are not allowed). Note: the output list may contain NULLs, - // which the caller should pass over; it just happens to be more efficient for - // the algorithm to output a list that contains NULLs. - static void TopSortTokens(Token *tok_list, - std::vector *topsorted_list); - - void ClearActiveTokens(); - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl); -}; - -typedef LatticeFasterDecoderTpl - LatticeFasterDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-online-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-online-decoder.cc deleted file mode 100644 index 2345b4d129ff905784762e973bad279f2fb55d31..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-online-decoder.cc +++ /dev/null @@ -1,278 +0,0 @@ -// decoder/lattice-faster-online-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2014 IMSL, PKU-HKUST (author: Wei Shi) -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.cc, about how to maintain this -// file in sync with lattice-faster-decoder.cc - -#include -#include -#include -#include - -#include "decoder/lattice-faster-online-decoder.h" - -namespace kaldi { - -template -bool LatticeFasterOnlineDecoderTpl::TestGetBestPath( - bool use_final_probs) const { - Lattice lat1; - { - Lattice raw_lat; - this->GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, &lat1); - } - Lattice lat2; - GetBestPath(&lat2, use_final_probs); - BaseFloat delta = 0.1; - int32 num_paths = 1; - if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) { - KALDI_WARN << "Best-path test failed"; - return false; - } else { - return true; - } -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterOnlineDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - olat->DeleteStates(); - BaseFloat final_graph_cost; - BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost); - if (iter.Done()) return false; // would have printed warning. - StateId state = olat->AddState(); - olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0)); - while (!iter.Done()) { - LatticeArc arc; - iter = TraceBackBestPath(iter, &arc); - arc.nextstate = state; - StateId new_state = olat->AddState(); - olat->AddArc(new_state, arc); - state = new_state; - } - olat->SetStart(state); - return true; -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::BestPathEnd( - bool use_final_probs, BaseFloat *final_cost_out) const { - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "BestPathEnd() with use_final_probs == false"; - KALDI_ASSERT(this->NumFramesDecoded() > 0 && - "You cannot call BestPathEnd if no frames were decoded."); - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - // Singly linked list of tokens on last frame (access list through "next" - // pointer). - BaseFloat best_cost = std::numeric_limits::infinity(); - BaseFloat best_final_cost = 0; - Token *best_tok = NULL; - for (Token *tok = this->active_toks_.back().toks; tok != NULL; - tok = tok->next) { - BaseFloat cost = tok->tot_cost, final_cost = 0.0; - if (use_final_probs && !final_costs.empty()) { - // if we are instructed to use final-probs, and any final tokens were - // active on final frame, include the final-prob in the cost of the token. - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) { - final_cost = iter->second; - cost += final_cost; - } else { - cost = std::numeric_limits::infinity(); - } - } - if (cost < best_cost) { - best_cost = cost; - best_tok = tok; - best_final_cost = final_cost; - } - } - if (best_tok == - NULL) { // this should not happen, and is likely a code error or - // caused by infinities in likelihoods, but I'm not making - // it a fatal error for now. - KALDI_WARN << "No final token found."; - } - if (final_cost_out) *final_cost_out = best_final_cost; - return BestPathIterator(best_tok, this->NumFramesDecoded() - 1); -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::TraceBackBestPath(BestPathIterator iter, - LatticeArc *oarc) const { - KALDI_ASSERT(!iter.Done() && oarc != NULL); - Token *tok = static_cast(iter.tok); - int32 cur_t = iter.frame, step_t = 0; - if (tok->backpointer != NULL) { - // retrieve the correct forward link(with the best link cost) - BaseFloat best_cost = std::numeric_limits::infinity(); - ForwardLinkT *link; - for (link = tok->backpointer->links; link != NULL; link = link->next) { - if (link->next_tok == tok) { // this is a link to "tok" - BaseFloat graph_cost = link->graph_cost, - acoustic_cost = link->acoustic_cost; - BaseFloat cost = graph_cost + acoustic_cost; - if (cost < best_cost) { - oarc->ilabel = link->ilabel; - oarc->olabel = link->olabel; - if (link->ilabel != 0) { - KALDI_ASSERT(static_cast(cur_t) < - this->cost_offsets_.size()); - acoustic_cost -= this->cost_offsets_[cur_t]; - step_t = -1; - } else { - step_t = 0; - } - oarc->weight = LatticeWeight(graph_cost, acoustic_cost); - best_cost = cost; - } - } - } - if (link == NULL && - best_cost == - std::numeric_limits::infinity()) { // Did not find - // correct link. - KALDI_ERR << "Error tracing best-path back (likely " - << "bug in token-pruning algorithm)"; - } - } else { - oarc->ilabel = 0; - oarc->olabel = 0; - oarc->weight = LatticeWeight::One(); // zero costs. - } - return BestPathIterator(tok->backpointer, cur_t + step_t); -} - -template -bool LatticeFasterOnlineDecoderTpl::GetRawLatticePruned( - Lattice *ofst, bool use_final_probs, BaseFloat beam) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = this->active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - for (int32 f = 0; f <= num_frames; f++) { - if (this->active_toks_[f].toks == NULL) { - KALDI_WARN << "No tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - } - unordered_map tok_map; - std::queue > tok_queue; - // First initialize the queue and states. Put the initial state on the queue; - // this is the last token in the list active_toks_[0].toks. - for (Token *tok = this->active_toks_[0].toks; tok != NULL; tok = tok->next) { - if (tok->next == NULL) { - tok_map[tok] = ofst->AddState(); - ofst->SetStart(tok_map[tok]); - std::pair tok_pair(tok, 0); // #frame = 0 - tok_queue.push(tok_pair); - } - } - - // Next create states for "good" tokens - while (!tok_queue.empty()) { - std::pair cur_tok_pair = tok_queue.front(); - tok_queue.pop(); - Token *cur_tok = cur_tok_pair.first; - int32 cur_frame = cur_tok_pair.second; - KALDI_ASSERT(cur_frame >= 0 && cur_frame <= this->cost_offsets_.size()); - - typename unordered_map::const_iterator iter = - tok_map.find(cur_tok); - KALDI_ASSERT(iter != tok_map.end()); - StateId cur_state = iter->second; - - for (ForwardLinkT *l = cur_tok->links; l != NULL; l = l->next) { - Token *next_tok = l->next_tok; - if (next_tok->extra_cost < beam) { - // so both the current and the next token are good; create the arc - int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1; - StateId nextstate; - if (tok_map.find(next_tok) == tok_map.end()) { - nextstate = tok_map[next_tok] = ofst->AddState(); - tok_queue.push(std::pair(next_tok, next_frame)); - } else { - nextstate = tok_map[next_tok]; - } - BaseFloat cost_offset = - (l->ilabel != 0 ? this->cost_offsets_[cur_frame] : 0); - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(cur_state, arc); - } - } - if (cur_frame == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(cur_tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - return (ofst->NumStates() != 0); -} - -// Instantiate the template for the FST types that we'll need. -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-online-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-online-decoder.h deleted file mode 100644 index dc50cfa73e6574e9625eda9045c47f674fcbc1e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/decoder/lattice-faster-online-decoder.h +++ /dev/null @@ -1,131 +0,0 @@ -// decoder/lattice-faster-online-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.h, about how to maintain this -// file in sync with lattice-faster-decoder.h - -#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ - -#include "decoder/lattice-faster-decoder.h" - -#include - -namespace kaldi { - -/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also - supports an efficient way to get the best path (see the function - BestPathEnd()), which is useful in endpointing and in situations where you - might want to frequently access the best path. - - This is only templated on the FST type, since the Token type is required to - be BackpointerToken. Actually it only makes sense to instantiate - LatticeFasterDecoderTpl with Token == BackpointerToken if you do so - indirectly via this child class. - */ -template -class LatticeFasterOnlineDecoderTpl - : public LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using Token = decoder::BackpointerToken; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterOnlineDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : LatticeFasterDecoderTpl(fst, config, context_graph) {} - - // This version of the initializer takes ownership of 'fst', and will delete - // it when this object is destroyed. - LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config, - FST *fst) - : LatticeFasterDecoderTpl(config, fst) {} - - struct BestPathIterator { - void *tok; - int32 frame; - // note, "frame" is the frame-index of the frame you'll get the - // transition-id for next time, if you call TraceBackBestPath on this - // iterator (assuming it's not an epsilon transition). Note that this - // is one less than you might reasonably expect, e.g. it's -1 for - // the nonemitting transitions before the first frame. - BestPathIterator(void *t, int32 f) : tok(t), frame(f) {} - bool Done() const { return tok == NULL; } - }; - - /// Outputs an FST corresponding to the single best path through the lattice. - /// This is quite efficient because it doesn't get the entire raw lattice and - /// find the best path through it; instead, it uses the BestPathEnd and - /// BestPathIterator so it basically traces it back through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// This function does a self-test of GetBestPath(). Returns true on - /// success; returns false and prints a warning on failure. - bool TestGetBestPath(bool use_final_probs = true) const; - - /// This function returns an iterator that can be used to trace back - /// the best path. If use_final_probs == true and at least one final state - /// survived till the end, it will use the final-probs in working out the best - /// final Token, and will output the final cost to *final_cost (if non-NULL), - /// else it will use only the forward likelihood, and will put zero in - /// *final_cost (if non-NULL). - /// Requires that NumFramesDecoded() > 0. - BestPathIterator BestPathEnd(bool use_final_probs, - BaseFloat *final_cost = NULL) const; - - /// This function can be used in conjunction with BestPathEnd() to trace back - /// the best path one link at a time (e.g. this can be useful in endpoint - /// detection). By "link" we mean a link in the graph; not all links cross - /// frame boundaries, but each time you see a nonzero ilabel you can interpret - /// that as a frame. The return value is the updated iterator. It outputs - /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" - /// pointer, while leaving its "nextstate" variable unchanged. - BestPathIterator TraceBackBestPath(BestPathIterator iter, - LatticeArc *arc) const; - - /// Behaves the same as GetRawLattice but only processes tokens whose - /// extra_cost is smaller than the best-cost plus the specified beam. - /// It is only worthwhile to call this function if beam is less than - /// the lattice_beam specified in the config; otherwise, it would - /// return essentially the same thing as GetRawLattice, but more slowly. - bool GetRawLatticePruned(Lattice *ofst, bool use_final_probs, - BaseFloat beam) const; - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl); -}; - -typedef LatticeFasterOnlineDecoderTpl LatticeFasterOnlineDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstaddselfloops.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstaddselfloops.cc deleted file mode 100644 index 145bf006f2324136c5fea4a8d0012a7a4126c646..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstaddselfloops.cc +++ /dev/null @@ -1,100 +0,0 @@ -// fstbin/fstaddselfloops.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#include "util/simple-io-funcs.h" - -/* some test examples: - pushd ~/tmpdir - ( echo 3; echo 4) > in.list - ( echo 5; echo 6) > out.list - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list - | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | - fstcompile | fstaddselfloops in.list out.list | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Adds self-loops to states of an FST to propagate disambiguation " - "symbols through it\n" - "They are added on each final state and each state with non-epsilon " - "output symbols\n" - "on at least one arc out of the state. Useful in conjunction with " - "predeterminize\n" - "\n" - "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " - "[out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" - "in.list and out.list are lists of integers, one per line, of the\n" - "same length.\n"; - - ParseOptions po(usage); - po.Read(argc, argv); - - if (po.NumArgs() < 2 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string disambig_in_rxfilename = po.GetArg(1), - disambig_out_rxfilename = po.GetArg(2), - fst_in_filename = po.GetOptArg(3), - fst_out_filename = po.GetOptArg(4); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - std::vector disambig_in; - if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_in_rxfilename); - - std::vector disambig_out; - if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_out_rxfilename); - - if (disambig_in.size() != disambig_out.size()) - KALDI_ERR - << "fstaddselfloops: mismatch in size of disambiguation symbols"; - - AddSelfLoops(fst, disambig_in, disambig_out); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstdeterminizestar.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstdeterminizestar.cc deleted file mode 100644 index e818143025c0fd5d389c28c77715d65711fe63f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstdeterminizestar.cc +++ /dev/null @@ -1,114 +0,0 @@ -// fstbin/fstdeterminizestar.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#if !defined(_MSC_VER) && !defined(__APPLE__) -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. We are disabling this code if -// compiling on Windows because signal.h is not available there, and on -// MacOS due to a problem with in the initial release of Sierra. -#endif - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | - fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 - 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst - > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n - "." done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo - "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id - of fstdeterminizestar] # prints out a bunch of debugging output showing the - mess it got itself into. -*/ - -bool debug_location = false; -void signal_handler(int) { debug_location = true; } - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Removes epsilons and determinizes in one step\n" - "\n" - "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" - "\n" - "See also: fstdeterminizelog, lattice-determinize\n"; - - float delta = kDelta; - int max_states = -1; - bool use_log = false; - ParseOptions po(usage); - po.Register("use-log", &use_log, "Determinize in log semiring."); - po.Register("delta", &delta, - "Delta value used to determine equivalence of weights."); - po.Register( - "max-states", &max_states, - "Maximum number of states in determinized FST before it will abort."); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); - - // This enables us to get traceback info from determinization that is - // not seeming to terminate. -#if !defined(_MSC_VER) && !defined(__APPLE__) - signal(SIGUSR1, signal_handler); -#endif - // Normal case: just files. - VectorFst *fst = ReadFstKaldi(fst_in_str); - - ArcSort(fst, ILabelCompare()); // improves speed. - if (use_log) { - DeterminizeStarInLog(fst, delta, &debug_location, max_states); - } else { - VectorFst det_fst; - DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); - *fst = det_fst; // will do shallow copy and then det_fst goes - // out of scope anyway. - } - WriteFstKaldi(*fst, fst_out_str); - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstisstochastic.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstisstochastic.cc deleted file mode 100644 index 468ed0daa7d37cb9a25cf25264f86e48e137b975..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstisstochastic.cc +++ /dev/null @@ -1,91 +0,0 @@ -// fstbin/fstisstochastic.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -// e.g. of test: -// echo " 0 0" | fstcompile | fstisstochastic -// should return 0 and print "0 0" [meaning, min and -// max weight are one = exp(0)] -// echo " 0 1" | fstcompile | fstisstochastic -// should return 1, not stochastic, and print 1 1 -// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 -// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo -// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, -// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; -// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic -// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" -// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even -// though not stochastic because we gave it an absurdly large delta. - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Checks whether an FST is stochastic and exits with success if so.\n" - "Prints out maximum error (in log units).\n" - "\n" - "Usage: fstisstochastic [ in.fst ]\n"; - - float delta = 0.01; - bool test_in_log = true; - - ParseOptions po(usage); - po.Register("delta", &delta, "Maximum error to accept."); - po.Register("test-in-log", &test_in_log, - "Test stochasticity in log semiring."); - po.Read(argc, argv); - - if (po.NumArgs() > 1) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1); - - Fst *fst = ReadFstKaldiGeneric(fst_in_filename); - - bool ans; - StdArc::Weight min, max; - if (test_in_log) - ans = IsStochasticFstInLog(*fst, delta, &min, &max); - else - ans = IsStochasticFst(*fst, delta, &min, &max); - - std::cout << min.Value() << " " << max.Value() << '\n'; - delete fst; - if (ans) - return 0; // success; - else - return 1; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstminimizeencoded.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstminimizeencoded.cc deleted file mode 100644 index ae9ca6d75abe67d9a195572dd6d91ec3c7b44851..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fstminimizeencoded.cc +++ /dev/null @@ -1,74 +0,0 @@ -// fstbin/fstminimizeencoded.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint - ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | - fstminimizeencoded | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Minimizes FST after encoding [similar to fstminimize, but no " - "weight-pushing]\n" - "\n" - "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; - - float delta = kDelta; - ParseOptions po(usage); - po.Register("delta", &delta, - "Delta likelihood used for quantization of weights"); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1), - fst_out_filename = po.GetOptArg(2); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - MinimizeEncoded(fst, delta); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fsttablecompose.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fsttablecompose.cc deleted file mode 100644 index bdd476da78b8cb8823c60abf33b5278e05bfd92c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstbin/fsttablecompose.cc +++ /dev/null @@ -1,133 +0,0 @@ -// fstbin/fsttablecompose.cc - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "fstext/table-matcher.h" -#include "util/parse-options.h" - -/* - cd ~/tmpdir - while true; do - fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort - > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst - fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" - echo -n "." - done - -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - /* - fsttablecompose should always give equivalent results to compose, - but it is more efficient for certain kinds of inputs. - In particular, it is useful when, say, the left FST has states - that typically either have epsilon olabels, or - one transition out for each of the possible symbols (as the - olabel). The same with the input symbols of the right-hand FST - is possible. - */ - - const char *usage = - "Composition algorithm [between two FSTs of standard type, in " - "tropical\n" - "semiring] that is more efficient for certain cases-- in particular,\n" - "where one of the FSTs (the left one, if --match-side=left) has large\n" - "out-degree\n" - "\n" - "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " - "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; - - ParseOptions po(usage); - - TableComposeOptions opts; - std::string match_side = "left"; - std::string compose_filter = "sequence"; - - po.Register("connect", &opts.connect, "If true, trim FST before output."); - po.Register("match-side", &match_side, - "Side of composition to do table " - "match, one of: \"left\" or \"right\"."); - po.Register("compose-filter", &compose_filter, - "Composition filter to use, " - "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); - - po.Read(argc, argv); - - if (match_side == "left") { - opts.table_match_type = MATCH_OUTPUT; - } else if (match_side == "right") { - opts.table_match_type = MATCH_INPUT; - } else { - KALDI_ERR << "Invalid match-side option: " << match_side; - } - - if (compose_filter == "alt_sequence") { - opts.filter_type = ALT_SEQUENCE_FILTER; - } else if (compose_filter == "auto") { - opts.filter_type = AUTO_FILTER; - } else if (compose_filter == "match") { - opts.filter_type = MATCH_FILTER; - } else if (compose_filter == "sequence") { - opts.filter_type = SEQUENCE_FILTER; - } else { - KALDI_ERR << "Invalid compose-filter option: " << compose_filter; - } - - if (po.NumArgs() < 2 || po.NumArgs() > 3) { - po.PrintUsage(); - exit(1); - } - - std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), - fst_out_str = po.GetOptArg(3); - - VectorFst *fst1 = ReadFstKaldi(fst1_in_str); - - VectorFst *fst2 = ReadFstKaldi(fst2_in_str); - - // Checks if is olabel sorted and is ilabel sorted. - if (fst1->Properties(fst::kOLabelSorted, true) == 0) { - KALDI_WARN << "The first FST is not olabel sorted."; - } - if (fst2->Properties(fst::kILabelSorted, true) == 0) { - KALDI_WARN << "The second FST is not ilabel sorted."; - } - - VectorFst composed_fst; - - TableCompose(*fst1, *fst2, &composed_fst, opts); - - delete fst1; - delete fst2; - - WriteFstKaldi(composed_fst, fst_out_str); - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstext/determinize-lattice-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstext/determinize-lattice-inl.h deleted file mode 100644 index 0bfbc8f41c7e439b1fac037f60490e04fdcbdd8b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/fstext/determinize-lattice-inl.h +++ /dev/null @@ -1,1357 +0,0 @@ -// fstext/determinize-lattice-inl.h - -// Copyright 2009-2012 Microsoft Corporation -// 2012-2013 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -#define KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -// Do not include this file directly. It is included by determinize-lattice.h - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fst { - -// This class maps back and forth from/to integer id's to sequences of strings. -// used in determinization algorithm. It is constructed in such a way that -// finding the string-id of the successor of (string, next-label) has constant -// time. - -// Note: class IntType, typically int32, is the type of the element in the -// string (typically a template argument of the CompactLatticeWeightTpl). - -template -class LatticeStringRepository { - public: - struct Entry { - const Entry *parent; // NULL for empty string. - IntType i; - inline bool operator==(const Entry &other) const { - return (parent == other.parent && i == other.i); - } - Entry() {} - Entry(const Entry &e) : parent(e.parent), i(e.i) {} - }; - // Note: all Entry* pointers returned in function calls are - // owned by the repository itself, not by the caller! - - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } - - // Returns string of "parent" with i appended. Pointer - // owned by repository - const Entry *Successor(const Entry *parent, IntType i) { - new_entry_->parent = parent; - new_entry_->i = i; - - std::pair pr = set_.insert(new_entry_); - if (pr.second) { // Was successfully inserted (was not there). We need to - // replace the element we inserted, which resides on the - // stack, with one from the heap. - const Entry *ans = new_entry_; - new_entry_ = new Entry(); - return ans; - } else { // Was not inserted because an equivalent Entry already - // existed. - return *pr.first; - } - } - - const Entry *Concatenate(const Entry *a, const Entry *b) { - if (a == NULL) - return b; - else if (b == NULL) - return a; - std::vector v; - ConvertToVector(b, &v); - const Entry *ans = a; - for (size_t i = 0; i < v.size(); i++) ans = Successor(ans, v[i]); - return ans; - } - const Entry *CommonPrefix(const Entry *a, const Entry *b) { - std::vector a_vec, b_vec; - ConvertToVector(a, &a_vec); - ConvertToVector(b, &b_vec); - const Entry *ans = NULL; - for (size_t i = 0; - i < a_vec.size() && i < b_vec.size() && a_vec[i] == b_vec[i]; i++) - ans = Successor(ans, a_vec[i]); - return ans; - } - - // removes any elements from b that are not part of - // a common prefix with a. - void ReduceToCommonPrefix(const Entry *a, std::vector *b) { - size_t a_size = Size(a), b_size = b->size(); - while (a_size > b_size) { - a = a->parent; - a_size--; - } - if (b_size > a_size) b_size = a_size; - typename std::vector::iterator b_begin = b->begin(); - while (a_size != 0) { - if (a->i != *(b_begin + a_size - 1)) b_size = a_size - 1; - a = a->parent; - a_size--; - } - if (b_size != b->size()) b->resize(b_size); - } - - // removes the first n elements of a. - const Entry *RemovePrefix(const Entry *a, size_t n) { - if (n == 0) return a; - std::vector a_vec; - ConvertToVector(a, &a_vec); - assert(a_vec.size() >= n); - const Entry *ans = NULL; - for (size_t i = n; i < a_vec.size(); i++) ans = Successor(ans, a_vec[i]); - return ans; - } - - // Returns true if a is a prefix of b. If a is prefix of b, - // time taken is |b| - |a|. Else, time taken is |b|. - bool IsPrefixOf(const Entry *a, const Entry *b) const { - if (a == NULL) return true; // empty string prefix of all. - if (a == b) return true; - if (b == NULL) return false; - return IsPrefixOf(a, b->parent); - } - - inline size_t Size(const Entry *entry) const { - size_t ans = 0; - while (entry != NULL) { - ans++; - entry = entry->parent; - } - return ans; - } - - void ConvertToVector(const Entry *entry, std::vector *out) const { - size_t length = Size(entry); - out->resize(length); - if (entry != NULL) { - typename std::vector::reverse_iterator iter = out->rbegin(); - while (entry != NULL) { - *iter = entry->i; - entry = entry->parent; - ++iter; - } - } - } - - const Entry *ConvertFromVector(const std::vector &vec) { - const Entry *e = NULL; - for (size_t i = 0; i < vec.size(); i++) e = Successor(e, vec[i]); - return e; - } - - LatticeStringRepository() { new_entry_ = new Entry; } - - void Destroy() { - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) - delete *iter; - SetType tmp; - tmp.swap(set_); - if (new_entry_) { - delete new_entry_; - new_entry_ = NULL; - } - } - - // Rebuild will rebuild this object, guaranteeing only - // to preserve the Entry values that are in the vector pointed - // to (this list does not have to be unique). The point of - // this is to save memory. - void Rebuild(const std::vector &to_keep) { - SetType tmp_set; - for (typename std::vector::const_iterator iter = - to_keep.begin(); - iter != to_keep.end(); ++iter) - RebuildHelper(*iter, &tmp_set); - // Now delete all elems not in tmp_set. - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) { - if (tmp_set.count(*iter) == 0) - delete (*iter); // delete the Entry; not needed. - } - set_.swap(tmp_set); - } - - ~LatticeStringRepository() { Destroy(); } - int32 MemSize() const { - return set_.size() * sizeof(Entry) * 2; // this is a lower bound - // on the size this structure might take. - } - - private: - class EntryKey { // Hash function object. - public: - inline size_t operator()(const Entry *entry) const { - size_t prime = 49109; - return static_cast(entry->i) + - prime * reinterpret_cast(entry->parent); - } - }; - class EntryEqual { - public: - inline bool operator()(const Entry *e1, const Entry *e2) const { - return (*e1 == *e2); - } - }; - typedef std::unordered_set SetType; - - void RebuildHelper(const Entry *to_add, SetType *tmp_set) { - while (true) { - if (to_add == NULL) return; - typename SetType::iterator iter = tmp_set->find(to_add); - if (iter == tmp_set->end()) { // not in tmp_set. - tmp_set->insert(to_add); - to_add = to_add->parent; // and loop. - } else { - return; - } - } - } - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); - Entry *new_entry_; // We always have a pre-allocated Entry ready to use, - // to avoid unnecessary news and deletes. - SetType set_; -}; - -// class LatticeDeterminizer is templated on the same types that -// CompactLatticeWeight is templated on: the base weight (Weight), typically -// LatticeWeightTpl etc. but could also be e.g. TropicalWeight, and the -// IntType, typically int32, used for the output symbols in the compact -// representation of strings [note: the output symbols would usually be -// p.d.f. id's in the anticipated use of this code] It has a special requirement -// on the Weight type: that there should be a Compare function on the weights -// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 -// > w2. This requires that there be a total order on the weights. - -template -class LatticeDeterminizer { - public: - // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 - // correspondence between our states and the states in ofst. If destroy == - // true, release memory as we go (but we cannot output again). - - typedef CompactLatticeWeightTpl CompactWeight; - typedef ArcTpl - CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - - // Output to standard FST with CompactWeightTpl as its weight type - // (the weight stores the original output-symbol strings). If destroy == - // true, release memory as we go (but we cannot output again). - void Output(MutableFst *ofst, bool destroy = true) { - assert(determinized_); - typedef typename Arc::StateId StateId; - StateId nStates = static_cast(output_arcs_.size()); - if (destroy) FreeMostMemory(); - ofst->DeleteStates(); - ofst->SetStart(kNoStateId); - if (nStates == 0) { - return; - } - for (StateId s = 0; s < nStates; s++) { - OutputStateId news = ofst->AddState(); - assert(news == s); - } - ofst->SetStart(0); - // now process transitions. - for (StateId this_state = 0; this_state < nStates; this_state++) { - std::vector &this_vec(output_arcs_[this_state]); - typename std::vector::const_iterator iter = this_vec.begin(), - end = this_vec.end(); - - for (; iter != end; ++iter) { - const TempArc &temp_arc(*iter); - CompactArc new_arc; - std::vector is not treated as epsilon, create a common end state for - // all transitions accepting the , since they do not back off. This small - // optimization saves about 2% states in an average grammar. - if (sub_eps_ == 0) { - eos_state_ = fst_->AddState(); - fst_->SetFinal(eos_state_, 0); - } -} - -template -void ArpaLmCompilerImpl::ConsumeNGram(const NGram& ngram, - bool is_highest) { - // Generally, we do the following. Suppose we are adding an n-gram "A B - // C". Then find the node for "A B", add a new node for "A B C", and connect - // them with the arc accepting "C" with the specified weight. Also, add a - // backoff arc from the new "A B C" node to its backoff state "B C". - // - // Two notable exceptions are the highest order n-grams, and final n-grams. - // - // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM), - // the following optimization is performed. There is no point adding a node - // for "A B C" with a "C" arc from "A B", since there will be no other - // arcs ingoing to this node, and an epsilon backoff arc into the backoff - // model "B C", with the weight of \bar{1}. To save a node, create an arc - // accepting "C" directly from "A B" to "B C". This saves as many nodes - // as there are the highest order n-grams, which is typically about half - // the size of a large 3-gram model. - // - // Indeed, this does not apply to n-grams ending in EOS, since they do not - // back off. These are special, as they do not have a back-off state, and - // the node for "(..anything..) " is always final. These are handled - // in one of the two possible ways, If symbols and are being - // replaced by epsilons, neither node nor arc is created, and the logprob - // of the n-gram is applied to its source node as final weight. If and - // are preserved, then a special final node for is allocated and - // used as the destination of the "" acceptor arc. - HistKey heads(ngram.words.begin(), ngram.words.end() - 1); - typename HistoryMap::iterator source_it = history_.find(heads); - if (source_it == history_.end()) { - // There was no "A B", therefore the probability of "A B C" is zero. - // Print a warning and discard current n-gram. - if (parent_->ShouldWarn()) - KALDI_WARN << parent_->LineReference() - << " skipped: no parent (n-1)-gram exists"; - return; - } - - StateId source = source_it->second; - StateId dest; - Symbol sym = ngram.words.back(); - float weight = -ngram.logprob; - if (sym == sub_eps_ || sym == 0) { - KALDI_ERR << " or disambiguation symbol " << sym - << "found in the ARPA file. "; - } - if (sym == eos_symbol_) { - if (sub_eps_ == 0) { - // Keep as a real symbol when not substituting. - dest = eos_state_; - } else { - // Treat as if it was epsilon: mark source final, with the weight - // of the n-gram. - fst_->SetFinal(source, weight); - return; - } - } else { - // For the highest order n-gram, this may find an existing state, for - // non-highest, will create one (unless there are duplicate n-grams - // in the grammar, which cannot be reliably detected if highest order, - // so we better do not do that at all). - dest = AddStateWithBackoff( - HistKey(ngram.words.begin() + (is_highest ? 1 : 0), ngram.words.end()), - -ngram.backoff); - } - - if (sym == bos_symbol_) { - weight = 0; // Accepting is always free. - if (sub_eps_ == 0) { - // is as a real symbol, only accepted in the start state. - source = fst_->AddState(); - fst_->SetStart(source); - } else { - // The new state for unigram history *is* the start state. - fst_->SetStart(dest); - return; - } - } - - // Add arc from source to dest, whichever way it was found. - fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest)); - return; -} - -// Find or create a new state for n-gram defined by key, and ensure it has a -// backoff transition. The key is either the current n-gram for all but -// highest orders, or the tails of the n-gram for the highest order. The -// latter arises from the chain-collapsing optimization described above. -template -StateId ArpaLmCompilerImpl::AddStateWithBackoff(HistKey key, - float backoff) { - typename HistoryMap::iterator dest_it = history_.find(key); - if (dest_it != history_.end()) { - // Found an existing state in the history map. Invariant: if the state in - // the map, then its backoff arc is in the FST. We are done. - return dest_it->second; - } - // Otherwise create a new state and its backoff arc, and register in the map. - StateId dest = fst_->AddState(); - history_[key] = dest; - CreateBackoff(key.Tails(), dest, backoff); - return dest; -} - -// Create a backoff arc for a state. Key is a backoff destination that may or -// may not exist. When the destination is not found, naturally fall back to -// the lower order model, and all the way down until one is found (since the -// 0-gram model is always present, the search is guaranteed to terminate). -template -inline void ArpaLmCompilerImpl::CreateBackoff(HistKey key, - StateId state, - float weight) { - typename HistoryMap::iterator dest_it = history_.find(key); - while (dest_it == history_.end()) { - key = key.Tails(); - dest_it = history_.find(key); - } - - // The arc should transduce either or #0 to , depending on the - // epsilon substitution mode. This is the only case when input and output - // label may differ. - fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second)); -} - -ArpaLmCompiler::~ArpaLmCompiler() { - if (impl_ != NULL) delete impl_; -} - -void ArpaLmCompiler::HeaderAvailable() { - KALDI_ASSERT(impl_ == NULL); - // Use optimized implementation if the grammar is 4-gram or less, and the - // maximum attained symbol id will fit into the optimized range. - int64 max_symbol = 0; - if (Symbols() != NULL) max_symbol = Symbols()->AvailableKey() - 1; - // If augmenting the symbol table, assume the worst case when all words in - // the model being read are novel. - if (Options().oov_handling == ArpaParseOptions::kAddToSymbols) - max_symbol += NgramCounts()[0]; - - if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - } else { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - KALDI_LOG << "Reverting to slower state tracking because model is large: " - << NgramCounts().size() << "-gram with symbols up to " - << max_symbol; - } -} - -void ArpaLmCompiler::ConsumeNGram(const NGram& ngram) { - // is invalid in tails, in heads of an n-gram. - for (int i = 0; i < ngram.words.size(); ++i) { - if ((i > 0 && ngram.words[i] == Options().bos_symbol) || - (i + 1 < ngram.words.size() && - ngram.words[i] == Options().eos_symbol)) { - if (ShouldWarn()) - KALDI_WARN << LineReference() - << " skipped: n-gram has invalid BOS/EOS placement"; - return; - } - } - - bool is_highest = ngram.words.size() == NgramCounts().size(); - impl_->ConsumeNGram(ngram, is_highest); -} - -void ArpaLmCompiler::RemoveRedundantStates() { - fst::StdArc::Label backoff_symbol = sub_eps_; - if (backoff_symbol == 0) { - // The method of removing redundant states implemented in this function - // leads to slow determinization of L o G when people use the older style of - // usage of arpa2fst where the --disambig-symbol option was not specified. - // The issue seems to be that it creates a non-deterministic FST, while G is - // supposed to be deterministic. By 'return'ing below, we just disable this - // method if people were using an older script. This method isn't really - // that consequential anyway, and people will move to the newer-style - // scripts (see current utils/format_lm.sh), so this isn't much of a - // problem. - return; - } - - fst::StdArc::StateId num_states = fst_.NumStates(); - - // replace the #0 symbols on the input of arcs out of redundant states (states - // that are not final and have only a backoff arc leaving them), with . - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && - fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } - } - } - - // we could call fst::RemoveEps, and it would have the same effect in normal - // cases, where backoff_symbol != 0 and there are no epsilons in unexpected - // places, but RemoveEpsLocal is a bit safer in case something weird is going - // on; it guarantees not to blow up the FST. - fst::RemoveEpsLocal(&fst_); - KALDI_LOG << "Reduced num-states from " << num_states << " to " - << fst_.NumStates(); -} - -void ArpaLmCompiler::Check() const { - if (fst_.Start() == fst::kNoStateId) { - KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " - << Symbols()->Find(Options().bos_symbol) << "."; - } -} - -void ArpaLmCompiler::ReadComplete() { - fst_.SetInputSymbols(Symbols()); - fst_.SetOutputSymbols(Symbols()); - RemoveRedundantStates(); - Check(); -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/lm/arpa-lm-compiler.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/lm/arpa-lm-compiler.h deleted file mode 100644 index 069c71bd0e6f5acf0b9521ec1ef46796eb31fe4d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/lm/arpa-lm-compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// lm/arpa-lm-compiler.h - -// Copyright 2009-2011 Gilles Boulianne -// Copyright 2016 Smart Action LLC (kkm) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_LM_ARPA_LM_COMPILER_H_ -#define KALDI_LM_ARPA_LM_COMPILER_H_ - -#include - -#include "lm/arpa-file-parser.h" - -namespace kaldi { - -class ArpaLmCompilerImplInterface; - -class ArpaLmCompiler : public ArpaFileParser { - public: - ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps, - fst::SymbolTable* symbols) - : ArpaFileParser(options, symbols), sub_eps_(sub_eps), impl_(NULL) {} - ~ArpaLmCompiler(); - - const fst::StdVectorFst& Fst() const { return fst_; } - fst::StdVectorFst* MutableFst() { return &fst_; } - - protected: - // ArpaFileParser overrides. - virtual void HeaderAvailable(); - virtual void ConsumeNGram(const NGram& ngram); - virtual void ReadComplete(); - - private: - // this function removes states that only have a backoff arc coming - // out of them. - void RemoveRedundantStates(); - void Check() const; - - int sub_eps_; - ArpaLmCompilerImplInterface* impl_; // Owned. - fst::StdVectorFst fst_; - template - friend class ArpaLmCompilerImpl; -}; - -} // namespace kaldi - -#endif // KALDI_LM_ARPA_LM_COMPILER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/lmbin/arpa2fst.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/lmbin/arpa2fst.cc deleted file mode 100644 index 881a45c5b37810247ea38dae56237f59b5554a9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/lmbin/arpa2fst.cc +++ /dev/null @@ -1,145 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "lm/arpa-lm-compiler.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; // NOLINT - try { - const char *usage = - "Convert an ARPA format language model into an FST\n" - "Usage: arpa2fst [opts] \n" - " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" - "data/lang/words.txt lm/input.arpa G.fst\n\n" - "Note: When called without switches, the output G.fst will contain\n" - "an embedded symbol table. This is compatible with the way a previous\n" - "version of arpa2fst worked.\n"; - - ParseOptions po(usage); - - ArpaParseOptions options; - options.Register(&po); - - // Option flags. - std::string bos_symbol = ""; - std::string eos_symbol = ""; - std::string disambig_symbol; - std::string read_syms_filename; - std::string write_syms_filename; - bool keep_symbols = false; - bool ilabel_sort = true; - - po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); - po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); - po.Register("disambig-symbol", &disambig_symbol, - "Disambiguator. If provided (e. g. #0), used on input side of " - "backoff links, and and are replaced with epsilons"); - po.Register("read-symbol-table", &read_syms_filename, - "Use existing symbol table"); - po.Register("write-symbol-table", &write_syms_filename, - "Write generated symbol table to a file"); - po.Register("keep-symbols", &keep_symbols, - "Store symbol table with FST. Symbols always saved to FST if " - "symbol tables are neither read or written (otherwise symbols " - "would be lost entirely)"); - po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); - - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_rxfilename = po.GetArg(1), - fst_wxfilename = po.GetOptArg(2); - - int64 disambig_symbol_id = 0; - - fst::SymbolTable *symbols; - if (!read_syms_filename.empty()) { - // Use existing symbols. Required symbols must be in the table. - kaldi::Input kisym(read_syms_filename); - symbols = fst::SymbolTable::ReadText( - kisym.Stream(), PrintableWxfilename(read_syms_filename)); - if (symbols == NULL) - KALDI_ERR << "Could not read symbol table from file " - << read_syms_filename; - - options.oov_handling = ArpaParseOptions::kSkipNGram; - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == -1) // fst::kNoSymbol - KALDI_ERR << "Symbol table " << read_syms_filename - << " has no symbol for " << disambig_symbol; - } - } else { - // Create a new symbol table and populate it from ARPA file. - symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); - options.oov_handling = ArpaParseOptions::kAddToSymbols; - symbols->AddSymbol("", 0); - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->AddSymbol(disambig_symbol); - } - } - - // Add or use existing BOS and EOS. - options.bos_symbol = symbols->AddSymbol(bos_symbol); - options.eos_symbol = symbols->AddSymbol(eos_symbol); - - // If producing new (not reading existing) symbols and not saving them, - // need to keep symbols with FST, otherwise they would be lost. - if (read_syms_filename.empty() && write_syms_filename.empty()) - keep_symbols = true; - - // Actually compile LM. - KALDI_ASSERT(symbols != NULL); - ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); - { - Input ki(arpa_rxfilename); - lm_compiler.Read(ki.Stream()); - } - - // Sort the FST in-place if requested by options. - if (ilabel_sort) { - fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); - } - - // Write symbols if requested. - if (!write_syms_filename.empty()) { - kaldi::Output kosym(write_syms_filename, false); - symbols->WriteText(kosym.Stream()); - } - - // Write LM FST. - bool write_binary = true, write_header = false; - kaldi::Output kofst(fst_wxfilename, write_binary, write_header); - fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); - wopts.write_isymbols = wopts.write_osymbols = keep_symbols; - lm_compiler.Fst().Write(kofst.Stream(), wopts); - - delete symbols; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/basic-filebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/basic-filebuf.h deleted file mode 100644 index 22ec891064d5955c8b1d255e0d34781a9f505a38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/basic-filebuf.h +++ /dev/null @@ -1,952 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// This is a modified version of the std::basic_filebuf from libc++ -// Copyright 20XX LLVM -// (http://libcxx.llvm.org/). -// It allows one to create basic_filebuf from an existing FILE* handle or file -// descriptor. -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source License licenses. See LICENSE.TXT for details (included at the -// bottom). -/////////////////////////////////////////////////////////////////////////////// -#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ -#define KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////// -namespace kaldi { -/////////////////////////////////////////////////////////////////////////////// -template > -class basic_filebuf : public std::basic_streambuf { - public: - typedef CharT char_type; - typedef Traits traits_type; - typedef typename traits_type::int_type int_type; - typedef typename traits_type::pos_type pos_type; - typedef typename traits_type::off_type off_type; - typedef typename traits_type::state_type state_type; - - basic_filebuf(); - basic_filebuf(basic_filebuf&& rhs); - virtual ~basic_filebuf(); - - basic_filebuf& operator=(basic_filebuf&& rhs); - void swap(basic_filebuf& rhs); - - bool is_open() const; - basic_filebuf* open(const char* s, std::ios_base::openmode mode); - basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); - basic_filebuf* open(int fd, std::ios_base::openmode mode); - basic_filebuf* open(FILE* f, std::ios_base::openmode mode); - basic_filebuf* close(); - - FILE* file() { return this->_M_file; } - int fd() { return fileno(this->_M_file); } - - protected: - int_type underflow() override; - int_type pbackfail(int_type c = traits_type::eof()) override; - int_type overflow(int_type c = traits_type::eof()) override; - std::basic_streambuf* setbuf( - char_type* s, std::streamsize n) override; - pos_type seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - pos_type seekpos(pos_type sp, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - int sync() override; - void imbue(const std::locale& loc) override; - - protected: - char* _M_extbuf; - const char* _M_extbufnext; - const char* _M_extbufend; - char _M_extbuf_min[8]; - size_t _M_ebs; - char_type* _M_intbuf; - size_t _M_ibs; - FILE* _M_file; - const std::codecvt* _M_cv; - state_type _M_st; - state_type _M_st_last; - std::ios_base::openmode _M_om; - std::ios_base::openmode _M_cm; - bool _M_owns_eb; - bool _M_owns_ib; - bool _M_always_noconv; - - const char* _M_get_mode(std::ios_base::openmode mode); - bool _M_read_mode(); - void _M_write_mode(); -}; - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf() - : _M_extbuf(nullptr), - _M_extbufnext(nullptr), - _M_extbufend(nullptr), - _M_ebs(0), - _M_intbuf(nullptr), - _M_ibs(0), - _M_file(nullptr), - _M_cv(nullptr), - _M_st(), - _M_st_last(), - _M_om(std::ios_base::openmode(0)), - _M_cm(std::ios_base::openmode(0)), - _M_owns_eb(false), - _M_owns_ib(false), - _M_always_noconv(false) { - if (std::has_facet >( - this->getloc())) { - _M_cv = &std::use_facet >( - this->getloc()); - _M_always_noconv = _M_cv->always_noconv(); - } - setbuf(0, 4096); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf(basic_filebuf&& rhs) - : std::basic_streambuf(rhs) { - if (rhs._M_extbuf == rhs._M_extbuf_min) { - _M_extbuf = _M_extbuf_min; - _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); - _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); - } else { - _M_extbuf = rhs._M_extbuf; - _M_extbufnext = rhs._M_extbufnext; - _M_extbufend = rhs._M_extbufend; - } - _M_ebs = rhs._M_ebs; - _M_intbuf = rhs._M_intbuf; - _M_ibs = rhs._M_ibs; - _M_file = rhs._M_file; - _M_cv = rhs._M_cv; - _M_st = rhs._M_st; - _M_st_last = rhs._M_st_last; - _M_om = rhs._M_om; - _M_cm = rhs._M_cm; - _M_owns_eb = rhs._M_owns_eb; - _M_owns_ib = rhs._M_owns_ib; - _M_always_noconv = rhs._M_always_noconv; - if (rhs.pbase()) { - if (rhs.pbase() == rhs._M_intbuf) - this->setp(_M_intbuf, _M_intbuf + (rhs.epptr() - rhs.pbase())); - else - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + - (rhs.epptr() - rhs.pbase())); - this->pbump(rhs.pptr() - rhs.pbase()); - } else if (rhs.eback()) { - if (rhs.eback() == rhs._M_intbuf) - this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), - _M_intbuf + (rhs.egptr() - rhs.eback())); - else - this->setg( - reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (rhs.gptr() - rhs.eback()), - reinterpret_cast(_M_extbuf) + - (rhs.egptr() - rhs.eback())); - } - rhs._M_extbuf = nullptr; - rhs._M_extbufnext = nullptr; - rhs._M_extbufend = nullptr; - rhs._M_ebs = 0; - rhs._M_intbuf = nullptr; - rhs._M_ibs = 0; - rhs._M_file = nullptr; - rhs._M_st = state_type(); - rhs._M_st_last = state_type(); - rhs._M_om = std::ios_base::openmode(0); - rhs._M_cm = std::ios_base::openmode(0); - rhs._M_owns_eb = false; - rhs._M_owns_ib = false; - rhs.setg(0, 0, 0); - rhs.setp(0, 0); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf& basic_filebuf::operator=( - basic_filebuf&& rhs) { - close(); - swap(rhs); - return *this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::~basic_filebuf() { - // try - // { - // close(); - // } - // catch (...) - // { - // } - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::swap(basic_filebuf& rhs) { - std::basic_streambuf::swap(rhs); - if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - std::swap(_M_extbuf, rhs._M_extbuf); - std::swap(_M_extbufnext, rhs._M_extbufnext); - std::swap(_M_extbufend, rhs._M_extbufend); - } else { - ptrdiff_t ln = _M_extbufnext - _M_extbuf; - ptrdiff_t le = _M_extbufend - _M_extbuf; - ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; - ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; - if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - _M_extbuf = rhs._M_extbuf; - rhs._M_extbuf = rhs._M_extbuf_min; - } else if (_M_extbuf != _M_extbuf_min && - rhs._M_extbuf == rhs._M_extbuf_min) { - rhs._M_extbuf = _M_extbuf; - _M_extbuf = _M_extbuf_min; - } - _M_extbufnext = _M_extbuf + rn; - _M_extbufend = _M_extbuf + re; - rhs._M_extbufnext = rhs._M_extbuf + ln; - rhs._M_extbufend = rhs._M_extbuf + le; - } - std::swap(_M_ebs, rhs._M_ebs); - std::swap(_M_intbuf, rhs._M_intbuf); - std::swap(_M_ibs, rhs._M_ibs); - std::swap(_M_file, rhs._M_file); - std::swap(_M_cv, rhs._M_cv); - std::swap(_M_st, rhs._M_st); - std::swap(_M_st_last, rhs._M_st_last); - std::swap(_M_om, rhs._M_om); - std::swap(_M_cm, rhs._M_cm); - std::swap(_M_owns_eb, rhs._M_owns_eb); - std::swap(_M_owns_ib, rhs._M_owns_ib); - std::swap(_M_always_noconv, rhs._M_always_noconv); - if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->gptr() - this->eback(); - ptrdiff_t e = this->egptr() - this->eback(); - this->setg(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + n, - reinterpret_cast(_M_extbuf_min) + e); - } else if (this->pbase() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->pptr() - this->pbase(); - ptrdiff_t e = this->epptr() - this->pbase(); - this->setp(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + e); - this->pbump(n); - } - if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.gptr() - rhs.eback(); - ptrdiff_t e = rhs.egptr() - rhs.eback(); - rhs.setg(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + n, - reinterpret_cast(rhs._M_extbuf_min) + e); - } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.pptr() - rhs.pbase(); - ptrdiff_t e = rhs.epptr() - rhs.pbase(); - rhs.setp(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + e); - rhs.pbump(n); - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline void swap(basic_filebuf& x, - basic_filebuf& y) { - x.swap(y); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline bool basic_filebuf::is_open() const { - return _M_file != nullptr; -} - -/////////////////////////////////////////////////////////////////////////////// -template -const char* basic_filebuf::_M_get_mode( - std::ios_base::openmode mode) { - switch ((mode & ~std::ios_base::ate) | 0) { - case std::ios_base::out: - case std::ios_base::out | std::ios_base::trunc: - return "w"; - case std::ios_base::out | std::ios_base::app: - case std::ios_base::app: - return "a"; - break; - case std::ios_base::in: - return "r"; - case std::ios_base::in | std::ios_base::out: - return "r+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: - return "w+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app: - case std::ios_base::in | std::ios_base::app: - return "a+"; - case std::ios_base::out | std::ios_base::binary: - case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: - return "wb"; - case std::ios_base::out | std::ios_base::app | std::ios_base::binary: - case std::ios_base::app | std::ios_base::binary: - return "ab"; - case std::ios_base::in | std::ios_base::binary: - return "rb"; - case std::ios_base::in | std::ios_base::out | std::ios_base::binary: - return "r+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | - std::ios_base::binary: - return "w+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app | - std::ios_base::binary: - case std::ios_base::in | std::ios_base::app | std::ios_base::binary: - return "a+b"; - default: - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - const char* s, std::ios_base::openmode mode) { - basic_filebuf* rt = nullptr; - if (_M_file == nullptr) { - const char* md = _M_get_mode(mode); - if (md) { - _M_file = fopen(s, md); - if (_M_file) { - rt = this; - _M_om = mode; - if (mode & std::ios_base::ate) { - if (fseek(_M_file, 0, SEEK_END)) { - fclose(_M_file); - _M_file = nullptr; - rt = nullptr; - } - } - } - } - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf* basic_filebuf::open( - const std::string& s, std::ios_base::openmode mode) { - return open(s.c_str(), mode); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - int fd, std::ios_base::openmode mode) { - const char* md = this->_M_get_mode(mode); - if (md) { - this->_M_file = fdopen(fd, md); - this->_M_om = mode; - return this; - } else { - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - FILE* f, std::ios_base::openmode mode) { - this->_M_file = f; - this->_M_om = mode; - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::close() { - basic_filebuf* rt = nullptr; - if (_M_file) { - rt = this; - std::unique_ptr h(_M_file, fclose); - if (sync()) rt = nullptr; - if (fclose(h.release()) == 0) - _M_file = nullptr; - else - rt = nullptr; - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::underflow() { - if (_M_file == nullptr) return traits_type::eof(); - bool initial = _M_read_mode(); - char_type buf; - if (this->gptr() == nullptr) this->setg(&buf, &buf + 1, &buf + 1); - const size_t unget_sz = - initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); - int_type c = traits_type::eof(); - if (this->gptr() == this->egptr()) { - memmove(this->eback(), this->egptr() - unget_sz, - unget_sz * sizeof(char_type)); - if (_M_always_noconv) { - size_t nmemb = - static_cast(this->egptr() - this->eback() - unget_sz); - nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); - if (nmemb != 0) { - this->setg(this->eback(), this->eback() + unget_sz, - this->eback() + unget_sz + nmemb); - c = traits_type::to_int_type(*this->gptr()); - } - } else { - memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); - _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); - _M_extbufend = - _M_extbuf + - (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); - size_t nmemb = - std::min(static_cast(_M_ibs - unget_sz), - static_cast(_M_extbufend - _M_extbufnext)); - std::codecvt_base::result r; - _M_st_last = _M_st; - size_t nr = - fread(reinterpret_cast(const_cast(_M_extbufnext)), - 1, nmemb, _M_file); - if (nr != 0) { - if (!_M_cv) throw std::bad_cast(); - _M_extbufend = _M_extbufnext + nr; - char_type* inext; - r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, - this->eback() + unget_sz, this->eback() + _M_ibs, inext); - if (r == std::codecvt_base::noconv) { - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf), - const_cast(_M_extbufend)); - c = traits_type::to_int_type(*this->gptr()); - } else if (inext != this->eback() + unget_sz) { - this->setg(this->eback(), this->eback() + unget_sz, inext); - c = traits_type::to_int_type(*this->gptr()); - } - } - } - } else { - c = traits_type::to_int_type(*this->gptr()); - } - if (this->eback() == &buf) this->setg(0, 0, 0); - return c; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::pbackfail(int_type c) { - if (_M_file && this->eback() < this->gptr()) { - if (traits_type::eq_int_type(c, traits_type::eof())) { - this->gbump(-1); - return traits_type::not_eof(c); - } - if ((_M_om & std::ios_base::out) || - traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { - this->gbump(-1); - *this->gptr() = traits_type::to_char_type(c); - return c; - } - } - return traits_type::eof(); -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::overflow(int_type c) { - if (_M_file == nullptr) return traits_type::eof(); - _M_write_mode(); - char_type buf; - char_type* pb_save = this->pbase(); - char_type* epb_save = this->epptr(); - if (!traits_type::eq_int_type(c, traits_type::eof())) { - if (this->pptr() == nullptr) this->setp(&buf, &buf + 1); - *this->pptr() = traits_type::to_char_type(c); - this->pbump(1); - } - if (this->pptr() != this->pbase()) { - if (_M_always_noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else { - char* extbe = _M_extbuf; - std::codecvt_base::result r; - do { - if (!_M_cv) throw std::bad_cast(); - const char_type* e; - r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, _M_extbuf, - _M_extbuf + _M_ebs, extbe); - if (e == this->pbase()) return traits_type::eof(); - if (r == std::codecvt_base::noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else if (r == std::codecvt_base::ok || - r == std::codecvt_base::partial) { - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - if (r == std::codecvt_base::partial) { - this->setp(const_cast(e), this->pptr()); - this->pbump(this->epptr() - this->pbase()); - } - } else { - return traits_type::eof(); - } - } while (r == std::codecvt_base::partial); - } - this->setp(pb_save, epb_save); - } - return traits_type::not_eof(c); -} - -/////////////////////////////////////////////////////////////////////////////// -template -std::basic_streambuf* basic_filebuf::setbuf( - char_type* s, std::streamsize n) { - this->setg(0, 0, 0); - this->setp(0, 0); - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; - _M_ebs = n; - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv && s) { - _M_extbuf = reinterpret_cast(s); - _M_owns_eb = false; - } else { - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } - } else { - _M_extbuf = _M_extbuf_min; - _M_ebs = sizeof(_M_extbuf_min); - _M_owns_eb = false; - } - if (!_M_always_noconv) { - _M_ibs = std::max(n, sizeof(_M_extbuf_min)); - if (s && _M_ibs >= sizeof(_M_extbuf_min)) { - _M_intbuf = s; - _M_owns_ib = false; - } else { - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } else { - _M_ibs = 0; - _M_intbuf = 0; - _M_owns_ib = false; - } - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode) { - if (!_M_cv) throw std::bad_cast(); - int width = _M_cv->encoding(); - if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) - return pos_type(off_type(-1)); - // width > 0 || off == 0 - int whence; - switch (way) { - case std::ios_base::beg: - whence = SEEK_SET; - break; - case std::ios_base::cur: - whence = SEEK_CUR; - break; - case std::ios_base::end: - whence = SEEK_END; - break; - default: - return pos_type(off_type(-1)); - } -#if _WIN32 - if (fseek(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftell(_M_file); -#else - if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftello(_M_file); -#endif - r.state(_M_st); - return r; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { - if (_M_file == nullptr || sync()) return pos_type(off_type(-1)); -#if _WIN32 - if (fseek(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#else - if (fseeko(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#endif - _M_st = sp.state(); - return sp; -} - -/////////////////////////////////////////////////////////////////////////////// -template -int basic_filebuf::sync() { - if (_M_file == nullptr) return 0; - if (!_M_cv) throw std::bad_cast(); - if (_M_cm & std::ios_base::out) { - if (this->pptr() != this->pbase()) - if (overflow() == traits_type::eof()) return -1; - std::codecvt_base::result r; - do { - char* extbe; - r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) return -1; - } while (r == std::codecvt_base::partial); - if (r == std::codecvt_base::error) return -1; - if (fflush(_M_file)) return -1; - } else if (_M_cm & std::ios_base::in) { - off_type c; - state_type state = _M_st_last; - bool update_st = false; - if (_M_always_noconv) { - c = this->egptr() - this->gptr(); - } else { - int width = _M_cv->encoding(); - c = _M_extbufend - _M_extbufnext; - if (width > 0) { - c += width * (this->egptr() - this->gptr()); - } else { - if (this->gptr() != this->egptr()) { - const int off = _M_cv->length(state, _M_extbuf, _M_extbufnext, - this->gptr() - this->eback()); - c += _M_extbufnext - _M_extbuf - off; - update_st = true; - } - } - } -#if _WIN32 - if (fseek(_M_file_, -c, SEEK_CUR)) return -1; -#else - if (fseeko(_M_file, -c, SEEK_CUR)) return -1; -#endif - if (update_st) _M_st = state; - _M_extbufnext = _M_extbufend = _M_extbuf; - this->setg(0, 0, 0); - _M_cm = std::ios_base::openmode(0); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::imbue(const std::locale& loc) { - sync(); - _M_cv = &std::use_facet >(loc); - bool old_anc = _M_always_noconv; - _M_always_noconv = _M_cv->always_noconv(); - if (old_anc != _M_always_noconv) { - this->setg(0, 0, 0); - this->setp(0, 0); - // invariant, char_type is char, else we couldn't get here - // need to dump _M_intbuf - if (_M_always_noconv) { - if (_M_owns_eb) delete[] _M_extbuf; - _M_owns_eb = _M_owns_ib; - _M_ebs = _M_ibs; - _M_extbuf = reinterpret_cast(_M_intbuf); - _M_ibs = 0; - _M_intbuf = nullptr; - _M_owns_ib = false; - } else { // need to obtain an _M_intbuf. - // If _M_extbuf is user-supplied, use it, else new _M_intbuf - if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { - _M_ibs = _M_ebs; - _M_intbuf = reinterpret_cast(_M_extbuf); - _M_owns_ib = false; - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } else { - _M_ibs = _M_ebs; - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -bool basic_filebuf::_M_read_mode() { - if (!(_M_cm & std::ios_base::in)) { - this->setp(0, 0); - if (_M_always_noconv) - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + _M_ebs, - reinterpret_cast(_M_extbuf) + _M_ebs); - else - this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); - _M_cm = std::ios_base::in; - return true; - } - return false; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::_M_write_mode() { - if (!(_M_cm & std::ios_base::out)) { - this->setg(0, 0, 0); - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv) - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (_M_ebs - 1)); - else - this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); - } else { - this->setp(0, 0); - } - _M_cm = std::ios_base::out; - } -} - -/////////////////////////////////////////////////////////////////////////////// -} // namespace kaldi - -/////////////////////////////////////////////////////////////////////////////// -#endif // KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// - -/* - * ============================================================================ - * libc++ License - * ============================================================================ - * - * The libc++ library is dual licensed under both the University of Illinois - * "BSD-Like" license and the MIT license. As a user of this code you may - * choose to use it under either license. As a contributor, you agree to allow - * your code to be used under both. - * - * Full text of the relevant licenses is included below. - * - * ============================================================================ - * - * University of Illinois/NCSA - * Open Source License - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * All rights reserved. - * - * Developed by: - * - * LLVM Team - * - * University of Illinois at Urbana-Champaign - * - * http://llvm.org - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the names of the LLVM Team, University of Illinois at - * Urbana-Champaign, nor the names of its contributors may be used to - * endorse or promote products derived from this Software without specific - * prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - * ============================================================================== - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * ============================================================================== - * - * This file is a partial list of people who have contributed to the LLVM/libc++ - * project. If you have contributed a patch or made some other contribution to - * LLVM/libc++, please submit a patch to this file to add yourself, and it will - * be done! - * - * The list is sorted by surname and formatted to allow easy grepping and - * beautification by scripts. The fields are: name (N), email (E), web-address - * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address - * (S). - * - * N: Saleem Abdulrasool - * E: compnerd@compnerd.org - * D: Minor patches and Linux fixes. - * - * N: Dimitry Andric - * E: dimitry@andric.com - * D: Visibility fixes, minor FreeBSD portability patches. - * - * N: Holger Arnold - * E: holgerar@gmail.com - * D: Minor fix. - * - * N: Ruben Van Boxem - * E: vanboxem dot ruben at gmail dot com - * D: Initial Windows patches. - * - * N: David Chisnall - * E: theraven at theravensnest dot org - * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. - * - * N: Marshall Clow - * E: mclow.lists@gmail.com - * E: marshall@idio.com - * D: C++14 support, patches and bug fixes. - * - * N: Bill Fisher - * E: william.w.fisher@gmail.com - * D: Regex bug fixes. - * - * N: Matthew Dempsky - * E: matthew@dempsky.org - * D: Minor patches and bug fixes. - * - * N: Google Inc. - * D: Copyright owner and contributor of the CityHash algorithm - * - * N: Howard Hinnant - * E: hhinnant@apple.com - * D: Architect and primary author of libc++ - * - * N: Hyeon-bin Jeong - * E: tuhertz@gmail.com - * D: Minor patches and bug fixes. - * - * N: Argyrios Kyrtzidis - * E: kyrtzidis@apple.com - * D: Bug fixes. - * - * N: Bruce Mitchener, Jr. - * E: bruce.mitchener@gmail.com - * D: Emscripten-related changes. - * - * N: Michel Morin - * E: mimomorin@gmail.com - * D: Minor patches to is_convertible. - * - * N: Andrew Morrow - * E: andrew.c.morrow@gmail.com - * D: Minor patches and Linux fixes. - * - * N: Arvid Picciani - * E: aep at exys dot org - * D: Minor patches and musl port. - * - * N: Bjorn Reese - * E: breese@users.sourceforge.net - * D: Initial regex prototype - * - * N: Nico Rieck - * E: nico.rieck@gmail.com - * D: Windows fixes - * - * N: Jonathan Sauer - * D: Minor patches, mostly related to constexpr - * - * N: Craig Silverstein - * E: csilvers@google.com - * D: Implemented Cityhash as the string hash function on 64-bit machines - * - * N: Richard Smith - * D: Minor patches. - * - * N: Joerg Sonnenberger - * E: joerg@NetBSD.org - * D: NetBSD port. - * - * N: Stephan Tolksdorf - * E: st@quanttec.com - * D: Minor fix - * - * N: Michael van der Westhuizen - * E: r1mikey at gmail dot com - * - * N: Klaas de Vries - * E: klaas at klaasgaaf dot nl - * D: Minor bug fix. - * - * N: Zhang Xiongpang - * E: zhangxiongpang@gmail.com - * D: Minor patches and bug fixes. - * - * N: Xing Xue - * E: xingxue@ca.ibm.com - * D: AIX port - * - * N: Zhihao Yuan - * E: lichray@gmail.com - * D: Standard compatibility fixes. - * - * N: Jeffrey Yasskin - * E: jyasskin@gmail.com - * E: jyasskin@google.com - * D: Linux fixes. - */ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/const-integer-set-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/const-integer-set-inl.h deleted file mode 100644 index b93846148a3e4595774507f638396ce13393ac0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/const-integer-set-inl.h +++ /dev/null @@ -1,87 +0,0 @@ -// util/const-integer-set-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ - -// Do not include this file directly. It is included by const-integer-set.h - -namespace kaldi { - -template -void ConstIntegerSet::InitInternal() { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - quick_set_.clear(); // just in case we previously had data. - if (slow_set_.size() == 0) { - lowest_member_ = (I)1; - highest_member_ = (I)0; - contiguous_ = false; - quick_ = false; - } else { - lowest_member_ = slow_set_.front(); - highest_member_ = slow_set_.back(); - size_t range = highest_member_ + 1 - lowest_member_; - if (range == slow_set_.size()) { - contiguous_ = true; - quick_ = false; - } else { - contiguous_ = false; - // If it would be more compact to store as bool - if (range < slow_set_.size() * 8 * sizeof(I)) { - // (assuming 1 bit per element)... - quick_set_.resize(range, false); - for (size_t i = 0; i < slow_set_.size(); i++) - quick_set_[slow_set_[i] - lowest_member_] = true; - quick_ = true; - } else { - quick_ = false; - } - } - } -} - -template -int ConstIntegerSet::count(I i) const { - if (i < lowest_member_ || i > highest_member_) { - return 0; - } else { - if (contiguous_) return true; - if (quick_) { - return (quick_set_[i - lowest_member_] ? 1 : 0); - } else { - bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); - return (ans ? 1 : 0); - } - } -} - -template -void ConstIntegerSet::Write(std::ostream &os, bool binary) const { - WriteIntegerVector(os, binary, slow_set_); -} - -template -void ConstIntegerSet::Read(std::istream &is, bool binary) { - ReadIntegerVector(is, binary, &slow_set_); - InitInternal(); -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/const-integer-set.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/const-integer-set.h deleted file mode 100644 index 809a56a7c83804bfaa4badb5e28059734bfcad1e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/const-integer-set.h +++ /dev/null @@ -1,96 +0,0 @@ -// util/const-integer-set.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_H_ -#include -#include -#include -#include -#include -#include "util/stl-utils.h" - -/* ConstIntegerSet is a way to efficiently test whether something is in a - supplied set of integers. It can be initialized from a vector or set, but - never changed after that. It either uses a sorted vector or an array of - bool, depending on the input. It behaves like a const version of an STL set, - with only a subset of the functionality, except all the member functions are - upper-case. - - Note that we could get rid of the member slow_set_, but we'd have to - do more work to implement an iterator type. This would save memory. -*/ - -namespace kaldi { - -template -class ConstIntegerSet { - public: - ConstIntegerSet() : lowest_member_(1), highest_member_(0) {} - - void Init(const std::vector &input) { - slow_set_ = input; - SortAndUniq(&slow_set_); - InitInternal(); - } - - void Init(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - - explicit ConstIntegerSet(const std::vector &input) : slow_set_(input) { - SortAndUniq(&slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const ConstIntegerSet &other) - : slow_set_(other.slow_set_) { - InitInternal(); - } - - int count(I i) const; // returns 1 or 0. - - typedef typename std::vector::const_iterator iterator; - iterator begin() const { return slow_set_.begin(); } - iterator end() const { return slow_set_.end(); } - size_t size() const { return slow_set_.size(); } - bool empty() const { return slow_set_.empty(); } - - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); - - private: - I lowest_member_; - I highest_member_; - bool contiguous_; - bool quick_; - std::vector quick_set_; - std::vector slow_set_; - void InitInternal(); -}; - -} // end namespace kaldi - -#include "util/const-integer-set-inl.h" - -#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/hash-list-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/hash-list-inl.h deleted file mode 100644 index 063fa7131ec618f0aae9dc30f4edd26c9dcce7fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/hash-list-inl.h +++ /dev/null @@ -1,193 +0,0 @@ -// util/hash-list-inl.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_INL_H_ -#define KALDI_UTIL_HASH_LIST_INL_H_ - -// Do not include this file directly. It is included by fast-hash.h - -namespace kaldi { - -template -HashList::HashList() { - list_head_ = NULL; - bucket_list_tail_ = static_cast(-1); // invalid. - hash_size_ = 0; - freed_head_ = NULL; -} - -template -void HashList::SetSize(size_t size) { - hash_size_ = size; - KALDI_ASSERT(list_head_ == NULL && - bucket_list_tail_ == - static_cast(-1)); // make sure empty. - if (size > buckets_.size()) buckets_.resize(size, HashBucket(0, NULL)); -} - -template -typename HashList::Elem *HashList::Clear() { - // Clears the hashtable and gives ownership of the currently contained list - // to the user. - for (size_t cur_bucket = bucket_list_tail_; - cur_bucket != static_cast(-1); - cur_bucket = buckets_[cur_bucket].prev_bucket) { - buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". - } - bucket_list_tail_ = static_cast(-1); - Elem *ans = list_head_; - list_head_ = NULL; - return ans; -} - -template -const typename HashList::Elem *HashList::GetList() const { - return list_head_; -} - -template -inline void HashList::Delete(Elem *e) { - e->tail = freed_head_; - freed_head_ = e; -} - -template -inline typename HashList::Elem *HashList::Find(I key) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - if (bucket.last_elem == NULL) { - return NULL; // empty bucket. - } else { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - return NULL; // Not found. - } -} - -template -inline typename HashList::Elem *HashList::New() { - if (freed_head_) { - Elem *ans = freed_head_; - freed_head_ = freed_head_->tail; - return ans; - } else { - Elem *tmp = new Elem[allocate_block_size_]; - for (size_t i = 0; i + 1 < allocate_block_size_; i++) - tmp[i].tail = tmp + i + 1; - tmp[allocate_block_size_ - 1].tail = NULL; - freed_head_ = tmp; - allocated_.push_back(tmp); - return this->New(); - } -} - -template -HashList::~HashList() { - // First test whether we had any memory leak within the - // HashList, i.e. things for which the user did not call Delete(). - size_t num_in_list = 0, num_allocated = 0; - for (Elem *e = freed_head_; e != NULL; e = e->tail) num_in_list++; - for (size_t i = 0; i < allocated_.size(); i++) { - num_allocated += allocate_block_size_; - delete[] allocated_[i]; - } - if (num_in_list != num_allocated) { - KALDI_WARN << "Possible memory leak: " << num_in_list - << " != " << num_allocated - << ": you might have forgotten to call Delete on " - << "some Elems"; - } -} - -template -inline typename HashList::Elem *HashList::Insert(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - // Check the element is existing or not. - if (bucket.last_elem != NULL) { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - } - - // This is a new element. Insert it. - Elem *elem = New(); - elem->key = key; - elem->val = val; - if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at - // head of bucket list (which is tail of regular list, they go in - // opposite directions). - if (bucket_list_tail_ == static_cast(-1)) { - // list was empty so this is the first elem. - KALDI_ASSERT(list_head_ == NULL); - list_head_ = elem; - } else { - // link in to the chain of Elems - buckets_[bucket_list_tail_].last_elem->tail = elem; - } - elem->tail = NULL; - bucket.last_elem = elem; - bucket.prev_bucket = bucket_list_tail_; - bucket_list_tail_ = index; - } else { - // Already-occupied bucket. Insert at tail of list of elements within - // the bucket. - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - } - return elem; -} - -template -void HashList::InsertMore(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - Elem *elem = New(); - elem->key = key; - elem->val = val; - - KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here - if (bucket.last_elem->key == key) { // standard behavior: add as last element - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - return; - } - Elem *e = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail); - // find place to insert in linked list - while (e != bucket.last_elem->tail && e->key != key) e = e->tail; - KALDI_ASSERT(e->key == key); // not found? - should not happen - elem->tail = e->tail; - e->tail = elem; -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/hash-list.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/hash-list.h deleted file mode 100644 index 31cc9bdc4870773475f8c5139539e320746bf5fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/hash-list.h +++ /dev/null @@ -1,146 +0,0 @@ -// util/hash-list.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_H_ -#define KALDI_UTIL_HASH_LIST_H_ - -#include -#include -#include -#include -#include - -#include "base/kaldi-error.h" - -/* This header provides utilities for a structure that's used in a decoder (but - is quite generic in nature so we implement and test it separately). - Basically it's a singly-linked list, but implemented in such a way that we - can quickly search for elements in the list. We give it a slightly richer - interface than just a hash and a list. The idea is that we want to separate - the hash part and the list part: basically, in the decoder, we want to have a - single hash for the current frame and the next frame, because by the time we - need to access the hash for the next frame we no longer need the hash for the - previous frame. So we have an operation that clears the hash but leaves the - list structure intact. We also control memory management inside this object, - to avoid repeated new's/deletes. - - See hash-list-test.cc for an example of how to use this object. -*/ - -namespace kaldi { - -template -class HashList { - public: - struct Elem { - I key; - T val; - Elem *tail; - }; - - /// Constructor takes no arguments. - /// Call SetSize to inform it of the likely size. - HashList(); - - /// Clears the hash and gives the head of the current list to the user; - /// ownership is transferred to the user (the user must call Delete() - /// for each element in the list, at his/her leisure). - Elem *Clear(); - - /// Gives the head of the current list to the user. Ownership retained in the - /// class. Caution: in December 2013 the return type was changed to const - /// Elem* and this function was made const. You may need to change some types - /// of local Elem* variables to const if this produces compilation errors. - const Elem *GetList() const; - - /// Think of this like delete(). It is to be called for each Elem in turn - /// after you "obtained ownership" by doing Clear(). This is not the opposite - /// of. Insert, it is the opposite of New. It's really a memory operation. - inline void Delete(Elem *e); - - /// This should probably not be needed to be called directly by the user. - /// Think of it as opposite - /// to Delete(); - inline Elem *New(); - - /// Find tries to find this element in the current list using the hashtable. - /// It returns NULL if not present. The Elem it returns is not owned by the - /// user, it is part of the internal list owned by this object, but the user - /// is free to modify the "val" element. - inline Elem *Find(I key); - - /// Insert inserts a new element into the hashtable/stored list. - /// Because element keys in a hashtable are unique, this operation checks - /// whether each inserted element has a key equivalent to the one of an - /// element already in the hashtable. If so, the element is not inserted, - /// returning an pointer to this existing element. - inline Elem *Insert(I key, T val); - - /// Insert inserts another element with same key into the hashtable/ - /// stored list. - /// By calling this, the user asserts that one element with that key is - /// already present. - /// We insert it that way, that all elements with the same key - /// follow each other. - /// Find() will return the first one of the elements with the same key. - inline void InsertMore(I key, T val); - - /// SetSize tells the object how many hash buckets to allocate (should - /// typically be at least twice the number of objects we expect to go in the - /// structure, for fastest performance). It must be called while the hash - /// is empty (e.g. after Clear() or after initializing the object, but before - /// adding anything to the hash. - void SetSize(size_t sz); - - /// Returns current number of hash buckets. - inline size_t Size() { return hash_size_; } - - ~HashList(); - - private: - struct HashBucket { - size_t prev_bucket; // index to next bucket (-1 if list tail). Note: - // list of buckets goes in opposite direction to list of Elems. - Elem *last_elem; // pointer to last element in this bucket (NULL if empty) - inline HashBucket(size_t i, Elem *e) : prev_bucket(i), last_elem(e) {} - }; - - Elem *list_head_; // head of currently stored list. - size_t bucket_list_tail_; // tail of list of active hash buckets. - - size_t hash_size_; // number of hash buckets. - - std::vector buckets_; - - Elem *freed_head_; // head of list of currently freed elements. [ready for - // allocation] - - std::vector allocated_; // list of allocated blocks. - - static const size_t allocate_block_size_ = 1024; // Number of Elements to - // allocate in one block. Must be largish so storing allocated_ doesn't - // become a problem. -}; - -} // end namespace kaldi - -#include "util/hash-list-inl.h" - -#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io-inl.h deleted file mode 100644 index 8b0c92131c4af2113eb33da6f3cfa9dc4dee83e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io-inl.h +++ /dev/null @@ -1,40 +0,0 @@ -// util/kaldi-io-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_INL_H_ -#define KALDI_UTIL_KALDI_IO_INL_H_ - -#include - -namespace kaldi { - -bool Input::Open(const std::string &rxfilename, bool *binary) { - return OpenInternal(rxfilename, true, binary); -} - -bool Input::OpenTextMode(const std::string &rxfilename) { - return OpenInternal(rxfilename, false, NULL); -} - -bool Input::IsOpen() { return impl_ != NULL; } - -bool Output::IsOpen() { return impl_ != NULL; } - -} // end namespace kaldi. - -#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io.cc deleted file mode 100644 index 5f8ec4870138df32f6aca9c12383cf3885411741..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io.cc +++ /dev/null @@ -1,898 +0,0 @@ -// util/kaldi-io.cc - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/kaldi-io.h" - -#include -#include -#include - -#include - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" -#include "util/kaldi-pipebuf.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -#ifdef KALDI_CYGWIN_COMPAT -#include "util/kaldi-cygwin-io-inl.h" -#define MapOsPath(x) MapCygwinPath(x) -#else // KALDI_CYGWIN_COMPAT -#define MapOsPath(x) x -#endif // KALDI_CYGWIN_COMPAT - -#if defined(_MSC_VER) -static FILE *popen(const char *command, const char *mode) { -#ifdef KALDI_CYGWIN_COMPAT - return kaldi::CygwinCompatPopen(command, mode); -#else // KALDI_CYGWIN_COMPAT - return _popen(command, mode); -#endif // KALDI_CYGWIN_COMPAT -} -#endif // _MSC_VER - -namespace kaldi { - -#ifndef _MSC_VER // on VS, we don't need this type. -// could replace basic_pipebuf with stdio_filebuf on some platforms. -// Would mean we could use less of our own code. -typedef basic_pipebuf PipebufType; -#endif -} // namespace kaldi - -namespace kaldi { - -std::string PrintableRxfilename(const std::string &rxfilename) { - if (rxfilename == "" || rxfilename == "-") { - return "standard input"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return rxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(rxfilename); - } -} - -std::string PrintableWxfilename(const std::string &wxfilename) { - if (wxfilename == "" || wxfilename == "-") { - return "standard output"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return wxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(wxfilename); - } -} - -OutputType ClassifyWxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardOutput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardOutput; - } else if (first_char == '|') { - return kPipeOutput; // An output pipe like "|blah". - } else if (isspace(first_char) || isspace(last_char) || last_char == '|') { - return kNoOutput; // Leading or trailing space: can't interpret this. - // Final '|' would represent an input pipe, not an - // output pipe. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoOutput; - } else if (isdigit(last_char)) { - // This could be a file, but we have to see if it's an offset into a file - // (like foo.ark:4314328), which is not allowed for writing (but is - // allowed for reaching). This eliminates some things which would be - // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed - // such filenames for writing, we woudln't be able to correctly read them). - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') return kNoOutput; - // else it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but we - // check for internal '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" - " wrong place (pipe without | at the beginning?): " - << filename; - return kNoOutput; - } - return kFileOutput; // It matched no other pattern: assume it's a filename. -} - -InputType ClassifyRxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardInput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardInput; - } else if (first_char == '|') { - return kNoInput; // An output pipe like "|blah": not - // valid for input. - } else if (last_char == '|') { - return kPipeInput; - } else if (isspace(first_char) || isspace(last_char)) { - return kNoInput; // We don't allow leading or trailing space in a filename. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoInput; - } else if (isdigit(last_char)) { - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') - return kOffsetFileInput; // Filename is like - // some_file:12345 - // otherwise it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but - // we check for '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified in this case. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" - " wrong place (pipe without | at the end?): " - << filename; - return kNoInput; - } - return kFileInput; // It matched no other pattern: assume it's a filename. -} - -class OutputImplBase { - public: - // Open will open it as a file (no header), and return true - // on success. It cannot be called on an already open stream. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::ostream &Stream() = 0; - virtual bool Close() = 0; - virtual ~OutputImplBase() {} -}; - -class FileOutputImpl : public OutputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (os_.is_open()) - KALDI_ERR << "FileOutputImpl::Open(), " - << "open called on already open file."; - filename_ = filename; - os_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out); - return os_.is_open(); - } - - virtual std::ostream &Stream() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return os_; - } - - virtual bool Close() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - os_.close(); - return !(os_.fail()); - } - virtual ~FileOutputImpl() { - if (os_.is_open()) { - os_.close(); - if (os_.fail()) KALDI_ERR << "Error closing output file " << filename_; - } - } - - private: - std::string filename_; - std::ofstream os_; -}; - -class StandardOutputImpl : public OutputImplBase { - public: - StandardOutputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardOutputImpl::Open(), " - "open called on already open file."; -#ifdef _MSC_VER - _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); -#endif - is_open_ = std::cout.good(); - return is_open_; - } - - virtual std::ostream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cout; - } - - virtual bool Close() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; - is_open_ = false; - std::cout << std::flush; - return !(std::cout.fail()); - } - virtual ~StandardOutputImpl() { - if (is_open_) { - std::cout << std::flush; - if (std::cout.fail()) KALDI_ERR << "Error writing to standard output"; - } - } - - private: - bool is_open_; -}; - -class PipeOutputImpl : public OutputImplBase { - public: - PipeOutputImpl() : f_(NULL), os_(NULL) {} - - virtual bool Open(const std::string &wxfilename, bool binary) { - filename_ = wxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should - // start with '|' - std::string cmd_name(wxfilename, 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); -#else - f_ = popen(cmd_name.c_str(), "w"); -#endif - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for writing, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't make the - // destructor try to close the stream when - // we're done. - (binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - os_ = new std::ostream(fb_); -#else - os_ = new std::ofstream(f_); -#endif - return os_->good(); - } - } - - virtual std::ostream &Stream() { - if (os_ == NULL) - KALDI_ERR << "PipeOutputImpl::Stream()," - " object not initialized."; - // I believe this error can only arise from coding error. - return *os_; - } - - virtual bool Close() { - if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; - bool ok = true; - os_->flush(); - if (os_->fail()) ok = false; - delete os_; - os_ = NULL; - int status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return ok; - } - virtual ~PipeOutputImpl() { - if (os_) { - if (!Close()) - KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); - } - } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::ostream *os_; -}; - -class InputImplBase { - public: - // Open will open it as a file, and return true on success. - // May be called twice only for kOffsetFileInput (otherwise, - // if called twice, we just create a new Input object, to avoid - // having to deal with the extra hassle of reopening with the - // same object. - // Note that we will to call Open with true (binary) for - // for text-mode Kaldi files; the only actual text-mode input - // is for non-Kaldi files. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::istream &Stream() = 0; - virtual int32 Close() = 0; // We only need to check failure in the case of - // kPipeInput. - // on close for input streams. - virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may - // call Open twice - // (has efficiency benefits). - - virtual ~InputImplBase() {} -}; - -class FileInputImpl : public InputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (is_.is_open()) - KALDI_ERR << "FileInputImpl::Open(), " - << "open called on already open file."; - is_.open( - MapOsPath(filename).c_str(), - binary ? std::ios_base::in | std::ios_base::binary : std::ios_base::in); - return is_.is_open(); - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kFileInput; } - - virtual ~FileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::ifstream is_; -}; - -class StandardInputImpl : public InputImplBase { - public: - StandardInputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardInputImpl::Open(), " - "open called on already open file."; - is_open_ = true; -#ifdef _MSC_VER - _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); -#endif - return true; // Don't check good() because would be false if - // eof, which may be valid input. - } - - virtual std::istream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cin; - } - - virtual InputType MyType() { return kStandardInput; } - - virtual int32 Close() { - if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; - is_open_ = false; - return 0; - } - virtual ~StandardInputImpl() {} - - private: - bool is_open_; -}; - -class PipeInputImpl : public InputImplBase { - public: - PipeInputImpl() : f_(NULL), is_(NULL) {} - - virtual bool Open(const std::string &rxfilename, bool binary) { - filename_ = rxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(rxfilename.length() != 0 && - rxfilename[rxfilename.length() - 1] == - '|'); // should end with '|' - std::string cmd_name(rxfilename, 0, rxfilename.length() - 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); -#else - f_ = popen(cmd_name.c_str(), "r"); -#endif - - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for reading, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't lead the - // destructor to close the stream. - (binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - is_ = new std::istream(fb_); -#else - is_ = new std::ifstream(f_); -#endif - if (is_->fail() || is_->bad()) return false; - if (is_->eof()) { - KALDI_WARN << "Pipe opened with command " - << PrintableRxfilename(rxfilename) << " is empty."; - // don't return false: empty may be valid. - } - return true; - } - } - - virtual std::istream &Stream() { - if (is_ == NULL) - KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return *is_; - } - - virtual int32 Close() { - if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open."; - delete is_; - is_ = NULL; - int32 status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return status; - } - virtual ~PipeInputImpl() { - if (is_) Close(); - } - virtual InputType MyType() { return kPipeInput; } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::istream *is_; -}; - -/* -#else - -// Just have an empty implementation of the pipe input that crashes if -// called. -class PipeInputImpl: public InputImplBase { - public: - PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this - platform."); } - virtual bool Open(const std::string, bool) { return 0; } - virtual std::istream &Stream() const { return NULL; } - virtual void Close() {} - virtual InputType MyType() { return kPipeInput; } -}; - -#endif -*/ - -class OffsetFileInputImpl : public InputImplBase { - // This class is a bit more complicated than the - - public: - // splits a filename like /my/file:123 into /my/file and the - // number 123. Crashes if not this format. - static void SplitFilename(const std::string &rxfilename, - std::string *filename, size_t *offset) { - size_t pos = rxfilename.find_last_of(':'); - KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling - // code, as the filename is supposed to be of the correct form at this - // point. - *filename = std::string(rxfilename, 0, pos); - std::string number(rxfilename, pos + 1); - bool ans = ConvertStringToInteger(number, offset); - if (!ans) - KALDI_ERR << "Cannot get offset from filename " << rxfilename - << " (possibly you compiled in 32-bit and have a >32-bit" - << " byte offset into a file; you'll have to compile 64-bit."; - } - - bool Seek(size_t offset) { - size_t cur_pos = is_.tellg(); - if (cur_pos == offset) { - return true; - } else if (cur_pos < offset && cur_pos + 100 > offset) { - // We're close enough that it may be faster to just - // read that data, rather than seek. - for (size_t i = cur_pos; i < offset; i++) is_.get(); - return (is_.tellg() == std::streampos(offset)); - } - // Try to actually seek. - is_.seekg(offset, std::ios_base::beg); - if (is_.fail()) { // failbit or badbit is set [error happened] - is_.close(); - return false; // failure. - } else { - is_.clear(); // Clear any failure bits (e.g. eof). - return true; // success. - } - } - - // This Open routine is unusual in that it is designed to work even - // if it was already open. This for efficiency when seeking multiple - // times. - virtual bool Open(const std::string &rxfilename, bool binary) { - if (is_.is_open()) { - // We are opening when we have an already-open file. - // We may have to seek within this file, or else close it and - // open a different one. - std::string tmp_filename; - size_t offset; - SplitFilename(rxfilename, &tmp_filename, &offset); - if (tmp_filename == filename_ && binary == binary_) { // Just seek - is_.clear(); // clear fail bit, etc. - return Seek(offset); - } else { - is_.close(); // don't bother checking error status of is_. - filename_ = tmp_filename; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } else { - size_t offset; - SplitFilename(rxfilename, &filename_, &offset); - binary_ = binary; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kOffsetFileInput; } - - virtual ~OffsetFileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::string filename_; // the actual filename - bool binary_; // true if was opened in binary mode. - std::ifstream is_; -}; - -Output::Output(const std::string &wxfilename, bool binary, bool write_header) - : impl_(NULL) { - if (!Open(wxfilename, binary, write_header)) { - if (impl_) { - delete impl_; - impl_ = NULL; - } - KALDI_ERR << "Error opening output stream " - << PrintableWxfilename(wxfilename); - } -} - -bool Output::Close() { - if (!impl_) { - return false; // error to call Close if not open. - } else { - bool ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } -} - -Output::~Output() { - if (impl_) { - bool ok = impl_->Close(); - delete impl_; - impl_ = NULL; - if (!ok) - KALDI_ERR << "Error closing output file " - << PrintableWxfilename(filename_) - << (ClassifyWxfilename(filename_) == kFileOutput - ? " (disk full?)" - : ""); - } -} - -std::ostream &Output::Stream() { // will throw if not open; else returns - // stream. - if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; - return impl_->Stream(); -} - -bool Output::Open(const std::string &wxfn, bool binary, bool header) { - if (IsOpen()) { - if (!Close()) { // Throw here rather than return status, as it's an error - // about something else: if the user wanted to avoid the exception he/she - // could have called Close(). - KALDI_ERR << "Output::Open(), failed to close output stream: " - << PrintableWxfilename(filename_); - } - } - - filename_ = wxfn; - - OutputType type = ClassifyWxfilename(wxfn); - KALDI_ASSERT(impl_ == NULL); - - if (type == kFileOutput) { - impl_ = new FileOutputImpl(); - } else if (type == kStandardOutput) { - impl_ = new StandardOutputImpl(); - } else if (type == kPipeOutput) { - impl_ = new PipeOutputImpl(); - } else { // type == kNoOutput - KALDI_WARN << "Invalid output filename format " - << PrintableWxfilename(wxfn); - return false; - } - if (!impl_->Open(wxfn, binary)) { - delete impl_; - impl_ = NULL; - return false; // failed to open. - } else { // successfully opened it. - if (header) { - InitKaldiOutputStream(impl_->Stream(), binary); - bool ok = impl_->Stream().good(); // still OK? - if (!ok) { - delete impl_; - impl_ = NULL; - return false; - } - return true; - } else { - return true; - } - } -} - -Input::Input(const std::string &rxfilename, bool *binary) : impl_(NULL) { - if (!Open(rxfilename, binary)) { - KALDI_ERR << "Error opening input stream " - << PrintableRxfilename(rxfilename); - } -} - -int32 Input::Close() { - if (impl_) { - int32 ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } else { - return 0; - } -} - -bool Input::OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary) { - InputType type = ClassifyRxfilename(rxfilename); - if (IsOpen()) { - // May have to close the stream first. - if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { - // We want to use the same object to Open... this is in case - // the files are the same, so we can just seek. - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always open in binary. - delete impl_; - impl_ = NULL; - return false; - } - // read the binary header, if requested. - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; - } else { - Close(); - // and fall through to code below which actually opens the file. - } - } - if (type == kFileInput) { - impl_ = new FileInputImpl(); - } else if (type == kStandardInput) { - impl_ = new StandardInputImpl(); - } else if (type == kPipeInput) { - impl_ = new PipeInputImpl(); - } else if (type == kOffsetFileInput) { - impl_ = new OffsetFileInputImpl(); - } else { // type == kNoInput - KALDI_WARN << "Invalid input filename format " - << PrintableRxfilename(rxfilename); - return false; - } - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always read in binary. - delete impl_; - impl_ = NULL; - return false; - } - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; -} - -Input::~Input() { - if (impl_) Close(); -} - -std::istream &Input::Stream() { - if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; - return impl_->Stream(); -} - -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io.h deleted file mode 100644 index 2175ca8f89ed5f3e3bade26528e924208df692c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-io.h +++ /dev/null @@ -1,266 +0,0 @@ -// util/kaldi-io.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_H_ -#define KALDI_UTIL_KALDI_IO_H_ - -#ifdef _MSC_VER -#include -#include -#endif -#include // For isspace. -#include -#include -#include "base/kaldi-common.h" -// #include "matrix/kaldi-matrix.h" - -namespace kaldi { - -class OutputImplBase; // Forward decl; defined in a .cc file -class InputImplBase; // Forward decl; defined in a .cc file - -/// \addtogroup io_group -/// @{ - -// The Output and Input classes handle stream-opening for "extended" filenames -// that include actual files, standard-input/standard-output, pipes, and -// offsets into actual files. They also handle reading and writing the -// binary-mode headers for Kaldi files, where applicable. The classes have -// versions of the Open routines that throw and do not throw, depending whether -// the calling code wants to catch the errors or not; there are also versions -// that write (or do not write) the Kaldi binary-mode header that says if it's -// binary mode. Generally files that contain Kaldi objects will have the header -// on, so we know upon reading them whether they have the header. So you would -// use the OpenWithHeader routines for these (or the constructor); but other -// types of objects (e.g. FSTs) would have files without a header so you would -// use OpenNoHeader. - -// We now document the types of extended filenames that we use. -// -// A "wxfilename" is an extended filename for writing. It can take three forms: -// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My -// Documents\\boo" -// (whatever the actual file-system interprets) -// (2) Standard output: "" or "-" -// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" -// -// -// A "rxfilename" is an extended filename for reading. It can take four forms: -// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". -// (2) Standard input: "" or "-" -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" -// [these are created by the Table and TableWriter classes; I may also write -// a program that creates them for arbitrary files] -// - -// Typical usage: -// ... -// bool binary; -// MyObject.Write(Output(some_filename, binary).Stream(), binary); -// -// ... more extensive example: -// { -// Output ko(some_filename, binary); -// MyObject1.Write(ko.Stream(), binary); -// MyObject2.Write(ko.Stream(), binary); -// } - -enum OutputType { kNoOutput, kFileOutput, kStandardOutput, kPipeOutput }; - -/// ClassifyWxfilename interprets filenames as follows: -/// - kNoOutput: invalid filenames (leading or trailing space, things that look -/// like wspecifiers and rspecifiers or like pipes to read from with leading -/// |. -/// - kFileOutput: Normal filenames -/// - kStandardOutput: The empty string or "-", interpreted as standard output -/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" -OutputType ClassifyWxfilename(const std::string &wxfilename); - -enum InputType { - kNoInput, - kFileInput, - kStandardInput, - kOffsetFileInput, - kPipeInput -}; - -/// ClassifyRxfilenames interprets filenames for reading as follows: -/// - kNoInput: invalid filenames (leading or trailing space, things that -/// look like wspecifiers and rspecifiers or pipes to write to -/// with trailing |. -/// - kFileInput: normal filenames -/// - kStandardInput: the empty string or "-" -/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" -/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 -InputType ClassifyRxfilename(const std::string &rxfilename); - -class Output { - public: - // The normal constructor, provided for convenience. - // Equivalent to calling with default constructor then Open() - // with these arguments. - Output(const std::string &filename, bool binary, bool write_header = true); - - Output() : impl_(NULL) {} - - /// This opens the stream, with the given mode (binary or text). It returns - /// true on success and false on failure. However, it will throw if something - /// was already open and could not be closed (to avoid this, call Close() - /// first. if write_header == true and binary == true, it writes the Kaldi - /// binary-mode header ('\0' then 'B'). You may call Open even if it is - /// already open; it will close the existing stream and reopen (however if - /// closing the old stream failed it will throw). - bool Open(const std::string &wxfilename, bool binary, bool write_header); - - inline bool IsOpen(); // return true if we have an open stream. Does not - // imply stream is good for writing. - - std::ostream &Stream(); // will throw if not open; else returns stream. - - // Close closes the stream. Calling Close is never necessary unless you - // want to avoid exceptions being thrown. There are times when calling - // Close will hurt efficiency (basically, when using offsets into files, - // and using the same Input object), - // but most of the time the user won't be doing this directly, it will - // be done in kaldi-table.{h, cc}, so you don't have to worry about it. - bool Close(); - - // This will throw if stream could not be closed (to check error status, - // call Close()). - ~Output(); - - private: - OutputImplBase *impl_; // non-NULL if open. - std::string filename_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Output); -}; - -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject.Read(ki.Stream(), binary_in); -// -// ... more extensive example: -// -// { -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject1.Read(ki.Stream(), &binary_in); -// MyObject2.Write(ki.Stream(), &binary_in); -// } -// Note that to catch errors you need to use try.. catch. -// Input communicates errors by throwing exceptions. - -// Input interprets four kinds of filenames: -// (1) Normal filenames -// (2) The empty string or "-", interpreted as standard output -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) Offsets into [real] files, e.g. "/my/filename:12049" -// The last one has no correspondence in Output. - -class Input { - public: - /// The normal constructor. Opens the stream in binary mode. - /// Equivalent to calling the default constructor followed by Open(); then, if - /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it - /// throws on error. - explicit Input(const std::string &rxfilename, bool *contents_binary = NULL); - - Input() : impl_(NULL) {} - - // Open opens the stream for reading (the mode, where relevant, is binary; use - // OpenTextMode for text-mode, we made this a separate function rather than a - // boolean argument, to avoid confusion with Kaldi's text/binary distinction, - // since reading in the file system's text mode is unusual.) If - // contents_binary != NULL, it reads the binary-mode header and puts it in the - // "binary" variable. Returns true on success. If it returns false it will - // not be open. You may call Open even if it is already open; it will close - // the existing stream and reopen (however if closing the old stream failed it - // will throw). - inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); - - // As Open but (if the file system has text/binary modes) opens in text mode; - // you shouldn't ever have to use this as in Kaldi we read even text files in - // binary mode (and ignore the \r). - inline bool OpenTextMode(const std::string &rxfilename); - - // Return true if currently open for reading and Stream() will - // succeed. Does not guarantee that the stream is good. - inline bool IsOpen(); - - // It is never necessary or helpful to call Close, except if - // you are concerned about to many filehandles being open. - // Close does not throw. It returns the exit code as int32 - // in the case of a pipe [kPipeInput], and always zero otherwise. - int32 Close(); - - // Returns the underlying stream. Throws if !IsOpen() - std::istream &Stream(); - - // Destructor does not throw: input streams may legitimately fail so we - // don't worry about the status when we close them. - ~Input(); - - private: - bool OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary); - InputImplBase *impl_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Input); -}; - -template -void ReadKaldiObject(const std::string &filename, C *c) { - bool binary_in; - Input ki(filename, &binary_in); - c->Read(ki.Stream(), binary_in); -} - -// Specialize the template for reading matrices, because we want to be able to -// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); -// -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); - -template -inline void WriteKaldiObject(const C &c, const std::string &filename, - bool binary) { - Output ko(filename, binary); - c.Write(ko.Stream(), binary); -} - -/// PrintableRxfilename turns the rxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard input". -std::string PrintableRxfilename(const std::string &rxfilename); - -/// PrintableWxfilename turns the wxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard output". -std::string PrintableWxfilename(const std::string &wxfilename); - -/// @} - -} // end namespace kaldi. - -#include "util/kaldi-io-inl.h" - -#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-pipebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-pipebuf.h deleted file mode 100644 index bcee80ccb1a6fa8ce3195483ac144c5ff66d2f89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/kaldi-pipebuf.h +++ /dev/null @@ -1,86 +0,0 @@ -// util/kaldi-pipebuf.h - -// Copyright 2009-2011 Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/** @file kaldi-pipebuf.h - * This is an Kaldi C++ Library header. - */ - -#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ -#define KALDI_UTIL_KALDI_PIPEBUF_H_ - -#include -#if !defined(_LIBCPP_VERSION) // libc++ -#include -#else -#include "util/basic-filebuf.h" -#endif - -namespace kaldi { -// This class provides a way to initialize a filebuf with a FILE* pointer -// directly; it will not close the file pointer when it is deleted. -// The C++ standard does not allow implementations of C++ to provide -// this constructor within basic_filebuf, which makes it hard to deal -// with pipes using completely native C++. This is a workaround - -#ifdef _MSC_VER -#elif defined(_LIBCPP_VERSION) // libc++ -template > -class basic_pipebuf : public basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : basic_filebuf() { - this->open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - } -}; // class basic_pipebuf -#else -template > -class basic_pipebuf : public std::basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : std::basic_filebuf() { - this->_M_file.sys_open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - this->_M_mode = mode; - this->_M_buf_size = BUFSIZ; - this->_M_allocate_internal_buffer(); - this->_M_reading = false; - this->_M_writing = false; - this->_M_set_buffer(-1); - } -}; // class basic_pipebuf -#endif // _MSC_VER - -} // namespace kaldi - -#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/parse-options.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/parse-options.cc deleted file mode 100644 index 1f2ef844d28d67ed58d2e0c9d7c7b674e8209df8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/parse-options.cc +++ /dev/null @@ -1,636 +0,0 @@ -// util/parse-options.cc - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -// Frantisek Skala; Arnab Ghoshal -// Copyright 2013 Tanel Alumae -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -namespace kaldi { - -ParseOptions::ParseOptions(const std::string &prefix, OptionsItf *other) - : print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { - ParseOptions *po = dynamic_cast(other); - if (po != NULL && po->other_parser_ != NULL) { - // we get here if this constructor is used twice, recursively. - other_parser_ = po->other_parser_; - } else { - other_parser_ = other; - } - if (po != NULL && po->prefix_ != "") { - prefix_ = po->prefix_ + std::string(".") + prefix; - } else { - prefix_ = prefix; - } -} - -void ParseOptions::Register(const std::string &name, bool *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, int32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, uint32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, float *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, double *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, std::string *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -// old-style, used for registering application-specific parameters -template -void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, - const std::string &doc) { - if (other_parser_ == NULL) { - this->RegisterCommon(name, ptr, doc, false); - } else { - KALDI_ASSERT(prefix_ != "" && - "Cannot use empty prefix when registering with prefix."); - std::string new_name = prefix_ + '.' + name; // name becomes prefix.name - other_parser_->Register(new_name, ptr, doc); - } -} - -// does the common part of the job of registering a parameter -template -void ParseOptions::RegisterCommon(const std::string &name, T *ptr, - const std::string &doc, bool is_standard) { - KALDI_ASSERT(ptr != NULL); - std::string idx = name; - NormalizeArgName(&idx); - if (doc_map_.find(idx) != doc_map_.end()) - KALDI_WARN << "Registering option twice, ignoring second time: " << name; - this->RegisterSpecific(name, idx, ptr, doc, is_standard); -} - -// used to register standard parameters (those that are present in all of the -// applications) -template -void ParseOptions::RegisterStandard(const std::string &name, T *ptr, - const std::string &doc) { - this->RegisterCommon(name, ptr, doc, true); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, bool *b, - const std::string &doc, bool is_standard) { - bool_map_[idx] = b; - doc_map_[idx] = - DocInfo(name, doc + " (bool, default = " + ((*b) ? "true)" : "false)"), - is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, int32 *i, - const std::string &doc, bool is_standard) { - int_map_[idx] = i; - std::ostringstream ss; - ss << doc << " (int, default = " << *i << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, uint32 *u, - const std::string &doc, bool is_standard) { - uint_map_[idx] = u; - std::ostringstream ss; - ss << doc << " (uint, default = " << *u << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, float *f, - const std::string &doc, bool is_standard) { - float_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (float, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, double *f, - const std::string &doc, bool is_standard) { - double_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (double, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, std::string *s, - const std::string &doc, bool is_standard) { - string_map_[idx] = s; - doc_map_[idx] = - DocInfo(name, doc + " (string, default = \"" + *s + "\")", is_standard); -} -void ParseOptions::DisableOption(const std::string &name) { - if (argv_ != NULL) - KALDI_ERR << "DisableOption must not be called after calling Read()."; - if (doc_map_.erase(name) == 0) - KALDI_ERR << "Option " << name - << " was not registered so cannot be disabled: "; - bool_map_.erase(name); - int_map_.erase(name); - uint_map_.erase(name); - float_map_.erase(name); - double_map_.erase(name); - string_map_.erase(name); -} - -int ParseOptions::NumArgs() const { return positional_args_.size(); } - -std::string ParseOptions::GetArg(int i) const { - // use KALDI_ERR if code error - if (i < 1 || i > static_cast(positional_args_.size())) - KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; - return positional_args_[i - 1]; -} - -// We currently do not support any other options. -enum ShellType { kBash = 0 }; - -// This can be changed in the code if it ever does need to be changed (as it's -// unlikely that one compilation of this tool-set would use both shells). -static ShellType kShellType = kBash; - -// Returns true if we need to escape a string before putting it into -// a shell (mainly thinking of bash shell, but should work for others) -// This is for the convenience of the user so command-lines that are -// printed out by ParseOptions::Read (with --print-args=true) are -// paste-able into the shell and will run. If you use a different type of -// shell, it might be necessary to change this function. -// But it's mostly a cosmetic issue as it basically affects how -// the program echoes its command-line arguments to the screen. -static bool MustBeQuoted(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - const char *c = str.c_str(); - if (*c == '\0') { - return true; // Must quote empty string - } else { - const char *ok_chars[2]; - - // These seem not to be interpreted as long as there are no other "bad" - // characters involved (e.g. "," would be interpreted as part of something - // like a{b,c}, but not on its own. - ok_chars[kBash] = "[]~#^_-+=:.,/"; - - // Just want to make sure that a space character doesn't get automatically - // inserted here via an automated style-checking script, like it did before. - KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); - - for (; *c != '\0'; c++) { - // For non-alphanumeric characters we have a list of characters which - // are OK. All others are forbidden (this is easier since the shell - // interprets most non-alphanumeric characters). - if (!isalnum(*c)) { - const char *d; - for (d = ok_chars[st]; *d != '\0'; d++) - if (*c == *d) break; - // If not alphanumeric or one of the "ok_chars", it must be escaped. - if (*d == '\0') return true; - } - } - return false; // The string was OK. No quoting or escaping. - } -} - -// Returns a quoted and escaped version of "str" -// which has previously been determined to need escaping. -// Our aim is to print out the command line in such a way that if it's -// pasted into a shell of ShellType "st" (only bash for now), it -// will get passed to the program in the same way. -static std::string QuoteAndEscape(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - // For now we use the following rules: - // In the normal case, we quote with single-quote "'", and to escape - // a single-quote we use the string: '\'' (interpreted as closing the - // single-quote, putting an escaped single-quote from the shell, and - // then reopening the single quote). - char quote_char = '\''; - const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b - - // If the string contains single-quotes that would need escaping this - // way, and we determine that the string could be safely double-quoted - // without requiring any escaping, then we double-quote the string. - // This is the case if the characters "`$\ do not appear in the string. - // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html - const char *c_str = str.c_str(); - if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { - quote_char = '"'; - escape_str = "\\\""; // should never be accessed. - } - - char buf[2]; - buf[1] = '\0'; - - buf[0] = quote_char; - std::string ans = buf; - const char *c = str.c_str(); - for (; *c != '\0'; c++) { - if (*c == quote_char) { - ans += escape_str; - } else { - buf[0] = *c; - ans += buf; - } - } - buf[0] = quote_char; - ans += buf; - return ans; -} - -// static function -std::string ParseOptions::Escape(const std::string &str) { - return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; -} - -int ParseOptions::Read(int argc, const char *const argv[]) { - argc_ = argc; - argv_ = argv; - std::string key, value; - int i; - if (argc > 0) { - // set global "const char*" g_program_name (name of the program) - // so it can be printed out in error messages; - // it's useful because often the stderr of different programs will - // be mixed together in the same log file. -#ifdef _MSC_VER - const char *c = strrchr(argv[0], '\\'); -#else - const char *c = strrchr(argv[0], '/'); -#endif - SetProgramName(c == NULL ? argv[0] : c + 1); - } - // first pass: look for config parameter, look for priority - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // a lone "--" marks the end of named options - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (key.compare("config") == 0) { - ReadConfigFile(value); - } - if (key.compare("help") == 0) { - PrintUsage(); - exit(0); - } - } - } - bool double_dash_seen = false; - // second pass: add the command line options - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // A lone "--" marks the end of named options. - // Skip that option and break the processing of named options - i += 1; - double_dash_seen = true; - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << argv[i]; - } - } else { - break; - } - } - - // process remaining arguments as positional - for (; i < argc; i++) { - if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { - double_dash_seen = true; - } else { - positional_args_.push_back(std::string(argv[i])); - } - } - - // if the user did not suppress this with --print-args = false.... - if (print_args_) { - std::ostringstream strm; - for (int j = 0; j < argc; j++) strm << Escape(argv[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } - return i; -} - -void ParseOptions::PrintUsage(bool print_command_line) { - std::cerr << '\n' << usage_ << '\n'; - DocMapType::iterator it; - // first we print application-specific options - bool app_specific_header_printed = false; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option - if (app_specific_header_printed == false) { // header was not yet printed - std::cerr << "Options:" << '\n'; - app_specific_header_printed = true; - } - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - if (app_specific_header_printed == true) { - std::cerr << '\n'; - } - - // then the standard options - std::cerr << "Standard options:" << '\n'; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - std::cerr << '\n'; - if (print_command_line) { - std::ostringstream strm; - strm << "Command line was: "; - for (int j = 0; j < argc_; j++) strm << Escape(argv_[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } -} - -void ParseOptions::PrintConfig(std::ostream &os) { - os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; - std::string key; - DocMapType::iterator it; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; - if (bool_map_.end() != bool_map_.find(key)) { - os << (*bool_map_[key] ? "true" : "false"); - } else if (int_map_.end() != int_map_.find(key)) { - os << (*int_map_[key]); - } else if (uint_map_.end() != uint_map_.find(key)) { - os << (*uint_map_[key]); - } else if (float_map_.end() != float_map_.find(key)) { - os << (*float_map_[key]); - } else if (double_map_.end() != double_map_.find(key)) { - os << (*double_map_[key]); - } else if (string_map_.end() != string_map_.find(key)) { - os << "'" << *string_map_[key] << "'"; - } else { - KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; - } - os << '\n'; - } - os << '\n'; -} - -void ParseOptions::ReadConfigFile(const std::string &filename) { - std::ifstream is(filename.c_str(), std::ifstream::in); - if (!is.good()) { - KALDI_ERR << "Cannot open config file: " << filename; - } - - std::string line, key, value; - int32 line_number = 0; - while (std::getline(is, line)) { - line_number++; - // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { - line.erase(pos); - } - // skip empty lines - Trim(&line); - if (line.length() == 0) continue; - - if (line.substr(0, 2) != "--") { - KALDI_ERR << "Reading config file " << filename << ": line " - << line_number << " does not look like a line " - << "from a Kaldi command-line program's config file: should " - << "be of the form --x=y. Note: config files intended to " - << "be sourced by shell scripts lack the '--'."; - } - - // parse option - bool has_equal_sign; - SplitLongArg(line, &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << line << " in config file " << filename; - } - } -} - -void ParseOptions::SplitLongArg(const std::string &in, std::string *key, - std::string *value, bool *has_equal_sign) { - KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. - size_t pos = in.find_first_of('=', 0); - if (pos == std::string::npos) { // we allow --option for bools - // defaults to empty. We handle this differently in different cases. - *key = in.substr(2, in.size() - 2); // 2 because starts with --. - *value = ""; - *has_equal_sign = false; - } else if (pos == 2) { // we also don't allow empty keys: --=value - PrintUsage(true); - KALDI_ERR << "Invalid option (no key): " << in; - } else { // normal case: --option=value - *key = in.substr(2, pos - 2); // 2 because starts with --. - *value = in.substr(pos + 1); - *has_equal_sign = true; - } -} - -void ParseOptions::NormalizeArgName(std::string *str) { - std::string out; - std::string::iterator it; - - for (it = str->begin(); it != str->end(); ++it) { - if (*it == '_') - out += '-'; // convert _ to - - else - out += std::tolower(*it); - } - *str = out; - - KALDI_ASSERT(str->length() > 0); -} - -bool ParseOptions::SetOption(const std::string &key, const std::string &value, - bool has_equal_sign) { - if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") - KALDI_ERR << "Invalid option --" << key << "="; - *(bool_map_[key]) = ToBool(value); - } else if (int_map_.end() != int_map_.find(key)) { - *(int_map_[key]) = ToInt(value); - } else if (uint_map_.end() != uint_map_.find(key)) { - *(uint_map_[key]) = ToUint(value); - } else if (float_map_.end() != float_map_.find(key)) { - *(float_map_[key]) = ToFloat(value); - } else if (double_map_.end() != double_map_.find(key)) { - *(double_map_[key]) = ToDouble(value); - } else if (string_map_.end() != string_map_.find(key)) { - if (!has_equal_sign) - KALDI_ERR << "Invalid option --" << key << " (option format is --x=y)."; - *(string_map_[key]) = value; - } else { - return false; - } - return true; -} - -bool ParseOptions::ToBool(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); - - // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { - return true; - } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { - return false; - } - // if it is neither true nor false: - PrintUsage(true); - KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " - << str; - return false; // never reached -} - -int32 ParseOptions::ToInt(const std::string &str) { - int32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -uint32 ParseOptions::ToUint(const std::string &str) { - uint32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -float ParseOptions::ToFloat(const std::string &str) { - float ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -double ParseOptions::ToDouble(const std::string &str) { - double ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -// instantiate templates -template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterStandard(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - float *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - double *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterCommon(const std::string &name, bool *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, int32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, uint32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, float *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, double *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, - std::string *ptr, - const std::string &doc, - bool is_standard); - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/parse-options.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/parse-options.h deleted file mode 100644 index 93a060f4a411dfd63298a91bb313e0b66d337a75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/parse-options.h +++ /dev/null @@ -1,265 +0,0 @@ -// util/parse-options.h - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ -#define KALDI_UTIL_PARSE_OPTIONS_H_ - -#include -#include -#include - -#include "base/kaldi-common.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/// The class ParseOptions is for parsing command-line options; see -/// \ref parse_options for more documentation. -class ParseOptions : public OptionsItf { - public: - explicit ParseOptions(const char *usage) - : print_args_(true), - help_(false), - usage_(usage), - argc_(0), - argv_(NULL), - prefix_(""), - other_parser_(NULL) { -#if !defined(_MSC_VER) && \ - !defined(__CYGWIN__) // This is just a convenient place to set the stderr - // to line - setlinebuf(stderr); // buffering mode, since it's called at program start. -#endif // This helps ensure different programs' output is not mixed up. - RegisterStandard("config", &config_, - "Configuration file to read (this " - "option may be repeated)"); - RegisterStandard("print-args", &print_args_, - "Print the command line arguments (to stderr)"); - RegisterStandard("help", &help_, "Print out usage message"); - RegisterStandard("verbose", &g_kaldi_verbose_level, - "Verbose level (higher->more logging)"); - } - - /** - This is a constructor for the special case where some options are - registered with a prefix to avoid conflicts. The object thus created will - only be used temporarily to register an options class with the original - options parser (which is passed as the *other pointer) using the given - prefix. It should not be used for any other purpose, and the prefix must - not be the empty string. It seems to be the least bad way of implementing - options with prefixes at this point. - Example of usage is: - ParseOptions po; // original ParseOptions object - ParseOptions po_mfcc("mfcc", &po); // object with prefix. - MfccOptions mfcc_opts; - mfcc_opts.Register(&po_mfcc); - The options will now get registered as, e.g., --mfcc.frame-shift=10.0 - instead of just --frame-shift=10.0 - */ - ParseOptions(const std::string &prefix, OptionsItf *other); - - ~ParseOptions() {} - - // Methods from the interface - void Register(const std::string &name, bool *ptr, const std::string &doc); - void Register(const std::string &name, int32 *ptr, const std::string &doc); - void Register(const std::string &name, uint32 *ptr, const std::string &doc); - void Register(const std::string &name, float *ptr, const std::string &doc); - void Register(const std::string &name, double *ptr, const std::string &doc); - void Register(const std::string &name, std::string *ptr, - const std::string &doc); - - /// If called after registering an option and before calling - /// Read(), disables that option from being used. Will crash - /// at runtime if that option had not been registered. - void DisableOption(const std::string &name); - - /// This one is used for registering standard parameters of all the programs - template - void RegisterStandard(const std::string &name, T *ptr, - const std::string &doc); - - /** - Parses the command line options and fills the ParseOptions-registered - variables. This must be called after all the variables were registered!!! - - Initially the variables have implicit values, - then the config file values are set-up, - finally the command line values given. - Returns the first position in argv that was not used. - [typically not useful: use NumParams() and GetParam(). ] - */ - int Read(int argc, const char *const *argv); - - /// Prints the usage documentation [provided in the constructor]. - void PrintUsage(bool print_command_line = false); - /// Prints the actual configuration of all the registered variables - void PrintConfig(std::ostream &os); - - /// Reads the options values from a config file. Must be called after - /// registering all options. This is usually used internally after the - /// standard --config option is used, but it may also be called from a - /// program. - void ReadConfigFile(const std::string &filename); - - /// Number of positional parameters (c.f. argc-1). - int NumArgs() const; - - /// Returns one of the positional parameters; 1-based indexing for argc/argv - /// compatibility. Will crash if param is not >=1 and <=NumArgs(). - std::string GetArg(int param) const; - - std::string GetOptArg(int param) const { - return (param <= NumArgs() ? GetArg(param) : ""); - } - - /// The following function will return a possibly quoted and escaped - /// version of "str", according to the current shell. Currently - /// this is just hardwired to bash. It's useful for debug output. - static std::string Escape(const std::string &str); - - private: - /// Template to register various variable types, - /// used for program-specific parameters - template - void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); - - // Following functions do just the datatype-specific part of the job - /// Register boolean variable - void RegisterSpecific(const std::string &name, const std::string &idx, - bool *b, const std::string &doc, bool is_standard); - /// Register int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - int32 *i, const std::string &doc, bool is_standard); - /// Register unsinged int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - uint32 *u, const std::string &doc, bool is_standard); - /// Register float variable - void RegisterSpecific(const std::string &name, const std::string &idx, - float *f, const std::string &doc, bool is_standard); - /// Register double variable [useful as we change BaseFloat type]. - void RegisterSpecific(const std::string &name, const std::string &idx, - double *f, const std::string &doc, bool is_standard); - /// Register string variable - void RegisterSpecific(const std::string &name, const std::string &idx, - std::string *s, const std::string &doc, - bool is_standard); - - /// Does the actual job for both kinds of parameters - /// Does the common part of the job for all datatypes, - /// then calls RegisterSpecific - template - void RegisterCommon(const std::string &name, T *ptr, const std::string &doc, - bool is_standard); - - /// Set option with name "key" to "value"; will crash if can't do it. - /// "has_equal_sign" is used to allow --x for a boolean option x, - /// and --y=, for a string option y. - bool SetOption(const std::string &key, const std::string &value, - bool has_equal_sign); - - bool ToBool(std::string str); - int32 ToInt(const std::string &str); - uint32 ToUint(const std::string &str); - float ToFloat(const std::string &str); - double ToDouble(const std::string &str); - - // maps for option variables - std::map bool_map_; - std::map int_map_; - std::map uint_map_; - std::map float_map_; - std::map double_map_; - std::map string_map_; - - /** - Structure for options' documentation - */ - struct DocInfo { - DocInfo() {} - DocInfo(const std::string &name, const std::string &usemsg) - : name_(name), use_msg_(usemsg), is_standard_(false) {} - DocInfo(const std::string &name, const std::string &usemsg, - bool is_standard) - : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} - - std::string name_; - std::string use_msg_; - bool is_standard_; - }; - typedef std::map DocMapType; - DocMapType doc_map_; ///< map for the documentation - - bool print_args_; ///< variable for the implicit --print-args parameter - bool help_; ///< variable for the implicit --help parameter - std::string config_; ///< variable for the implicit --config parameter - std::vector positional_args_; - const char *usage_; - int argc_; - const char *const *argv_; - - /// These members are not normally used. They are only used when the object - /// is constructed with a prefix - std::string prefix_; - OptionsItf *other_parser_; - - protected: - /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, - /// and sets "has_equal_sign" to true if an equals-sign was parsed.. - /// this is needed in order to correctly allow --x for a boolean option - /// x, and --y= for a string option y, and to disallow --x= and --y. - void SplitLongArg(const std::string &in, std::string *key, std::string *value, - bool *has_equal_sign); - - void NormalizeArgName(std::string *str); -}; - -/// This template is provided for convenience in reading config classes from -/// files; this is not the standard way to read configuration options, but may -/// occasionally be needed. This function assumes the config has a function -/// "void Register(OptionsItf *opts)" which it can call to register the -/// ParseOptions object. -template -void ReadConfigFromFile(const std::string &config_filename, C *c) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << config_filename << "'"; - ParseOptions po(usage_str.str().c_str()); - c->Register(&po); - po.ReadConfigFile(config_filename); -} - -/// This variant of the template ReadConfigFromFile is for if you need to read -/// two config classes from the same file. -template -void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << conf << "'"; - ParseOptions po(usage_str.str().c_str()); - c1->Register(&po); - c2->Register(&po); - po.ReadConfigFile(conf); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/simple-io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/simple-io-funcs.cc deleted file mode 100644 index 5ace601b6a2bb186dec78b0b25cb5a3227c48bc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/simple-io-funcs.cc +++ /dev/null @@ -1,80 +0,0 @@ -// util/simple-io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/simple-io-funcs.h" -#include "util/text-utils.h" - -namespace kaldi { - -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; - return ko.Close(); -} - -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - int32 i; - list->clear(); - while (!(is >> i).fail()) list->push_back(i); - is >> std::ws; - return is.eof(); // should be eof, or junk at end of file. -} - -bool WriteIntegerVectorVectorSimple( - const std::string &wxfilename, - const std::vector > &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - std::ostream &os = ko.Stream(); - for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i].size(); j++) { - os << list[i][j]; - if (j + 1 < list[i].size()) os << ' '; - } - os << '\n'; - } - return ko.Close(); -} - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - list->clear(); - std::string line; - while (std::getline(is, line)) { - std::vector v; - if (!SplitStringToIntegers(line, " \t\r", true, &v)) { - list->clear(); - return false; - } - list->push_back(v); - } - return is.eof(); // if we're not at EOF, something weird happened. -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/simple-io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/simple-io-funcs.h deleted file mode 100644 index 1ead12790ba9bd6a44ccdff855918270191b8ebd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/simple-io-funcs.h +++ /dev/null @@ -1,61 +0,0 @@ -// util/simple-io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ -#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ - -#include -#include -#include "util/kaldi-io.h" - -// This header contains some utilities for reading some common, simple text -// formats:integers in files, one per line, and integers in files, possibly -// multiple per line. these are not really fully native Kaldi formats; they are -// mostly for small files that might be generated by scripts, and can be read -// all at one time. for longer files of this type, we would probably use the -// Table code. - -namespace kaldi { - -/// WriteToList attempts to write this list of integers, one per line, -/// to the given file, in text format. -/// returns true if succeeded. -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &v); - -/// ReadFromList attempts to read this list of integers, one per line, -/// from the given file, in text format. -/// returns true if succeeded. -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *v); - -// This is a file format like: -// 1 2 -// 3 -// -// 4 5 6 -// etc. -bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, - const std::vector > &v); - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *v); - -} // end namespace kaldi. - -#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/stl-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/stl-utils.h deleted file mode 100644 index 8a29cd582c77b3078277aa9713b8676032bbc5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/stl-utils.h +++ /dev/null @@ -1,310 +0,0 @@ -// util/stl-utils.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_STL_UTILS_H_ -#define KALDI_UTIL_STL_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -using std::unordered_map; -using std::unordered_set; - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Sorts and uniq's (removes duplicates) from a vector. -template -inline void SortAndUniq(std::vector *vec) { - std::sort(vec->begin(), vec->end()); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Returns true if the vector is sorted. -template -inline bool IsSorted(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter < *iter) return false; - iter = next_iter; - } -} - -/// Returns true if the vector is sorted and contains each element -/// only once. -template -inline bool IsSortedAndUniq(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter <= *iter) return false; - iter = next_iter; - } -} - -/// Removes duplicate elements from a sorted list. -template -inline void Uniq(std::vector *vec) { // must be already sorted. - KALDI_PARANOID_ASSERT(IsSorted(*vec)); - KALDI_ASSERT(vec); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Copies the elements of a set to a vector. -template -void CopySetToVector(const std::set &s, std::vector *v) { - // copies members of s into v, in sorted order from lowest to highest - // (because the set was in sorted order). - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename std::set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -template -void CopySetToVector(const unordered_set &s, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename unordered_set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -/// Copies the (key, value) pairs in a map to a vector of pairs. -template -void CopyMapToVector(const std::map &m, - std::vector > *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector >::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = std::make_pair(miter->first, miter->second); - // do it like this because of const casting. - } -} - -/// Copies the keys in a map to a vector. -template -void CopyMapKeysToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->first; - } -} - -/// Copies the values in a map to a vector. -template -void CopyMapValuesToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->second; - } -} - -/// Copies the keys in a map to a set. -template -void CopyMapKeysToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) { - s->insert(s->end(), miter->first); - } -} - -/// Copies the values in a map to a set. -template -void CopyMapValuesToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) s->insert(s->end(), miter->second); -} - -/// Copies the contents of a vector to a set. -template -void CopyVectorToSet(const std::vector &v, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) s->insert(s->end(), *iter); - // s->end() is a hint in case v was sorted. will work regardless. -} - -/// Deletes any non-NULL pointers in the vector v, and sets -/// the corresponding entries of v to NULL -template -void DeletePointers(std::vector *v) { - KALDI_ASSERT(v != NULL); - typename std::vector::iterator iter = v->begin(), end = v->end(); - for (; iter != end; ++iter) { - if (*iter != NULL) { - delete *iter; - *iter = NULL; // set to NULL for extra safety. - } - } -} - -/// Returns true if the vector of pointers contains NULL pointers. -template -bool ContainsNullPointers(const std::vector &v) { - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) - if (*iter == static_cast(NULL)) return true; - return false; -} - -/// Copies the contents a vector of one type to a vector -/// of another type. -template -void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { - KALDI_ASSERT(vec_out != NULL); - vec_out->resize(vec_in.size()); - for (size_t i = 0; i < vec_in.size(); i++) - (*vec_out)[i] = static_cast(vec_in[i]); -} - -/// A hashing function-object for vectors. -template -struct VectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const noexcept { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - VectorHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - } - - private: - static const int kPrime = 7853; -}; - -/// A hashing function-object for pairs of ints -template -struct PairHasher { // hashing function for pair - size_t operator()(const std::pair &x) const noexcept { - // 7853 was chosen at random from a list of primes. - return x.first + x.second * 7853; - } - PairHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int1); - KALDI_ASSERT_IS_INTEGER_TYPE(Int2); - } -}; - -/// A hashing function object for strings. -struct StringHasher { // hashing function for std::string - size_t operator()(const std::string &str) const noexcept { - size_t ans = 0, len = str.length(); - const char *c = str.c_str(), *end = c + len; - for (; c != end; c++) { - ans *= kPrime; - ans += *c; - } - return ans; - } - - private: - static const int kPrime = 7853; -}; - -/// Reverses the contents of a vector. -template -inline void ReverseVector(std::vector *vec) { - KALDI_ASSERT(vec != NULL); - size_t sz = vec->size(); - for (size_t i = 0; i < sz / 2; i++) std::swap((*vec)[i], (*vec)[sz - 1 - i]); -} - -/// Comparator object for pairs that compares only the first pair. -template -struct CompareFirstMemberOfPair { - inline bool operator()(const std::pair &p1, const std::pair &p2) { - return p1.first < p2.first; - } -}; - -/// For a vector of pair where I is an integer and F a floating-point or -/// integer type, this function sorts a vector of type vector > on -/// the I value and then merges elements with equal I values, summing these over -/// the F component and then removing any F component with zero value. This -/// is for where the vector of pairs represents a map from the integer to float -/// component, with an "adding" type of semantics for combining the elements. -template -inline void MergePairVectorSumming(std::vector > *vec) { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - CompareFirstMemberOfPair c; - std::sort(vec->begin(), vec->end(), c); // sort on 1st element. - typename std::vector >::iterator out = vec->begin(), - in = vec->begin(), - end = vec->end(); - // special case: while there is nothing to be changed, skip over - // initial input (avoids unnecessary copying). - while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { - in++; - out++; - } - while (in < end) { - // We reach this point only at the first element of - // each stretch of identical .first elements. - *out = *in; - ++in; - while (in < end && in->first == out->first) { - out->second += in->second; // this is the merge operation. - ++in; - } - if (out->second != static_cast(0)) // Don't keep zero elements. - out++; - } - vec->erase(out, end); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/text-utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/text-utils.cc deleted file mode 100644 index fd70889644f6b4e14793ddd4f5b0d71a66768699..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/text-utils.cc +++ /dev/null @@ -1,580 +0,0 @@ -// util/text-utils.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/text-utils.h" - -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out) { - KALDI_ASSERT(out != NULL); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - F f = 0; - if (!ConvertStringToReal(split[i], &f)) return false; - (*out)[i] = f; - } - return true; -} - -// Instantiate the template above for float and double. -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out) { - std::string tmp_str; - for (size_t i = 0; i < vec_in.size(); i++) { - if (!omit_empty_strings || !vec_in[i].empty()) { - tmp_str.append(vec_in[i]); - if (i < vec_in.size() - 1) - if (!omit_empty_strings || !vec_in[i + 1].empty()) - tmp_str.append(delim); - } - } - str_out->swap(tmp_str); -} - -void Trim(std::string *str) { - const char *white_chars = " \t\n\r\f\v"; - - std::string::size_type pos = str->find_last_not_of(white_chars); - if (pos != std::string::npos) { - str->erase(pos + 1); - pos = str->find_first_not_of(white_chars); - if (pos != std::string::npos) str->erase(0, pos); - } else { - str->erase(str->begin(), str->end()); - } -} - -bool IsToken(const std::string &token) { - size_t l = token.length(); - if (l == 0) return false; - for (size_t i = 0; i < l; i++) { - unsigned char c = token[i]; - if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) - return false; - // The "&& (isascii(c) || c == 255)" was added so that we won't reject - // non-ASCII characters such as French characters with accents [except for - // 255 which is "nbsp", a form of space]. - } - return true; -} - -void SplitStringOnFirstSpace(const std::string &str, std::string *first, - std::string *rest) { - const char *white_chars = " \t\n\r\f\v"; - typedef std::string::size_type I; - const I npos = std::string::npos; - I first_nonwhite = str.find_first_not_of(white_chars); - if (first_nonwhite == npos) { - first->clear(); - rest->clear(); - return; - } - // next_white is first whitespace after first nonwhitespace. - I next_white = str.find_first_of(white_chars, first_nonwhite); - - if (next_white == npos) { // no more whitespace... - *first = std::string(str, first_nonwhite); - rest->clear(); - return; - } - I next_nonwhite = str.find_first_not_of(white_chars, next_white); - if (next_nonwhite == npos) { - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - rest->clear(); - return; - } - - I last_nonwhite = str.find_last_not_of(white_chars); - KALDI_ASSERT(last_nonwhite != npos); // or coding error. - - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - *rest = std::string(str, next_nonwhite, last_nonwhite + 1 - next_nonwhite); -} - -bool IsLine(const std::string &line) { - if (line.find('\n') != std::string::npos) return false; - if (line.empty()) return true; - if (isspace(*(line.begin()))) return false; - if (isspace(*(line.rbegin()))) return false; - std::string::const_iterator iter = line.begin(), end = line.end(); - for (; iter != end; iter++) - if (!isprint(*iter)) return false; - return true; -} - -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - -template bool ConvertStringToReal(const std::string &str, float *out); -template bool ConvertStringToReal(const std::string &str, double *out); - -/* - This function is a helper function of StringsApproxEqual. It should be - thought of as a recursive function-- it was designed that way-- but rather - than actually recursing (which would cause problems with stack overflow), we - just set the args and return to the start. - - The 'decimal_places_tolerance' argument is just passed in from outside, - see the documentation for StringsApproxEqual in text-utils.h to see an - explanation. The argument 'places_into_number' provides some information - about the strings 'a' and 'b' that precedes the current pointers. - For purposes of this comment, let's define the 'decimal' of a number - as the part that comes after the decimal point, e.g. in '99.123', - '123' would be the decimal. If 'places_into_number' is -1, it means - we're not currently inside some place like that (i.e. it's not the - case that we're pointing to the '1' or the '2' or the '3'). - If it's 0, then we'd be pointing to the first place after the decimal, - '1' in this case. Note if one of the numbers is shorter than the - other, like '99.123' versus '99.1234' and 'a' points to the first '3' - while 'b' points to the second '4', 'places_into_number' referes to the - shorter of the two, i.e. it would be 2 in this example. - - - */ -bool StringsApproxEqualInternal(const char *a, const char *b, - int32 decimal_places_tolerance, - int32 places_into_number) { -start: - char ca = *a, cb = *b; - if (ca == cb) { - if (ca == '\0') { - return true; - } else { - if (places_into_number >= 0) { - if (isdigit(ca)) { - places_into_number++; - } else { - places_into_number = -1; - } - } else { - if (ca == '.') { - places_into_number = 0; - } - } - a++; - b++; - goto start; - } - } else { - if (places_into_number >= decimal_places_tolerance && - (isdigit(ca) || isdigit(cb))) { - // we're potentially willing to accept this difference between the - // strings. - if (isdigit(ca)) a++; - if (isdigit(cb)) b++; - // we'll have advanced at least one of the two strings. - goto start; - } else if (places_into_number >= 0 && - ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { - // this clause is designed to ensure that, for example, - // "0.1" would count the same as "0.100001". - if (ca == '0') - a++; - else - b++; - places_into_number++; - goto start; - } else { - return false; - } - } -} - -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_tolerance) { - return StringsApproxEqualInternal(a.c_str(), b.c_str(), - decimal_places_tolerance, -1); -} - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = - std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals - // sign, or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign + 1] == '\'' || - line[next_equals_sign + 1] == '"') { - char my_quote = line[next_equals_sign + 1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote - << " in config line '" << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) - // foo=bar": in general, config values with spaces in them, even without - // quoting. - - size_t next_next_equals_sign = - line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != - std::string::npos) { // found a later equals sign. - size_t preceding_space = - line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -// void ExpectOneOrTwoTokens(std::istream &is, bool binary, -// const std::string &token1, -// const std::string &token2) { -// KALDI_ASSERT(token1 != token2); -// std::string temp; -// ReadToken(is, binary, &temp); -// if (temp == token1) { -// ExpectToken(is, binary, token2); -// } else { -// if (temp != token2) { -// KALDI_ERR << "Expecting token " << token1 << " or " << token2 -// << " but got " << temp; -// } -// } -// } - -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines) { - config_lines->resize(lines.size()); - for (size_t i = 0; i < lines.size(); i++) { - bool ret = (*config_lines)[i].ParseLine(lines[i]); - if (!ret) { - KALDI_ERR << "Error parsing config line: " << lines[i]; - } - } -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/text-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/text-utils.h deleted file mode 100644 index bc7763c4aff38214d97cbeda3b29c8717dd65318..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/kaldi/util/text-utils.h +++ /dev/null @@ -1,264 +0,0 @@ -// util/text-utils.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_TEXT_UTILS_H_ -#define KALDI_UTIL_TEXT_UTILS_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Split a string using any of the single character delimiters. -/// If omit_empty_strings == true, the output will contain any -/// nonempty strings after splitting on any of the -/// characters in the delimiter. If omit_empty_strings == false, -/// the output will contain n+1 strings if there are n characters -/// in the set "delim" within the input string. In this case -/// the empty string is split to a single empty string. -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -/// Joins the elements of a vector of strings into a single string using -/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings -/// in the vector are skipped. A vector of empty strings results in an empty -/// string on the output. -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out); - -/** - \brief Split a string (e.g. 1:2:3) into a vector of integers. - - \param [in] delim String containing a list of characters, any of which - is allowed as a delimiter. - \param [in] omit_empty_strings If true, empty strings between delimiters are - allowed and will not produce an output integer; if false, - instances of characters in 'delim' that are consecutive or - at the start or end of the string would be an error. - You'll normally want this to be true if 'delim' consists - of spaces, and false otherwise. - \param [out] out The output list of integers. -*/ -template -bool SplitStringToIntegers(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false [but - // should probably be true - // if "delim" is spaces]. - std::vector *out) { - KALDI_ASSERT(out != NULL); - KALDI_ASSERT_IS_INTEGER_TYPE(I); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - const char *this_str = split[i].c_str(); - char *end = NULL; - int64 j = 0; - j = KALDI_STRTOLL(this_str, &end); - if (end == this_str || *end != '\0') { - out->clear(); - return false; - } else { - I jI = static_cast(j); - if (static_cast(jI) != j) { - // output type cannot fit this integer. - out->clear(); - return false; - } - (*out)[i] = jI; - } - } - return true; -} - -// This is defined for F = float and double. -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out); - -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - const char *this_str = str.c_str(); - char *end = NULL; - errno = 0; - int64 i = KALDI_STRTOLL(this_str, &end); - if (end != this_str) - while (isspace(*end)) end++; - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out); - -/// Removes the beginning and trailing whitespaces from a string -void Trim(std::string *str); - -/// Removes leading and trailing white space from the string, then splits on the -/// first section of whitespace found (if present), putting the part before the -/// whitespace in "first" and the rest in "rest". If there is no such space, -/// everything that remains after removing leading and trailing whitespace goes -/// in "first". -void SplitStringOnFirstSpace(const std::string &line, std::string *first, - std::string *rest); - -/// Returns true if "token" is nonempty, and all characters are -/// printable and whitespace-free. -bool IsToken(const std::string &token); - -/// Returns true if "line" is free of \n characters and unprintable -/// characters, and does not contain leading or trailing whitespace. -bool IsLine(const std::string &line); - -/** - This function returns true when two text strings are approximately equal, and - false when they are not. The definition of 'equal' is normal string - equality, except that two substrings like "0.31134" and "0.311341" would be - considered equal. 'decimal_places_tolerance' controls how many digits after - the '.' have to match up. - E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would - return false because there is a difference in the 2nd decimal, but with - an argument of 1 it would return true. - */ -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_check = 2); - -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' - baz="a b c d='a b' e" and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free - of the '=' character. If values are going to contain the '=' character, you - need to quote them with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; -}; - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, const std::string &token2); - -/** - This function reads in a config file and *appends* its contents to a vector - of lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, std::vector *lines); - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); - -/// Returns true if 'name' would be a valid name for a component or node in a -/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing -/// only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - -} // namespace kaldi - -#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/CPPLINT.cfg deleted file mode 100644 index 51ff339c18435a6c3a3be03131080d7b8ab8de86..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/CPPLINT.cfg +++ /dev/null @@ -1 +0,0 @@ -exclude_files=.* diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/CMakeLists.txt deleted file mode 100644 index 04051ef5ae46c04a40c1ffccc98c37fa594ad13e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - -#-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o - -include_directories(./include/) -install(DIRECTORY include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - -add_subdirectory(lib) - -if(HAVE_SCRIPT) - add_subdirectory(script) -endif(HAVE_SCRIPT) - -if(HAVE_BIN) - add_subdirectory(bin) -endif(HAVE_BIN) - -add_subdirectory(extensions) - -if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) -endif(BUILD_TESTING) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/extensions/special/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/extensions/special/CMakeLists.txt deleted file mode 100644 index 9c71b750a72ffe3c2dafde657273361c3dbae409..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/extensions/special/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) -message(STATUS "${HEADER_FILES}") - -if(HAVE_BIN) - add_executable(fstspecial-bin - ../../bin/fstconvert.cc - ../../bin/fstconvert-main.cc - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ) - - set_target_properties(fstspecial-bin PROPERTIES - FOLDER special/bin - OUTPUT_NAME fstspecial - ) - - target_link_libraries(fstspecial-bin - fstscript - fst - ${CMAKE_DL_LIBS} - ) -endif(HAVE_BIN) - - -add_library(fstspecial - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ${HEADER_FILES} -) - -set_target_properties(fstspecial PROPERTIES - SOVERSION "${SOVERSION}" - FOLDER special -) -target_link_libraries(fstspecial - fst -) - -set(FST_SPECIAL_INSTALL_TARGETS fstspecial) -if(HAVE_BIN) - list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) -endif() - -install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib -) - -function (add_module _name) - add_library(${ARGV}) - if (TARGET ${_name}) - target_link_libraries(${_name} fst) - set_target_properties(${_name} - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true - FOLDER special/modules - ) - endif() - - install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) -endfunction() - -add_module(phi-fst MODULE phi-fst.cc) -add_module(rho-fst MODULE rho-fst.cc) -add_module(sigma-fst MODULE sigma-fst.cc) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/include/fst/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/include/fst/flags.h deleted file mode 100644 index b5ec8ff7416774a0612ae0fe7e008a630b289dd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/include/fst/flags.h +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style flag handling declarations and inline definitions. - -#ifndef FST_LIB_FLAGS_H_ -#define FST_LIB_FLAGS_H_ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" - -using std::string; - -// FLAGS USAGE: -// -// Definition example: -// -// DEFINE_int32(length, 0, "length"); -// -// This defines variable FLAGS_length, initialized to 0. -// -// Declaration example: -// -// DECLARE_int32(length); -// -// SET_FLAGS() can be used to set flags from the command line -// using, for example, '--length=2'. -// -// ShowUsage() can be used to print out command and flag usage. - -// #define DECLARE_bool(name) extern bool FLAGS_ ## name -// #define DECLARE_string(name) extern string FLAGS_ ## name -// #define DECLARE_int32(name) extern int32 FLAGS_ ## name -// #define DECLARE_int64(name) extern int64 FLAGS_ ## name -// #define DECLARE_double(name) extern double FLAGS_ ## name - -template -struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, - const char *file, const T val) - : address(addr), - doc_string(doc), - type_name(type), - file_name(file), - default_value(val) {} - - T *address; - const char *doc_string; - const char *type_name; - const char *file_name; - const T default_value; -}; - -template -class FlagRegister { - public: - static FlagRegister *GetRegister() { - static auto reg = new FlagRegister; - return reg; - } - - const FlagDescription &GetFlagDescription(const string &name) const { - fst::MutexLock l(&flag_lock_); - auto it = flag_table_.find(name); - return it != flag_table_.end() ? it->second : 0; - } - - void SetDescription(const string &name, - const FlagDescription &desc) { - fst::MutexLock l(&flag_lock_); - flag_table_.insert(make_pair(name, desc)); - } - - bool SetFlag(const string &val, bool *address) const { - if (val == "true" || val == "1" || val.empty()) { - *address = true; - return true; - } else if (val == "false" || val == "0") { - *address = false; - return true; - } - else { - return false; - } - } - - bool SetFlag(const string &val, string *address) const { - *address = val; - return true; - } - - bool SetFlag(const string &val, int32 *address) const { - char *p = 0; - *address = strtol(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, int64 *address) const { - char *p = 0; - *address = strtoll(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, double *address) const { - char *p = 0; - *address = strtod(val.c_str(), &p); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, FlagDescription >::const_iterator it = - flag_table_.begin(); - it != flag_table_.end(); - ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - if (arg == name) - return SetFlag(val, desc.address); - } - return false; - } - - void GetUsage(std::set> *usage_set) const { - for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - string usage = " --" + name; - usage += ": type = "; - usage += desc.type_name; - usage += ", default = "; - usage += GetDefault(desc.default_value) + "\n "; - usage += desc.doc_string; - usage_set->insert(make_pair(desc.file_name, usage)); - } - } - - private: - string GetDefault(bool default_value) const { - return default_value ? "true" : "false"; - } - - string GetDefault(const string &default_value) const { - return "\"" + default_value + "\""; - } - - template - string GetDefault(const V &default_value) const { - std::ostringstream strm; - strm << default_value; - return strm.str(); - } - - mutable fst::Mutex flag_lock_; // Multithreading lock. - std::map> flag_table_; -}; - -template -class FlagRegisterer { - public: - FlagRegisterer(const string &name, const FlagDescription &desc) { - auto registr = FlagRegister::GetRegister(); - registr->SetDescription(name, desc); - } - - private: - FlagRegisterer(const FlagRegisterer &) = delete; - FlagRegisterer &operator=(const FlagRegisterer &) = delete; -}; - - -#define DEFINE_VAR(type, name, value, doc) \ - type FLAGS_ ## name = value; \ - static FlagRegisterer \ - name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ - doc, \ - #type, \ - __FILE__, \ - value)) - -// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) -// #define DEFINE_string(name, value, doc) \ -// DEFINE_VAR(string, name, value, doc) -// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc) -// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc) -// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc) - - -// Temporary directory. -DECLARE_string(tmpdir); - -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, - const char *src = ""); - -#define SET_FLAGS(usage, argc, argv, rmflags) \ -gflags::ParseCommandLineFlags(argc, argv, true) -// SetFlags(usage, argc, argv, rmflags, __FILE__) - -// Deprecated; for backward compatibility. -inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { - return SetFlags(usage, argc, argv, rmflags); -} - -void ShowUsage(bool long_usage = true); - -#endif // FST_LIB_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/include/fst/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/include/fst/log.h deleted file mode 100644 index bf041c58ebfab73d03bb14adf28c7c7916a2217d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/patch/openfst/src/include/fst/log.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style logging declarations and inline definitions. - -#ifndef FST_LIB_LOG_H_ -#define FST_LIB_LOG_H_ - -#include -#include -#include - -#include -#include - -using std::string; - -DECLARE_int32(v); - -class LogMessage { - public: - LogMessage(const string &type) : fatal_(type == "FATAL") { - std::cerr << type << ": "; - } - ~LogMessage() { - std::cerr << std::endl; - if(fatal_) - exit(1); - } - std::ostream &stream() { return std::cerr; } - - private: - bool fatal_; -}; - -// #define LOG(type) LogMessage(#type).stream() -// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO) - -// Checks -inline void FstCheck(bool x, const char* expr, - const char *file, int line) { - if (!x) { - LOG(FATAL) << "Check failed: \"" << expr - << "\" file: " << file - << " line: " << line; - } -} - -// #define CHECK(x) FstCheck(static_cast(x), #x, __FILE__, __LINE__) -// #define CHECK_EQ(x, y) CHECK((x) == (y)) -// #define CHECK_LT(x, y) CHECK((x) < (y)) -// #define CHECK_GT(x, y) CHECK((x) > (y)) -// #define CHECK_LE(x, y) CHECK((x) <= (y)) -// #define CHECK_GE(x, y) CHECK((x) >= (y)) -// #define CHECK_NE(x, y) CHECK((x) != (y)) - -// Debug checks -// #define DCHECK(x) assert(x) -// #define DCHECK_EQ(x, y) DCHECK((x) == (y)) -// #define DCHECK_LT(x, y) DCHECK((x) < (y)) -// #define DCHECK_GT(x, y) DCHECK((x) > (y)) -// #define DCHECK_LE(x, y) DCHECK((x) <= (y)) -// #define DCHECK_GE(x, y) DCHECK((x) >= (y)) -// #define DCHECK_NE(x, y) DCHECK((x) != (y)) - - -// Ports -#define ATTRIBUTE_DEPRECATED __attribute__((deprecated)) - -#endif // FST_LIB_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/CMakeLists.txt deleted file mode 100644 index 6113bbc26eb8fe35e4e17ffd1cab382f0fb0f1f8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(post_processor STATIC - post_processor.cc -) -target_link_libraries(post_processor PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/post_processor.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/post_processor.cc deleted file mode 100644 index 315f62d34cbc441ecbaf7c07667eb35ee61c2c8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/post_processor.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "utils/string.h" - -namespace wenet { - -std::string PostProcessor::ProcessSpace(const std::string& str) { - std::string result = str; - // 1. remove ' ' if needed - // only spaces between mandarin words need to be removed, please note that - // if str contains '_', we assume that the decoding type must be - // `CtcPrefixBeamSearch` and this branch will do nothing since str must be - // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) - if (opts_.language_type == kMandarinEnglish && !str.empty()) { - result.clear(); - // split str by ' ' - std::vector words; - std::stringstream ss(str); - std::string tmp; - while (ss >> tmp) { - words.push_back(tmp); - } - // check english word - bool is_englishword_prev = false; - bool is_englishword_now = false; - for (std::string& w : words) { - is_englishword_now = CheckEnglishWord(w); - if (is_englishword_prev && is_englishword_now) { - result += (' ' + w); - } else { - result += (w); - } - is_englishword_prev = is_englishword_now; - } - } - // 2. replace '_' with ' ' - // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) - result = ProcessBlank(result, opts_.lowercase); - return result; -} - -std::string PostProcessor::Process(const std::string& str, bool finish) { - std::string result; - result = ProcessSpace(str); - // TODO(xcsong): do itn/punctuation if finish == true - return result; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/post_processor.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/post_processor.h deleted file mode 100644 index 54597845ebc88ad22e1244d2e693e2088cff6d21..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/post_processor/post_processor.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#ifndef POST_PROCESSOR_POST_PROCESSOR_H_ -#define POST_PROCESSOR_POST_PROCESSOR_H_ - -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -enum LanguageType { - // spaces between **mandarin words** should be removed. - // cases of processing spaces with mandarin-only, english-only - // and mandarin-english code-switch can be found in post_processor_test.cc - kMandarinEnglish = 0x00, - // spaces should be kept for most of the - // Indo-European languages (i.e., deutsch or english-deutsch code-switch). - // cases of those languages can be found in post_processor_test.cc - kIndoEuropean = 0x01 -}; - -struct PostProcessOptions { - // space options - // The decoded result may contain spaces (' ' or '_'), - // we will process those spaces according to language_type. More details can - // be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - LanguageType language_type = kMandarinEnglish; - // whether lowercase letters are required - bool lowercase = true; -}; - -// TODO(xcsong): add itn/punctuation related resource -struct PostProcessResource {}; - -// Post Processor -class PostProcessor { - public: - explicit PostProcessor(PostProcessOptions&& opts) : opts_(std::move(opts)) {} - explicit PostProcessor(const PostProcessOptions& opts) : opts_(opts) {} - // call other functions to do post processing - std::string Process(const std::string& str, bool finish); - // process spaces according to configurations - std::string ProcessSpace(const std::string& str); - // TODO(xcsong): add itn/punctuation - // void InverseTN(const std::string& str); - // void Punctuate(const std::string& str); - - private: - const PostProcessOptions opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(PostProcessor); -}; - -} // namespace wenet - -#endif // POST_PROCESSOR_POST_PROCESSOR_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/CMakeLists.txt deleted file mode 100644 index 145654105350e91a5f9121b47197f5fc60663f5c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -link_libraries(gtest_main gmock) - -add_executable(utils_test utils_test.cc) -target_link_libraries(utils_test PUBLIC utils) -add_test(UTILS_TEST utils_test) - -add_executable(ctc_prefix_beam_search_test ctc_prefix_beam_search_test.cc) -target_link_libraries(ctc_prefix_beam_search_test PUBLIC decoder) -add_test(CTC_PREFIX_BEAM_SEARCH_TEST ctc_prefix_beam_search_test) - -add_executable(post_processor_test post_processor_test.cc) -target_link_libraries(post_processor_test PUBLIC post_processor) -add_test(POST_PROCESSOR_TEST post_processor_test) - - -add_executable(feature_pipeline_test feature_pipeline_test.cc) -target_link_libraries(feature_pipeline_test PUBLIC frontend) -add_test(FEATURE_PIPELINE_TEST feature_pipeline_test) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/ctc_prefix_beam_search_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/ctc_prefix_beam_search_test.cc deleted file mode 100644 index d8f3b65693b934beb33f3a770795f0b6e7ce3456..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/ctc_prefix_beam_search_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(CtcPrefixBeamSearchTest, CtcPrefixBeamSearchLogicTest) { - using ::testing::ElementsAre; - // See https://robin1001.github.io/2020/12/11/ctc-search for the - // graph demonstration of the data - std::vector> data = { - {0.25, 0.40, 0.35}, {0.40, 0.35, 0.25}, {0.10, 0.50, 0.40}}; - // Apply log - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data[i].size(); j++) { - data[i][j] = std::log(data[i][j]); - } - } - wenet::CtcPrefixBeamSearchOptions option; - option.first_beam_size = 3; - option.second_beam_size = 3; - wenet::CtcPrefixBeamSearch prefix_beam_search(option); - prefix_beam_search.Search(data); - /* Test case info - | top k | result index | prefix score | viterbi score | timestamp | - |-------|--------------|--------------|---------------|-----------| - | top 1 | [2, 1] | 0.2185 | 0.07 | [0, 2] | - | top 2 | [1, 2] | 0.1550 | 0.064 | [0, 2] | - | top 3 | [1] | 0.1525 | 0.07 | [2] | - */ - const std::vector>& result = prefix_beam_search.Outputs(); - EXPECT_EQ(result.size(), 3); - ASSERT_THAT(result[0], ElementsAre(2, 1)); - ASSERT_THAT(result[1], ElementsAre(1, 2)); - ASSERT_THAT(result[2], ElementsAre(1)); - - const std::vector& likelihood = prefix_beam_search.Likelihood(); - EXPECT_EQ(likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(likelihood[0]), 0.2185); - EXPECT_FLOAT_EQ(std::exp(likelihood[1]), 0.1550); - EXPECT_FLOAT_EQ(std::exp(likelihood[2]), 0.1525); - - const std::vector& viterbi_likelihood = - prefix_beam_search.viterbi_likelihood(); - EXPECT_EQ(viterbi_likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[0]), 0.07); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[1]), 0.064); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[2]), 0.07); - - const std::vector>& times = prefix_beam_search.Times(); - EXPECT_EQ(times.size(), 3); - ASSERT_THAT(times[0], ElementsAre(0, 2)); - ASSERT_THAT(times[1], ElementsAre(0, 2)); - ASSERT_THAT(times[2], ElementsAre(2)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/feature_pipeline_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/feature_pipeline_test.cc deleted file mode 100644 index 244ec0735b6086211b476e8d97569e1ee5959bc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/feature_pipeline_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2022 Roney -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "frontend/feature_pipeline.h" -#include "utils/blocking_queue.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -void pushQueue(const std::shared_ptr>& que, - std::vector vec) { - que->Push(vec); -} - -void popQueue(const std::shared_ptr>& que, int num, - int back_data) { - auto pop_data = que->Pop(num); - ASSERT_EQ(pop_data[num - 1], back_data); -} - -TEST(FeaturePipelineTest, BlockingQueueTest) { - auto capacity_queue = std::make_shared>(2); - std::vector test_data{1, 2, 3, 4, 5}; - std::thread push_thread(&pushQueue, capacity_queue, test_data); - ASSERT_EQ(capacity_queue->Pop(), 1); - ASSERT_LE(capacity_queue->Size(), 2); // capacity_queue: 2 or 2,3 - auto pop_data = capacity_queue->Pop(3); // 2,3,4 num > capacity - ASSERT_EQ(pop_data.size(), 3); - ASSERT_EQ(pop_data[2], 4); - push_thread.join(); - ASSERT_EQ(capacity_queue->Size(), 1); // capacity_queue:5 - - std::thread pop_thread(&popQueue, capacity_queue, 3, 0); // num > capacity - capacity_queue->Push(9); // capacity_queue:5,9 - capacity_queue->Push(0); // capacity_queue:5,9,0 - pop_thread.join(); // capacity_queue: - ASSERT_EQ(capacity_queue->Size(), 0); - - pop_data = capacity_queue->Pop(0); - ASSERT_TRUE(pop_data.empty()); -} - -TEST(FeaturePipelineTest, PipelineTest) { - wenet::FeaturePipelineConfig config(80, 8000); - wenet::FeaturePipeline feature_pipeline(config); - int audio_len = 8 * 55; // audio len 55ms,4 frames - std::vector pcm(audio_len, 0); - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 4); - - std::vector> out_feats; - auto b = feature_pipeline.Read(2, &out_feats); - ASSERT_TRUE(b); - ASSERT_EQ(out_feats.size(), 2); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 2); - - std::vector out_feat; - b = feature_pipeline.ReadOne(&out_feat); - ASSERT_TRUE(b); - ASSERT_FALSE(out_feat.empty()); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 1); - - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 1); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); - - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - feature_pipeline.Read(2, &out_feats); - feature_pipeline.Reset(); - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 0); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/post_processor_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/post_processor_test.cc deleted file mode 100644 index fa11fa29231032d62389a93fd00b0ec782bf8a3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/post_processor_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(PostProcessorTest, ProcessSpacekMandarinEnglishTest) { - wenet::PostProcessOptions opts_lowercase; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: mandarin character - // decode type: CtcPrefixBeamSearch, "".join() - "震东好帅", - // modeling unit: mandarin word - // decode type: CtcWfstBeamSearch, " ".join() - " 吴迪 也 好帅", - // modeling unit: english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁binbin▁is▁also▁handsome", - // modeling unit: english word - // decode type: CtcWfstBeamSearch, " ".join() - " life is short i use wenet", - // modeling unit: mandarin character + english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "超哥▁is▁the▁most▁handsome", - // modeling unit: mandarin word + english word - // decode type: CtcWfstBeamSearch, " ".join() - " 人生 苦短 i use wenet", - }; - - std::vector result_lowercase = { - "震东好帅", - "吴迪也好帅", - "binbin is also handsome", - "life is short i use wenet", - "超哥 is the most handsome", - "人生苦短i use wenet", - }; - - std::vector result_uppercase = { - "震东好帅", - "吴迪也好帅", - "BINBIN IS ALSO HANDSOME", - "LIFE IS SHORT I USE WENET", - "超哥 IS THE MOST HANDSOME", - "人生苦短I USE WENET", - }; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} - -TEST(PostProcessorTest, ProcessSpacekIndoEuropeanTest) { - wenet::PostProcessOptions opts_lowercase; - opts_lowercase.language_type = wenet::kIndoEuropean; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.language_type = wenet::kIndoEuropean; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁zhendong▁ist▁so▁schön", - // modeling unit: word - // decode type: CtcWfstBeamSearch, " ".join() - " zhendong ist so schön"}; - - std::vector result_lowercase = {"zhendong ist so schön", - "zhendong ist so schön"}; - - std::vector result_uppercase = {"ZHENDONG IST SO SCHÖN", - "ZHENDONG IST SO SCHÖN"}; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/utils_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/utils_test.cc deleted file mode 100644 index 6b2bbac25e000ce854d5e55a50cb51109d62d758..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/test/utils_test.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "utils/utils.h" - -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -TEST(UtilsTest, TopKTest) { - using ::testing::ElementsAre; - using ::testing::FloatNear; - using ::testing::Pointwise; - std::vector data = {1, 3, 5, 7, 9, 2, 4, 6, 8, 10}; - std::vector values; - std::vector indices; - wenet::TopK(data, 3, &values, &indices); - EXPECT_THAT(values, Pointwise(FloatNear(1e-8), {10, 9, 8})); - ASSERT_THAT(indices, ElementsAre(9, 4, 8)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/CMakeLists.txt deleted file mode 100644 index 686362688c050d48224ca0a01e0d24b03d94758a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_library(utils STATIC - string.cc - utils.cc -) - -if(NOT ANDROID) - if(MSVC) - target_link_libraries(utils PUBLIC fst) - else() - target_link_libraries(utils PUBLIC fst dl) - endif() -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/blocking_queue.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/blocking_queue.h deleted file mode 100644 index 9bf0127d9298fbfae2eeebb9431c680fc5dd7647..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/blocking_queue.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_BLOCKING_QUEUE_H_ -#define UTILS_BLOCKING_QUEUE_H_ - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity = std::numeric_limits::max()) - : capacity_(capacity) {} - - void Push(const T& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(value); - } - not_empty_condition_.notify_one(); - } - - void Push(T&& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - void Push(const std::vector& values) { - { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(value); - } - } - not_empty_condition_.notify_one(); - } - - void Push(std::vector&& values) { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - T Pop() { - std::unique_lock lock(mutex_); - while (queue_.empty()) { - not_empty_condition_.wait(lock); - } - T t(std::move(queue_.front())); - queue_.pop(); - not_full_condition_.notify_one(); - return t; - } - - // num can be greater than capacity,but it needs to be used with care - std::vector Pop(size_t num) { - std::unique_lock lock(mutex_); - std::vector block_data; - while (block_data.size() < num) { - while (queue_.empty()) { - not_full_condition_.notify_one(); - not_empty_condition_.wait(lock); - } - block_data.push_back(std::move(queue_.front())); - queue_.pop(); - } - not_full_condition_.notify_one(); - return block_data; - } - - bool Empty() const { - std::lock_guard lock(mutex_); - return queue_.empty(); - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - void Clear() { - while (!Empty()) { - Pop(); - } - } - - private: - size_t capacity_; - mutable std::mutex mutex_; - std::condition_variable not_full_condition_; - std::condition_variable not_empty_condition_; - std::queue queue_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace wenet - -#endif // UTILS_BLOCKING_QUEUE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/file.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/file.h deleted file mode 100644 index 83ad9c8c52fecd334b3549285bf39cd4f59b9f2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/file.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FILE_H_ -#define UTILS_FILE_H_ - -#include -#include - -namespace wenet { - -inline bool FileExists(const std::string& path) { - std::ifstream f(path.c_str()); - return f.good(); -} - -} // namespace wenet - -#endif // UTILS_FILE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/flags.h deleted file mode 100644 index 3432aa78847322edec8d6d2aec59ed7ca5352fcd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/flags.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FLAGS_H_ -#define UTILS_FLAGS_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/flags.h" - -#endif // UTILS_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/json.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/json.h deleted file mode 100644 index bf8d94a3e42504139b10daa39b8f8e7a8b2d93cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/json.h +++ /dev/null @@ -1,754 +0,0 @@ -// Copyright (c) From https://github.com/nbsdx/SimpleJSON -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_JSON_H_ -#define UTILS_JSON_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace json { - -using std::deque; -using std::enable_if; -using std::initializer_list; -using std::is_convertible; -using std::is_floating_point; -using std::is_integral; -using std::is_same; -using std::map; -using std::string; - -namespace { // NOLINT -string json_escape(const string& str) { - string output; - for (unsigned i = 0; i < str.length(); ++i) switch (str[i]) { - case '\"': - output += "\\\""; - break; - case '\\': - output += "\\\\"; - break; - case '\b': - output += "\\b"; - break; - case '\f': - output += "\\f"; - break; - case '\n': - output += "\\n"; - break; - case '\r': - output += "\\r"; - break; - case '\t': - output += "\\t"; - break; - default: - output += str[i]; - break; - } - return std::move(output); -} -} // namespace - -class JSON { - union BackingData { - BackingData(double d) : Float(d) {} - BackingData(int l) : Int(l) {} - BackingData(bool b) : Bool(b) {} - BackingData(string s) : String(new string(s)) {} - BackingData() : Int(0) {} - - deque* List; - map* Map; - string* String; - double Float; - int Int; - bool Bool; - } Internal; - - public: - enum class Class { Null, Object, Array, String, Floating, Integral, Boolean }; - - template - class JSONWrapper { - Container* object; - - public: - explicit JSONWrapper(Container* val) : object(val) {} - explicit JSONWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::iterator begin() { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::iterator end() { - return object ? object->end() : typename Container::iterator(); - } - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::iterator(); - } - }; - - template - class JSONConstWrapper { - const Container* object; - - public: - explicit JSONConstWrapper(const Container* val) : object(val) {} - explicit JSONConstWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::const_iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::const_iterator(); - } - }; - - JSON() : Internal(), Type(Class::Null) {} - - explicit JSON(initializer_list list) : JSON() { - SetType(Class::Object); - for (auto i = list.begin(), e = list.end(); i != e; ++i, ++i) - operator[](i->ToString()) = *std::next(i); - } - - JSON(JSON&& other) : Internal(other.Internal), Type(other.Type) { - other.Type = Class::Null; - other.Internal.Map = nullptr; - } - - JSON& operator=(JSON&& other) { - ClearInternal(); - Internal = other.Internal; - Type = other.Type; - other.Internal.Map = nullptr; - other.Type = Class::Null; - return *this; - } - - JSON(const JSON& other) { - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - } - - JSON& operator=(const JSON& other) { - ClearInternal(); - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - return *this; - } - - ~JSON() { - switch (Type) { - case Class::Array: - delete Internal.List; - break; - case Class::Object: - delete Internal.Map; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - template - explicit JSON(T b, typename enable_if::value>::type* = 0) - : Internal(b), Type(Class::Boolean) {} - - template - explicit JSON(T i, typename enable_if::value && - !is_same::value>::type* = 0) - : Internal(static_cast(i)), Type(Class::Integral) {} - - template - explicit JSON(T f, typename enable_if::value>::type* = 0) - : Internal(static_cast(f)), Type(Class::Floating) {} - - template - explicit JSON(T s, - typename enable_if::value>::type* = 0) - : Internal(string(s)), Type(Class::String) {} - - explicit JSON(std::nullptr_t) : Internal(), Type(Class::Null) {} - - static JSON Make(Class type) { - JSON ret; - ret.SetType(type); - return ret; - } - - static JSON Load(const string&); - - template - void append(T arg) { - SetType(Class::Array); - Internal.List->emplace_back(arg); - } - - template - void append(T arg, U... args) { - append(arg); - append(args...); - } - - template - typename enable_if::value, JSON&>::type operator=(T b) { - SetType(Class::Boolean); - Internal.Bool = b; - return *this; - } - - template - typename enable_if::value && !is_same::value, - JSON&>::type - operator=(T i) { - SetType(Class::Integral); - Internal.Int = i; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=(T f) { - SetType(Class::Floating); - Internal.Float = f; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=( - T s) { - SetType(Class::String); - *Internal.String = string(s); - return *this; - } - - JSON& operator[](const string& key) { - SetType(Class::Object); - return Internal.Map->operator[](key); - } - - JSON& operator[](unsigned index) { - SetType(Class::Array); - if (index >= Internal.List->size()) Internal.List->resize(index + 1); - return Internal.List->operator[](index); - } - - JSON& at(const string& key) { return operator[](key); } - - const JSON& at(const string& key) const { return Internal.Map->at(key); } - - JSON& at(unsigned index) { return operator[](index); } - - const JSON& at(unsigned index) const { return Internal.List->at(index); } - - int length() const { - if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - bool hasKey(const string& key) const { - if (Type == Class::Object) - return Internal.Map->find(key) != Internal.Map->end(); - return false; - } - - int size() const { - if (Type == Class::Object) - return Internal.Map->size(); - else if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - Class JSONType() const { return Type; } - - /// Functions for getting primitives from the JSON object. - bool IsNull() const { return Type == Class::Null; } - - string ToString() const { - bool b; - return std::move(ToString(&b)); - } - string ToString(bool* ok) const { - *ok = (Type == Class::String); - return *ok ? std::move(json_escape(*Internal.String)) : string(""); - } - - double ToFloat() const { - bool b; - return ToFloat(&b); - } - double ToFloat(bool* ok) const { - *ok = (Type == Class::Floating); - return *ok ? Internal.Float : 0.0; - } - - int ToInt() const { - bool b; - return ToInt(&b); - } - int ToInt(bool* ok) const { - *ok = (Type == Class::Integral); - return *ok ? Internal.Int : 0; - } - - bool ToBool() const { - bool b; - return ToBool(&b); - } - bool ToBool(bool* ok) const { - *ok = (Type == Class::Boolean); - return *ok ? Internal.Bool : false; - } - - JSONWrapper> ObjectRange() { - if (Type == Class::Object) - return JSONWrapper>(Internal.Map); - return JSONWrapper>(nullptr); - } - - JSONWrapper> ArrayRange() { - if (Type == Class::Array) return JSONWrapper>(Internal.List); - return JSONWrapper>(nullptr); - } - - JSONConstWrapper> ObjectRange() const { - if (Type == Class::Object) - return JSONConstWrapper>(Internal.Map); - return JSONConstWrapper>(nullptr); - } - - JSONConstWrapper> ArrayRange() const { - if (Type == Class::Array) - return JSONConstWrapper>(Internal.List); - return JSONConstWrapper>(nullptr); - } - - string dump(int depth = 1, string tab = " ") const { - string pad = ""; - for (int i = 0; i < depth; ++i, pad += tab) { - } - - switch (Type) { - case Class::Null: - return "null"; - case Class::Object: { - string s = "{\n"; - bool skip = true; - for (auto& p : *Internal.Map) { - if (!skip) s += ",\n"; - s += (pad + "\"" + p.first + "\" : " + p.second.dump(depth + 1, tab)); - skip = false; - } - s += ("\n" + pad.erase(0, 2) + "}"); - return s; - } - case Class::Array: { - string s = "["; - bool skip = true; - for (auto& p : *Internal.List) { - if (!skip) s += ", "; - s += p.dump(depth + 1, tab); - skip = false; - } - s += "]"; - return s; - } - case Class::String: - return "\"" + json_escape(*Internal.String) + "\""; - case Class::Floating: - return std::to_string(Internal.Float); - case Class::Integral: - return std::to_string(Internal.Int); - case Class::Boolean: - return Internal.Bool ? "true" : "false"; - default: - return ""; - } - return ""; - } - - friend std::ostream& operator<<(std::ostream&, const JSON&); - - private: - void SetType(Class type) { - if (type == Type) return; - - ClearInternal(); - - switch (type) { - case Class::Null: - Internal.Map = nullptr; - break; - case Class::Object: - Internal.Map = new map(); - break; - case Class::Array: - Internal.List = new deque(); - break; - case Class::String: - Internal.String = new string(); - break; - case Class::Floating: - Internal.Float = 0.0; - break; - case Class::Integral: - Internal.Int = 0; - break; - case Class::Boolean: - Internal.Bool = false; - break; - } - - Type = type; - } - - private: - /* beware: only call if YOU know that Internal is allocated. No checks - performed here. This function should be called in a constructed JSON just - before you are going to overwrite Internal... -*/ - void ClearInternal() { - switch (Type) { - case Class::Object: - delete Internal.Map; - break; - case Class::Array: - delete Internal.List; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - private: - Class Type = Class::Null; -}; - -JSON Array() { return std::move(JSON::Make(JSON::Class::Array)); } - -template -JSON Array(T... args) { - JSON arr = JSON::Make(JSON::Class::Array); - arr.append(args...); - return std::move(arr); -} - -JSON Object() { return std::move(JSON::Make(JSON::Class::Object)); } - -std::ostream& operator<<(std::ostream& os, const JSON& json) { - os << json.dump(); - return os; -} - -namespace { // NOLINT -JSON parse_next(const string&, size_t&); - -void consume_ws(const string& str, size_t& offset) { // NOLINT - while (isspace(str[offset])) ++offset; -} - -JSON parse_object(const string& str, size_t& offset) { // NOLINT - JSON Object = JSON::Make(JSON::Class::Object); - - ++offset; - consume_ws(str, offset); - if (str[offset] == '}') { - ++offset; - return std::move(Object); - } - - while (true) { - JSON Key = parse_next(str, offset); - consume_ws(str, offset); - if (str[offset] != ':') { - std::cerr << "Error: Object: Expected colon, found '" << str[offset] - << "'\n"; - break; - } - consume_ws(str, ++offset); - JSON Value = parse_next(str, offset); - Object[Key.ToString()] = Value; - - consume_ws(str, offset); - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == '}') { - ++offset; - break; - } else { - std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] - << "'\n"; - break; - } - } - - return std::move(Object); -} - -JSON parse_array(const string& str, size_t& offset) { // NOLINT - JSON Array = JSON::Make(JSON::Class::Array); - unsigned index = 0; - - ++offset; - consume_ws(str, offset); - if (str[offset] == ']') { - ++offset; - return std::move(Array); - } - - while (true) { - Array[index++] = parse_next(str, offset); - consume_ws(str, offset); - - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == ']') { - ++offset; - break; - } else { - std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] - << "'\n"; - return std::move(JSON::Make(JSON::Class::Array)); - } - } - - return std::move(Array); -} - -JSON parse_string(const string& str, size_t& offset) { // NOLINT - JSON String; - string val; - for (char c = str[++offset]; c != '\"'; c = str[++offset]) { - if (c == '\\') { - switch (str[++offset]) { - case '\"': - val += '\"'; - break; - case '\\': - val += '\\'; - break; - case '/': - val += '/'; - break; - case 'b': - val += '\b'; - break; - case 'f': - val += '\f'; - break; - case 'n': - val += '\n'; - break; - case 'r': - val += '\r'; - break; - case 't': - val += '\t'; - break; - case 'u': { - val += "\\u"; - for (unsigned i = 1; i <= 4; ++i) { - c = str[offset + i]; - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { - val += c; - } else { - std::cerr << "ERROR: String: Expected hex character in unicode " - "escape, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::String)); - } - } - offset += 4; - } break; - default: - val += '\\'; - break; - } - } else { - val += c; - } - } - ++offset; - String = val; - return std::move(String); -} - -JSON parse_number(const string& str, size_t& offset) { // NOLINT - JSON Number; - string val, exp_str; - char c; - bool isDouble = false; - int exp = 0; - while (true) { - c = str[offset++]; - if ((c == '-') || (c >= '0' && c <= '9')) { - val += c; - } else if (c == '.') { - val += c; - isDouble = true; - } else { - break; - } - } - if (c == 'E' || c == 'e') { - c = str[offset++]; - if (c == '-') { - ++offset; - exp_str += '-'; - } - while (true) { - c = str[offset++]; - if (c >= '0' && c <= '9') { - exp_str += c; - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: Expected a number for exponent, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } else { - break; - } - } - exp = std::stol(exp_str); - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - --offset; - - if (isDouble) { - Number = std::stod(val) * std::pow(10, exp); - } else { - if (!exp_str.empty()) - Number = std::stol(val) * std::pow(10, exp); - else - Number = std::stol(val); - } - return std::move(Number); -} - -JSON parse_bool(const string& str, size_t& offset) { // NOLINT - JSON Bool; - if (str.substr(offset, 4) == "true") { - Bool = true; - } else if (str.substr(offset, 5) == "false") { - Bool = false; - } else { - std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" - << str.substr(offset, 5) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += (Bool.ToBool() ? 4 : 5); - return std::move(Bool); -} - -JSON parse_null(const string& str, size_t& offset) { // NOLINT - JSON Null; - if (str.substr(offset, 4) != "null") { - std::cerr << "ERROR: Null: Expected 'null', found '" - << str.substr(offset, 4) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += 4; - return std::move(Null); -} - -JSON parse_next(const string& str, size_t& offset) { // NOLINT - char value; - consume_ws(str, offset); - value = str[offset]; - switch (value) { - case '[': - return std::move(parse_array(str, offset)); - case '{': - return std::move(parse_object(str, offset)); - case '\"': - return std::move(parse_string(str, offset)); - case 't': - case 'f': - return std::move(parse_bool(str, offset)); - case 'n': - return std::move(parse_null(str, offset)); - default: - if ((value <= '9' && value >= '0') || value == '-') - return std::move(parse_number(str, offset)); - } - std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; - return JSON(); -} -} // namespace - -JSON JSON::Load(const string& str) { - size_t offset = 0; - return std::move(parse_next(str, offset)); -} - -} // namespace json - -#endif // UTILS_JSON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/log.h deleted file mode 100644 index c2bf03f261a8711f74da819d80d68e8eb9fb124a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/log.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_LOG_H_ -#define UTILS_LOG_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/log.h" - -#endif // UTILS_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/string.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/string.cc deleted file mode 100644 index 1ab93adf3cac1bc5a42c0b8c6cadbde399678fef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/string.cc +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/string.h" - -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -void SplitString(const std::string& str, std::vector* strs) { - SplitStringToVector(Trim(str), " \t", true, strs); -} - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars) { - chars->clear(); - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - assert((str[i] & 0xF8) <= 0xF0); - if ((str[i] & 0x80) == 0x00) { - // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - // The next 1,920 characters need two bytes to encode, - // which covers the remainder of almost all Latin-script alphabets. - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - // Three bytes are needed for characters in the rest of - // the Basic Multilingual Plane, which contains virtually all characters - // in common use, including most Chinese, Japanese and Korean characters. - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - // Four bytes are needed for characters in the other planes of Unicode, - // which include less common CJK characters, various historic scripts, - // mathematical symbols, and emoji (pictographic symbols). - bytes = 4; - } - chars->push_back(str.substr(i, bytes)); - } -} - -int UTF8StringLength(const std::string& str) { - int len = 0; - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - if ((str[i] & 0x80) == 0x00) { - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - bytes = 4; - } - ++len; - } - return len; -} - -bool CheckEnglishChar(const std::string& ch) { - // all english characters should be encoded in one byte - if (ch.size() != 1) return false; - // english words may contain apostrophe, i.e., "He's" - return isalpha(ch[0]) || ch[0] == '\''; -} - -bool CheckEnglishWord(const std::string& word) { - std::vector chars; - SplitUTF8StringToChars(word, &chars); - for (size_t k = 0; k < chars.size(); k++) { - if (!CheckEnglishChar(chars[k])) { - return false; - } - } - return true; -} - -std::string JoinString(const std::string& c, - const std::vector& strs) { - std::string result; - if (strs.size() > 0) { - for (int i = 0; i < strs.size() - 1; i++) { - result += (strs[i] + c); - } - result += strs.back(); - } - return result; -} - -bool IsAlpha(const std::string& str) { - for (size_t i = 0; i < str.size(); i++) { - if (!isalpha(str[i])) { - return false; - } - } - return true; -} - -std::string ProcessBlank(const std::string& str, bool lowercase) { - std::string result; - if (!str.empty()) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - for (std::string& ch : chars) { - if (ch != kSpaceSymbol) { - result.append(ch); - } else { - // Ignore consecutive space or located in head - if (!result.empty() && result.back() != ' ') { - result.push_back(' '); - } - } - } - // Ignore tailing space - if (!result.empty() && result.back() == ' ') { - result.pop_back(); - } - // NOTE: convert string to wstring - // see issue 745: https://github.com/wenet-e2e/wenet/issues/745 - std::locale loc(""); - std::wstring_convert, wchar_t> converter; - std::wstring wsresult = converter.from_bytes(result); - for (auto& c : wsresult) { - c = lowercase ? tolower(c, loc) : toupper(c, loc); - } - result = converter.to_bytes(wsresult); - } - return result; -} - -std::string Ltrim(const std::string& str) { - size_t start = str.find_first_not_of(WHITESPACE); - return (start == std::string::npos) ? "" : str.substr(start); -} - -std::string Rtrim(const std::string& str) { - size_t end = str.find_last_not_of(WHITESPACE); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); -} - -std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } - -std::string JoinPath(const std::string& left, const std::string& right) { - std::string path(left); - if (path.size() && path.back() != '/') { - path.push_back('/'); - } - path.append(right); - return path; -} - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str) { - unsigned len = str.size() * 2; - setlocale(LC_CTYPE, ""); - wchar_t* p = new wchar_t[len]; - mbstowcs(p, str.c_str(), len); - std::wstring wstr(p); - delete[] p; - return wstr; -} -#endif - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/string.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/string.h deleted file mode 100644 index bf7a52ae09bce45ab7e34a5277652d7ae91bae1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/string.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_STRING_H_ -#define UTILS_STRING_H_ - -#include -#include -#include -#include -#include - -#include "fst/symbol-table.h" - -namespace wenet { - -const char WHITESPACE[] = " \n\r\t\f\v"; - -// Split the string with space or tab. -void SplitString(const std::string& str, std::vector* strs); - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out); - -// NOTE(Xingchen Song): we add this function to make it possible to -// support multilingual recipe in the future, in which characters of -// different languages are all encoded in UTF-8 format. -// UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding -// Split the UTF-8 string into chars. -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars); - -int UTF8StringLength(const std::string& str); - -// Check whether the UTF-8 char is alphabet or '. -bool CheckEnglishChar(const std::string& ch); - -// Check whether the UTF-8 word is only contains alphabet or '. -bool CheckEnglishWord(const std::string& word); - -std::string JoinString(const std::string& c, - const std::vector& strs); - -bool IsAlpha(const std::string& str); - -// Split the UTF-8 string into words by symbol table. -// Return whether not contains oov. -bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - -// Replace ▁ with space, then remove head, tail and consecutive space. -std::string ProcessBlank(const std::string& str, bool lowercase); - -std::string Ltrim(const std::string& str); - -std::string Rtrim(const std::string& str); - -std::string Trim(const std::string& str); - -std::string JoinPath(const std::string& left, const std::string& right); - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str); -#endif - -} // namespace wenet - -#endif // UTILS_STRING_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/thread_pool.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/thread_pool.h deleted file mode 100644 index a78162995d90bf079ad091cf14cb9f2cd4476d05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/thread_pool.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2012 Jakob Progsch, Václav Zeman - -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. - -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: - -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. - -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original software. - -// 3. This notice may not be removed or altered from any source -// distribution. - -#ifndef UTILS_THREAD_POOL_H_ -#define UTILS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - public: - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared >( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) { - worker.join(); - } -} - -#endif // UTILS_THREAD_POOL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/timer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/timer.h deleted file mode 100644 index 068519f98d140ba0eef68babe2ad2fdcb798c074..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/timer.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_TIMER_H_ -#define UTILS_TIMER_H_ - -#include - -namespace wenet { - -class Timer { - public: - Timer() : time_start_(std::chrono::steady_clock::now()) {} - void Reset() { time_start_ = std::chrono::steady_clock::now(); } - // return int in milliseconds - int Elapsed() const { - auto time_now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(time_now - - time_start_) - .count(); - } - - private: - std::chrono::time_point time_start_; -}; -} // namespace wenet - -#endif // UTILS_TIMER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/utils.cc deleted file mode 100644 index c37e36c6e9f629e0a4b11cf21a791aefd58b659f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/utils.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/utils.h" - -#include -#include -#include -#include -#include -#include - -#include "utils/log.h" - -namespace wenet { - -float LogAdd(float x, float y) { - static float num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - float xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -template -struct ValueComp { - bool operator()(const std::pair& lhs, - const std::pair& rhs) const { - return lhs.first > rhs.first || - (lhs.first == rhs.first && lhs.second < rhs.second); - } -}; - -// We refer the pytorch topk implementation -// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices) { - std::vector> heap_data; - int n = data.size(); - for (int32_t i = 0; i < k && i < n; ++i) { - heap_data.emplace_back(data[i], i); - } - std::priority_queue, std::vector>, - ValueComp> - pq(ValueComp(), std::move(heap_data)); - for (int32_t i = k; i < n; ++i) { - if (pq.top().first < data[i]) { - pq.pop(); - pq.emplace(data[i], i); - } - } - - values->resize(std::min(k, n)); - indices->resize(std::min(k, n)); - int32_t cur = values->size() - 1; - while (!pq.empty()) { - const auto& item = pq.top(); - (*values)[cur] = item.first; - (*indices)[cur] = item.second; - pq.pop(); - cur -= 1; - } -} - -template void TopK(const std::vector& data, int32_t k, - std::vector* values, - std::vector* indices); - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/utils.h deleted file mode 100644 index f9957c0b6e8ae27d9260e75cf55e786055827801..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/utils/utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_UTILS_H_ -#define UTILS_UTILS_H_ - -#include -#include -#include - -namespace wenet { - -#define WENET_DISALLOW_COPY_AND_ASSIGN(Type) \ - Type(const Type&) = delete; \ - Type& operator=(const Type&) = delete; - -const float kFloatMax = std::numeric_limits::max(); -// kSpaceSymbol in UTF-8 is: ▁ -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Return the sum of two probabilities in log scale -float LogAdd(float x, float y); - -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices); - -} // namespace wenet - -#endif // UTILS_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/CMakeLists.txt deleted file mode 100644 index 67447c42d977f120fc39cdab0d052b011edd3efe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(websocket STATIC - websocket_client.cc - websocket_server.cc -) -target_link_libraries(websocket PUBLIC decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_client.cc deleted file mode 100644 index c0394e6250153e2d59636c9eab62badc4a737d16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_client.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "websocket/websocket_client.h" - -#include "boost/json/src.hpp" - -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -WebSocketClient::WebSocketClient(const std::string& hostname, int port) - : hostname_(hostname), port_(port) { - Connect(); - t_.reset(new std::thread(&WebSocketClient::ReadLoopFunc, this)); -} - -void WebSocketClient::Connect() { - tcp::resolver resolver{ioc_}; - // Look up the domain name - auto const results = resolver.resolve(hostname_, std::to_string(port_)); - // Make the connection on the IP address we get from a lookup - auto ep = asio::connect(ws_.next_layer(), results); - // Provide the value of the Host HTTP header during the WebSocket handshake. - // See https://tools.ietf.org/html/rfc7230#section-5.4 - std::string host = hostname_ + ":" + std::to_string(ep.port()); - // Perform the websocket handshake - ws_.handshake(host, "/"); -} - -void WebSocketClient::SendTextData(const std::string& data) { - ws_.text(true); - ws_.write(asio::buffer(data)); -} - -void WebSocketClient::SendBinaryData(const void* data, size_t size) { - ws_.binary(true); - ws_.write(asio::buffer(data, size)); -} - -void WebSocketClient::Close() { ws_.close(websocket::close_code::normal); } - -void WebSocketClient::ReadLoopFunc() { - try { - while (true) { - beast::flat_buffer buffer; - ws_.read(buffer); - std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; - CHECK(ws_.got_text()); - json::object obj = json::parse(message).as_object(); - if (obj["status"] != "ok") { - break; - } - if (obj["type"] == "speech_end") { - done_ = true; - break; - } - } - } catch (beast::system_error const& se) { - // This indicates that the session was closed - if (se.code() != websocket::error::closed) { - LOG(ERROR) << se.code().message(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void WebSocketClient::Join() { t_->join(); } - -void WebSocketClient::SendStartSignal() { - // TODO(Binbin Zhang): Add sample rate and other setting support - json::value start_tag = {{"signal", "start"}, - {"nbest", nbest_}, - {"continuous_decoding", continuous_decoding_}}; - std::string start_message = json::serialize(start_tag); - this->SendTextData(start_message); -} - -void WebSocketClient::SendEndSignal() { - json::value end_tag = {{"signal", "end"}}; - std::string end_message = json::serialize(end_tag); - this->SendTextData(end_message); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_client.h deleted file mode 100644 index 76ec3aa451d31c7ee6b158ce21c8acdc10575eb3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_client.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef WEBSOCKET_WEBSOCKET_CLIENT_H_ -#define WEBSOCKET_WEBSOCKET_CLIENT_H_ - -#include -#include -#include -#include - -#include "boost/asio/connect.hpp" -#include "boost/asio/ip/tcp.hpp" -#include "boost/beast/core.hpp" -#include "boost/beast/websocket.hpp" - -#include "utils/utils.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class WebSocketClient { - public: - WebSocketClient(const std::string& host, int port); - - void SendTextData(const std::string& data); - void SendBinaryData(const void* data, size_t size); - void ReadLoopFunc(); - void Close(); - void Join(); - void SendStartSignal(); - void SendEndSignal(); - void set_nbest(int nbest) { nbest_ = nbest; } - void set_continuous_decoding(bool continuous_decoding) { - continuous_decoding_ = continuous_decoding; - } - bool done() const { return done_; } - - private: - void Connect(); - std::string hostname_; - int port_; - int nbest_ = 1; - bool continuous_decoding_ = false; - bool done_ = false; - asio::io_context ioc_; - websocket::stream ws_{ioc_}; - std::unique_ptr t_{nullptr}; - - WENET_DISALLOW_COPY_AND_ASSIGN(WebSocketClient); -}; - -} // namespace wenet - -#endif // WEBSOCKET_WEBSOCKET_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_server.cc deleted file mode 100644 index 52ab088f46d59b9f3f1add1e34d3aceae290f5da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_server.cc +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "websocket/websocket_server.h" - -#include -#include -#include - -#include "boost/json/src.hpp" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -ConnectionHandler::ConnectionHandler( - tcp::socket&& socket, std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : ws_(std::move(socket)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - -void ConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Received speech start signal, start reading speech"; - got_start_tag_ = true; - json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = - std::make_shared(&ConnectionHandler::DecodeThreadFunc, this); -} - -void ConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Received speech end signal"; - if (feature_pipeline_ != nullptr) { - feature_pipeline_->set_input_finished(); - } - got_end_tag_ = true; -} - -void ConnectionHandler::OnPartialResult(const std::string& result) { - LOG(INFO) << "Partial result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "partial_result"}, {"nbest", result}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnFinalResult(const std::string& result) { - LOG(INFO) << "Final result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "final_result"}, {"nbest", result}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnFinish() { - // Send finish tag - json::value rv = {{"status", "ok"}, {"type", "speech_end"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) { - // Read binary PCM data - int num_samples = buffer.size() / sizeof(int16_t); - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - const auto* pcm_data = static_cast(buffer.data().data()); - feature_pipeline_->AcceptWaveform(pcm_data, num_samples); -} - -std::string ConnectionHandler::SerializeResult(bool finish) { - json::array nbest; - for (const DecodeResult& path : decoder_->result()) { - json::object jpath({{"sentence", path.sentence}}); - if (finish) { - json::array word_pieces; - for (const WordPiece& word_piece : path.word_pieces) { - json::object jword_piece({{"word", word_piece.word}, - {"start", word_piece.start}, - {"end", word_piece.end}}); - word_pieces.emplace_back(jword_piece); - } - jpath.emplace("word_pieces", word_pieces); - } - nbest.emplace_back(jpath); - - if (nbest.size() == nbest_) { - break; - } - } - return json::serialize(nbest); -} - -void ConnectionHandler::DecodeThreadFunc() { - try { - while (true) { - DecodeState state = decoder_->Decode(); - if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - OnFinish(); - stop_recognition_ = true; - break; - } else if (state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - // If it's not continuous decoding, continue to do next recognition - // otherwise stop the recognition - if (continuous_decoding_) { - decoder_->ResetContinuousDecoding(); - } else { - OnFinish(); - stop_recognition_ = true; - break; - } - } else { - if (decoder_->DecodedSomething()) { - std::string result = SerializeResult(false); - OnPartialResult(result); - } - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void ConnectionHandler::OnError(const std::string& message) { - json::value rv = {{"status", "failed"}, {"message", message}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); - // Close websocket - ws_.close(websocket::close_code::normal); -} - -void ConnectionHandler::OnText(const std::string& message) { - json::value v = json::parse(message); - if (v.is_object()) { - json::object obj = v.get_object(); - if (obj.find("signal") != obj.end()) { - json::string signal = obj["signal"].as_string(); - if (signal == "start") { - if (obj.find("nbest") != obj.end()) { - if (obj["nbest"].is_int64()) { - nbest_ = obj["nbest"].as_int64(); - } else { - OnError("integer is expected for nbest option"); - } - } - if (obj.find("continuous_decoding") != obj.end()) { - if (obj["continuous_decoding"].is_bool()) { - continuous_decoding_ = obj["continuous_decoding"].as_bool(); - } else { - OnError( - "boolean true or false is expected for " - "continuous_decoding option"); - } - } - OnSpeechStart(); - } else if (signal == "end") { - OnSpeechEnd(); - } else { - OnError("Unexpected signal type"); - } - } else { - OnError("Wrong message header"); - } - } else { - OnError("Wrong protocol"); - } -} - -void ConnectionHandler::operator()() { - try { - // Accept the websocket handshake - ws_.accept(); - for (;;) { - // This buffer will hold the incoming message - beast::flat_buffer buffer; - // Read a message - ws_.read(buffer); - if (ws_.got_text()) { - std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; - OnText(message); - if (got_end_tag_) { - break; - } - } else { - if (!got_start_tag_) { - OnError("Start signal is expected before binary data"); - } else { - if (stop_recognition_) { - break; - } - OnSpeechData(buffer); - } - } - } - - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (beast::system_error const& se) { - LOG(INFO) << se.code().message(); - // This indicates that the session was closed - if (se.code() == websocket::error::closed) { - OnSpeechEnd(); - } - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void WebSocketServer::Start() { - try { - auto const address = asio::ip::make_address("0.0.0.0"); - tcp::acceptor acceptor{ioc_, {address, static_cast(port_)}}; - for (;;) { - // This will receive the new connection - tcp::socket socket{ioc_}; - // Block until we get a connection - acceptor.accept(socket); - // Launch the session, transferring ownership of the socket - ConnectionHandler handler(std::move(socket), feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.detach(); - } - } catch (const std::exception& e) { - LOG(FATAL) << e.what(); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_server.h deleted file mode 100644 index a1241834221dcf93c34d6414bd9b5ae40ef1cf38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/websocket/websocket_server.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef WEBSOCKET_WEBSOCKET_SERVER_H_ -#define WEBSOCKET_WEBSOCKET_SERVER_H_ - -#include -#include -#include -#include -#include - -#include "boost/asio/connect.hpp" -#include "boost/asio/ip/tcp.hpp" -#include "boost/beast/core.hpp" -#include "boost/beast/websocket.hpp" - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class ConnectionHandler { - public: - ConnectionHandler(tcp::socket&& socket, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource_); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnText(const std::string& message); - void OnFinish(); - void OnSpeechData(const beast::flat_buffer& buffer); - void OnError(const std::string& message); - void OnPartialResult(const std::string& result); - void OnFinalResult(const std::string& result); - void DecodeThreadFunc(); - std::string SerializeResult(bool finish); - - bool continuous_decoding_ = false; - int nbest_ = 1; - websocket::stream ws_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - bool got_start_tag_ = false; - bool got_end_tag_ = false; - // When endpoint is detected, stop recognition, and stop receiving data. - bool stop_recognition_ = false; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class WebSocketServer { - public: - WebSocketServer(int port, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : port_(port), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - - void Start(); - - private: - int port_; - // The io_context is required for all I/O - asio::io_context ioc_{1}; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - WENET_DISALLOW_COPY_AND_ASSIGN(WebSocketServer); -}; - -} // namespace wenet - -#endif // WEBSOCKET_WEBSOCKET_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/CMakeLists.txt deleted file mode 100644 index 380e23204b32dad2ee66999430a66450066ea6a7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -message("cmake build type is ${CMAKE_BUILD_TYPE} .") - -if(XPU) - list(APPEND xpu_conformer_srcs ./xpu_asr_model.cc) - list(APPEND xpu_conformer_srcs ./xpu_conformer.cpp) - list(APPEND xpu_conformer_srcs ./xpu_util.cpp) - message(STATUS "Use src_files: [ ${xpu_conformer_srcs} ] to compile xpu_conformer.a .") - - # compile xpu_conformer.a - add_library(xpu_conformer STATIC ${xpu_conformer_srcs}) - target_link_libraries(xpu_conformer PUBLIC xpuapi xpurt) -endif() - -set(CMAKE_VERBOSE_MAKEFILE OFF) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive") -set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -lm -ldl") - -set(SRC_FILES ./conformer_test.cpp ./xpu_conformer.cpp ./xpu_util.cpp) -message(STATUS "Use src_files: [ ${SRC_FILES} ] to compile xpu_conformer_test.") - -add_executable(xpu_conformer_test ${SRC_FILES}) -target_link_libraries(xpu_conformer_test -lxpuapi -lxpurt) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/conformer_test.cpp b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/conformer_test.cpp deleted file mode 100644 index 1d9fd672a31eb5d7a787368f274df516d19dc7a8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/conformer_test.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright (c) 2022 KUNLUNXIN Inc. -// 2022 Han Qi (qihan@baidu.com) -// Hehe Pan (panhehe@baidu.com) -// Zikui Yan (yanzikui@baidu.com) -// Chaolin Li (lichaolin@baidu.com) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include "xpu_conformer.h" // NOLINT -#include "xpu_util.h" // NOLINT -namespace api = baidu::xpu::api; -namespace wenet = xpu::wenet; - -template -static void conformer_test(const std::string& data_dir, - const std::string& params_dir, int threads_number, - int dev_id) { - typedef std::vector< - std::tuple>, - std::tuple, std::vector>>> - Dtype; - ConformerEncoderParam encoder_param; - init_encoder_params(params_dir, encoder_param); - ConformerDecoderParam decoder_param; - init_decoder_params(params_dir, decoder_param); - int real_threads_number = threads_number <= 0 ? 1 : threads_number; - std::cout << "Encoder + Decoder MultiStreamTest threads:" - << real_threads_number << std::endl; - // init test data - std::vector ids = get_all_ids(data_dir); - Dtype data_list; - for (auto index_id : ids) { - std::string input_lenghts_prefix = - data_dir + std::to_string(index_id) + "_len"; - std::string input_prefix = data_dir + std::to_string(index_id); - auto input_lenghts_cpu_info = - read_cpu_data_from_file(input_lenghts_prefix, 1); - auto input_xpu_info = read_xpu_data_from_file(input_prefix, 3); - data_list.push_back( - std::make_tuple(input_xpu_info, input_lenghts_cpu_info)); - } - bool write_res = true; - // init mem - int ret = 0; - std::vector ctx_xpu_ptrs(real_threads_number); - std::vector streams(real_threads_number); - - int nsdnn = real_threads_number > 1 ? 2 : 6; - int ncluster = real_threads_number > 1 ? 2 : 8; - for (int i = 0; i < real_threads_number; i++) { - ret = xpu_stream_create(&streams[i]); - ctx_xpu_ptrs[i] = new api::Context(api::kXPU2); - ctx_xpu_ptrs[i]->xpu_stream = streams[i]; - ctx_xpu_ptrs[i]->set_nsdnn(nsdnn); - ctx_xpu_ptrs[i]->set_ncluster(ncluster); - } - // threads - std::vector thread_times(real_threads_number); - std::vector threads; - int data_counter = 0; - std::mutex data_mutex; - std::vector time_info(real_threads_number, 0.0f); - auto f = [&](int thread_id) { - xpu_set_device(dev_id); - api::Context* ctx_xpu = ctx_xpu_ptrs[thread_id]; - api::ctx_guard RAII_GUARD(ctx_xpu); - while (true) { - int data_index = -1; - data_mutex.lock(); - if (data_counter >= data_list.size()) { - data_mutex.unlock(); - break; - } - data_index = data_counter++; - data_mutex.unlock(); - if (data_index < 0) { - continue; - } - auto start_time = std::chrono::system_clock::now(); - // get input data - auto& input_xpu_info = std::get<0>(data_list[data_index]); - auto& input_lenghts_info = std::get<1>(data_list[data_index]); - auto& input_xpu_data = std::get<0>(input_xpu_info); - auto& speech_shape = std::get<1>(input_xpu_info); - int batch = speech_shape[0]; - int max_seqlen = speech_shape[1]; - auto xpu_mask_info_float = create_mask_according_speech_length( - std::get<0>(input_lenghts_info), max_seqlen, ctx_xpu->xpu_stream); - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - int q_seqlen = ((max_seqlen - 1) / 2 - 1) / 2; - // encoder run - int att_dim = encoder_param.head_num * encoder_param.head_dim; - int ctc_dim = encoder_param.ctc_dim; - T* encoder_out = RAII_GUARD.alloc(batch * q_seqlen * att_dim); - T* ctc_probs = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); - // get encoder_out & ctc_probs - ret = wenet::conformer_encoder_wenet( - ctx_xpu, input_xpu_data, speech_shape, encoder_out, ctc_probs, - encoder_param, xpu_mask_info_float); - CHECK_RET(ret); - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - // ctc_prefix_beamsearch implement in cpu - int beam_size = encoder_param.beam_size; - int new_bs = batch * beam_size; - std::vector hyps_len(new_bs); - std::vector ctc_scores(new_bs); - std::vector hyps_cpu; - int* hyps = RAII_GUARD.alloc(new_bs * q_seqlen); - ret = wenet::ctc_prefix_beamsearch(ctx_xpu, ctc_probs, hyps_cpu, - hyps_len, ctc_scores, batch, - beam_size, q_seqlen, ctc_dim); - CHECK_RET(ret); - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - int max_target_len = - padding_target(hyps_cpu, hyps_len, beam_size, decoder_param.eos_id); - ret = xpu_memcpy(hyps, reinterpret_cast(&hyps_cpu[0]), - max_target_len * new_bs * sizeof(int), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - // decoder - int pad_target_len = decoder_param.add_sos_num + max_target_len; - float* character_scores = - RAII_GUARD.alloc(new_bs * pad_target_len * ctc_dim); - ret = wenet::conformer_decoder_wenet( - ctx_xpu, encoder_out, {batch, q_seqlen, att_dim}, - std::get<0>(xpu_mask_info_float), hyps, {new_bs, max_target_len}, - character_scores, decoder_param); - CHECK_RET(ret); - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - // Only use decoder score for rescoring - std::vector best_score(batch, -std::numeric_limits::max()); - std::vector best_index(batch, 0); - float ctc_weight = 0.5; - std::vector decoder_out(new_bs * pad_target_len * ctc_dim); - ret = xpu_memcpy(&decoder_out[0], character_scores, - new_bs * max_target_len * ctc_dim * sizeof(float), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); - xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - // cal score && output - std::string wav_prefix = - data_dir + std::to_string(data_index) + "_wav.txt"; - std::string res_prefix = "./token_id.txt"; - std::ofstream res; - std::string wav_name; - std::vector wav_info; - if (write_res) { - std::ifstream wav(wav_prefix.c_str()); - if (!wav.is_open()) { - std::cout << "wav file open fail" << std::endl; - exit(0); - } - while (getline(wav, wav_name)) { - wav_info.push_back(wav_name); - } - wav.close(); - } - for (int i = 0; i < batch; i++) { - for (int j = 0; j < beam_size; j++) { - T score = 0.0; - for (int k = 0; k < hyps_len[i * beam_size + j]; k++) { - int index = i * beam_size * max_target_len * ctc_dim + - j * max_target_len * ctc_dim + k * ctc_dim + - hyps_cpu[k]; - score += decoder_out[index]; - } - score += decoder_out[i * beam_size * max_target_len * ctc_dim + - j * max_target_len * ctc_dim + - hyps_len[i * batch + j] * ctc_dim + ctc_dim - 1]; - // add ctc score - score += ctc_weight * ctc_scores[i * beam_size + j]; - if (score > best_score[i]) { - best_score[i] = score; - best_index[i] = j; - } - } - int token_index = best_index[i] + i * beam_size; - if (write_res) { - data_mutex.lock(); - res.open(res_prefix, std::ios::app); - if (!res.is_open()) { - std::cout << "res file open fail" << std::endl; - exit(0); - } - res << wav_info[i] << ":"; - for (int k = 0; k < hyps_len[token_index]; k++) - res << hyps_cpu[k] << " "; - res << std::endl; - res.close(); - data_mutex.unlock(); - } - } - auto end_time = std::chrono::system_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time); - time_info[thread_id] += static_cast(duration.count()) / 1000; - ret = xpu_free(std::get<0>(input_xpu_info)); - CHECK_RET(ret); - ret = xpu_free(std::get<0>(xpu_mask_info_float)); - CHECK_RET(ret); - } - }; - auto all_start = std::chrono::system_clock::now(); - for (auto i = 0; i < real_threads_number; i++) { - std::thread t(f, i); - threads.push_back(std::move(t)); - } - for (auto& t : threads) { - t.join(); - } - auto all_end = std::chrono::system_clock::now(); - auto duration = std::chrono::duration_cast( - all_end - all_start); - float total_time = static_cast(duration.count()) / 1000; - std::cout << "Total time cost:" << total_time << std::endl; - for (int i = 0; i < real_threads_number; i++) { - if (ctx_xpu_ptrs[i]) delete ctx_xpu_ptrs[i]; - } -} - -int main(int argc, char* argv[]) { - if (argc != 6) { - std::cout << "Only support the following three params:" << std::endl; - std::cout - << "\t1. " << argv[0] - << " encoder_test [params_dir] [data_dir] [dev_id] [threads_number]" - << std::endl; - std::cout - << "\t2. " << argv[0] - << " decoder_test [params_dir] [data_dir] [dev_id] [threads_number]" - << std::endl; - std::cout << "\t3. " << argv[0] - << " all [params_dir] [data_dir] [dev_id] [threads_number]" - << std::endl; - return 0; - } - std::string mode = argv[1]; - std::string params_dir = argv[2]; - std::string data_dir = argv[3]; - int dev_id = std::stoi(argv[4]); - int threads_number = std::stoi(argv[5]); - add_separator_when_necessary(params_dir); - add_separator_when_necessary(data_dir); - xpu_set_device(dev_id); - - typedef float16 T; - typedef int16_t TW; - typedef int16_t TGEMM; - - if (mode == "all") { - conformer_test(data_dir, params_dir, threads_number, dev_id); - } else { - std::cout << "Unkown test mode: " << mode << std::endl; - std::exit(1); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_asr_model.cc deleted file mode 100644 index 71b60bd156e5b1812dec903d1ba4a3d3f54625ea..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_asr_model.cc +++ /dev/null @@ -1,318 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Han Qi (qihan@baidu.com, Kunlunxin Inc) -// Hehe Pan (panhehe@baidu.com, Kunlunxin Inc) -// Zikui Yan (yanzikui@baidu.com, Kunlunxin Inc) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "xpu_asr_model.h" // NOLINT - -#include -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -void XPUAsrModel::SetEngineThreads(int num_threads) { - real_threads_number = num_threads; -} - -void XPUAsrModel::SetDeviceId(int dev_id) { device_id_ = dev_id; } - -void XPUAsrModel::Read(const std::string& model_dir) { - // init xpu runtime params - ctx_xpu_ptr = std::make_shared(api::kXPU2); - RAII_GUARD.reset(new api::ctx_guard(ctx_xpu_ptr.get())); - - // For XPU, model_dir is params_dir, which is used to store weights for every - // layer. - std::string weight_dir = model_dir + "/model_weights/"; - std::string weight_info_txt_path = weight_dir + "/weights_info.txt"; - - LOG(INFO) << "\e[1;34mXPU weight_dir is: " << weight_dir << "\e[0m\n"; - if (!std::ifstream(weight_info_txt_path.c_str()).good()) { - LOG(FATAL) << "weight_info_txt: " << weight_info_txt_path - << " NOT exist !!!\n"; - } - - // 1. Load weight for every layer - init_encoder_params(weight_dir, encoder_param); - init_decoder_params(weight_dir, decoder_param); - - // 2. Read metadata - // TODO(panhehe): Load following parameters from config file or - // encoder/decoder params. - subsampling_rate_ = 4; - right_context_ = 6; - sos_ = 5538; - eos_ = 5538; - is_bidirectional_decoder_ = 1; - - LOG(INFO) << "======= XPU Kunlun Model Info: ======="; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -XPUAsrModel::XPUAsrModel(const XPUAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - l3ptr = other.l3ptr; - real_threads_number = other.real_threads_number; - device_id_ = other.device_id_; - ctx_xpu_ptr = other.ctx_xpu_ptr; - RAII_GUARD = other.RAII_GUARD; - encoder_param = other.encoder_param; - decoder_param = other.decoder_param; - stream = other.stream; - // other member variables may not need to copy here -} - -std::shared_ptr XPUAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void XPUAsrModel::Reset() { - offset_ = 0; - encoder_out = nullptr; - ctc_probs = nullptr; - cached_feature_.clear(); - // Reset att_cache - att_cache_.resize(0, 0.0); - cnn_cache_.resize(0, 0.0); -} - -void XPUAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // Set Device Id - LOG(INFO) << "Now Use XPU:" << device_id_ << "!\n"; - xpu_set_device(device_id_); - - // 1. Prepare XPU required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - // chunk - - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - - std::vector feats_length_shape = {1}; - std::vector feats_length_data = {num_frames}; - input_lenghts_cpu_info = - std::make_tuple(feats_length_data, feats_length_shape); - - std::vector feats_data_shape = {1, num_frames, feature_dim}; - std::vector feats_data_cpu; - feats_data_cpu.reserve(1 * num_frames * feature_dim); - // convert 2d-vector to 1d-vector - for (auto& row : chunk_feats) { - auto end_iter = feats_data_cpu.end(); - feats_data_cpu.insert(end_iter, row.cbegin(), row.cend()); - } - - float* input_xpu_data = get_xpu_data("wav_test", feats_data_cpu); - input_xpu_info = std::make_tuple(input_xpu_data, feats_data_shape); - - // init L3 Memory - int ret = 0; - real_threads_number = 1; - int nsdnn = real_threads_number > 1 ? 2 : 6; - int ncluster = real_threads_number > 1 ? 2 : 8; - for (int i = 0; i < real_threads_number; i++) { - ret = xpu_stream_create(&stream); - ctx_xpu_ptr->xpu_stream = stream; - ctx_xpu_ptr->set_nsdnn(nsdnn); - ctx_xpu_ptr->set_ncluster(ncluster); - } - - std::shared_ptr ctx_xpu = ctx_xpu_ptr; - - // get input speech info and data - batch = feats_data_shape.at(0); // batch = 1 - max_seqlen = feats_data_shape.at(1); - - xpu_mask_info_float = create_mask_according_speech_length( - feats_length_data, max_seqlen, ctx_xpu->xpu_stream); - - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - - q_seqlen = ((max_seqlen - 1) / 2 - 1) / 2; - - // Encoder run - int att_dim = encoder_param.head_num * encoder_param.head_dim; - int ctc_dim = encoder_param.ctc_dim; - - LOG(INFO) << "\t max_seqlen is " << max_seqlen << "\n"; - LOG(INFO) << "\t q_seqlen is " << q_seqlen << "\n"; - LOG(INFO) << "\t att_dim is " << att_dim << "\n"; - LOG(INFO) << "\t ctc_dim is " << ctc_dim << "\n"; - - // T is float16 - encoder_out = RAII_GUARD->alloc(batch * q_seqlen * att_dim); - ctc_probs = RAII_GUARD->alloc(batch * q_seqlen * ctc_dim); - - // 2. Encoder chunk forward, including ctc_activation - // get encoder_out & ctc_probs - ret = xpu::wenet::conformer_encoder_wenet( - ctx_xpu.get(), input_xpu_data, feats_data_shape, encoder_out, ctc_probs, - encoder_param, xpu_mask_info_float); - CHECK_RET(ret); - - // Copy to output(cpu) - int num_outputs = q_seqlen; - int output_dim = ctc_dim; - out_prob->resize(num_outputs); - - float* logp = RAII_GUARD->alloc(batch * q_seqlen * ctc_dim); - // cast T to float32 - ret = api::cast_v2(ctx_xpu.get(), ctc_probs, logp, - batch * q_seqlen * ctc_dim); - CHECK_RET(ret); - ret = xpu_wait(ctx_xpu->xpu_stream); - CHECK_RET(ret); - - // xpu_memcpy logp from device to host - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - ret = xpu_memcpy(reinterpret_cast((*out_prob)[i].data()), - logp + output_dim * i, output_dim * sizeof(float), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); - CHECK_RET(ret); - } -} - -float XPUAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void XPUAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - - if (encoder_out == nullptr) { - return; - } - - int beam_size = encoder_param.beam_size; - int new_bs = batch * beam_size; - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - LOG(INFO) << "\t num_hyps is " << num_hyps << "\n"; - LOG(INFO) << "\t beam_size is " << beam_size << "\n"; - LOG(INFO) << "\t new_bs is " << new_bs << "\n"; - LOG(INFO) << "\t max_hyps_len is " << max_hyps_len << "\n"; - - // pad hyps - std::vector hyps_pad_cpu(max_hyps_len * beam_size); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad_cpu.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad_cpu.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad_cpu.emplace_back(0); - } - } - int* hyps_xpu = RAII_GUARD->alloc(new_bs * q_seqlen); - int max_target_len = max_hyps_len; - // xpu_memcpy hyps_pad_cup to device - int ret = xpu_memcpy(hyps_xpu, reinterpret_cast(hyps_pad_cpu.data()), - max_target_len * new_bs * sizeof(int), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - CHECK_RET(ret); - - // Decoder - int att_dim = encoder_param.head_num * encoder_param.head_dim; - int ctc_dim = encoder_param.ctc_dim; - int pad_target_len = decoder_param.add_sos_num + max_target_len; - float* character_scores = - RAII_GUARD->alloc(new_bs * pad_target_len * ctc_dim); - ret = xpu::wenet::conformer_decoder_wenet( - ctx_xpu_ptr.get(), encoder_out, {batch, q_seqlen, att_dim}, - std::get<0>(xpu_mask_info_float), hyps_xpu, {new_bs, max_target_len}, - character_scores, decoder_param); - CHECK_RET(ret); - ret = xpu_wait(ctx_xpu_ptr->xpu_stream); - CHECK_RET(ret); - - // xpu_memcpy from xpu device to host - std::vector decoder_out(new_bs * pad_target_len * ctc_dim); - ret = xpu_memcpy(&decoder_out[0], character_scores, - new_bs * max_target_len * ctc_dim * sizeof(float), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); - CHECK_RET(ret); - ret = xpu_wait(ctx_xpu_ptr->xpu_stream); - CHECK_RET(ret); - - // cal score - float* decoder_outs_data = decoder_out.data(); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - // ctc_dim maybe equal to decode_out_len - score = ComputeAttentionScore( - decoder_outs_data + max_target_len * ctc_dim * i, hyp, eos_, ctc_dim); - // Optional: Used for right to left score - float r_score = 0.0f; - // reverse_weight is 0 ; so the codes in if-condition is be ignored. - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_asr_model.h deleted file mode 100644 index 500081ad9d6b3cb54c996e127117627863b9372c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_asr_model.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Han Qi (qihan@baidu.com, Kunlunxin Inc) -// Hehe Pan (panhehe@baidu.com, Kunlunxin Inc) -// Zikui Yan (yanzikui@baidu.com, Kunlunxin Inc) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_ -#define RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -#include "xpu_conformer.h" // NOLINT - -namespace wenet { - -class XPUAsrModel : public AsrModel { - typedef float16 T; - typedef int16_t TW; - - public: - // Note: Do not call the InitEngineThreads function more than once. - void SetEngineThreads(int num_threads = 1); - - public: - XPUAsrModel() = default; - XPUAsrModel(const XPUAsrModel& other); - void SetDeviceId(int dev_id); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // XPU device id - int device_id_ = 0; - int real_threads_number = 1; - - // XPU Conformer EncoderParam and DecoderParam - ConformerEncoderParam encoder_param; - ConformerDecoderParam decoder_param; - - // XPU input and weights params - using INPUT_LENGTH_CPU_TUPLE = std::tuple, std::vector>; - using INPUT_XPU_INFO_TUPLE = std::tuple>; - INPUT_LENGTH_CPU_TUPLE input_lenghts_cpu_info; - INPUT_XPU_INFO_TUPLE input_xpu_info; - INPUT_XPU_INFO_TUPLE xpu_mask_info_float; - - // XPU encoder and decoder outputs - T* encoder_out = nullptr; - T* ctc_probs = nullptr; - - // XPU runtime params - void* l3ptr = nullptr; - XPUStream stream; - std::shared_ptr ctx_xpu_ptr; - std::shared_ptr RAII_GUARD; - - int batch, max_seqlen, q_seqlen; - - // caches - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_conformer.cpp b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_conformer.cpp deleted file mode 100644 index f5fd562a624f04cd196a7d2084cc35f52d5a7bbb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_conformer.cpp +++ /dev/null @@ -1,971 +0,0 @@ -// Copyright (c) 2022 KUNLUNXIN Inc. -// 2022 Han Qi (qihan@baidu.com) -// Hehe Pan (panhehe@baidu.com) -// Zikui Yan (yanzikui@baidu.com) -// Chaolin Li (lichaolin@baidu.com) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "xpu_conformer.h" // NOLINT -#include -#include -#include -#include - -namespace xpu { -namespace wenet { -const int X4_BEGIN = 8; -template -static int encoder_embed(api::Context* ctx_xpu, const float* x, T* y, int batch, - int max_seqlen, int seq_dim, int att_dim, - const ConformerEncoderParam& param) { - api::ctx_guard RAII_GUARD(ctx_xpu); - int ret = 0; - int h_seqlen = (max_seqlen - 1) / 2; - int q_seqlen = (h_seqlen - 1) / 2; - int out_channels = att_dim; - int h_dim = (seq_dim - 1) / 2; - int q_dim = (h_dim - 1) / 2; - - float xscale = std::sqrt(att_dim); - std::vector sizes = {std::max(batch * max_seqlen * seq_dim, - batch * out_channels * q_seqlen * q_dim), - batch * out_channels * h_seqlen * h_dim}; - std::vector ptrs; - for (auto size_ind : sizes) { - ptrs.push_back(RAII_GUARD.alloc(size_ind)); - } - - auto& emb_conv_w_list = param.emb_conv_w_list; - auto& emb_conv_maxw_list = param.emb_conv_maxw_list; - auto& emb_conv_bias_list = param.emb_conv_bias_list; - auto& emb_fc_w = param.emb_fc_w_list; - auto& emb_fc_maxw = param.emb_fc_maxw_list; - auto& emb_fc_bias = param.emb_fc_bias_list; - - ret = - api::cast_v2(ctx_xpu, x, ptrs[0], batch * max_seqlen * seq_dim); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - ret = api::conv2d_fusion( - ctx_xpu, ptrs[0], emb_conv_w_list[0], ptrs[1], batch, 1, max_seqlen, - seq_dim, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, 1, nullptr, - emb_conv_maxw_list[0], nullptr, true, emb_conv_bias_list[0], nullptr, - api::Activation_t::RELU, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - ret = api::conv2d_fusion( - ctx_xpu, ptrs[1], emb_conv_w_list[1], ptrs[0], batch, out_channels, - h_seqlen, h_dim, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, 1, nullptr, - emb_conv_maxw_list[1], nullptr, true, emb_conv_bias_list[1], nullptr, - api::Activation_t::RELU, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - ret = api::transpose(ctx_xpu, ptrs[0], ptrs[1], - {batch, out_channels, q_seqlen, q_dim}, {0, 2, 1, 3}); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - ret = api::fc_fusion( - ctx_xpu, ptrs[1], emb_fc_w[0], ptrs[0], batch * q_seqlen, att_dim, - out_channels * q_dim, false, true, nullptr, emb_fc_maxw[0], nullptr, - out_channels * q_dim, out_channels * q_dim, att_dim, 1.0f, 0.0f, - emb_fc_bias[0], api::Activation_t::LINEAR); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - ret = api::scale(ctx_xpu, ptrs[0], y, batch * q_seqlen * out_channels, - false, xscale, 0); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - ret = xpu_wait(ctx_xpu->xpu_stream); - WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); - return api::SUCCESS; -} - -template -static int ffn(api::Context* ctx, int batch, int q_seqlen, int hidden_dim, - bool with_endln, const T* x, T* y, int ln_begin, int fc_begin, - std::vector ln_scale_list, - std::vector ln_bias_list, - std::vector fc_w_list, - std::vector fc_maxw_list, - std::vector fc_bias_list, - std::vector mem_single, int ffn_factor) { - api::ctx_guard RAII_GUARD(ctx); - int ret = api::SUCCESS; - std::unordered_map buf_mapping = { - {"ffn_ln", mem_single[1]}, {"ffn_fc0", mem_single[X4_BEGIN]}, - {"tmp0", mem_single[X4_BEGIN + 1]}, {"tmp1", mem_single[X4_BEGIN]}, - {"ffn_fc1", mem_single[1]}, - }; - int ffn1_out_dim = hidden_dim * ffn_factor; - int ffn2_input_dim = ffn1_out_dim; - ret = api::layer_norm(ctx, x, buf_mapping["ffn_ln"], batch * q_seqlen, - hidden_dim, 1e-5, ln_scale_list[ln_begin], - ln_bias_list[ln_begin], nullptr, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::fc_fusion( - ctx, buf_mapping["ffn_ln"], fc_w_list[fc_begin], buf_mapping["ffn_fc0"], - batch * q_seqlen, ffn1_out_dim, hidden_dim, false, true, nullptr, - fc_maxw_list[fc_begin], nullptr, hidden_dim, hidden_dim, ffn1_out_dim, - 1.0f, 0.0f, fc_bias_list[fc_begin], api::Activation_t::LINEAR); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::sigmoid(ctx, buf_mapping["ffn_fc0"], buf_mapping["tmp0"], - batch * q_seqlen * hidden_dim * ffn_factor); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::mul(ctx, buf_mapping["ffn_fc0"], buf_mapping["tmp0"], - buf_mapping["tmp1"], - batch * q_seqlen * hidden_dim * ffn_factor); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::fc_fusion( - ctx, buf_mapping["tmp1"], fc_w_list[fc_begin + 1], buf_mapping["ffn_fc1"], - batch * q_seqlen, hidden_dim, ffn2_input_dim, false, true, nullptr, - fc_maxw_list[fc_begin + 1], nullptr, ffn2_input_dim, ffn2_input_dim, - hidden_dim, 0.5f, 0.0f, fc_bias_list[fc_begin + 1], - api::Activation_t::LINEAR); - if (with_endln) { - ret = api::add_layer_norm_fusion( - ctx, x, buf_mapping["ffn_fc1"], y, batch * q_seqlen, hidden_dim, 1e-5, - ln_scale_list[ln_begin + 1], ln_bias_list[ln_begin + 1]); - } else { - ret = api::add(ctx, x, buf_mapping["ffn_fc1"], y, - batch * q_seqlen * hidden_dim); - } - WRAPPER_ASSERT_SUCCESS(ctx, ret); - return api::SUCCESS; -} - -template -int wenet_encoder_layer(api::Context* ctx, - api::ctx_guard& RAII_GUARD, // NOLINT - int batch, int q_seqlen, int hidden_dim, int ln_begin, - int fc_begin, int attn_pos_begin, int conv_begin, - const T* x, T* y, - ConformerEncoderParam& param, // NOLINT - std::vector& mem_single, // NOLINT - std::vector& mem_double, // NOLINT - float* mem_float, float* mask_score) { - WRAPPER_CHECK_CTX(ctx); - int max_size = ctx->max_ptr_size(); - int ret = api::SUCCESS; - std::unordered_map buf_mapping = { - {"ffn0_out", mem_single[1]}, - {"swp0", mem_single[2]}, - {"swp1", mem_single[3]}, - {"matrix_bd_pre", mem_double[0]}, - {"soft_scores", mem_double[0]}, - {"qkv", mem_single[2]}, - {"qkv_add", mem_single[1]}, - {"conv_p1", mem_single[X4_BEGIN + 2]}, - {"conv_glu0", mem_single[X4_BEGIN + 3]}, - {"conv_glu1", mem_single[X4_BEGIN + 4]}, - {"conv_d1", mem_single[X4_BEGIN + 3]}, - {"conv_p2", mem_single[X4_BEGIN + 2]}, - {"conv_after", mem_single[0]}, - }; - - auto ln_scale_list = param.ln_scale_list; - auto ln_bias_list = param.ln_bias_list; - - auto fc_w_list = param.fc_w_list; - auto fc_maxw_list = param.fc_maxw_list; - auto fc_bias_list = param.fc_bias_list; - - auto attn_pos_w_list = param.attn_pos_w_list; - auto attn_pos_maxw_list = param.attn_pos_maxw_list; - auto attn_pos_uv_bias_list = param.attn_pos_uv_bias_list; - - auto conv_w_list = param.conv_w_list; - auto conv_maxw_list = param.conv_maxw_list; - auto conv_bias_list = param.conv_bias_list; - - auto kernel_size = param.conv_param.kernel_size; - auto lorder = param.conv_param.lorder; - auto padding = param.conv_param.padding; - auto head_num = param.head_num; - auto head_dim = param.head_dim; - /* - ** feed forward macaron-style module - ** x = residual + 0.5*ff(x) - */ - ret = ffn(ctx, batch, q_seqlen, hidden_dim, false, x, - buf_mapping["ffn0_out"], ln_begin, fc_begin, - ln_scale_list, ln_bias_list, fc_w_list, fc_maxw_list, - fc_bias_list, mem_single, param.ffn_factor); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - /* - ** multi-headed self-attention module - ** qkv_list[0-4]: q,k,v,qu,qv mapping single[2-6] - ** attn_pos_uv_bias_list : float -> float16 - ** q_pos_attention : get pos_emb before cal - ** q_pos_attention : cal matrix_bd to qk_attention's mask ,when cal - *qk_attention, mask will be added - **/ - T* qkv_list[5] = {mem_single[6], mem_single[3], mem_single[4], mem_single[5], - mem_single[2]}; - ret = api::layer_norm(ctx, buf_mapping["ffn0_out"], buf_mapping["swp0"], - batch * q_seqlen, hidden_dim, 1e-5, - ln_scale_list[ln_begin + 1], - ln_bias_list[ln_begin + 1], nullptr, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::fc_fusion_3c( - ctx, buf_mapping["swp0"], fc_w_list[fc_begin + 2], qkv_list[0], - qkv_list[1], qkv_list[2], batch * q_seqlen, hidden_dim * 3, hidden_dim, - false, true, nullptr, fc_maxw_list[fc_begin + 2], nullptr, hidden_dim, - hidden_dim, hidden_dim * 3, 1.0f, 0.0f, fc_bias_list[fc_begin + 2], - api::Activation_t::LINEAR); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - for (int i = 0; i < 2; i++) { - ret = api::broadcast_add( - ctx, qkv_list[0], attn_pos_uv_bias_list[attn_pos_begin * 2 + i], - qkv_list[i + 3], {batch, q_seqlen, hidden_dim}, {1, 1, hidden_dim}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - } - int pos_emb_dim = 2 * q_seqlen - 1; - T* pos_emb_sliced = RAII_GUARD.alloc(pos_emb_dim * hidden_dim); - ret = api::slice(ctx, param.pos_emb[attn_pos_begin], pos_emb_sliced, - {5000, head_num, head_dim}, {0, 0, 0}, - {pos_emb_dim, head_num, head_dim}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - int tmp_sliced_len = batch * head_num * q_seqlen * q_seqlen; - float* tmp_mask = RAII_GUARD.alloc(tmp_sliced_len); - ret = api::q_pos_attention( - ctx, qkv_list[4], pos_emb_sliced, buf_mapping["matrix_bd_pre"], batch, - q_seqlen, head_num, head_dim, 1.0f / std::sqrt(head_dim), nullptr, - nullptr, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::slice(ctx, buf_mapping["matrix_bd_pre"], - reinterpret_cast(mem_float), - {batch, head_num, q_seqlen, pos_emb_dim}, {0, 0, 0, 0}, - {batch, head_num, q_seqlen, q_seqlen}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::cast_v2(ctx, reinterpret_cast(mem_float), tmp_mask, - batch * head_num * q_seqlen * q_seqlen); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::broadcast_add(ctx, tmp_mask, mask_score, mem_float, - {batch, head_num, q_seqlen, q_seqlen}, - {batch, q_seqlen}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - api::QKVAttnParam loop_p(batch, q_seqlen, head_num, head_dim, - {batch, head_num, q_seqlen, q_seqlen}, - api::Activation_t::LINEAR, -1, false, hidden_dim); - float* qk_maxptr = RAII_GUARD.alloc(max_size); - ret = api::qk_attention( - ctx, qkv_list[3], qkv_list[1], buf_mapping["soft_scores"], nullptr, - nullptr, qk_maxptr, loop_p, mem_float); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - float* qkv_maxptr = RAII_GUARD.alloc(max_size); - ret = api::qk_v_attention( - ctx, buf_mapping["soft_scores"], qkv_list[2], buf_mapping["qkv"], - qk_maxptr, nullptr, qkv_maxptr, loop_p); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::fc_fusion( - ctx, buf_mapping["qkv"], fc_w_list[fc_begin + 3], buf_mapping["swp1"], - batch * q_seqlen, hidden_dim, hidden_dim, false, true, qkv_maxptr, - fc_maxw_list[fc_begin + 3], nullptr, hidden_dim, hidden_dim, hidden_dim, - 1.0f, 0.0f, fc_bias_list[fc_begin + 3], api::Activation_t::LINEAR); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::add(ctx, buf_mapping["ffn0_out"], buf_mapping["swp1"], - buf_mapping["qkv_add"], batch * q_seqlen * hidden_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - /* - ** Conv conv_p1-conv_d1-conv_p2 - */ - ret = api::layer_norm(ctx, buf_mapping["qkv_add"], buf_mapping["swp1"], - batch * q_seqlen, hidden_dim, 1e-5, - ln_scale_list[ln_begin + 2], - ln_bias_list[ln_begin + 2], nullptr, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::transpose(ctx, buf_mapping["swp1"], buf_mapping["swp0"], - {batch, q_seqlen, hidden_dim}, {0, 2, 1}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - int pad_seqlen = q_seqlen; - if (lorder > 0) { - ret = api::pad(ctx, buf_mapping["swp0"], buf_mapping["swp1"], - {batch, hidden_dim, q_seqlen}, {0, 0, lorder}, {0, 0, 0}, - padding); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - pad_seqlen += lorder; - } - ret = api::conv2d_fusion( - ctx, buf_mapping["swp1"], conv_w_list[conv_begin], buf_mapping["swp0"], - batch, hidden_dim, 1, pad_seqlen, hidden_dim * 2, {1, 1}, {1, 1}, - {0, 0, 0, 0}, {1, 1}, 1, nullptr, conv_maxw_list[conv_begin], nullptr, - true, conv_bias_list[conv_begin], nullptr, api::Activation_t::LINEAR, - nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::split(ctx, buf_mapping["swp0"], - {buf_mapping["conv_glu0"], buf_mapping["conv_glu1"]}, - {batch, hidden_dim * 2, pad_seqlen}, - {hidden_dim, hidden_dim}, 1); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::sigmoid(ctx, buf_mapping["conv_glu1"], buf_mapping["conv_glu1"], - batch * pad_seqlen * hidden_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::mul(ctx, buf_mapping["conv_glu0"], buf_mapping["conv_glu1"], - buf_mapping["conv_p1"], batch * pad_seqlen * hidden_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::conv1d_fusion( - ctx, buf_mapping["conv_p1"], conv_w_list[conv_begin + 1], - buf_mapping["conv_d1"], batch, hidden_dim, pad_seqlen, hidden_dim, - kernel_size, 1, {0}, 1, hidden_dim, nullptr, - conv_maxw_list[conv_begin + 1], nullptr, true, - conv_bias_list[conv_begin + 1], nullptr, api::Activation_t::LINEAR, - nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - - ret = api::transpose(ctx, buf_mapping["conv_d1"], buf_mapping["swp0"], - {batch, hidden_dim, q_seqlen}, {0, 2, 1}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::layer_norm(ctx, buf_mapping["swp0"], buf_mapping["swp1"], - batch * q_seqlen, hidden_dim, 1e-5, - ln_scale_list[ln_begin + 3], - ln_bias_list[ln_begin + 3], nullptr, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::sigmoid(ctx, buf_mapping["swp1"], buf_mapping["swp0"], - batch * q_seqlen * hidden_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::mul(ctx, buf_mapping["swp0"], buf_mapping["swp1"], - buf_mapping["conv_p1"], batch * q_seqlen * hidden_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::transpose(ctx, buf_mapping["conv_p1"], buf_mapping["conv_d1"], - {batch, q_seqlen, hidden_dim}, {0, 2, 1}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::conv2d_fusion( - ctx, buf_mapping["conv_d1"], conv_w_list[conv_begin + 2], - buf_mapping["conv_p2"], batch, hidden_dim, 1, q_seqlen, hidden_dim, - {1, 1}, {1, 1}, {0, 0, 0, 0}, {1, 1}, 1, nullptr, - conv_maxw_list[conv_begin + 2], nullptr, true, - conv_bias_list[conv_begin + 2], nullptr, api::Activation_t::LINEAR, - nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::transpose(ctx, buf_mapping["conv_p2"], buf_mapping["swp0"], - {batch, hidden_dim, q_seqlen}, {0, 2, 1}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::add(ctx, buf_mapping["swp0"], buf_mapping["qkv_add"], - buf_mapping["conv_after"], batch * q_seqlen * hidden_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - /* - ** feed forward module - ** x = residual + 0.5*ff(x) - */ - ret = ffn( - ctx, batch, q_seqlen, hidden_dim, true, buf_mapping["conv_after"], y, - ln_begin + 4, fc_begin + 4, ln_scale_list, ln_bias_list, fc_w_list, - fc_maxw_list, fc_bias_list, mem_single, param.ffn_factor); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - return api::SUCCESS; -} - -template -int conformer_encoder_wenet( - api::Context* ctx, float* x, const std::vector& data_shape, - T* encoder_out, T* ctc_probs, - ConformerEncoderParam& param, // NOLINT - const std::tuple>& xpu_mask_info) { - // Embedding -> Encoder_layer * N -> Layernorm -> Ctc_loss - int ret = 0; - int fc_num_per_layer = param.fc_num_per_layer; - int conv_num_per_layer = param.conv_num_per_layer; - int ln_num_per_layer = param.ln_num_per_layer; - int ffn_factor = param.ffn_factor; - int head_num = param.head_num; - int head_dim = param.head_dim; - int att_dim = head_num * head_dim; - int ctc_dim = param.ctc_dim; - int batch = data_shape[0]; - int max_seqlen = data_shape[1]; - int seq_dim = data_shape[2]; - int h_seqlen = (max_seqlen - 1) / 2; - int q_seqlen = (h_seqlen - 1) / 2; - - WRAPPER_ASSERT_GT(ctx, param.layer_num, 0); - WRAPPER_ASSERT_GT(ctx, batch, 0); - WRAPPER_ASSERT_GT(ctx, head_num, 0); - WRAPPER_ASSERT_GT(ctx, ctc_dim, 0); - WRAPPER_ASSERT_GT(ctx, head_dim, 0); - // Inital GM - api::ctx_guard RAII_GUARD(ctx); - std::vector mem_double; - std::vector mem_single; - int base_len = batch * (q_seqlen + 14) * (att_dim + 14); - for (int i = 0; i < 8; i++) { - mem_single.push_back(RAII_GUARD.alloc(base_len)); - } - mem_single.push_back(RAII_GUARD.alloc(base_len * ffn_factor)); - mem_single.push_back(RAII_GUARD.alloc(base_len * ffn_factor)); - mem_single.push_back(RAII_GUARD.alloc(base_len * 4)); - mem_single.push_back(RAII_GUARD.alloc(base_len * 4)); - mem_single.push_back(RAII_GUARD.alloc(base_len * 2)); - mem_double.push_back( - RAII_GUARD.alloc(batch * head_num * q_seqlen * q_seqlen * 3)); - mem_double.push_back( - RAII_GUARD.alloc(batch * head_num * q_seqlen * q_seqlen)); - int ind_len = base_len * 6 + batch * param.head_num * q_seqlen * q_seqlen * 2; - int lens = - batch * param.head_num * q_seqlen * q_seqlen * sizeof(float) / sizeof(T); - float* mem_float = RAII_GUARD.alloc(lens); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - T* calx = mem_single[0]; - T* caly = mem_single[0]; - - // embedding + mask - float* emb = RAII_GUARD.alloc(batch * max_seqlen * seq_dim); - float* emb_nm = RAII_GUARD.alloc(batch * max_seqlen * seq_dim); - T* emb_fc = RAII_GUARD.alloc(batch * q_seqlen * att_dim); - ret = api::broadcast_sub(ctx, x, param.cmvn_mean, emb, data_shape, - {1, 1, 80}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::broadcast_mul(ctx, emb, param.cmvn_istd, emb_nm, data_shape, - {1, 1, 80}); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = encoder_embed(ctx, emb_nm, calx, batch, max_seqlen, seq_dim, - att_dim, param); - float* mask_scores = RAII_GUARD.alloc(batch * q_seqlen); - ret = api::scale(ctx, std::get<0>(xpu_mask_info), mask_scores, - batch * q_seqlen, false, 1e4, -1); - CHECK_RET(ret); - // encoder * N - for (int i = 0; i < param.layer_num; i++) { - int ln_begin = i * ln_num_per_layer; - int fc_begin = i * fc_num_per_layer; - int attn_pos_begin = i; - int conv_begin = i * conv_num_per_layer; - ret = wenet_encoder_layer( - ctx, RAII_GUARD, batch, q_seqlen, att_dim, ln_begin, fc_begin, - attn_pos_begin, conv_begin, calx, caly, param, mem_single, mem_double, - mem_float, mask_scores); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - calx = caly; - } - // Final Layer_Norm - int ln_begin = param.layer_num * param.ln_num_per_layer; - int fc_begin = param.layer_num * param.fc_num_per_layer; - auto final_ln_scale = param.ln_scale_list[ln_begin]; - auto final_ln_bias = param.ln_bias_list[ln_begin]; - ret = api::layer_norm(ctx, caly, encoder_out, batch * q_seqlen, att_dim, 1e-5, - final_ln_scale, final_ln_bias, nullptr, nullptr); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - // Ctc_Loss + log_sofmax - auto ctc_fc_w = param.fc_w_list[fc_begin]; - auto ctc_fc_maxw = param.fc_maxw_list[fc_begin]; - auto ctc_fc_bias = param.fc_bias_list[fc_begin]; - float* ctc_buffer = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); - ret = api::fc_fusion( - ctx, encoder_out, ctc_fc_w, ctc_buffer, batch * q_seqlen, ctc_dim, - att_dim, false, true, nullptr, ctc_fc_maxw, nullptr, att_dim, att_dim, - ctc_dim, 1.0f, 0.0f, ctc_fc_bias, api::Activation_t::LINEAR); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - float* softmax_out = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); - ret = api::softmax(ctx, ctc_buffer, softmax_out, - {batch, q_seqlen, ctc_dim}, 2); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - float* log_out = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); - ret = api::log(ctx, softmax_out, log_out, batch * q_seqlen * ctc_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::cast_v2(ctx, log_out, ctc_probs, - batch * q_seqlen * ctc_dim); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - return api::SUCCESS; -} - -#define INSTANTIATION_CONSFORMER_WENET(T, TW, TGEMM) \ - template int conformer_encoder_wenet( \ - api::Context*, float*, const std::vector&, T*, T*, \ - ConformerEncoderParam&, \ - const std::tuple>&); -INSTANTIATION_CONSFORMER_WENET(float16, int16_t, int16_t); - -const float kFloatMax = std::numeric_limits::max(); -float logadd(std::vector const& x) { - float xmax = *max_element(x.begin(), x.end()); - if (xmax <= -kFloatMax) { - return -kFloatMax; - } - float sum = 0.0; - for (auto& it : x) { - sum += std::exp(it - xmax); - } - return std::log(sum) + xmax; -} - -struct PrefixScore { - float s = -kFloatMax; - float ns = -kFloatMax; - float score() const { return logadd({s, ns}); } - void check() const { - std::cout << "score " << s << std::endl; - std::cout << "nscore " << ns << std::endl; - } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.score() > b.second.score(); -} - -template -int ctc_prefix_beamsearch(api::Context* ctx, T* ctc_probs, - std::vector& hyps, // NOLINT - std::vector& hyps_len, // NOLINT - std::vector& ctc_scores, int batch, // NOLINT - int beam_size, int max_len, int ctc_dim) { - // 0. get topk - api::ctx_guard RAII_GUARD(ctx); - int data_len = batch * max_len * beam_size; - int* topk_index_buf = RAII_GUARD.alloc(data_len); - float* topk_score_buf = RAII_GUARD.alloc(data_len); - float* logp = RAII_GUARD.alloc(batch * max_len * ctc_dim); - int ret = - api::cast_v2(ctx, ctc_probs, logp, batch * max_len * ctc_dim); - ret = api::sorted_topk(ctx, logp, topk_score_buf, topk_index_buf, - max_len, ctc_dim, beam_size, true); - xpu_wait(ctx->xpu_stream); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - std::vector topk_index(data_len); - std::vector topk_score(data_len); - ret = xpu_memcpy(reinterpret_cast(&topk_index[0]), topk_index_buf, - data_len * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST); - CHECK_RET(ret); - ret = xpu_memcpy(reinterpret_cast(&topk_score[0]), topk_score_buf, - data_len * sizeof(float), XPUMemcpyKind::XPU_DEVICE_TO_HOST); - CHECK_RET(ret); - std::unordered_map, PrefixScore, PrefixHash> cur_hyps; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - std::vector empty; - cur_hyps[empty] = prefix_score; - for (int t = 0; t < max_len; ++t) { - int offset = beam_size * t; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. Token passing - for (int i = 0; i < beam_size; ++i) { - int id = topk_index[i + offset]; - float prob = topk_score[i + offset]; - for (const auto& it : cur_hyps) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - if (id == 0) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = logadd( - {next_score.s, prefix_score.s + prob, prefix_score.ns + prob}); - // Prefix not changed, copy the context from prefix. - next_hyps[prefix] = next_score; - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.ns = logadd({next_score.ns, prefix_score.ns + prob}); - next_hyps[prefix] = next_score; - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score1 = next_hyps[new_prefix]; - next_score1.ns = logadd({next_score1.ns, prefix_score.s + prob}); - next_hyps[new_prefix] = next_score1; - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = logadd( - {next_score.ns, prefix_score.s + prob, prefix_score.ns + prob}); - next_hyps[new_prefix] = next_score; - } - } - } - // 2. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - std::nth_element(arr.begin(), arr.begin() + beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - // 3. Update cur_hyps and get new result - cur_hyps.clear(); - for (int k = 0; k < beam_size; k++) { - cur_hyps[arr[k].first] = arr[k].second; - } - } - std::vector, PrefixScore>> arr(cur_hyps.begin(), - cur_hyps.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - int beam = 0; - for (auto it : arr) { - auto vec = it.first; - hyps_len[beam] = vec.size(); - ctc_scores[beam] = it.second.score(); - hyps.insert(hyps.end(), vec.begin(), vec.end()); - beam++; - } - return api::SUCCESS; -} - -template int ctc_prefix_beamsearch( - api::Context* ctx, float16* logp, - std::vector& hyps, // NOLINT - std::vector& hyps_len, // NOLINT - std::vector& ctc_scores, // NOLINT - int batch, int beam_size, int max_len, int ctc_dim); - -static int clip_cpu(int x, int min, int max) { - if (x <= min) return min; - if (x >= max) return max; - return x; -} - -static int add_sos_and_pad_ignored_id( - api::Context* ctx, const int* target, - std::vector& pad_target, // NOLINT - std::vector& pad_target_lod, // NOLINT - int batch_size, int target_seq_len, int max_target_seq_len, int eos_id, - int ignored_id, int add_sos_num, int vocab_size) { - int ret = -1; - int target_data_len = batch_size * target_seq_len; - std::vector target_cpu(target_data_len); - ret = xpu_wait(ctx->xpu_stream); - ret = xpu_memcpy(reinterpret_cast(target_cpu.data()), target, - target_data_len * sizeof(int), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); - for (int i = 0; i < batch_size; i++) { - int valid_target_len = add_sos_num; - for (int j = 0; j < target_seq_len; j++) { - if (target_cpu[i * target_seq_len + j] == eos_id) { - pad_target[i * max_target_seq_len + j + add_sos_num] = ignored_id; - } else { - pad_target[i * max_target_seq_len + j + add_sos_num] = - clip_cpu(target_cpu[i * target_seq_len + j], 0, vocab_size); - valid_target_len++; - } - } - pad_target_lod[i + 1] = pad_target_lod[i] + valid_target_len; - } - return api::SUCCESS; -} - -template -int conformer_decoder_wenet(api::Context* ctx, const T* x, - const std::vector& x_shape, - const float* x_mask, const int* padded_target, - const std::vector& target_shape, - float* character_scores, - const ConformerDecoderParam& param) { - int layer_num = param.layer_num; - int batch_size = x_shape[0]; - int beam_size = param.beam_size; - int head_num = param.head_num; - int head_dim = param.head_dim; - int vocab_size = param.vocab_size; - int dim = head_num * head_dim; - int add_sos_num = param.add_sos_num; - int new_bs = batch_size * beam_size; - int sos_id = param.sos_id; - int eos_id = param.eos_id; - int ignored_id = param.ignored_id; - WRAPPER_CHECK_CTX(ctx); - WRAPPER_ASSERT_GT(ctx, layer_num, 0); - WRAPPER_ASSERT_GT(ctx, batch_size, 0); - WRAPPER_ASSERT_GT(ctx, head_num, 0); - WRAPPER_ASSERT_GT(ctx, vocab_size, 0); - WRAPPER_ASSERT_GT(ctx, dim, 0); - - api::ctx_guard RAII_GUARD(ctx); - const int max_seq_len = x_shape[1]; - WRAPPER_ASSERT_GT(ctx, max_seq_len, 0); - const int ffn1_out_dim = param.ffn_dim; - // if ffn_act is glu - const int ffn2_input_dim = ffn1_out_dim; - const int d_k = dim / head_num; - WRAPPER_ASSERT_GT(ctx, d_k, 0); - int target_seq_len = target_shape[1]; - WRAPPER_ASSERT_GT(ctx, target_seq_len, 1); - int max_target_seq_len = target_seq_len + add_sos_num; // add sos - WRAPPER_ASSERT_GT(ctx, max_seq_len, max_target_seq_len); - - int seqlen_sum = new_bs * max_seq_len; - T* new_x = const_cast(x); - int ret = -1; - // get src_attn vsl input - std::vector cpu_mask_data(new_bs * max_seq_len, 0); - std::vector src_lod_vec(new_bs + 1, 0); - ret = xpu_wait(ctx->xpu_stream); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = xpu_memcpy(reinterpret_cast(&cpu_mask_data.front()), x_mask, - new_bs * max_seq_len * sizeof(float), - XPUMemcpyKind::XPU_DEVICE_TO_HOST); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - for (int b = 1; b < src_lod_vec.size(); b++) { - int curr_seqlen = 0; - for (int idx = 0; idx < max_seq_len; idx++) { - if (static_cast(cpu_mask_data[idx]) == 1) { - curr_seqlen++; - } - } - src_lod_vec[b] = src_lod_vec[b - 1] + curr_seqlen; - } - api::VectorParam src_qk_lods = { - src_lod_vec.data(), static_cast(src_lod_vec.size()), nullptr}; - src_qk_lods = src_qk_lods.to_xpu(RAII_GUARD); - seqlen_sum = src_qk_lods.cpu[new_bs]; - - T* broadcast_x = RAII_GUARD.alloc(new_bs * max_seq_len * dim); - ret = api::broadcast(ctx, x, broadcast_x, {batch_size, max_seq_len, dim}, - {new_bs, max_seq_len, dim}); - - WRAPPER_ASSERT_SUCCESS(ctx, ret); - // add sos_id and pad ignored_id - std::vector real_target_cpu(max_target_seq_len * new_bs, sos_id); - std::vector real_target_lod(new_bs + 1, 0); - - ret = add_sos_and_pad_ignored_id(ctx, padded_target, real_target_cpu, - real_target_lod, batch_size * beam_size, - target_seq_len, max_target_seq_len, eos_id, - ignored_id, add_sos_num, vocab_size); - - // get self/src QKVParam - int target_seq_sum = real_target_lod[new_bs]; - api::VectorParam self_qk_lods = { - real_target_lod.data(), static_cast(real_target_lod.size()), - nullptr}; - self_qk_lods = self_qk_lods.to_xpu(RAII_GUARD); - api::QKVAttnParam self_qkv_param(self_qk_lods, head_num, d_k, - api::Activation_t::LINEAR); - api::ConformerQKVParam src_qkv_param(self_qk_lods, src_qk_lods, head_num, d_k, - false, -1); - - seqlen_sum = seqlen_sum > target_seq_sum ? seqlen_sum : target_seq_sum; - std::vector buf_sizes = { - new_bs * max_target_seq_len * - static_cast(sizeof(int) / sizeof(T)), // padded_target - new_bs * max_target_seq_len * dim, // embedding_out - new_bs * max_target_seq_len * dim, // mid_a - new_bs * max_target_seq_len * dim, // mid_b - new_bs * max_target_seq_len * - dim, // attention_out, src_attention qk_v的结果 - new_bs * max_target_seq_len * dim, // residual - // ffn buffer - new_bs * max_target_seq_len * ffn1_out_dim, // ffn1_out - new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_out - new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_a - new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_b - new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_sigmoid - // feature buffer - new_bs * max_target_seq_len * dim * 3, // feature_in buffer - new_bs * max_target_seq_len * dim * 2, // feature_out buffer - new_bs * max_target_seq_len * 2, // final_out - seqlen_sum * dim, // q - seqlen_sum * dim, // k - seqlen_sum * dim, // v - new_bs * max_seq_len * dim, // src_x - // attention buffer - new_bs * max_seq_len * max_seq_len * dim, // src_qk - }; - std::vector buffer_ptrs(buf_sizes.size()); - for (int i = 0; i < buf_sizes.size(); i++) { - buffer_ptrs[i] = RAII_GUARD.alloc(buf_sizes[i]); - } - int b_id = 0; - std::unordered_map buffer_map = { - {"padded_target", buffer_ptrs[b_id++]}, - {"embedding_out", buffer_ptrs[b_id++]}, - {"mid_a", buffer_ptrs[b_id++]}, - {"mid_b", buffer_ptrs[b_id++]}, - {"attention_out", buffer_ptrs[b_id++]}, - {"residual", buffer_ptrs[b_id++]}, - {"ffn1_out", buffer_ptrs[b_id++]}, - {"ffn_glu_out", buffer_ptrs[b_id++]}, - {"ffn_glu_a", buffer_ptrs[b_id++]}, - {"ffn_glu_b", buffer_ptrs[b_id++]}, - {"ffn_glu_sigmoid", buffer_ptrs[b_id++]}, - {"feature_in", buffer_ptrs[b_id++]}, - {"feature_out", buffer_ptrs[b_id++]}, - {"final_out", buffer_ptrs[b_id++]}, - {"q", buffer_ptrs[b_id++]}, - {"k", buffer_ptrs[b_id++]}, - {"v", buffer_ptrs[b_id++]}, - {"src_x", buffer_ptrs[b_id++]}, - {"src_qk", buffer_ptrs[b_id++]}, - }; - // maxptr buffer - int max_size = ctx->max_ptr_size(); - float* max_buffer = RAII_GUARD.alloc(6 * max_size); - float* max_x = max_buffer; - float* max_q = max_buffer + max_size; - float* max_k = max_buffer + 2 * max_size; - float* max_v = max_buffer + 3 * max_size; - float* max_qk = max_buffer + 4 * max_size; - float* max_qkv = max_buffer + 5 * max_size; - // copy pad_sos target to xpu - int* new_paded_target = reinterpret_cast(buffer_map["padded_target"]); - ret = api::do_host2device(ctx, real_target_cpu.data(), new_paded_target, - max_target_seq_len * new_bs * sizeof(int)); - T* embedding_out = buffer_map["embedding_out"]; - T* attention_out = buffer_map["attention_out"]; - T* mid_a = buffer_map["mid_a"]; - T* mid_b = buffer_map["mid_b"]; - T* q = buffer_map["q"]; - T* k = buffer_map["k"]; - T* v = buffer_map["v"]; - T* src_qk = buffer_map["src_qk"]; - T* residual = buffer_map["residual"]; - T* ffn1_out = buffer_map["ffn1_out"]; - T* ffn_glu_a = buffer_map["ffn_glu_a"]; - T* ffn_glu_b = buffer_map["ffn_glu_b"]; - T* ffn_glu_sigmoid = buffer_map["ffn_glu_sigmoid"]; - T* ffn_glu_out = buffer_map["ffn_glu_out"]; - // 1.1 embedding input: target{3,14} out:{3,14,512} - ret = - api::embedding(ctx, param.embed_table, new_paded_target, residual, - vocab_size, dim, new_bs * max_target_seq_len, -1); - float logit_scale = 1.0f; - ret = - api::scale(ctx, residual, embedding_out, - new_bs * max_target_seq_len * dim, true, logit_scale, 0.0f); - // 1.2 pos_embed, pos=[1, 5000, dim] - ret = api::broadcast_add(ctx, embedding_out, param.pe, residual, - {new_bs, max_target_seq_len, dim}, - {1, max_target_seq_len, dim}); - // 2. decoder - auto fc_weight_itr = param.fc_w_list.begin(); - auto fc_bias_itr = param.fc_bias_list.begin(); - auto fc_w_maxptr_itr = param.fc_maxw_list.begin(); - auto ln_scale_itr = param.ln_scale_list.begin(); - auto ln_bias_itr = param.ln_bias_list.begin(); - const float eps = 1e-5f; - - std::vector mask_cpu(max_target_seq_len * max_target_seq_len, 0.0); - const float kFloatMax = std::numeric_limits::max(); - for (int j = 0; j < max_target_seq_len; j++) { - for (int k = j + 1; k < max_target_seq_len; k++) - mask_cpu[j * max_target_seq_len + k] = -kFloatMax; - } - float* mask_xpu; - mask_xpu = reinterpret_cast( - RAII_GUARD.alloc(max_target_seq_len * max_target_seq_len)); - float* tg_mask; - tg_mask = reinterpret_cast(RAII_GUARD.alloc( - new_bs * head_num * max_target_seq_len * max_target_seq_len)); - ret = xpu_memcpy(mask_xpu, reinterpret_cast(&mask_cpu[0]), - max_target_seq_len * max_target_seq_len * sizeof(float), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - ret = api::broadcast( - ctx, mask_xpu, tg_mask, {1, 1, max_target_seq_len, max_target_seq_len}, - {new_bs, head_num, max_target_seq_len, max_target_seq_len}); - for (int j = 0; j < layer_num; j++) { - // 2.1 self attention - ret = api::layer_norm(ctx, residual, mid_b, new_bs * max_target_seq_len, - dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr, - nullptr); - ret = api::fc_fusion_3c( - ctx, mid_b, *fc_weight_itr++, q, k, v, target_seq_sum, dim * 3, dim, - false, true, nullptr, *fc_w_maxptr_itr++, max_q, dim, dim, dim * 3, - 1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - - api::QKVAttnParam loop_p( - new_bs, max_target_seq_len, head_num, d_k, - {new_bs, head_num, max_target_seq_len, max_target_seq_len}, - api::Activation_t::LINEAR, -1, false, dim); - - ret = api::qk_attention(ctx, q, k, src_qk, nullptr, nullptr, - max_qk, loop_p, tg_mask); - ret = api::qk_v_attention(ctx, src_qk, v, mid_a, max_qk, - nullptr, max_qkv, loop_p); - // x + residual fused with fc - ret = api::fc_fusion( - ctx, mid_a, *fc_weight_itr++, residual, new_bs * max_target_seq_len, - dim, dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, - dim, 1.0f, 1.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - // 2.2 src attention - ret = api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, - dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr, - nullptr); - ret = api::fc_fusion( - ctx, mid_a, *fc_weight_itr++, mid_b, new_bs * max_target_seq_len, dim, - dim, false, true, nullptr, *fc_w_maxptr_itr++, max_q, dim, dim, dim, - 1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - // get k,v use encoder_out - ret = api::fc_fusion( - ctx, broadcast_x, *fc_weight_itr++, k, new_bs * max_seq_len, dim, dim, - false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, dim, 1.0f, - 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - ret = api::fc_fusion( - ctx, broadcast_x, *fc_weight_itr++, v, new_bs * max_seq_len, dim, dim, - false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, dim, 1.0f, - 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - ret = api::qk_attention(ctx, mid_b, k, src_qk, nullptr, - nullptr, max_qk, src_qkv_param); - - ret = api::qk_v_attention(ctx, src_qk, v, mid_a, max_qk, - nullptr, max_qkv, src_qkv_param); - // x = x + residual fused with fc - ret = api::fc_fusion( - ctx, mid_a, *fc_weight_itr++, residual, new_bs * max_target_seq_len, - dim, dim, false, true, max_qkv, *fc_w_maxptr_itr++, nullptr, dim, dim, - dim, 1.0f, 1.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - // normalize before - ret = api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, - dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr, - nullptr); - // ffn1 - ret = api::fc_fusion( - ctx, mid_a, *fc_weight_itr++, ffn1_out, new_bs * max_target_seq_len, - ffn1_out_dim, dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, - dim, dim, ffn1_out_dim, 1.0, 0.0, *fc_bias_itr++, - api::Activation_t::RELU); - // ffn2 - ret = api::fc_fusion( - ctx, ffn1_out, *fc_weight_itr++, residual, new_bs * max_target_seq_len, - dim, ffn2_input_dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, - ffn2_input_dim, ffn2_input_dim, dim, 1.0, 1.0, *fc_bias_itr++, - api::Activation_t::LINEAR); - } - - ret = - api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, dim, - 1e-5, *ln_scale_itr++, *ln_bias_itr++, nullptr, nullptr); - int ctc_dim = param.vocab_size; - ret = api::fc_fusion( - ctx, mid_a, *fc_weight_itr++, mid_b, new_bs * max_target_seq_len, ctc_dim, - dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, ctc_dim, - 1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); - // log_softmax - int data_len = new_bs * max_target_seq_len * ctc_dim; - float* softmax_in = RAII_GUARD.alloc(data_len); - float* softmax_out = RAII_GUARD.alloc(data_len); - float* log_out = RAII_GUARD.alloc(data_len); - ret = api::cast_v2(ctx, mid_b, softmax_in, data_len); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::softmax(ctx, softmax_in, softmax_out, - {new_bs, max_target_seq_len, ctc_dim}, 2); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - ret = api::log(ctx, softmax_out, character_scores, data_len); - WRAPPER_ASSERT_SUCCESS(ctx, ret); - - return api::SUCCESS; -} - -template int conformer_decoder_wenet( - api::Context* ctx, const float16* x, const std::vector& x_shape, - const float* x_mask, const int* padded_target, - const std::vector& target_shape, float* character_scores, - const ConformerDecoderParam& param); - -} // namespace wenet -} // namespace xpu diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_conformer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_conformer.h deleted file mode 100644 index c20af03e11a4e1807ebd3b7d453292d9373d2f80..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_conformer.h +++ /dev/null @@ -1,781 +0,0 @@ -// Copyright (c) 2022 KUNLUNXIN Inc. -// 2022 Han Qi (qihan@baidu.com) -// Hehe Pan (panhehe@baidu.com) -// Zikui Yan (yanzikui@baidu.com) -// Chaolin Li (lichaolin@baidu.com) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "xpu/runtime.h" -#include "xpu/xdnn.h" -#include "xpu_util.h" // NOLINT -#pragma once - -namespace api = baidu::xpu::api; -template -class ConformerEncoderParam { - public: - int layer_num; - int fc_num_per_layer; - int conv_num_per_layer; - int ln_num_per_layer; - int head_num; - int head_dim; - int ctc_dim; - int ffn_factor; - int beam_size; - struct Embedding { - int conv_num; - int fc_num; - int embed_dim; - } emb_param; - struct ConvBlock { - bool is_casual; - int kernel_size; - int lorder; - T padding; - } conv_param; - - std::vector pos_emb; - std::vector emb_conv_w_list; - std::vector emb_conv_maxw_list; - std::vector emb_conv_bias_list; - std::vector emb_fc_w_list; - std::vector emb_fc_maxw_list; - std::vector emb_fc_bias_list; - - std::vector conv_w_list; - std::vector conv_maxw_list; - std::vector conv_bias_list; - - std::vector ln_scale_list; - std::vector ln_bias_list; - - std::vector fc_w_list; - std::vector fc_maxw_list; - std::vector fc_bias_list; - - std::vector attn_pos_w_list_; - std::vector attn_pos_w_list; - std::vector attn_pos_maxw_list; - std::vector attn_pos_uv_bias_list; - - const float* cmvn_istd{nullptr}; - const float* cmvn_mean{nullptr}; - const float* pe{nullptr}; - float* mask{nullptr}; -}; - -template -class ConformerDecoderParam { - public: - int layer_num; - int fc_num_per_layer; - int ln_num_per_layer; - - int head_num; - int head_dim; - int vocab_size; - int sos_id; - int eos_id; - int ignored_id; - int beam_size; - int max_token_num; - int add_sos_num; - int ffn_dim; - - const T* embed_table{nullptr}; - const T* pe{nullptr}; - std::vector fc_w_list; - std::vector fc_maxw_list; - std::vector fc_bias_list; - std::vector ln_scale_list; - std::vector ln_bias_list; -}; - -template -static int64_t vec_prod(const std::vector& data) { - int len = data.size(); - if (len < 1) { - return 0; - } - int64_t prod = data[0]; - for (int i = 1; i < len; ++i) { - prod *= data[i]; - } - return prod; -} - -template -static std::vector get_w_list_from( - const std::vector>& quant_data_list) { - int len = quant_data_list.size(); - std::vector w_list(len, nullptr); - for (int i = 0; i < len; ++i) { - w_list[i] = quant_data_list[i].data_; - } - return w_list; -} - -template -static std::vector get_w_maxptr_list_from( - const std::vector>& quant_data_list) { - int len = quant_data_list.size(); - std::vector w_maxptr_list(len, nullptr); - for (int i = 0; i < len; ++i) { - w_maxptr_list[i] = quant_data_list[i].max_ptr_; - } - return w_maxptr_list; -} - -template -void get_fc_param(const std::unordered_map& weights_len_info, - const std::string& params_dir, - const std::string& fc_name_prefix, - XPUQunatData& fc_w, // NOLINT - const float*& fc_bias, bool has_bias = true) { // NOLINT - const std::string fc_file_prefix = params_dir + fc_name_prefix; - int wlen = weights_len_info.at(fc_name_prefix + "weight"); - fc_w = get_xpu_quant_data(fc_file_prefix + "weight", wlen); - if (has_bias) { - int blen = weights_len_info.at(fc_name_prefix + "bias"); - fc_bias = get_xpu_data(fc_file_prefix + "bias", blen); - } else { - fc_bias = nullptr; - } -} - -template -void get_conv_param( - const std::unordered_map& weights_len_info, - const std::string& params_dir, const std::string& conv_name_prefix, - XPUQunatData& conv_w, const float*& conv_b, // NOLINT - bool has_bias = true) { // NOLINT - std::string conv_file_prefix = params_dir + conv_name_prefix; - int wlen = weights_len_info.at(conv_name_prefix + "weight"); - conv_w = get_xpu_quant_data(conv_file_prefix + "weight", wlen); - if (has_bias) { - int blen = weights_len_info.at(conv_name_prefix + "bias"); - conv_b = get_xpu_data(conv_file_prefix + "bias", blen); - } else { - conv_b = nullptr; - } -} - -template -void get_fc_fused_param( - const std::unordered_map& weights_len_info, - const std::string& params_dir, - const std::vector fc_name_prefixs, - XPUQunatData& _fc_w, // NOLINT - const float*& _fc_b, bool has_bias = true) { // NOLINT - // get cpu fc params - std::vector fc_ws; - std::vector fc_bs; - for (int ids = 0; ids < fc_name_prefixs.size(); ids++) { - std::string fc_file_prefix = params_dir + fc_name_prefixs[ids]; - int wlen = weights_len_info.at(fc_name_prefixs[ids] + "weight"); - std::vector fc_w = - get_cpu_data(fc_file_prefix + "weight", wlen); - std::vector fc_b; - if (has_bias) { - int blen = weights_len_info.at(fc_name_prefixs[ids] + "bias"); - fc_b = get_cpu_data(fc_file_prefix + "bias", blen); - } - fc_ws.insert(fc_ws.end(), fc_w.begin(), fc_w.end()); - fc_bs.insert(fc_bs.end(), fc_b.begin(), fc_b.end()); - } - _fc_w = get_xpu_quant_data("fused_fc_weight", fc_ws); - _fc_b = get_xpu_data("fused_fc_bias", fc_bs); -} - -template -void get_fc_ln_fused_param( - const std::unordered_map& weights_len_info, - const std::string& params_dir, - const std::vector fc_name_prefixs, - std::vector ln_name_prefixs, - XPUQunatData& _fc_w, // NOLINT - const float*& _fc_b, bool has_bias = true) { // NOLINT - // get cpu fc params - std::vector fc_ws; - std::vector fc_bs; - for (int ids = 0; ids < fc_name_prefixs.size(); ids++) { - std::string fc_file_prefix = params_dir + fc_name_prefixs[ids]; - int wlen = weights_len_info.at(fc_name_prefixs[ids] + "weight"); - std::vector fc_w = - get_cpu_data(fc_file_prefix + "weight", wlen); - std::vector fc_b; - if (has_bias) { - int blen = weights_len_info.at(fc_name_prefixs[ids] + "bias"); - fc_b = get_cpu_data(fc_file_prefix + "bias", blen); - } - // get cpu ln params - std::string ln_file_prefix = params_dir + ln_name_prefixs[ids]; - wlen = weights_len_info.at(ln_name_prefixs[ids] + "weight"); - int blen = weights_len_info.at(ln_name_prefixs[ids] + "bias"); - std::vector ln_scale = - get_cpu_data(ln_file_prefix + "weight", wlen); - std::vector ln_bias = - get_cpu_data(ln_file_prefix + "bias", blen); - int col = ln_scale.size(); - int row = static_cast(fc_w.size()) / col; - if (!has_bias) { - fc_b.resize(row); - } - // get new fc_bias - for (int i = 0; i < row; i++) { - float b = has_bias ? fc_b[i] : 0.f; - for (int j = 0; j < col; j++) { - b += fc_w[i * col + j] * ln_bias[j]; - } - fc_b[i] = b; - } - // get new fc_weight - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - fc_w[i * col + j] = fc_w[i * col + j] * ln_scale[j]; - } - } - fc_ws.insert(fc_ws.end(), fc_w.begin(), fc_w.end()); - fc_bs.insert(fc_bs.end(), fc_b.begin(), fc_b.end()); - } - _fc_w = get_xpu_quant_data("fused_fc_weight", fc_ws); - _fc_b = get_xpu_data("fused_fc_bias", fc_bs); -} - -template -void get_conv_bn_fused_param( - const std::unordered_map& weights_len_info, - const std::string& params_dir, const std::string& conv_name_prefix, - const std::string& bn_name_prefix, XPUQunatData& _conv_w, // NOLINT - const float*& _conv_b, bool has_bias = true) { // NOLINT - // get cpu conv params - std::string conv_file_prefix = params_dir + conv_name_prefix; - int wlen = weights_len_info.at(conv_name_prefix + "weight"); - std::vector conv_w = - get_cpu_data(conv_file_prefix + "weight", wlen); - std::vector conv_b; - if (has_bias) { - int blen = weights_len_info.at(conv_name_prefix + "bias"); - conv_b = get_cpu_data(conv_file_prefix + "bias", blen); - } - // get cpu bn params - std::string bn_file_prefix = params_dir + bn_name_prefix; - wlen = weights_len_info.at(bn_name_prefix + "weight"); - int blen = weights_len_info.at(bn_name_prefix + "bias"); - int mlen = weights_len_info.at(bn_name_prefix + "running_mean"); - int vlen = weights_len_info.at(bn_name_prefix + "running_var"); - std::vector bn_scale = - get_cpu_data(bn_file_prefix + "weight", wlen); - std::vector bn_bias = - get_cpu_data(bn_file_prefix + "bias", blen); - std::vector bn_mean = - get_cpu_data(bn_file_prefix + "running_mean", mlen); - std::vector bn_var = - get_cpu_data(bn_file_prefix + "running_var", vlen); - // fuse conv, bn, new weight is conv_w, new bias is bn_bias - int h = bn_scale.size(); - int w = static_cast(conv_w.size()) / h; - float eps = 1e-5f; // assume eps is 1e-5; - for (int i = 0; i < h; ++i) { - bn_scale[i] = bn_scale[i] / std::sqrt(bn_var[i] + eps); - } - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - conv_w[i * w + j] *= bn_scale[i]; - } - } - for (int i = 0; i < h; ++i) { - float b = has_bias ? conv_b[i] : 0.f; - bn_bias[i] += ((b - bn_mean[i]) * bn_scale[i]); - } - _conv_w = get_xpu_quant_data("fused_conv_weight", conv_w); - _conv_b = get_xpu_data("fused_conv_bias", bn_bias); -} - -template -static std::tuple, std::vector> read_cpu_data_from_file( - const std::string& data_file_prefix, int shape_ndim) { - std::vector res_data; - std::string data_file = data_file_prefix + ".dat"; - std::string shape_file = data_file_prefix + "_shape.txt"; - std::ifstream inF(shape_file); - if (!inF) { - std::cout << "ERR: open file failed! " << shape_file << std::endl; - std::exit(1); - } - char useless; // (16, 523, 80) or (160, 1) - std::vector inshape(shape_ndim, 0); - if (shape_ndim == 3) { - inF >> useless >> inshape[0] >> useless >> inshape[1] >> useless >> - inshape[2] >> useless; - } else if (shape_ndim == 2) { - inF >> useless >> inshape[0] >> useless >> inshape[1] >> useless; - } else if (shape_ndim == 1) { - inF >> useless >> inshape[0] >> useless >> useless; - } else { - std::cout << "ERR: only support shape ndim == 1, 2 or 3, but got " - << shape_ndim << std::endl; - std::exit(1); - } - - int data_len = vec_prod(inshape); - res_data = get_cpu_data(data_file, data_len); - return std::make_tuple(res_data, inshape); -} - -template -static std::tuple> read_xpu_data_from_file( - const std::string& data_file_prefix, int shape_ndim) { - auto cpu_data_info = read_cpu_data_from_file(data_file_prefix, shape_ndim); - T* xpu_data = get_xpu_data(data_file_prefix, std::get<0>(cpu_data_info)); - return std::make_tuple(xpu_data, std::get<1>(cpu_data_info)); -} - -template -static std::tuple> create_mask_according_speech_length( - const std::vector& speech_length, int max_seqlen, - void* xpu_stream = nullptr) { - int batch = speech_length.size(); - int mask_len = batch * max_seqlen; - int subsample_mask_len = batch * (((max_seqlen - 1) / 2 - 1) / 2); - std::vector mask_cpu(mask_len, 0); - std::vector subsample_mask_cpu(subsample_mask_len, 0); - // create mask, equal to 'masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)' - for (int b = 0; b < batch; ++b) { - int curr_seqlen = speech_length[b]; - for (int idx = 0; idx < curr_seqlen; ++idx) { - mask_cpu.at(b * max_seqlen + idx) = 1; - } - } - // create subsample_mask, equal to 'x_mask[:, :, :-2:2][:, :, :-2:2]' - int sub_seqlen = subsample_mask_len / batch; - for (int b = 0; b < batch; ++b) { - for (int idx = 0; idx < sub_seqlen; ++idx) { - subsample_mask_cpu.at(b * sub_seqlen + idx) = - mask_cpu.at(b * max_seqlen + idx * 4); - } - } - // copy to xpu - T* subsample_mask_xpu = nullptr; - int r = xpu_malloc(reinterpret_cast(&subsample_mask_xpu), - subsample_mask_len * sizeof(T)); - if (r != 0) { - std::cout << "ERR: xpu_malloc failed!" << std::endl; - std::exit(1); - } - r = xpu_wait(xpu_stream); - if (r != 0) { - std::cout << "ERR: xpu_wait failed!" << std::endl; - std::exit(1); - } - r = xpu_memcpy(subsample_mask_xpu, subsample_mask_cpu.data(), - subsample_mask_len * sizeof(T), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - if (r != 0) { - std::cout << "ERR: xpu_memcpy failed!" << std::endl; - std::exit(1); - } - - std::vector subsample_mask_shape{batch, 1, sub_seqlen}; - return std::make_tuple(subsample_mask_xpu, subsample_mask_shape); -} - -template -int init_encoder_params( - const std::string& params_dir, - ConformerEncoderParam& encoder_param) { // NOLINT - std::unordered_map weights_len_info = - get_weights_lens(params_dir + "weights_info.txt"); - std::unordered_map> weights_shape_info = - get_weights_shape(params_dir + "weights_info.txt"); - - // model struct param - auto& head_num = encoder_param.head_num; - auto& head_dim = encoder_param.head_dim; - auto& ffn_factor = encoder_param.ffn_factor; - auto& conv_param = encoder_param.conv_param; - auto& emb_param = encoder_param.emb_param; - auto& ctc_dim = encoder_param.ctc_dim; - auto& encoder_layer_num = encoder_param.layer_num; - auto& fc_num_per_layer = encoder_param.fc_num_per_layer; - auto& conv_num_per_layer = encoder_param.conv_num_per_layer; - auto& ln_num_per_layer = encoder_param.ln_num_per_layer; - encoder_layer_num = 12; - fc_num_per_layer = 6; - conv_num_per_layer = 3; - ln_num_per_layer = 6; - emb_param.conv_num = 2; - emb_param.fc_num = 1; - emb_param.embed_dim = 512; - ffn_factor = - weights_shape_info.at("encoder.encoders.0.feed_forward.w_1.weight")[0] / - weights_shape_info.at("encoder.encoders.0.feed_forward.w_1.weight")[1]; - head_dim = - weights_shape_info.at("encoder.encoders.0.self_attn.pos_bias_u")[1]; - head_num = - weights_shape_info.at("encoder.encoders.0.self_attn.pos_bias_u")[0]; - conv_param.kernel_size = weights_shape_info.at( - "encoder.encoders.0.conv_module.depthwise_conv.weight")[2]; - conv_param.lorder = conv_param.kernel_size - 1; - conv_param.padding = 0.0; - conv_param.is_casual = true; - ctc_dim = weights_len_info.at("ctc.ctc_lo.bias"); - encoder_param.beam_size = 3; - - // init encoder cmvn - auto& pe = encoder_param.pe; - auto& cmvn_istd = encoder_param.cmvn_istd; - auto& cmvn_mean = encoder_param.cmvn_mean; - int pe_len = weights_len_info.at("encoder.pe"); - int mlen = weights_len_info.at("encoder.global_cmvn.mean"); - int ilen = weights_len_info.at("encoder.global_cmvn.istd"); - pe = get_xpu_data(params_dir + "encoder.pe", pe_len); - cmvn_mean = - get_xpu_data(params_dir + "encoder.global_cmvn.mean", mlen); - cmvn_istd = - get_xpu_data(params_dir + "encoder.global_cmvn.istd", ilen); - - // init encoder embedding param - std::vector> emb_conv_w_list; - auto& emb_conv_bias_list = encoder_param.emb_conv_bias_list; - std::vector> emb_fc_w_list; - auto& emb_fc_bias_list = encoder_param.emb_fc_bias_list; - emb_conv_w_list.resize(emb_param.conv_num); - emb_conv_bias_list.resize(emb_param.conv_num); - emb_fc_w_list.resize(emb_param.fc_num); - emb_fc_bias_list.resize(emb_param.fc_num); - for (int i = 0; i < emb_param.conv_num; ++i) { - std::string conv_name_prefix = - "encoder.embed.conv." + std::to_string(i * 2) + "."; - get_conv_param(weights_len_info, params_dir, conv_name_prefix, - emb_conv_w_list[i], emb_conv_bias_list[i]); - } - get_fc_param(weights_len_info, params_dir, "encoder.embed.out.0.", - emb_fc_w_list[0], emb_fc_bias_list[0]); - - // encoder_param_layer - int enc_fc_num = encoder_layer_num * fc_num_per_layer + 1; - int enc_conv_num = encoder_layer_num * conv_num_per_layer; - int enc_ln_num = encoder_layer_num * ln_num_per_layer + 1; - - std::vector> fc_w_list; - auto& fc_bias_list = encoder_param.fc_bias_list; - - std::vector> conv_w_list; - auto& conv_bias_list = encoder_param.conv_bias_list; - - auto& ln_scale_list = encoder_param.ln_scale_list; - auto& ln_bias_list = encoder_param.ln_bias_list; - - std::vector> attn_pos_w_list; - std::vector attn_pos_uv_bias_list; - // w_param need to be quanted & get maxw - fc_w_list.resize(enc_fc_num); - fc_bias_list.resize(enc_fc_num); - conv_w_list.resize(enc_conv_num); - conv_bias_list.resize(enc_conv_num); - ln_scale_list.resize(enc_ln_num); - ln_bias_list.resize(enc_ln_num); - attn_pos_w_list.resize(encoder_layer_num); - attn_pos_uv_bias_list.resize(encoder_layer_num * - 2); // pos_bias_u, pos_bias_v - for (int i = 0; i < encoder_layer_num; ++i) { - std::string enc_prefix = "encoder.encoders." + std::to_string(i) + "."; - int fc_offset = i * fc_num_per_layer; - int conv_offset = i * conv_num_per_layer; - int ln_offset = i * ln_num_per_layer; - // init FeedForwardParam macaron - get_fc_param(weights_len_info, params_dir, - enc_prefix + "feed_forward_macaron.w_1.", - fc_w_list[fc_offset], fc_bias_list[fc_offset]); - get_fc_param(weights_len_info, params_dir, - enc_prefix + "feed_forward_macaron.w_2.", - fc_w_list[fc_offset + 1], fc_bias_list[fc_offset + 1]); - get_fc_fused_param( - weights_len_info, params_dir, - {enc_prefix + "self_attn.linear_q.", enc_prefix + "self_attn.linear_k.", - enc_prefix + "self_attn.linear_v."}, - fc_w_list[fc_offset + 2], fc_bias_list[fc_offset + 2]); - get_fc_param( - weights_len_info, params_dir, enc_prefix + "self_attn.linear_out.", - fc_w_list[fc_offset + 3], fc_bias_list[fc_offset + 3], true); - // get pos w, pos u bias, pos v bias - std::string pos_w_name = enc_prefix + "self_attn.linear_pos.weight"; - std::string pos_ubias_name = enc_prefix + "self_attn.pos_bias_u"; - std::string pos_vbias_name = enc_prefix + "self_attn.pos_bias_v"; - int pos_wlen = weights_len_info.at(pos_w_name); - int pos_ublen = weights_len_info.at(pos_ubias_name); - int pos_vblen = weights_len_info.at(pos_vbias_name); - attn_pos_w_list[i] = - get_xpu_quant_data(params_dir + pos_w_name, pos_wlen); - attn_pos_uv_bias_list[i * 2] = - get_xpu_data(params_dir + pos_ubias_name, pos_ublen); - attn_pos_uv_bias_list[i * 2 + 1] = - get_xpu_data(params_dir + pos_vbias_name, pos_vblen); - // init ConvModuleParam - get_conv_param(weights_len_info, params_dir, - enc_prefix + "conv_module.pointwise_conv1.", - conv_w_list[conv_offset], conv_bias_list[conv_offset], - true); - get_conv_param(weights_len_info, params_dir, - enc_prefix + "conv_module.depthwise_conv.", - conv_w_list[conv_offset + 1], - conv_bias_list[conv_offset + 1], true); - get_conv_param(weights_len_info, params_dir, - enc_prefix + "conv_module.pointwise_conv2.", - conv_w_list[conv_offset + 2], - conv_bias_list[conv_offset + 2], true); - // init FeedForwardParam - get_fc_param(weights_len_info, params_dir, - enc_prefix + "feed_forward.w_1.", fc_w_list[fc_offset + 4], - fc_bias_list[fc_offset + 4]); - get_fc_param(weights_len_info, params_dir, - enc_prefix + "feed_forward.w_2.", fc_w_list[fc_offset + 5], - fc_bias_list[fc_offset + 5]); - // init LayerNormParam - get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_ff_macaron.", - ln_scale_list[ln_offset], ln_bias_list[ln_offset]); - get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_mha.", - ln_scale_list[ln_offset + 1], ln_bias_list[ln_offset + 1]); - get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_conv.", - ln_scale_list[ln_offset + 2], ln_bias_list[ln_offset + 2]); - get_ln_param(weights_len_info, params_dir, enc_prefix + "conv_module.norm.", - ln_scale_list[ln_offset + 3], ln_bias_list[ln_offset + 3]); - get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_ff.", - ln_scale_list[ln_offset + 4], ln_bias_list[ln_offset + 4]); - get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_final.", - ln_scale_list[ln_offset + 5], ln_bias_list[ln_offset + 5]); - } - get_ln_param(weights_len_info, params_dir, "encoder.after_norm.", - ln_scale_list[enc_ln_num - 1], ln_bias_list[enc_ln_num - 1]); - get_fc_param(weights_len_info, params_dir, "ctc.ctc_lo.", - fc_w_list[enc_fc_num - 1], fc_bias_list[enc_fc_num - 1]); - /* get maxw && w */ - encoder_param.emb_conv_w_list = get_w_list_from(emb_conv_w_list); - encoder_param.emb_conv_maxw_list = - get_w_maxptr_list_from(emb_conv_w_list); - encoder_param.emb_fc_w_list = get_w_list_from(emb_fc_w_list); - encoder_param.emb_fc_maxw_list = get_w_maxptr_list_from(emb_fc_w_list); - - encoder_param.conv_w_list = get_w_list_from(conv_w_list); - encoder_param.conv_maxw_list = get_w_maxptr_list_from(conv_w_list); - - encoder_param.fc_w_list = get_w_list_from(fc_w_list); - encoder_param.fc_maxw_list = get_w_maxptr_list_from(fc_w_list); - - encoder_param.attn_pos_w_list_ = get_w_list_from(attn_pos_w_list); - encoder_param.attn_pos_maxw_list = - get_w_maxptr_list_from(attn_pos_w_list); - /* prepare params */ - api::Context ctx_xpu(api::kXPU2); - api::ctx_guard RAII_GUARD(&ctx_xpu); - int ret = 0; - int hidden_dim = head_num * head_dim; - encoder_param.pos_emb.resize(encoder_layer_num); - for (int i = 0; i < encoder_layer_num; i++) { - ret = xpu_malloc((void**)&(encoder_param.pos_emb[i]), // NOLINT - 5000 * hidden_dim * sizeof(T)); - ret = api::fc_fusion( - &ctx_xpu, encoder_param.pe, encoder_param.attn_pos_w_list_[i], - const_cast(encoder_param.pos_emb[i]), 5000, hidden_dim, hidden_dim, - false, true, nullptr, encoder_param.attn_pos_maxw_list[i], nullptr, - hidden_dim, hidden_dim, hidden_dim, 1.0f, 0.0f, nullptr, - api::Activation_t::LINEAR); - } - for (int i = 0; i < encoder_layer_num; i++) { - ret = api::scale( - &ctx_xpu, encoder_param.fc_bias_list[i * fc_num_per_layer + 1], - const_cast( - encoder_param.fc_bias_list[i * fc_num_per_layer + 1]), - hidden_dim, true, 0.5f, 0.0f); - ret = api::scale( - &ctx_xpu, encoder_param.fc_bias_list[i * fc_num_per_layer + 5], - const_cast( - encoder_param.fc_bias_list[i * fc_num_per_layer + 5]), - hidden_dim, true, 0.5f, 0.0f); - } - for (int i = 0; i < attn_pos_uv_bias_list.size(); i++) { - T* tmppos = nullptr; - ret = xpu_malloc(reinterpret_cast(&tmppos), hidden_dim * sizeof(T)); - ret = api::cast_v2(&ctx_xpu, attn_pos_uv_bias_list[i], tmppos, - hidden_dim); - encoder_param.attn_pos_uv_bias_list.push_back(tmppos); - } - return 0; -} - -template -int init_decoder_params( - const std::string& params_dir, - ConformerDecoderParam& decoder_param) { // NOLINT - std::unordered_map weights_len_info = - get_weights_lens(params_dir + "weights_info.txt"); - - // init DecoderLayerParam - auto& decoder_layer_num = decoder_param.layer_num; - auto& fc_num_per_layer = decoder_param.fc_num_per_layer; - auto& ln_num_per_layer = decoder_param.ln_num_per_layer; - std::vector> fc_w_list; - auto& fc_bias_list = decoder_param.fc_bias_list; - auto& ln_scale_list = decoder_param.ln_scale_list; - auto& ln_bias_list = decoder_param.ln_bias_list; - decoder_layer_num = 3; - fc_num_per_layer = 8; - ln_num_per_layer = 3; - int dec_fc_num = decoder_layer_num * fc_num_per_layer + 1; - int dec_ln_num = decoder_layer_num * ln_num_per_layer + 1; - fc_w_list.resize(dec_fc_num); - fc_bias_list.resize(dec_fc_num); - ln_scale_list.resize(dec_ln_num); - ln_bias_list.resize(dec_ln_num); - decoder_param.head_num = 8; - decoder_param.head_dim = 64; - decoder_param.vocab_size = 5538; - decoder_param.sos_id = 5537; - decoder_param.eos_id = 5537; - decoder_param.ignored_id = 2; - decoder_param.beam_size = 3; - decoder_param.max_token_num = 200; - decoder_param.add_sos_num = 1; - decoder_param.ffn_dim = 2048; - auto att_dim = decoder_param.head_num * decoder_param.head_dim; - - // init EmbeddingParam - std::string embed_table_name = "decoder.left_decoder.embed.0.weight"; - std::vector embed_table_cpu = get_cpu_data( - params_dir + embed_table_name, weights_len_info.at(embed_table_name)); - std::vector embed_table_cpu_t(embed_table_cpu.size(), 0); - for (int i = 0; i < static_cast(embed_table_cpu.size()); ++i) { - embed_table_cpu_t[i] = - static_cast(embed_table_cpu[i] * std::sqrt(att_dim)); - } - decoder_param.embed_table = - get_xpu_data(embed_table_name, embed_table_cpu_t); - - // init pe - std::string pe_name = "encoder.pe"; - std::vector pe_cpu = - get_cpu_data(params_dir + pe_name, weights_len_info.at(pe_name)); - std::vector pe_cpu_t(pe_cpu.size(), 0); - for (int i = 0; i < static_cast(pe_cpu.size()); ++i) { - pe_cpu_t[i] = static_cast(pe_cpu[i]); - } - decoder_param.pe = get_xpu_data(pe_name, pe_cpu_t); - for (int i = 0; i < decoder_layer_num; ++i) { - std::string dec_prefix = - "decoder.left_decoder.decoders." + std::to_string(i) + "."; - int offset = i * fc_num_per_layer; - // init fc param - // self attention qkv fc - get_fc_fused_param(weights_len_info, params_dir, - { - dec_prefix + "self_attn.linear_q.", - dec_prefix + "self_attn.linear_k.", - dec_prefix + "self_attn.linear_v.", - }, - fc_w_list[offset], fc_bias_list[offset], true); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "self_attn.linear_out.", - fc_w_list[offset + 1], fc_bias_list[offset + 1], true); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "src_attn.linear_q.", fc_w_list[offset + 2], - fc_bias_list[offset + 2], true); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "src_attn.linear_k.", fc_w_list[offset + 3], - fc_bias_list[offset + 3], true); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "src_attn.linear_v.", fc_w_list[offset + 4], - fc_bias_list[offset + 4], true); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "src_attn.linear_out.", fc_w_list[offset + 5], - fc_bias_list[offset + 5], true); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "feed_forward.w_1.", fc_w_list[offset + 6], - fc_bias_list[offset + 6]); - get_fc_param(weights_len_info, params_dir, - dec_prefix + "feed_forward.w_2.", fc_w_list[offset + 7], - fc_bias_list[offset + 7]); - // init ln param - offset = i * ln_num_per_layer; - get_ln_param(weights_len_info, params_dir, dec_prefix + "norm1.", - ln_scale_list[offset], ln_bias_list[offset]); - get_ln_param(weights_len_info, params_dir, dec_prefix + "norm2.", - ln_scale_list[offset + 1], ln_bias_list[offset + 1]); - get_ln_param(weights_len_info, params_dir, dec_prefix + "norm3.", - ln_scale_list[offset + 2], ln_bias_list[offset + 2]); - } - // init after ln - get_ln_param(weights_len_info, params_dir, "decoder.left_decoder.after_norm.", - ln_scale_list[dec_ln_num - 1], ln_bias_list[dec_ln_num - 1]); - // init output layer fc - get_fc_param( - weights_len_info, params_dir, "decoder.left_decoder.output_layer.", - fc_w_list[dec_fc_num - 1], fc_bias_list[dec_fc_num - 1], true); - decoder_param.fc_w_list = get_w_list_from(fc_w_list); - decoder_param.fc_maxw_list = get_w_maxptr_list_from(fc_w_list); - return 0; -} - -static int padding_target(std::vector& hyps, // NOLINT - std::vector& hyps_len, // NOLINT - int beam_size, int eos_id) { - int max_target_len = *max_element(hyps_len.begin(), hyps_len.end()); - std::vector pad(max_target_len * beam_size); - int offset = 0; - for (int i = 0; i < beam_size; i++) { - for (int j = 0; j < max_target_len; j++) { - pad[i * max_target_len + j] = j < hyps_len[i] ? hyps[j + offset] : eos_id; - } - offset += hyps_len[i]; - } - hyps.swap(pad); - return max_target_len; -} - -namespace xpu { -namespace wenet { - -template -int conformer_encoder_wenet( - api::Context* ctx, float* x, const std::vector& data_shape, - T* encoder_out, T* ctc_probs, - ConformerEncoderParam& param, // NOLINT - const std::tuple>& xpu_mask_info); -template -int ctc_prefix_beamsearch(api::Context* ctx, T* ctc_probs, - std::vector& hyps, // NOLINT - std::vector& hyps_len, // NOLINT - std::vector& ctc_scores, // NOLINT - int batch_size, int beam_size, int max_len, - int ctc_dim); - -template -int conformer_decoder_wenet(api::Context* ctx, const T* x, - const std::vector& x_shape, - const float* x_mask, const int* padded_target, - const std::vector& target_shape, - float* character_scores, - const ConformerDecoderParam& param); -} // namespace wenet -} // namespace xpu diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_util.cpp b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_util.cpp deleted file mode 100644 index b18cd12b7e2d46131076ed90b4df0aebf5f8039b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_util.cpp +++ /dev/null @@ -1,491 +0,0 @@ -// Copyright (c) 2022 KUNLUNXIN Inc. -// 2022 Han Qi (qihan@baidu.com) -// Hehe Pan (panhehe@baidu.com) -// Zikui Yan (yanzikui@baidu.com) -// Chaolin Li (lichaolin@baidu.com) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "xpu_util.h" // NOLINT - -template -static double vec_sum(const std::vector& data) { - double res = 0; - for (int i = 0; i < static_cast(data.size()); ++i) { - res += static_cast(data[i]); - } - return res; -} - -int vector_prod(std::vector shape) { - int accumlate = 1; - for (auto a : shape) { - accumlate *= a; - } - return accumlate; -} -void add_separator_when_necessary(std::string& str) { // NOLINT - int len = str.size(); - char ch = '/'; - if (str[len - 1] != ch) { - str.append(1, ch); - } -} - -template -static std::string print_vec(const std::vector& data) { - std::stringstream ss; - const int dump_len = data.size() > 8 ? 8 : data.size(); - std::vector dump_data(dump_len, 0); - int half_dump_len = dump_len / 2; - std::copy(data.cbegin(), data.cbegin() + half_dump_len, dump_data.begin()); - std::copy(data.cend() - (dump_len - half_dump_len), data.cend(), - dump_data.begin() + half_dump_len); - for (int i = 0; i < dump_len - 1; ++i) { - ss << dump_data[i] << ", "; - if ((i + 1) == dump_len / 2) { - ss << " ... "; - } - } - ss << dump_data[dump_len - 1]; - return ss.str(); -} - -template -static T parse_string(const std::string& str) { - return str; -} - -template <> -float parse_string(const std::string& str) { - return std::stof(str); -} -template <> -double parse_string(const std::string& str) { - return std::stod(str); -} -template <> -int parse_string(const std::string& str) { - return std::stoi(str); -} -template <> -int64_t parse_string(const std::string& str) { - return std::stoll(str); -} - -template -std::vector Split(const std::string& str, const std::string& separator) { - std::vector res; - std::string::size_type pos1, pos2; - pos1 = str.find_first_not_of(separator); - pos2 = str.find(separator, pos1); - while (std::string::npos != pos1 && std::string::npos != pos2) { - res.emplace_back(parse_string(str.substr(pos1, pos2 - pos1))); - pos1 = str.find_first_not_of(separator, pos2); - pos2 = str.find(separator, pos1); - } - if (std::string::npos != pos1 && pos1 < str.length()) { - res.emplace_back(parse_string(str.substr(pos1))); - } - return res; -} - -std::unordered_map get_weights_lens( - const std::string& file_path) { - std::unordered_map res; - std::ifstream inF(file_path, std::ifstream::in); - if (inF) { - // std::cout << "read success from " << file_path << std::endl; - std::string buffer; - while (std::getline(inF, buffer)) { - std::vector weight_info = Split(buffer, ":"); - std::string w_name = weight_info[0]; - int w_len = std::stoi(weight_info[3]); - res.insert(std::make_pair(w_name, w_len)); - } - } else { - std::cout << "ERR: read failed, " << file_path << std::endl; - std::exit(1); - } - - return res; -} - -std::unordered_map> get_weights_shape( - const std::string& file_path) { - std::unordered_map> res; - std::ifstream inF(file_path, std::ifstream::in); - if (inF) { - // std::cout << "read success from " << file_path << std::endl; - std::string buffer; - while (std::getline(inF, buffer)) { - std::vector weight_info = Split(buffer, ":"); - std::string w_name = weight_info[0]; - std::string w_shape_str = weight_info[2]; // example: (512, 1, 3, 3) - std::string w_shape_str_without_bracket( - w_shape_str.begin() + 1, - w_shape_str.end() - 1); // example: 512, 1, 3, 3 - std::vector w_shape = Split(w_shape_str_without_bracket, ","); - res.insert(std::make_pair(w_name, w_shape)); - } - } else { - std::cout << "ERR: read failed, " << file_path << std::endl; - std::exit(1); - } - - return res; -} - -template -std::vector get_cpu_data(const std::string& file_path, int len) { - std::vector result(len, 0); - std::ifstream inF(file_path, std::ifstream::binary); - if (!inF) { - std::cout << "ERR: std::ifstream init failed! " << file_path << std::endl; - std::exit(1); - } - if (inF.read(reinterpret_cast(result.data()), len * sizeof(T))) { - // std::cout << "read success from " << file_path << std::endl; - } else { - std::cout << "ERR: something wrong: " << file_path << ", len=" << len - << std::endl; - std::exit(1); - } - return result; -} - -template std::vector get_cpu_data(const std::string&, int len); -template std::vector get_cpu_data(const std::string&, - int len); -template std::vector get_cpu_data(const std::string&, - int len); -template std::vector get_cpu_data(const std::string&, int len); - -template -T* get_xpu_data(const std::string& data_name, const std::vector& cpu_data) { - int len = cpu_data.size(); -#ifdef TEST_DEBUG - std::cout << "DEBUG: file_path=" << data_name << ", len=" << len - << ", vec_sum=" << vec_sum(cpu_data) - << ", details: " << print_vec(cpu_data) << std::endl; -#endif - - T* xpu_data = nullptr; - int r = xpu_malloc(reinterpret_cast(&xpu_data), len * sizeof(T)); - if (r != 0) { - std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl; - std::exit(1); - } - - r = xpu_wait(); - if (r != 0) { - std::cout << "ERR: xpu_wait failed!" << std::endl; - std::exit(1); - } - r = xpu_memcpy(xpu_data, cpu_data.data(), len * sizeof(T), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - if (r != 0) { - std::cout << "ERR: xpu_memcpy failed! " << data_name << std::endl; - std::exit(1); - } - -#ifdef TEST_DEBUG - std::cout << "DEBUG: xpu_data=" << xpu_data << std::endl; -#endif - - return xpu_data; -} - -template float* get_xpu_data(const std::string&, const std::vector&); -template float16* get_xpu_data(const std::string&, const std::vector&); -template int64_t* get_xpu_data(const std::string&, const std::vector&); -template int* get_xpu_data(const std::string&, const std::vector&); - -template -T* get_xpu_data(const std::string& file_path, int len) { - std::vector cpu_data = get_cpu_data(file_path, len); - return get_xpu_data(file_path, cpu_data); -} - -template float* get_xpu_data(const std::string&, int); -template float16* get_xpu_data(const std::string&, int); -template int64_t* get_xpu_data(const std::string&, int); -template int* get_xpu_data(const std::string&, int); - -template -std::vector quant_cpu(const std::vector& cpu_data) { - int len = cpu_data.size(); - std::vector cpu_quant_data(len, 0); - api::Context ctx(api::kCPU); - int r = api::quantization(&ctx, cpu_data.data(), - cpu_quant_data.data(), len, nullptr); - if (r != 0) { - std::cout << "ERR: quantization failed!" << std::endl; - std::exit(1); - } - return cpu_quant_data; -} - -template <> -std::vector quant_cpu(const std::vector& cpu_data) { - return cpu_data; -} - -template -XPUQunatData get_xpu_quant_data(const std::string& data_name, - const std::vector& cpu_data) { - XPUQunatData xpu_quant_data; - - int len = cpu_data.size(); - // quant - std::vector cpu_quant_data = quant_cpu(cpu_data); - // findmax - float abs_max = 1e-30f; - if (std::is_same::value || std::is_same::value) { - for (int i = 0; i < len; ++i) { - float abs_val = std::fabs(static_cast(cpu_data[i])); - abs_max = std::max(abs_max, abs_val); - } - } - - constexpr int max_ptr_len = 6; // for xpu2 - std::vector cpu_max(max_ptr_len, abs_max); - // xpu malloc - TY* xpu_data = nullptr; - float* xpu_max_ptr = nullptr; - int r = xpu_malloc(reinterpret_cast(&xpu_data), len * sizeof(TY)); - if (r != 0) { - std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl; - std::exit(1); - } - r = xpu_malloc(reinterpret_cast(&xpu_max_ptr), - max_ptr_len * sizeof(float)); - if (r != 0) { - std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl; - std::exit(1); - } - -#ifdef TEST_DEBUG - std::cout << "DEBUG: file_path=" << data_name << ", len=" << len - << ", data vec_sum=" << vec_sum(cpu_data) - << ", quant_data vec_sum=" << vec_sum(cpu_quant_data) - << ", details: " << print_vec(cpu_quant_data) << std::endl; -#endif - r = xpu_wait(); - if (r != 0) { - std::cout << "ERR: xpu_wait failed!" << std::endl; - std::exit(1); - } - // xpu memcpy - r = xpu_memcpy(xpu_data, cpu_quant_data.data(), len * sizeof(TY), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - if (r != 0) { - std::cout << "ERR: xpu_memcpy failed!" << std::endl; - std::exit(1); - } -#ifdef TEST_DEBUG - std::cout << "DEBUG: max is " << print_vec(cpu_max) << std::endl; -#endif - r = xpu_memcpy(xpu_max_ptr, cpu_max.data(), max_ptr_len * sizeof(float), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - if (r != 0) { - std::cout << "ERR: xpu_malloc failed!" << std::endl; - std::exit(1); - } - -#ifdef TEST_DEBUG - std::cout << "DEBUG: xpu_data=" << xpu_data << ", xpu_max_ptr=" << xpu_max_ptr - << std::endl; -#endif - xpu_quant_data.data_ = xpu_data; - xpu_quant_data.max_ptr_ = xpu_max_ptr; - return xpu_quant_data; -} - -template XPUQunatData get_xpu_quant_data( - const std::string&, const std::vector&); -template XPUQunatData get_xpu_quant_data( - const std::string&, const std::vector&); - -template -XPUQunatData get_xpu_quant_data(const std::string& file_path, int len) { - std::vector cpu_data = get_cpu_data(file_path, len); - return get_xpu_quant_data(file_path, cpu_data); -} - -template XPUQunatData get_xpu_quant_data( - const std::string&, int); -template XPUQunatData get_xpu_quant_data( - const std::string&, int); - -std::vector get_all_ids(const std::string& dir_in) { - std::vector ids; - std::set ids_set; - struct stat s; - stat(dir_in.c_str(), &s); - if (!S_ISDIR(s.st_mode)) { - return ids; - } - DIR* open_dir = opendir(dir_in.c_str()); - if (nullptr == open_dir) { - return ids; - } - dirent* p = nullptr; - while ((p = readdir(open_dir)) != nullptr) { - if (p->d_name[0] != '.') { - std::string filename = std::string(p->d_name); - int end_pos = filename.find('_'); - - int qid = std::stoi(filename.substr(0, end_pos)); - ids_set.insert(qid); - } - } - closedir(open_dir); - ids.resize(ids_set.size()); - ids.assign(ids_set.begin(), ids_set.end()); - return ids; -} - -void get_ln_param(const std::unordered_map& weights_len_info, - const std::string& params_dir, - const std::string& ln_name_prefix, - const float*& ln_scale, // NOLINT - const float*& ln_bias) { // NOLINT - std::string ln_file_prefix = params_dir + ln_name_prefix; - int wlen = weights_len_info.at(ln_name_prefix + "weight"); - int blen = weights_len_info.at(ln_name_prefix + "bias"); - ln_scale = get_xpu_data(ln_file_prefix + "weight", wlen); - ln_bias = get_xpu_data(ln_file_prefix + "bias", blen); -} - -template -void print_xpu_data_all(api::Context* ctx, const T* data, - std::vector shape, std::string name) { - int data_len = vector_prod(shape); - std::vector cpu_data(data_len); - xpu_wait(ctx->xpu_stream); - xpu_memcpy(reinterpret_cast(&cpu_data.front()), data, - data_len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST); - std::cout << name; - std::cout << " shape:"; - for (auto i : shape) { - std::cout << i << " "; - } - std::cout << std::endl; - int row = 1; - int col = shape.back(); - if (shape.size() >= 2) { - row = data_len / col; - } - T* cpu_data_ptr = &cpu_data.front(); - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - std::cout << *(cpu_data_ptr + i * col + j) << " "; - } - std::cout << std::endl; - } -} -template -void print_xpu_data(api::Context* ctx, const T* data, std::vector shape, - std::string name) { - int data_len = vector_prod(shape); - - std::vector cpu_data(data_len); - xpu_memcpy(reinterpret_cast(&cpu_data.front()), data, - data_len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST); - std::cout << name; - std::cout << " shape:"; - for (auto i : shape) { - std::cout << i << " "; - } - std::cout << std::endl; - if (data_len > 1000) { - double mean = 0; - for (auto val : cpu_data) { - mean += static_cast(val); - } - mean /= data_len; - std::cout << "mean=" << mean << std::endl; - std::cout << "details: "; - for (int i = 0; i < 8; ++i) { - std::cout << cpu_data[i] << " "; - } - std::cout << "..."; - for (int i = data_len - 8; i < data_len; ++i) { - std::cout << cpu_data[i] << " "; - } - std::cout << std::endl; - return; - } - int row = 1; - int col = shape.back(); - if (shape.size() >= 2) { - row = data_len / col; - } - T* cpu_data_ptr = &cpu_data.front(); - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - std::cout << *(cpu_data_ptr + i * col + j) << " "; - } - std::cout << std::endl; - } -} -template -void print_cpu_data(const T* data, std::vector shape, std::string name) { - int data_len = vector_prod(shape); - std::cout << name; - std::cout << " shape:"; - for (auto i : shape) { - std::cout << i << " "; - } - std::cout << std::endl; - int row = 1; - int col = shape.back(); - if (shape.size() >= 2) { - row = data_len / col; - } - for (int i = 0; i < row; i++) { - for (int j = 0; j < col; j++) { - std::cout << *(data + i * col + j) << " "; - } - std::cout << std::endl; - } -} - -template -void print_vec(const std::vector& data, const std::string& data_name) { - int len = static_cast(data.size()); - T sum = std::accumulate(data.begin(), data.end(), 0); - std::cout << "DEBUG: data_name is " << data_name << ", len=" << len - << ", sum=" << sum << ", "; - for (int i = 0; i < len - 1; ++i) { - std::cout << data[i] << ", "; - } - std::cout << data[len - 1] << std::endl; -} - -#define INSTANTIATION_PRINT(T) \ - template void print_vec(const std::vector&, const std::string&); \ - template void print_cpu_data(const T*, std::vector, \ - std::string name); \ - template void print_xpu_data(api::Context * ctx, const T*, \ - std::vector, std::string); \ - template void print_xpu_data_all(api::Context * ctx, const T*, \ - std::vector shape, std::string); - -INSTANTIATION_PRINT(int); -INSTANTIATION_PRINT(int16_t); -INSTANTIATION_PRINT(int8_t); -INSTANTIATION_PRINT(float); -INSTANTIATION_PRINT(float16); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_util.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_util.h deleted file mode 100644 index e0b02dc6004f9a17a789fccc101a0ac16fd3cedb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/kunlun/xpu/xpu_util.h +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2022 KUNLUNXIN Inc. -// 2022 Han Qi (qihan@baidu.com) -// Hehe Pan (panhehe@baidu.com) -// Zikui Yan (yanzikui@baidu.com) -// Chaolin Li (lichaolin@baidu.com) -// All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xpu/runtime.h" -#include "xpu/xdnn.h" - -#pragma once -namespace api = baidu::xpu::api; -template -class XPUQunatData { - public: - XPUQunatData() : data_(nullptr), max_ptr_(nullptr) {} - XPUQunatData(T* data, float* max_ptr) : data_(data), max_ptr_(max_ptr) {} - T* data_{nullptr}; - float* max_ptr_{nullptr}; -}; - -int vector_prod(std::vector shape); -void add_separator_when_necessary(std::string& str); // NOLINT - -template -void conformer_test(const std::string& data_dir, const std::string& params_dir, - int threads_number, int dev_id); - -template -std::vector Split(const std::string& str, const std::string& separator); - -std::unordered_map get_weights_lens( - const std::string& file_path); -std::unordered_map> get_weights_shape( - const std::string& file_path); - -template -std::vector get_cpu_data(const std::string& file_path, int len); - -template -T* get_xpu_data(const std::string& file_path, int len); - -template -T* get_xpu_data(const std::string& data_name, const std::vector& cpu_data); - -template -XPUQunatData get_xpu_quant_data(const std::string& file_path, int len); - -template -XPUQunatData get_xpu_quant_data(const std::string& data_name, - const std::vector& cpu_data); - -std::vector get_all_ids(const std::string& dir_in); - -void get_ln_param(const std::unordered_map& weights_len_info, - const std::string& params_dir, - const std::string& ln_name_prefix, - const float*& ln_scale, // NOLINT - const float*& ln_bias); // NOLINT - -template -void print_vec(const std::vector& data, const std::string& data_name); -template -void print_cpu_data(const T* data, std::vector shape, std::string name); -template -void print_xpu_data(api::Context* ctx, const T* data, std::vector shape, - std::string name); -template -void print_xpu_data_all(api::Context* ctx, const T* data, - std::vector shape, std::string name); - -#define CHECK_RET(ret) \ - if ((ret) != 0) { \ - std::cout << "ERR" << __FILE__ << ":" << __LINE__ \ - << ", check failed, ret != 0" << std::endl; \ - std::exit(1); \ - } -#define WRAPPER_CHECK_CTX(ctx) \ - if (ctx == nullptr) { \ - return api::INVALID_PARAM; \ - } -#define WRAPPER_ASSERT_GT(ctx, expra, exprb) \ - if (!((expra) > (exprb))) { \ - return api::INVALID_PARAM; \ - } -#define WRAPPER_ASSERT_SUCCESS(ctx, ret) \ - if (!((ret) == api::SUCCESS)) { \ - return ret; \ - } diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/CMakeLists.txt deleted file mode 100644 index 6223e1481e7e98846d9de3535ec510b41c237d48..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/CMakeLists.txt +++ /dev/null @@ -1,81 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -project(wenet VERSION 0.1) - -option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF) -option(GRAPH_TOOLS "whether to build TLG graph tools" OFF) -option(BUILD_TESTING "whether to build unit test" ON) - -option(GRPC "whether to build with gRPC" OFF) -# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost -# which is a very big library -option(WEBSOCKET "whether to build with websocket" ON) -option(HTTP "whether to build with http" OFF) -option(TORCH "whether to build with Torch" ON) -option(ONNX "whether to build with ONNX" OFF) -option(GPU "whether to build with GPU" OFF) - -set(CMAKE_VERBOSE_MAKEFILE OFF) - -include(FetchContent) -set(FETCHCONTENT_QUIET OFF) -get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -set(FETCHCONTENT_BASE_DIR ${fc_base}) - -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - -if(NOT MSVC) - # Keep the same with openfst, -fPIC or -fpic - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC") -else() - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - add_compile_options("$<$:/utf-8>") -endif() - -# Include all dependency -if(TORCH) - include(libtorch) -endif() -if(ONNX) - include(onnx) -endif() -include(openfst) -include_directories( - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/kaldi -) - -# Build all libraries -add_subdirectory(utils) -add_subdirectory(frontend) -add_subdirectory(post_processor) -add_subdirectory(kaldi) # kaldi: wfst based decoder -add_subdirectory(decoder) -add_subdirectory(api) - -# Optionally, you can build with websocket -if(WEBSOCKET) - include(boost) - add_subdirectory(websocket) -endif() - -# Optionally, you can build with gRPC -if(GRPC) - include(grpc) - add_subdirectory(grpc) -endif() - -# Optionally, you can build with http -if(HTTP) - include(boost) - add_subdirectory(http) -endif() - -# Build all bins -add_subdirectory(bin) - -# Unit Test -if(BUILD_TESTING) - include(gtest) - add_subdirectory(test) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/README.md deleted file mode 100644 index 92f1727f6e4f336ded62de398ae1907e07c27067..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/README.md +++ /dev/null @@ -1,179 +0,0 @@ -# WeNet Server (x86) ASR Demo - -**[中文版:x86 平台上使用 WeNet 进行语音识别](./README_CN.md)** - -## Run with Prebuilt Docker - -* Step 1. Download pretrained model(see the following link) or prepare your trained model. - -[中文(WenetSpeech)](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/wenetspeech/wenetspeech_u2pp_conformer_libtorch.tar.gz) -| [English(GigaSpeech)](https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/gigaspeech/gigaspeech_u2pp_conformer_libtorch.tar.gz) - - -* Step 2. Start docker websocket server. Here is a demo. - -``` sh -model_dir=$PWD/20210602_u2++_conformer_libtorch # absolute path -docker run --rm -it -p 10086:10086 -v $model_dir:/home/wenet/model wenetorg/wenet-mini:latest bash /home/run.sh -``` - -* Step 3. Test with web browser. Open runtime/libtorch/web/templates/index.html in the browser directly, input your `WebSocket URL`, it will request some permissions, and start to record to test, as the following graph shows. - -![Runtime web](../../docs/images/runtime_web.png) - -## Run in Docker Build - -We recommend using the docker environment to build the c++ binary to avoid -system and environment problems. - -* Step 1. Build your docker image. - -``` sh -cd docker -docker build --no-cache -t wenet:latest . -``` - -* Step 2. Put all the resources, like model, test wavs into a docker resource dir. - -``` sh -mkdir -p docker_resource -cp -r docker_resource/model -cp docker_resource/test.wav -``` - -* Step 3. Start docker container. -``` sh -docker run --rm -v $PWD/docker_resource:/home/wenet/runtime/libtorch/docker_resource -it wenet bash -``` - -* Step 4. Testing in docker container -``` -cd /home/wenet/runtime/libtorch -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=docker_resource/test.wav -model_dir=docker_resource/model -./build/bin/decoder_main \ - --chunk_size -1 \ - --wav_path $wav_path \ - --model_path $model_dir/final.zip \ - --unit_path $model_dir/units.txt 2>&1 | tee log.txt -``` - -Or you can do the WebSocket server/client testing as described in the `WebSocket` section. - -## Run with Local Build - -* Step 1. Download or prepare your pretrained model. - -* Step 2. Build. The build requires cmake 3.14 or above. For building, please first change to `wenet/runtime/libtorch` as your build directory, then type: - -``` sh -mkdir build && cd build && cmake .. && cmake --build . -``` - -For building with GPU, you should turn on `GPU`: - -``` sh -mkdir build && cd build && cmake -DGPU=ON .. && cmake --build . -``` - -* Step 3. Testing, the RTF(real time factor) is shown in the console. - -``` sh -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=your_test_wav_path -model_dir=your_model_dir -./build/bin/decoder_main \ - --chunk_size -1 \ - --wav_path $wav_path \ - --model_path $model_dir/final.zip \ - --unit_path $model_dir/units.txt 2>&1 | tee log.txt -``` - - -## Advanced Usage - -### WebSocket - -* Step 1. Download or prepare your pretrained model. -* Step 2. Build as in `Run with Local Build` -* Step 3. Start WebSocket server. - -``` sh -export GLOG_logtostderr=1 -export GLOG_v=2 -model_dir=your_model_dir -./build/bin/websocket_server_main \ - --port 10086 \ - --chunk_size 16 \ - --model_path $model_dir/final.zip \ - --unit_path $model_dir/units.txt 2>&1 | tee server.log -``` -* Step 4. Start WebSocket client. - -```sh -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=your_test_wav_path -./build/websocket_client_main \ - --hostname 127.0.0.1 --port 10086 \ - --wav_path $wav_path 2>&1 | tee client.log -``` - -You can also start WebSocket client by web browser as described before. - -Here is a demo for command line based websocket server/client interaction. - -![Runtime server demo](../../../docs/images/runtime_server.gif) - -### gRPC - -Why grpc? You may find your answer in https://grpc.io/. -Please follow the following steps to try gRPC. - -* Step 1. Download or prepare your pretrained model. -* Step 2. Build -``` sh -mkdir build && cd build && cmake -DGRPC=ON .. && cmake --build . -``` -* Step 3. Start gRPC server - -``` sh -export GLOG_logtostderr=1 -export GLOG_v=2 -model_dir=your_model_dir -./build/bin/grpc_server_main \ - --port 10086 \ - --workers 4 \ - --chunk_size 16 \ - --model_path $model_dir/final.zip \ - --unit_path $model_dir/units.txt 2>&1 | tee server.log -``` - -* Step 4. Start gRPC client. - -```sh -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=your_test_wav_path -./build/bin/grpc_client_main \ - --hostname 127.0.0.1 --port 10086 \ - --wav_path $wav_path 2>&1 | tee client.log -``` - -### http - -* Step 1. Download or prepare your pretrained model. -* Step 2. Build -``` sh -mkdir build && cd build && cmake -DHTTP=ON .. && cmake --build . -``` -* Step 3. Start http server - -simply replace grpc_server_main with http_server_main of Step 3 in gRPC - -* Step 4. Start http client. - -simply replace grpc_client_main with http_client_main of Step 4 in gRPC diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/README_CN.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/README_CN.md deleted file mode 100644 index ee74968bd3693357fe3d29a8ebda495b6ccca11c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/README_CN.md +++ /dev/null @@ -1,172 +0,0 @@ -# x86 平台上使用 WeNet 进行语音识别 - -Wenet 基于 pytorch 框架进行语音识别模型训练,而在使用训练好的 Wenet 模型进行真实场景的语音识别任务时,需要更高效的执行效率和一些外围组件。因此我们提供了一套基于 C++ 实现的 Wenet 的语音识别工具和在线服务。 - - -## 使用docker启动语音识别服务 - -最简单的使用 Wenet 的方式是通过官方提供的 docker 镜像 `wenetorg/wenet:mini` 来启动服务。 - -下面的命令先下载官方提供的预训练模型,并启动 docker 服务,加载模型,提供 websocket 协议的语音识别服务。 -``` sh -cd wenet/runtime/libtorch -wget https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_libtorch.tar.gz -tar -xf 20210601_u2++_conformer_libtorch.tar.gz -model_dir=$PWD/20210601_u2++_conformer_libtorch -docker run --rm -it -p 10086:10086 -v $model_dir:/home/wenet/model wenetorg/wenet-mini:latest bash /home/run.sh -``` - -`$model_dir` 是模型在本机的目录,将被映射到容器的 `/home/wenet/model` 目录,然后启动 web 服务。 - -**实时识别** - -使用浏览器打开文件`web/templates/index.html`,在 `WebSocket URL:`填入 `ws://127.0.0.1:10086`, 允许浏览器弹出的请求使用麦克风,即可通过麦克风进行实时语音识别。 - -![Runtime web](../../../docs/images/runtime_web.png) - -## 自行编译运行时程序 - -如果想使用非 docker 方式,需要自行编译。Wenet 支持 linux/macos/windows 三种平台上的编译。需要安装 cmake 3.14 或者更高版本。 - -运行如下命令,完成编译。 - -``` sh -# 当前目录为 wenet/runtime/libtorch -mkdir build && cd build && cmake .. && cmake --build . -``` -或者使用命令编译以支持 gRPC。 - -``` sh -mkdir build && cd build && cmake -DGRPC=ON .. && cmake --build . -``` -或者使用命令编译以支持 GPU。 -``` sh -mkdir build && cd build && cmake -DGPU=ON .. && cmake --build . -``` - -编译好的可执行程序在 `wenet/runtime/libtorch/build` 下: - -* decoder_main 本地文件识别工具 -* websocket_server_main 基于websocket协议的识别服务端 -* websocket_client_main 基于websocket协议的识别客户端 - - -下载预训练模型 - -``` sh -# 当前目录为 wenet/runtime/libtorch -wget https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell/20210601_u2%2B%2B_conformer_libtorch.tar.gz -tar -xf 20210601_u2++_conformer_libtorch.tar.gz -``` - -## 本地wav文件识别 - -本地文件识别,即程序每次运行时,给定一个语音文件或者一组语音文件列表,输出识别结果,然后结束程序。 - -下载好模型后,执行如下的命令进行本地wav文件识别,将 `wav_path` 设为你想测试的 wav 文件地址,将 `model_dir` 设为你的模型目录地址。 - -``` sh -# 当前目录为 wenet/runtime/libtorch -# 已经下载并解压20210602_unified_transformer_server.tar.gz到当前目录 -# 准备好一个16k采样率,单通道,16bits的音频文件test.wav - -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=test.wav -model_dir=./20210602_unified_transformer_server -./build/bin/decoder_main \ - --chunk_size -1 \ - --wav_path $wav_path \ - --model_path $model_dir/final.zip \ - --unit_path $model_dir/units.txt 2>&1 | tee log.txt -``` - -`decoder_main`工具支持两种wav文件模式: - * 使用`--wav_path`指定单个文件,一次识别单个wav文件。 - * 使用`--wav_scp`指定一个.scp格式的wav列表,一次识别多个wav文件。 - -执行 `./build/bin/decoder_main --help` 可以了解更多的参数意义。 - -## 基于websocket的在线识别服务 - -在线识别服务,即程序运行后会常驻在内存中,等待客户端的请求,对于客户端发来的语音数据进行识别,将识别文本返回给客户端。 - -在这个示例中,需要先启动服务端程序,然后再启动客户端发送请求。 - -### 启动websocket识别服务端 - -执行如下指令,将 `model_dir` 设置为你的模型目录地址。 - -``` sh -export GLOG_logtostderr=1 -export GLOG_v=2 -model_dir=./20210602_unified_transformer_server -./build/bin/websocket_server_main \ - --port 10086 \ - --chunk_size 16 \ - --model_path $model_dir/final.zip \ - --unit_path $model_dir/units.txt 2>&1 | tee server.log -``` - -上述服务启动后,会监听 10086 端口。若想使用其他端口,请修改 `--port` 对应的参数. - -### websocket 识别客户端 - -客户端按 websocket 协议去请求服务,可以用不同语言来实现客户端。我们提供了两种客户端,一种是基于 C++ 的命令行工具。一种是基于网页形式的可视化客户端。 - -**命令行 websocket 客户端** - -打开一个新的命令行窗口,运行如下指令,启动客户端。可将 `wav_path` 设为你想测试的 wav 文件地址。 - -```sh -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=test.wav -./build/bin/websocket_client_main \ - --hostname 127.0.0.1 --port 10086 \ - --wav_path $wav_path 2>&1 | tee client.log -``` - -该程序会模拟语音数据的真实时间进行流式请求,即 10 秒的语音会按 10 秒时间发送完。可以在客户端和服务器端看到流式识别过程输出的信息。 - -![Runtime server demo](../../../docs/images/runtime_server.gif) - -注意 `--port` 需要设置为服务端使用的端口号。 - -如果有两台机器,也可以在一台机器上运行服务端,在另一台机器运行客户端,此时 `--hostname` 要指定为服务端所在机器的可访问 ip。 - -**网页版 websocket 客户端** - -网页版客户端支持麦克风的语音输入。 - -使用浏览器打开文件 `web/templates/index.html`, 在 `Websoket URL` 里设置 websoket 识别服务的地址,比如 `ws://localhost:10086`, 点击开始识别。 - -**时延信息计算** - -`server.log` 文件中记录了每次请求的时延,可以通过如下命令得到所有请求的平均时延。 - -``` sh -grep "Rescoring cost latency" server.log | awk '{sum += $NF}; END {print sum/NR}' -``` - -## 在 Docker 环境中使用 - -如果遇到问题比如无法编译,我们提供了 docker 镜像用于直接执行示例。需要先安装好 docker,运行如下命令,进入 docker 容器环境。 - -``` sh -docker run --rm -it mobvoiwenet/wenet:latest bash -``` - -该镜像包含了编译过程中所依赖的所有第三方库、编译好的文件和预训练模型。 - -预训练模型在 `/home/model` 目录, 可执行程序在 `/home/wenet/runtime/libtorch/build` 目录。 - -### 构建 Docker 镜像 - -我们也提供了 Dockerfile,可以自己构建 docker 镜像,参考 `docker/Dockerfile` 文件。 - -``` sh -cd docker -docker build --no-cache -t wenet:latest . -docker run --rm -it wenet bash -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/CMakeLists.txt deleted file mode 100644 index 8d61ca8477f0f0b6128f1effe0a2738494b2620f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(TORCH) - add_library(wenet_api SHARED wenet_api.cc) - target_link_libraries(wenet_api PUBLIC decoder) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/README.md deleted file mode 100644 index 5eaa13b977eb4836eb930452f4434dc9f2ea4139..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# WeNet API - -We refer [vosk](https://github.com/alphacep/vosk-api/blob/master/src/vosk_api.h) -for the interface design. - - -We are going to implement the following interfaces: - -- [x] non-streaming recognition -- [] streaming recognition -- [] nbest -- [] contextual biasing word -- [] alignment -- [] language support(post processor) -- [] label check diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/wenet_api.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/wenet_api.cc deleted file mode 100644 index cb1e0c8552e0126e2db274a29075578fe351a25f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/wenet_api.cc +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" - -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "decoder/torch_asr_model.h" -#include "post_processor/post_processor.h" -#include "utils/file.h" -#include "utils/json.h" -#include "utils/string.h" - -class Recognizer { - public: - explicit Recognizer(const std::string& model_dir) { - // FeaturePipeline init - feature_config_ = std::make_shared(80, 16000); - feature_pipeline_ = - std::make_shared(*feature_config_); - // Resource init - resource_ = std::make_shared(); - wenet::TorchAsrModel::InitEngineThreads(); - std::string model_path = wenet::JoinPath(model_dir, "final.zip"); - CHECK(wenet::FileExists(model_path)); - - auto model = std::make_shared(); - model->Read(model_path); - resource_->model = model; - - // units.txt: E2E model unit - std::string unit_path = wenet::JoinPath(model_dir, "units.txt"); - CHECK(wenet::FileExists(unit_path)); - resource_->unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(unit_path)); - - std::string fst_path = wenet::JoinPath(model_dir, "TLG.fst"); - if (wenet::FileExists(fst_path)) { // With LM - resource_->fst = std::shared_ptr>( - fst::Fst::Read(fst_path)); - - std::string symbol_path = wenet::JoinPath(model_dir, "words.txt"); - CHECK(wenet::FileExists(symbol_path)); - resource_->symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(symbol_path)); - } else { // Without LM, symbol_table is the same as unit_table - resource_->symbol_table = resource_->unit_table; - } - - // Context config init - context_config_ = std::make_shared(); - decode_options_ = std::make_shared(); - post_process_opts_ = std::make_shared(); - } - - void Reset() { - if (feature_pipeline_ != nullptr) { - feature_pipeline_->Reset(); - } - if (decoder_ != nullptr) { - decoder_->Reset(); - } - result_.clear(); - } - - void InitDecoder() { - CHECK(decoder_ == nullptr); - // Optional init context graph - if (context_.size() > 0) { - context_config_->context_score = context_score_; - auto context_graph = - std::make_shared(*context_config_); - context_graph->BuildContextGraph(context_, resource_->symbol_table); - resource_->context_graph = context_graph; - } - // PostProcessor - if (language_ == "chs") { // TODO(Binbin Zhang): CJK(chs, jp, kr) - post_process_opts_->language_type = wenet::kMandarinEnglish; - } else { - post_process_opts_->language_type = wenet::kIndoEuropean; - } - resource_->post_processor = - std::make_shared(*post_process_opts_); - // Init decoder - decoder_ = std::make_shared(feature_pipeline_, resource_, - *decode_options_); - } - - void Decode(const char* data, int len, int last) { - using wenet::DecodeState; - // Init decoder when it is called first time - if (decoder_ == nullptr) { - InitDecoder(); - } - // Convert to 16 bits PCM data to float - CHECK_EQ(len % 2, 0); - feature_pipeline_->AcceptWaveform(reinterpret_cast(data), - len / 2); - if (last > 0) { - feature_pipeline_->set_input_finished(); - } - - while (true) { - DecodeState state = decoder_->Decode(false); - if (state == DecodeState::kWaitFeats) { - break; - } else if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - UpdateResult(true); - break; - } else if (state == DecodeState::kEndpoint && continuous_decoding_) { - decoder_->Rescoring(); - UpdateResult(true); - decoder_->ResetContinuousDecoding(); - } else { // kEndBatch - UpdateResult(false); - } - } - } - - void UpdateResult(bool final_result) { - json::JSON obj; - obj["type"] = final_result ? "final_result" : "partial_result"; - int nbest = final_result ? nbest_ : 1; - obj["nbest"] = json::Array(); - for (int i = 0; i < nbest && i < decoder_->result().size(); i++) { - json::JSON one; - one["sentence"] = decoder_->result()[i].sentence; - if (final_result && enable_timestamp_) { - one["word_pieces"] = json::Array(); - for (const auto& word_piece : decoder_->result()[i].word_pieces) { - json::JSON piece; - piece["word"] = word_piece.word; - piece["start"] = word_piece.start; - piece["end"] = word_piece.end; - one["word_pieces"].append(piece); - } - } - one["sentence"] = decoder_->result()[i].sentence; - obj["nbest"].append(one); - } - result_ = obj.dump(); - } - - const char* GetResult() { return result_.c_str(); } - - void set_nbest(int n) { nbest_ = n; } - void set_enable_timestamp(bool flag) { enable_timestamp_ = flag; } - void AddContext(const char* word) { context_.emplace_back(word); } - void set_context_score(float score) { context_score_ = score; } - void set_language(const char* lang) { language_ = lang; } - void set_continuous_decoding(bool flag) { continuous_decoding_ = flag; } - - private: - // NOTE(Binbin Zhang): All use shared_ptr for clone in the future - std::shared_ptr feature_config_ = nullptr; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr resource_ = nullptr; - std::shared_ptr decode_options_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr context_config_ = nullptr; - std::shared_ptr post_process_opts_ = nullptr; - - int nbest_ = 1; - std::string result_; - bool enable_timestamp_ = false; - std::vector context_; - float context_score_; - std::string language_ = "chs"; - bool continuous_decoding_ = false; -}; - -void* wenet_init(const char* model_dir) { - Recognizer* decoder = new Recognizer(model_dir); - return reinterpret_cast(decoder); -} - -void wenet_free(void* decoder) { - delete reinterpret_cast(decoder); -} - -void wenet_reset(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Reset(); -} - -void wenet_decode(void* decoder, const char* data, int len, int last) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Decode(data, len, last); -} - -const char* wenet_get_result(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - return recognizer->GetResult(); -} - -void wenet_set_log_level(int level) { - FLAGS_logtostderr = true; - FLAGS_v = level; -} - -void wenet_set_nbest(void* decoder, int n) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_nbest(n); -} - -void wenet_set_timestamp(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - bool enable = flag > 0 ? true : false; - recognizer->set_enable_timestamp(enable); -} - -void wenet_add_context(void* decoder, const char* word) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->AddContext(word); -} - -void wenet_set_context_score(void* decoder, float score) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_context_score(score); -} - -void wenet_set_language(void* decoder, const char* lang) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_language(lang); -} - -void wenet_set_continuous_decoding(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_continuous_decoding(flag > 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/wenet_api.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/wenet_api.h deleted file mode 100644 index e839aaa40166a6e50d9aa2ac0e697356bd25b941..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/api/wenet_api.h +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_WENET_API_H_ -#define API_WENET_API_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/** Init decoder from the file and returns the object - * - * @param model_dir: the model dir - * @returns model object or NULL if problem occured - */ -void* wenet_init(const char* model_dir); - -/** Free wenet decoder and corresponding resource - */ -void wenet_free(void* decoder); - -/** Reset decoder for next decoding - */ -void wenet_reset(void* decoder); - -/** Decode the input wav data - * @param data: pcm data, encoded as int16_t(16 bits) - * @param len: data length - * @param last: if it is the last package - */ -void wenet_decode(void* decoder, const char* data, int len, int last); - -/** Get decode result in json format - * It returns partial result when last is 0 - * It returns final result when last is 1 - - { - "nbest" : [{ - "sentence" : "are you okay" - "word_pieces" : [{ - "end" : 960, - "start" : 0, - "word" : "are" - }, { - "end" : 1200, - "start" : 960, - "word" : "you" - }, { - ...}] - }, { - "sentence" : "are you ok" - }], - "type" : "final_result" - } - - "type": final_result/partial_result - "nbest": nbest is enabled when n > 1 in final_result - "sentence": the ASR result - "word_pieces": optional, output timestamp when enabled - */ -const char* wenet_get_result(void* decoder); - -/** Set n-best, range 1~10 - * wenet_get_result will return top-n best results - */ -void wenet_set_nbest(void* decoder, int n); - -/** Whether to enable word level timestamp in results - disable it when flag = 0, otherwise enable - */ -void wenet_set_timestamp(void* decoder, int flag); - -/** Add one contextual biasing - */ -void wenet_add_context(void* decoder, const char* word); - -/** Set contextual biasing bonus score - */ -void wenet_set_context_score(void* decoder, float score); - -/** Set language, has effect on the postpocessing - * @param: lang, could be chs/en now - */ -void wenet_set_language(void* decoder, const char* lang); - -/** Set log level - * We use glog in wenet, so the level is the glog level - */ -void wenet_set_log_level(int level); - -/** Enable continous decoding or not - * flag > 0: enable, otherwise disable - */ -void wenet_set_continuous_decoding(void* decoder, int flag); - -#ifdef __cplusplus -} -#endif - -#endif // API_WENET_API_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/CMakeLists.txt deleted file mode 100644 index a117b8bcb580c8738a7ce72f88bc10ff0a450e98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -add_executable(decoder_main decoder_main.cc) -target_link_libraries(decoder_main PUBLIC decoder) - -add_executable(label_checker_main label_checker_main.cc) -target_link_libraries(label_checker_main PUBLIC decoder) - -# if(TORCH) -# add_executable(api_main api_main.cc) -# target_link_libraries(api_main PUBLIC wenet_api) -# endif() - -if(WEBSOCKET) - add_executable(websocket_client_main websocket_client_main.cc) - target_link_libraries(websocket_client_main PUBLIC websocket) - add_executable(websocket_server_main websocket_server_main.cc) - target_link_libraries(websocket_server_main PUBLIC websocket) -endif() - -if(GRPC) - add_executable(grpc_server_main grpc_server_main.cc) - target_link_libraries(grpc_server_main PUBLIC wenet_grpc) - add_executable(grpc_client_main grpc_client_main.cc) - target_link_libraries(grpc_client_main PUBLIC wenet_grpc) -endif() - -if(HTTP) - add_executable(http_client_main http_client_main.cc) - target_link_libraries(http_client_main PUBLIC http) - add_executable(http_server_main http_server_main.cc) - target_link_libraries(http_server_main PUBLIC http) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/api_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/api_main.cc deleted file mode 100644 index 94b20d52a7b8eee5c39a12af4e1e25324d7d880f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/api_main.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" -#include "frontend/wav.h" -#include "utils/flags.h" - -DEFINE_string(model_dir, "", "model dir path"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_bool(enable_timestamp, false, "enable timestamps"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet_set_log_level(2); - - void* decoder = wenet_init(FLAGS_model_dir.c_str()); - wenet_set_timestamp(decoder, FLAGS_enable_timestamp == true ? 1 : 0); - wenet::WavReader wav_reader(FLAGS_wav_path); - std::vector data(wav_reader.num_samples()); - for (int i = 0; i < wav_reader.num_samples(); i++) { - data[i] = static_cast(*(wav_reader.data() + i)); - } - - for (int i = 0; i < 10; i++) { - // Return the final result when last is 1 - wenet_decode(decoder, reinterpret_cast(data.data()), - data.size() * 2, 1); - const char* result = wenet_get_result(decoder); - LOG(INFO) << i << " " << result; - wenet_reset(decoder); - } - wenet_free(decoder); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/decoder_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/decoder_main.cc deleted file mode 100644 index b8f1dbae6b88390504cc9ce63f33dc9bd54a2d6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/decoder_main.cc +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" -#include "utils/thread_pool.h" -#include "utils/timer.h" -#include "utils/utils.h" - -DEFINE_bool(simulate_streaming, false, "simulate streaming input"); -DEFINE_bool(output_nbest, false, "output n-best of decode result"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_string(wav_scp, "", "input wav scp"); -DEFINE_string(result, "", "result output file"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); -DEFINE_int32(thread_num, 1, "num of decode thread"); -DEFINE_int32(warmup, 0, "num of warmup decode, 0 means no warmup"); - -std::shared_ptr g_decode_config; -std::shared_ptr g_feature_config; -std::shared_ptr g_decode_resource; - -std::ofstream g_result; -std::mutex g_mutex; -int g_total_waves_dur = 0; -int g_total_decode_time = 0; - -void decode(std::pair wav, bool warmup = false) { - wenet::WavReader wav_reader(wav.second); - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - - auto feature_pipeline = - std::make_shared(*g_feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - - wenet::AsrDecoder decoder(feature_pipeline, g_decode_resource, - *g_decode_config); - - int wave_dur = static_cast(static_cast(num_samples) / - wav_reader.sample_rate() * 1000); - int decode_time = 0; - std::string final_result; - while (true) { - wenet::Timer timer; - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - } - int chunk_decode_time = timer.Elapsed(); - decode_time += chunk_decode_time; - if (decoder.DecodedSomething()) { - LOG(INFO) << "Partial result: " << decoder.result()[0].sentence; - } - - if (FLAGS_continuous_decoding && state == wenet::DecodeState::kEndpoint) { - if (decoder.DecodedSomething()) { - decoder.Rescoring(); - LOG(INFO) << "Final result (continuous decoding): " - << decoder.result()[0].sentence; - final_result.append(decoder.result()[0].sentence); - } - decoder.ResetContinuousDecoding(); - } - - if (state == wenet::DecodeState::kEndFeats) { - break; - } else if (FLAGS_chunk_size > 0 && FLAGS_simulate_streaming) { - float frame_shift_in_ms = - static_cast(g_feature_config->frame_shift) / - wav_reader.sample_rate() * 1000; - auto wait_time = - decoder.num_frames_in_current_chunk() * frame_shift_in_ms - - chunk_decode_time; - if (wait_time > 0) { - LOG(INFO) << "Simulate streaming, waiting for " << wait_time << "ms"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(wait_time))); - } - } - } - if (decoder.DecodedSomething()) { - final_result.append(decoder.result()[0].sentence); - } - LOG(INFO) << wav.first << " Final result: " << final_result << std::endl; - LOG(INFO) << "Decoded " << wave_dur << "ms audio taken " << decode_time - << "ms."; - - if (!warmup) { - g_mutex.lock(); - std::ostream& buffer = FLAGS_result.empty() ? std::cout : g_result; - if (!FLAGS_output_nbest) { - buffer << wav.first << " " << final_result << std::endl; - } else { - buffer << "wav " << wav.first << std::endl; - auto& results = decoder.result(); - for (auto& r : results) { - if (r.sentence.empty()) continue; - buffer << "candidate " << r.score << " " << r.sentence << std::endl; - } - } - g_total_waves_dur += wave_dur; - g_total_decode_time += decode_time; - g_mutex.unlock(); - } -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - g_decode_config = wenet::InitDecodeOptionsFromFlags(); - g_feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - g_decode_resource = wenet::InitDecodeResourceFromFlags(); - - if (FLAGS_wav_path.empty() && FLAGS_wav_scp.empty()) { - LOG(FATAL) << "Please provide the wave path or the wav scp."; - } - std::vector> waves; - if (!FLAGS_wav_path.empty()) { - waves.emplace_back(make_pair("test", FLAGS_wav_path)); - } else { - std::ifstream wav_scp(FLAGS_wav_scp); - std::string line; - while (getline(wav_scp, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_GE(strs.size(), 2); - waves.emplace_back(make_pair(strs[0], strs[1])); - } - - if (waves.empty()) { - LOG(FATAL) << "Please provide non-empty wav scp."; - } - } - - if (!FLAGS_result.empty()) { - g_result.open(FLAGS_result, std::ios::out); - } - - // Warmup - if (FLAGS_warmup > 0) { - LOG(INFO) << "Warming up..."; - { - ThreadPool pool(FLAGS_thread_num); - auto wav = waves[0]; - for (int i = 0; i < FLAGS_warmup; i++) { - pool.enqueue(decode, wav, true); - } - } - LOG(INFO) << "Warmup done."; - } - - { - ThreadPool pool(FLAGS_thread_num); - for (auto& wav : waves) { - pool.enqueue(decode, wav, false); - } - } - - LOG(INFO) << "Total: decoded " << g_total_waves_dur << "ms audio taken " - << g_total_decode_time << "ms."; - LOG(INFO) << "RTF: " << std::setprecision(4) - << static_cast(g_total_decode_time) / g_total_waves_dur; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/grpc_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/grpc_client_main.cc deleted file mode 100644 index f2d226d48d3757c5f095335eff3288f5d227282b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/grpc_client_main.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "grpc/grpc_client.h" -#include "utils/flags.h" -#include "utils/timer.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::GrpcClient client(FLAGS_hostname, FLAGS_port, FLAGS_nbest, - FLAGS_continuous_decoding); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - std::vector pcm_data(wav_reader.data(), - wav_reader.data() + num_samples); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(pcm_data[j])); - } - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/grpc_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/grpc_server_main.cc deleted file mode 100644 index b00f3cbade1ee70dadfb49829e9ca73fd50c2be2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/grpc_server_main.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "grpc/grpc_server.h" -#include "utils/log.h" - -DEFINE_int32(port, 10086, "grpc listening port"); -DEFINE_int32(workers, 4, "grpc num workers"); - -using grpc::Server; -using grpc::ServerBuilder; - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::GrpcServer service(feature_config, decode_config, decode_resource); - grpc::EnableDefaultHealthCheckService(true); - grpc::reflection::InitProtoReflectionServerBuilderPlugin(); - ServerBuilder builder; - std::string address("0.0.0.0:" + std::to_string(FLAGS_port)); - builder.AddListeningPort(address, grpc::InsecureServerCredentials()); - builder.RegisterService(&service); - builder.SetSyncServerOption(ServerBuilder::SyncServerOption::NUM_CQS, - FLAGS_workers); - std::unique_ptr server(builder.BuildAndStart()); - LOG(INFO) << "Listening at port " << FLAGS_port; - server->Wait(); - google::ShutdownGoogleLogging(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/http_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/http_client_main.cc deleted file mode 100644 index b59ee3f5f32bf08552416b183802029ac5d5afa5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/http_client_main.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "http/http_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of http server"); -DEFINE_int32(port, 10086, "port of http server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Convert to short - std::vector data; - data.reserve(num_samples); - for (int j = 0; j < num_samples; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // Send data - wenet::HttpClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - wenet::Timer timer; - VLOG(2) << "Send " << data.size() << " samples"; - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/http_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/http_server_main.cc deleted file mode 100644 index e30cf2bcdf746c2072f023e90f470ccba5467c2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/http_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "http/http_server.h" - -DEFINE_int32(port, 10086, "http listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::HttpServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/label_checker_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/label_checker_main.cc deleted file mode 100644 index e36e3d5c29a38a7ebee80606ebd8e69ae8b1eb96..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/label_checker_main.cc +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_string(text, "", "kaldi style text input file"); -DEFINE_string(wav_scp, "", "kaldi style wav scp"); -DEFINE_double(is_penalty, 1.0, - "insertion/substitution penalty for align insertion"); -DEFINE_double(del_penalty, 1.0, "deletion penalty for align insertion"); -DEFINE_string(result, "", "result output file"); -DEFINE_string(timestamp, "", "timestamp output file"); - -namespace wenet { - -const char* kDeletion = ""; -// Is: Insertion and substitution -const char* kIsStart = ""; -const char* kIsEnd = ""; - -bool MapToLabel(const std::string& text, - std::shared_ptr symbol_table, - std::vector* labels) { - labels->clear(); - // Split label to char sequence - std::vector chars; - SplitUTF8StringToChars(text, &chars); - for (size_t i = 0; i < chars.size(); i++) { - // ▁ is special symbol for white space - std::string label = chars[i] != " " ? chars[i] : "▁"; - int id = symbol_table->Find(label); - if (id != -1) { // fst::kNoSymbol - // LOG(INFO) << label << " " << id; - labels->push_back(id); - } - } - return true; -} - -std::shared_ptr MakeSymbolTableForFst( - std::shared_ptr isymbol_table) { - LOG(INFO) << isymbol_table; - CHECK(isymbol_table != nullptr); - auto osymbol_table = std::make_shared(); - osymbol_table->AddSymbol("", 0); - CHECK_EQ(isymbol_table->Find(""), 0); - osymbol_table->AddSymbol("", 1); - for (int i = 1; i < isymbol_table->NumSymbols(); i++) { - std::string symbol = isymbol_table->Find(i); - osymbol_table->AddSymbol(symbol, i + 1); - } - osymbol_table->AddSymbol(kDeletion, isymbol_table->NumSymbols() + 1); - osymbol_table->AddSymbol(kIsStart, isymbol_table->NumSymbols() + 2); - osymbol_table->AddSymbol(kIsEnd, isymbol_table->NumSymbols() + 3); - return osymbol_table; -} - -void CompileCtcFst(std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int start = ofst->AddState(); - ofst->SetStart(start); - CHECK_EQ(symbol_table->Find(""), 0); - CHECK_EQ(symbol_table->Find(""), 1); - ofst->AddArc(start, fst::StdArc(1, 0, 0.0, start)); - // Exclude kDeletion and kInsertion - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - int s = ofst->AddState(); - ofst->AddArc(start, fst::StdArc(i, i, 0.0, s)); - ofst->AddArc(s, fst::StdArc(i, 0, 0.0, s)); - ofst->AddArc(s, fst::StdArc(0, 0, 0.0, start)); - } - ofst->SetFinal(start, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdOLabelCompare()); -} - -void CompileAlignFst(std::vector labels, - std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int deletion = symbol_table->Find(kDeletion); - int insertion_start = symbol_table->Find(kIsStart); - int insertion_end = symbol_table->Find(kIsEnd); - - int start = ofst->AddState(); - ofst->SetStart(start); - // Filler State - int filler_start = ofst->AddState(); - int filler_end = ofst->AddState(); - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - ofst->AddArc(filler_start, fst::StdArc(i, i, FLAGS_is_penalty, filler_end)); - } - ofst->AddArc(filler_end, fst::StdArc(0, 0, 0.0, filler_start)); - - int prev = start; - // Alignment path and optional filler - for (size_t i = 0; i < labels.size(); i++) { - int cur = ofst->AddState(); - // 1. Insertion or Substitution - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - // 2. Correct - ofst->AddArc(prev, fst::StdArc(labels[i], labels[i], 0.0, cur)); - // 3. Deletion - ofst->AddArc(prev, fst::StdArc(0, deletion, FLAGS_del_penalty, cur)); - - prev = cur; - } - // Optional add endding filler - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - ofst->SetFinal(prev, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdILabelCompare()); -} - -} // namespace wenet - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - CHECK(decode_resource->unit_table != nullptr); - - auto wfst_symbol_table = - wenet::MakeSymbolTableForFst(decode_resource->unit_table); - // wfst_symbol_table->WriteText("fst.txt"); - // Reset symbol_table to on-the-fly generated wfst_symbol_table - decode_resource->symbol_table = wfst_symbol_table; - - // Compile ctc FST - fst::StdVectorFst ctc_fst; - wenet::CompileCtcFst(wfst_symbol_table, &ctc_fst); - // ctc_fst.Write("ctc.fst"); - - std::unordered_map wav_table; - std::ifstream wav_is(FLAGS_wav_scp); - std::string line; - while (std::getline(wav_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_EQ(strs.size(), 2); - wav_table[strs[0]] = strs[1]; - } - - std::ifstream text_is(FLAGS_text); - std::ofstream result_os(FLAGS_result, std::ios::out); - std::ofstream timestamp_out; - if (!FLAGS_timestamp.empty()) { - timestamp_out.open(FLAGS_timestamp, std::ios::out); - } - std::ostream& timestamp_os = - FLAGS_timestamp.empty() ? std::cout : timestamp_out; - - while (std::getline(text_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - if (strs.size() < 2) continue; - std::string key = strs[0]; - LOG(INFO) << "Processing " << key; - if (wav_table.find(key) != wav_table.end()) { - strs.erase(strs.begin()); - std::string text = wenet::JoinString(" ", strs); - std::vector labels; - wenet::MapToLabel(text, wfst_symbol_table, &labels); - // Prepare FST for alignment decoding - fst::StdVectorFst align_fst; - wenet::CompileAlignFst(labels, wfst_symbol_table, &align_fst); - // align_fst.Write("align.fst"); - auto decoding_fst = std::make_shared(); - fst::Compose(ctc_fst, align_fst, decoding_fst.get()); - // decoding_fst->Write("decoding.fst"); - // Preapre feature pipeline - wenet::WavReader wav_reader; - if (!wav_reader.Open(wav_table[key])) { - LOG(WARNING) << "Error in reading " << wav_table[key]; - continue; - } - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - auto feature_pipeline = - std::make_shared(*feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - decode_resource->fst = decoding_fst; - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - wenet::AsrDecoder decoder(feature_pipeline, decode_resource, - *decode_config); - while (true) { - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - break; - } - } - std::string final_result; - std::string timestamp_str; - if (decoder.DecodedSomething()) { - const wenet::DecodeResult& result = decoder.result()[0]; - final_result = result.sentence; - std::stringstream ss; - for (const auto& w : result.word_pieces) { - ss << " " << w.word << " " << w.start << " " << w.end; - } - timestamp_str = ss.str(); - } - result_os << key << " " << final_result << std::endl; - timestamp_os << key << " " << timestamp_str << std::endl; - LOG(INFO) << key << " " << final_result; - } else { - LOG(WARNING) << "No wav file for " << key; - } - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/websocket_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/websocket_client_main.cc deleted file mode 100644 index 3eaa96069dc5f57673fbb2819bf7d4883e0d5ffa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/websocket_client_main.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "websocket/websocket_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::WebSocketClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - client.set_continuous_decoding(FLAGS_continuous_decoding); - client.SendStartSignal(); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // TODO(Binbin Zhang): Network order? - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - client.SendEndSignal(); - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/websocket_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/websocket_server_main.cc deleted file mode 100644 index 796d9d2e6d151f7c08b43d66b7245c58ee086cc2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/bin/websocket_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "websocket/websocket_server.h" - -DEFINE_int32(port, 10086, "websocket listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::WebSocketServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/boost.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/boost.cmake deleted file mode 100644 index 8684c0ec43960da213da923dc57416f04301ea2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/boost.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FetchContent_Declare(boost - URL https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz - URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a -) -FetchContent_MakeAvailable(boost) -include_directories(${boost_SOURCE_DIR}) - -if(MSVC) - add_definitions(-DBOOST_ALL_DYN_LINK -DBOOST_ALL_NO_LIB) -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/bpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/bpu.cmake deleted file mode 100644 index 350d76c19d6f656fb130de09877d649cf49972a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/bpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -if(BPU) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(EASY_DNN_URL "https://github.com/xingchensong/toolchain_pkg/releases/download/easy_dnn/easy_dnn.0.4.11.tar.gz") - set(URL_HASH "SHA256=a1a6f77d1baae7181d75ec5d37a2ee529ac4e1c4400babd6ceb1c007392a4904") - else() - message(FATAL_ERROR "Unsupported CMake System Processor '${CMAKE_SYSTEM_PROCESSOR}' (expected 'aarch64')") - endif() - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Linux')") - endif() - - FetchContent_Declare(easy_dnn - URL ${EASY_DNN_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(easy_dnn) - include_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/include) - link_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/lib) - - add_definitions(-DUSE_BPU) - # NOTE(xcsong): Reasons for adding flag `-fuse-ld=gold`: - # https://stackoverflow.com/questions/59915966/unknown-gcc-linker-error-but-builds-sucessfully/59916438#59916438 - # https://github.com/tensorflow/tensorflow/issues/47849 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/gflags.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/gflags.cmake deleted file mode 100644 index 53ae5763b5a8c860b7e64d35b380eee5429f539d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/gflags.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(gflags - URL https://github.com/gflags/gflags/archive/v2.2.2.zip - URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 -) -FetchContent_MakeAvailable(gflags) -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/glog.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/glog.cmake deleted file mode 100644 index 447ab4132f669ee2c3a52c37959dd684a39ff21b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/glog.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(glog - URL https://github.com/google/glog/archive/v0.4.0.zip - URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc -) -FetchContent_MakeAvailable(glog) -include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/grpc.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/grpc.cmake deleted file mode 100644 index 644093a4bf8191f3a45b0df0a72c000981c48f58..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/grpc.cmake +++ /dev/null @@ -1,9 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) -# third_party: grpc -# On how to build grpc, you may refer to https://github.com/grpc/grpc -# We recommend manually recursive clone the repo to avoid internet connection problem -FetchContent_Declare(gRPC - GIT_REPOSITORY https://github.com/grpc/grpc - GIT_TAG v1.37.1 -) -FetchContent_MakeAvailable(gRPC) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/gtest.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/gtest.cmake deleted file mode 100644 index 30dc7c1a31d8b83991841a4dc33f61ed078b532a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/gtest.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FetchContent_Declare(googletest - URL https://github.com/google/googletest/archive/release-1.11.0.zip - URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a -) -if(MSVC) - set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) -endif() -FetchContent_MakeAvailable(googletest) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/libtorch.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/libtorch.cmake deleted file mode 100644 index 3cd9245b2da52f8be206d27164de5f411bff171b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/libtorch.cmake +++ /dev/null @@ -1,79 +0,0 @@ -if(TORCH) - add_definitions(-DUSE_TORCH) - if(NOT ANDROID) - if(GPU) - if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(FATAL_ERROR "GPU is supported only Linux, you can use CPU version") - else() - add_definitions(-DUSE_GPU) - endif() - endif() - - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - if(${CMAKE_BUILD_TYPE} MATCHES "Release") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bece54d36377990257e9d028c687c5b6759c5cfec0a0153da83cf6f0f71f648f") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=3cc7ba3c3865d86f03d78c2f0878fdbed8b764359476397a5c95cf3bba0d665a") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CXX11_ABI) - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=d52f63577a07adb0bfd6d77c90f7da21896e94f71eb7dcd55ed7835ccb3b2b59") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip") - set(URL_HASH "SHA256=80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865") - endif() - else() - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bee1b7be308792aa60fc95a4f5274d9658cb7248002d0e333d49eb81ec88430c") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip") - set(URL_HASH "SHA256=90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad") - endif() - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.13.0.zip") - set(URL_HASH "SHA256=a8f80050b95489b4e002547910410c2c230e9f590ffab2482e19e809afe4f7aa") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_definitions(-DIOS) - else() - message(FATAL_ERROR "Unsupported System '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux', 'Darwin' or 'iOS')") - endif() - - # iOS use LibTorch from pod install - if(NOT IOS) - FetchContent_Declare(libtorch - URL ${LIBTORCH_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(libtorch) - find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG") - endif() - - if(MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - file(COPY ${TORCH_DLLS} DESTINATION ${CMAKE_BINARY_DIR}) - endif() - else() - # Change version in runtime/android/app/build.gradle. - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - include_directories( - ${PYTORCH_INCLUDE_DIRS} - ${PYTORCH_INCLUDE_DIRS}/torch/csrc/api/include - ) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/onnx.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/onnx.cmake deleted file mode 100644 index bd55402cb2a6024620fa6ff8b5c413207041adfa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/onnx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -if(ONNX) - set(ONNX_VERSION "1.12.0") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-win-x64-${ONNX_VERSION}.zip") - set(URL_HASH "SHA256=8b5d61204989350b7904ac277f5fbccd3e6736ddbb6ec001e412723d71c9c176") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-aarch64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5820d9f343df73c63b6b2b174a1ff62575032e171c9564bcf92060f46827d0ac") - else() - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5d503ce8540358b59be26c675e42081be14a3e833a5301926f555451046929c5") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-osx-x86_64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=09b17f712f8c6f19bb63da35d508815b443cbb473e16c6192abfaa297c02f600") - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux' or 'Darwin')") - endif() - - FetchContent_Declare(onnxruntime - URL ${ONNX_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(onnxruntime) - include_directories(${onnxruntime_SOURCE_DIR}/include) - link_directories(${onnxruntime_SOURCE_DIR}/lib) - - if(MSVC) - file(GLOB ONNX_DLLS "${onnxruntime_SOURCE_DIR}/lib/*.dll") - file(COPY ${ONNX_DLLS} DESTINATION ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}) - endif() - - add_definitions(-DUSE_ONNX) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/openfst.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/openfst.cmake deleted file mode 100644 index 490a3da6b571ec228114167fb9c0d9e9b4043bd2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/openfst.cmake +++ /dev/null @@ -1,45 +0,0 @@ -if(NOT ANDROID) - include(gflags) - # We can't build glog with gflags, unless gflags is pre-installed. - # If build glog with pre-installed gflags, there will be conflict. - set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) - include(glog) - - if(NOT GRAPH_TOOLS) - set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) - set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) - endif() - set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) - set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) - set(HAVE_GRM OFF CACHE BOOL "Build grm" FORCE) - set(HAVE_FAR OFF CACHE BOOL "Build far" FORCE) - set(HAVE_PDT OFF CACHE BOOL "Build pdt" FORCE) - set(HAVE_MPDT OFF CACHE BOOL "Build mpdt" FORCE) - set(HAVE_LINEAR OFF CACHE BOOL "Build linear" FORCE) - set(HAVE_LOOKAHEAD OFF CACHE BOOL "Build lookahead" FORCE) - set(HAVE_NGRAM OFF CACHE BOOL "Build ngram" FORCE) - set(HAVE_SPECIAL OFF CACHE BOOL "Build special" FORCE) - - if(MSVC) - add_compile_options(/W0 /wd4244 /wd4267) - endif() - - # "OpenFST port for Windows" builds openfst with cmake for multiple platforms. - # Openfst is compiled with glog/gflags to avoid log and flag conflicts with log and flags in wenet/libtorch. - # To build openfst with gflags and glog, we comment out some vars of {flags, log}.h and flags.cc. - set(openfst_SOURCE_DIR ${fc_base}/openfst-src CACHE PATH "OpenFST source directory") - FetchContent_Declare(openfst - URL https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz - URL_HASH SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e - PATCH_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} - ) - FetchContent_MakeAvailable(openfst) - add_dependencies(fst gflags glog) - target_link_libraries(fst PUBLIC gflags_nothreads_static glog) - include_directories(${openfst_SOURCE_DIR}/src/include) -else() - set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) - include_directories(${openfst_BINARY_DIR}/include) - link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) - link_libraries(log gflags_nothreads glog fst) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/pybind11.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/pybind11.cmake deleted file mode 100644 index 6bdae202c1c4d94228e5f92dab051c118dba7d3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/pybind11.cmake +++ /dev/null @@ -1,7 +0,0 @@ -FetchContent_Declare(pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.zip - URL_HASH SHA256=d1646e6f70d8a3acb2ddd85ce1ed543b5dd579c68b8fb8e9638282af20edead8 -) -FetchContent_MakeAvailable(pybind11) - -add_subdirectory(${pybind11_SOURCE_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/xpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/xpu.cmake deleted file mode 100644 index 38418671b0237550cd01d4d95e8743067e113e56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/cmake/xpu.cmake +++ /dev/null @@ -1,37 +0,0 @@ -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -if(XPU) - set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") - set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) - if(NOT DEFINED ENV{XPU_API_PATH}) - message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") - else() - set(XPU_API_PATH $ENV{XPU_API_PATH}) - message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") - endif() - - include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ - ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) - link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) - - add_definitions(-DUSE_XPU) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/CMakeLists.txt deleted file mode 100644 index fe03efb288eb1c7ae3d05e896e95855e5865472f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -set(decoder_srcs - asr_decoder.cc - asr_model.cc - context_graph.cc - ctc_prefix_beam_search.cc - ctc_wfst_beam_search.cc - ctc_endpoint.cc -) - -if(NOT TORCH AND NOT ONNX AND NOT XPU AND NOT IOS AND NOT BPU) - message(FATAL_ERROR "Please build with TORCH or ONNX or XPU or IOS or BPU!!!") -endif() -if(TORCH OR IOS) - list(APPEND decoder_srcs torch_asr_model.cc) -endif() -if(ONNX) - list(APPEND decoder_srcs onnx_asr_model.cc) -endif() - -add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend - post_processor utils) - -if(ANDROID) - target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) -else() - if(TORCH) - target_link_libraries(decoder PUBLIC ${TORCH_LIBRARIES}) - endif() - if(ONNX) - target_link_libraries(decoder PUBLIC onnxruntime) - endif() - if(BPU) - target_link_libraries(decoder PUBLIC bpu_asr_model) - endif() - if(XPU) - target_link_libraries(decoder PUBLIC xpu_conformer) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_decoder.cc deleted file mode 100644 index 34de7550ea287b37d2cb707e148f5d6853b3d804..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_decoder.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/asr_decoder.h" - -#include - -#include -#include -#include - -#include "utils/timer.h" - -namespace wenet { - -AsrDecoder::AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts) - : feature_pipeline_(std::move(feature_pipeline)), - // Make a copy of the model ASR model since we will change the inner - // status of the model - model_(resource->model->Copy()), - post_processor_(resource->post_processor), - symbol_table_(resource->symbol_table), - fst_(resource->fst), - unit_table_(resource->unit_table), - opts_(opts), - ctc_endpointer_(new CtcEndpoint(opts.ctc_endpoint_config)) { - if (opts_.reverse_weight > 0) { - // Check if model has a right to left decoder - CHECK(model_->is_bidirectional_decoder()); - } - if (nullptr == fst_) { - searcher_.reset(new CtcPrefixBeamSearch(opts.ctc_prefix_search_opts, - resource->context_graph)); - } else { - searcher_.reset(new CtcWfstBeamSearch(*fst_, opts.ctc_wfst_search_opts, - resource->context_graph)); - } - ctc_endpointer_->frame_shift_in_ms(frame_shift_in_ms()); -} - -void AsrDecoder::Reset() { - start_ = false; - result_.clear(); - num_frames_ = 0; - global_frame_offset_ = 0; - model_->Reset(); - searcher_->Reset(); - feature_pipeline_->Reset(); - ctc_endpointer_->Reset(); -} - -void AsrDecoder::ResetContinuousDecoding() { - global_frame_offset_ = num_frames_; - start_ = false; - result_.clear(); - model_->Reset(); - searcher_->Reset(); - ctc_endpointer_->Reset(); -} - -DecodeState AsrDecoder::Decode(bool block) { - return this->AdvanceDecoding(block); -} - -void AsrDecoder::Rescoring() { - // Do attention rescoring - Timer timer; - AttentionRescoring(); - VLOG(2) << "Rescoring cost latency: " << timer.Elapsed() << "ms."; -} - -DecodeState AsrDecoder::AdvanceDecoding(bool block) { - DecodeState state = DecodeState::kEndBatch; - model_->set_chunk_size(opts_.chunk_size); - model_->set_num_left_chunks(opts_.num_left_chunks); - int num_required_frames = model_->num_frames_for_chunk(start_); - std::vector> chunk_feats; - // Return immediately if we do not want to block - if (!block && !feature_pipeline_->input_finished() && - feature_pipeline_->NumQueuedFrames() < num_required_frames) { - return DecodeState::kWaitFeats; - } - // If not okay, that means we reach the end of the input - if (!feature_pipeline_->Read(num_required_frames, &chunk_feats)) { - state = DecodeState::kEndFeats; - } - - num_frames_ += chunk_feats.size(); - VLOG(2) << "Required " << num_required_frames << " get " - << chunk_feats.size(); - Timer timer; - std::vector> ctc_log_probs; - model_->ForwardEncoder(chunk_feats, &ctc_log_probs); - int forward_time = timer.Elapsed(); - if (opts_.ctc_wfst_search_opts.blank_scale != 1.0) { - for (int i = 0; i < ctc_log_probs.size(); i++) { - ctc_log_probs[i][0] = ctc_log_probs[i][0] - + std::log(opts_.ctc_wfst_search_opts.blank_scale); - } - } - timer.Reset(); - searcher_->Search(ctc_log_probs); - int search_time = timer.Elapsed(); - VLOG(3) << "forward takes " << forward_time << " ms, search takes " - << search_time << " ms"; - UpdateResult(); - - if (state != DecodeState::kEndFeats) { - if (ctc_endpointer_->IsEndpoint(ctc_log_probs, DecodedSomething())) { - VLOG(1) << "Endpoint is detected at " << num_frames_; - state = DecodeState::kEndpoint; - } - } - - start_ = true; - return state; -} - -void AsrDecoder::UpdateResult(bool finish) { - const auto& hypotheses = searcher_->Outputs(); - const auto& inputs = searcher_->Inputs(); - const auto& likelihood = searcher_->Likelihood(); - const auto& times = searcher_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - int offset = global_frame_offset_ * feature_frame_shift_in_ms(); - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (searcher_->Type() == kWfstBeamSearch) { - path.sentence += (' ' + word); - } else { - path.sentence += (word); - } - } - - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - const std::vector& input = inputs[i]; - const std::vector& time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ > 0 - ? time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ - : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : start; - } - int end = time_stamp[j] * frame_shift_in_ms(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : end; - } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } - } - - if (post_processor_ != nullptr) { - path.sentence = post_processor_->Process(path.sentence, finish); - } - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } -} - -void AsrDecoder::AttentionRescoring() { - searcher_->FinalizeSearch(); - UpdateResult(true); - // No need to do rescoring - if (0.0 == opts_.rescoring_weight) { - return; - } - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = searcher_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - std::vector rescoring_score; - model_->AttentionRescoring(hypotheses, opts_.reverse_weight, - &rescoring_score); - - // Combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - result_[i].score = opts_.rescoring_weight * rescoring_score[i] + - opts_.ctc_weight * result_[i].score; - } - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_decoder.h deleted file mode 100644 index df71f5b7bad7b2ffdc69bbd7ab11f576bed464d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_decoder.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_ASR_DECODER_H_ -#define DECODER_ASR_DECODER_H_ - -#include -#include -#include -#include - -#include "fst/fstlib.h" -#include "fst/symbol-table.h" - -#include "decoder/asr_model.h" -#include "decoder/context_graph.h" -#include "decoder/ctc_endpoint.h" -#include "decoder/ctc_prefix_beam_search.h" -#include "decoder/ctc_wfst_beam_search.h" -#include "decoder/search_interface.h" -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/utils.h" - -namespace wenet { - -struct DecodeOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 64 = 16*4 - int chunk_size = 16; - int num_left_chunks = -1; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores in the following two search - // methods are different. - // For CtcPrefixBeamSearch, it's a sum(prefix) score + context score - // For CtcWfstBeamSearch, it's a max(viterbi) path score + context score - // So we should carefully set ctc_weight according to the search methods. - float ctc_weight = 0.5; - float rescoring_weight = 1.0; - float reverse_weight = 0.0; - CtcEndpointConfig ctc_endpoint_config; - CtcPrefixBeamSearchOptions ctc_prefix_search_opts; - CtcWfstBeamSearchOptions ctc_wfst_search_opts; -}; - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -enum DecodeState { - kEndBatch = 0x00, // End of current decoding batch, normal case - kEndpoint = 0x01, // Endpoint is detected - kEndFeats = 0x02, // All feature is decoded - kWaitFeats = 0x03 // Feat is not enough for one chunk inference, wait -}; - -// DecodeResource is thread safe, which can be shared for multiple -// decoding threads -struct DecodeResource { - std::shared_ptr model = nullptr; - std::shared_ptr symbol_table = nullptr; - std::shared_ptr> fst = nullptr; - std::shared_ptr unit_table = nullptr; - std::shared_ptr context_graph = nullptr; - std::shared_ptr post_processor = nullptr; -}; - -// Torch ASR decoder -class AsrDecoder { - public: - AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts); - // @param block: if true, block when feature is not enough for one chunk - // inference. Otherwise, return kWaitFeats. - DecodeState Decode(bool block = true); - void Rescoring(); - void Reset(); - void ResetContinuousDecoding(); - bool DecodedSomething() const { - return !result_.empty() && !result_[0].sentence.empty(); - } - - // This method is used for time benchmark - int num_frames_in_current_chunk() const { - return num_frames_in_current_chunk_; - } - int frame_shift_in_ms() const { - return model_->subsampling_rate() * - feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - int feature_frame_shift_in_ms() const { - return feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - const std::vector& result() const { return result_; } - - private: - DecodeState AdvanceDecoding(bool block = true); - void AttentionRescoring(); - - void UpdateResult(bool finish = false); - - std::shared_ptr feature_pipeline_; - std::shared_ptr model_; - std::shared_ptr post_processor_; - - std::shared_ptr> fst_ = nullptr; - // output symbol table - std::shared_ptr symbol_table_; - // e2e unit symbol table - std::shared_ptr unit_table_ = nullptr; - const DecodeOptions& opts_; - // cache feature - bool start_ = false; - // For continuous decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = 100; // timestamp gap between words in a sentence - - std::unique_ptr searcher_; - std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(AsrDecoder); -}; - -} // namespace wenet - -#endif // DECODER_ASR_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_model.cc deleted file mode 100644 index 8c7b0fb1195cf07bac6c3ff1bb8cb0e187e977da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_model.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#include "decoder/asr_model.h" - -#include -#include - -namespace wenet { - -int AsrModel::num_frames_for_chunk(bool start) const { - int num_required_frames = 0; - if (chunk_size_ > 0) { - if (!start) { // First batch - int context = right_context_ + 1; // Add current frame - num_required_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - num_required_frames = chunk_size_ * subsampling_rate_; - } - } else { - num_required_frames = std::numeric_limits::max(); - } - return num_required_frames; -} - -void AsrModel::CacheFeature( - const std::vector>& chunk_feats) { - // Cache feature for next chunk - const int cached_feature_size = 1 + right_context_ - subsampling_rate_; - if (chunk_feats.size() >= cached_feature_size) { - // TODO(Binbin Zhang): Only deal the case when - // chunk_feats.size() > cached_feature_size here, and it's consistent - // with our current model, refine it later if we have new model or - // new requirements - cached_feature_.resize(cached_feature_size); - for (int i = 0; i < cached_feature_size; ++i) { - cached_feature_[i] = - chunk_feats[chunk_feats.size() - cached_feature_size + i]; - } - } -} - -void AsrModel::ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) { - ctc_prob->clear(); - int num_frames = cached_feature_.size() + chunk_feats.size(); - if (num_frames >= right_context_ + 1) { - this->ForwardEncoderFunc(chunk_feats, ctc_prob); - this->CacheFeature(chunk_feats); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_model.h deleted file mode 100644 index d100dd818551014fa4769c1766bc3b1b626e8453..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/asr_model.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#ifndef DECODER_ASR_MODEL_H_ -#define DECODER_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "utils/timer.h" -#include "utils/utils.h" - -namespace wenet { - -class AsrModel { - public: - virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } - virtual int sos() const { return sos_; } - virtual int eos() const { return eos_; } - virtual bool is_bidirectional_decoder() const { - return is_bidirectional_decoder_; - } - virtual int offset() const { return offset_; } - - // If chunk_size > 0, streaming case. Otherwise, none streaming case - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { - num_left_chunks_ = num_left_chunks; - } - // start: if it is the start chunk of one sentence - virtual int num_frames_for_chunk(bool start) const; - - virtual void Reset() = 0; - - virtual void ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob); - - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - - virtual std::shared_ptr Copy() const = 0; - - protected: - virtual void ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) = 0; - virtual void CacheFeature(const std::vector>& chunk_feats); - - int right_context_ = 1; - int subsampling_rate_ = 1; - int sos_ = 0; - int eos_ = 0; - bool is_bidirectional_decoder_ = false; - int chunk_size_ = 16; - int num_left_chunks_ = -1; // -1 means all left chunks - int offset_ = 0; - - std::vector> cached_feature_; -}; - -} // namespace wenet - -#endif // DECODER_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/context_graph.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/context_graph.cc deleted file mode 100644 index adc59c506de2afa7087815887295e4d8735d2a35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/context_graph.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/context_graph.h" - -#include - -#include "fst/determinize.h" - -#include "utils/string.h" -#include "utils/utils.h" - -namespace wenet { - -ContextGraph::ContextGraph(ContextConfig config) : config_(config) {} - -void ContextGraph::BuildContextGraph( - const std::vector& query_contexts, - const std::shared_ptr& symbol_table) { - CHECK(symbol_table != nullptr) << "Symbols table should not be nullptr!"; - start_tag_id_ = symbol_table->AddSymbol(""); - end_tag_id_ = symbol_table->AddSymbol(""); - symbol_table_ = symbol_table; - if (query_contexts.empty()) { - if (graph_ != nullptr) graph_.reset(); - return; - } - - std::unique_ptr ofst(new fst::StdVectorFst()); - // State 0 is the start state and the final state. - int start_state = ofst->AddState(); - ofst->SetStart(start_state); - ofst->SetFinal(start_state, fst::StdArc::Weight::One()); - - LOG(INFO) << "Contexts count size: " << query_contexts.size(); - int count = 0; - for (const auto& context : query_contexts) { - if (context.size() > config_.max_context_length) { - LOG(INFO) << "Skip long context: " << context; - continue; - } - if (++count > config_.max_contexts) break; - - std::vector words; - // Split context to words by symbol table, and build the context graph. - bool no_oov = SplitUTF8StringToWords(Trim(context), symbol_table, &words); - if (!no_oov) { - LOG(WARNING) << "Ignore unknown word found during compilation."; - continue; - } - - int prev_state = start_state; - int next_state = start_state; - float escape_score = 0; - for (size_t i = 0; i < words.size(); ++i) { - int word_id = symbol_table_->Find(words[i]); - float score = (i * config_.incremental_context_score - + config_.context_score) * UTF8StringLength(words[i]); - next_state = (i < words.size() - 1) ? ofst->AddState() : start_state; - ofst->AddArc(prev_state, - fst::StdArc(word_id, word_id, score, next_state)); - // Add escape arc to clean the previous context score. - if (i > 0) { - // ilabel and olabel of the escape arc is 0 (). - ofst->AddArc(prev_state, fst::StdArc(0, 0, -escape_score, start_state)); - } - prev_state = next_state; - escape_score += score; - } - } - std::unique_ptr det_fst(new fst::StdVectorFst()); - fst::Determinize(*ofst, det_fst.get()); - graph_ = std::move(det_fst); -} - -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // escape score, will be overwritten when ilabel equals to word id. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - next_state = arc.nextstate; - *score = arc.weight.Value(); - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} - -bool ContextGraph::SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - bool no_oov = true; - for (size_t start = 0; start < chars.size();) { - for (size_t end = chars.size(); end > start; --end) { - std::string word; - for (size_t i = start; i < end; i++) { - word += chars[i]; - } - // Skip space. - if (word == " ") { - start = end; - continue; - } - // Add '▁' at the beginning of English word. - if (IsAlpha(word)) { - word = kSpaceSymbol + word; - } - - if (symbol_table->Find(word) != -1) { - words->emplace_back(word); - start = end; - continue; - } - if (end == start + 1) { - ++start; - no_oov = false; - LOG(WARNING) << word << " is oov."; - } - } - } - return no_oov; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/context_graph.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/context_graph.h deleted file mode 100644 index 41b59206987cfe22d421f40506057830b6311f8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/context_graph.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CONTEXT_GRAPH_H_ -#define DECODER_CONTEXT_GRAPH_H_ - -#include -#include -#include - -#include "fst/compose.h" -#include "fst/fst.h" -#include "fst/vector-fst.h" - -namespace wenet { - -using StateId = fst::StdArc::StateId; - -struct ContextConfig { - int max_contexts = 5000; - int max_context_length = 100; - float context_score = 3.0; - float incremental_context_score = 0.0; -}; - -class ContextGraph { - public: - explicit ContextGraph(ContextConfig config); - void BuildContextGraph(const std::vector& query_context, - const std::shared_ptr& symbol_table); - int GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary); - - int start_tag_id() { return start_tag_id_; } - int end_tag_id() { return end_tag_id_; } - - private: - bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - - int start_tag_id_ = -1; - int end_tag_id_ = -1; - ContextConfig config_; - std::shared_ptr symbol_table_ = nullptr; - std::unique_ptr graph_ = nullptr; - DISALLOW_COPY_AND_ASSIGN(ContextGraph); -}; - -} // namespace wenet - -#endif // DECODER_CONTEXT_GRAPH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_endpoint.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_endpoint.cc deleted file mode 100644 index 4a64dd048f32401ab0dca468836cfac8be943d26..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_endpoint.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_endpoint.h" - -#include - -#include -#include - -#include "utils/log.h" - -namespace wenet { - -CtcEndpoint::CtcEndpoint(const CtcEndpointConfig& config) : config_(config) { - Reset(); -} - -void CtcEndpoint::Reset() { - num_frames_decoded_ = 0; - num_frames_trailing_blank_ = 0; -} - -static bool RuleActivated(const CtcEndpointRule& rule, - const std::string& rule_name, bool decoded_sth, - int trailing_silence, int utterance_length) { - bool ans = (decoded_sth || !rule.must_decoded_sth) && - trailing_silence >= rule.min_trailing_silence && - utterance_length >= rule.min_utterance_length; - if (ans) { - VLOG(2) << "Endpointing rule " << rule_name - << " activated: " << (decoded_sth ? "true" : "false") << ',' - << trailing_silence << ',' << utterance_length; - } - return ans; -} - -bool CtcEndpoint::IsEndpoint( - const std::vector>& ctc_log_probs, - bool decoded_something) { - for (int t = 0; t < ctc_log_probs.size(); ++t) { - const auto& logp_t = ctc_log_probs[t]; - float blank_prob = expf(logp_t[config_.blank]); - - num_frames_decoded_++; - if (blank_prob > config_.blank_threshold) { - num_frames_trailing_blank_++; - } else { - num_frames_trailing_blank_ = 0; - } - } - CHECK_GE(num_frames_decoded_, num_frames_trailing_blank_); - CHECK_GT(frame_shift_in_ms_, 0); - int utterance_length = num_frames_decoded_ * frame_shift_in_ms_; - int trailing_silence = num_frames_trailing_blank_ * frame_shift_in_ms_; - if (RuleActivated(config_.rule1, "rule1", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule2, "rule2", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule3, "rule3", decoded_something, trailing_silence, - utterance_length)) - return true; - return false; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_endpoint.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_endpoint.h deleted file mode 100644 index 56d9e08e7d3fab5562028e956f7b1d6ebac7b9e4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_endpoint.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_ENDPOINT_H_ -#define DECODER_CTC_ENDPOINT_H_ - -#include - -namespace wenet { - -struct CtcEndpointRule { - bool must_decoded_sth; - int min_trailing_silence; - int min_utterance_length; - - CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000, - int min_utterance_length = 0) - : must_decoded_sth(must_decoded_sth), - min_trailing_silence(min_trailing_silence), - min_utterance_length(min_utterance_length) {} -}; - -struct CtcEndpointConfig { - /// We consider blank as silence for purposes of endpointing. - int blank = 0; // blank id - float blank_threshold = 0.8; // blank threshold to be silence - /// We support three rules. We terminate decoding if ANY of these rules - /// evaluates to "true". If you want to add more rules, do it by changing this - /// code. If you want to disable a rule, you can set the silence-timeout for - /// that rule to a very large number. - - /// rule1 times out after 5000 ms of silence, even if we decoded nothing. - CtcEndpointRule rule1; - /// rule2 times out after 1000 ms of silence after decoding something. - CtcEndpointRule rule2; - /// rule3 times out after the utterance is 20000 ms long, regardless of - /// anything else. - CtcEndpointRule rule3; - - CtcEndpointConfig() - : rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {} -}; - -class CtcEndpoint { - public: - explicit CtcEndpoint(const CtcEndpointConfig& config); - - void Reset(); - /// This function returns true if this set of endpointing rules thinks we - /// should terminate decoding. - bool IsEndpoint(const std::vector>& ctc_log_probs, - bool decoded_something); - - void frame_shift_in_ms(int frame_shift_in_ms) { - frame_shift_in_ms_ = frame_shift_in_ms; - } - - private: - CtcEndpointConfig config_; - int frame_shift_in_ms_ = -1; - int num_frames_decoded_ = 0; - int num_frames_trailing_blank_ = 0; -}; - -} // namespace wenet - -#endif // DECODER_CTC_ENDPOINT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_prefix_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index 154c8864ba98255528a33a80a35b18eee8fa5dc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_prefix_beam_search.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -CtcPrefixBeamSearch::CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : opts_(opts), context_graph_(context_graph) { - Reset(); -} - -void CtcPrefixBeamSearch::Reset() { - hypotheses_.clear(); - likelihood_.clear(); - cur_hyps_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); - abs_time_step_ = 0; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - prefix_score.v_s = 0.0; - prefix_score.v_ns = 0.0; - std::vector empty; - cur_hyps_[empty] = prefix_score; - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.total_score()); - times_.emplace_back(empty); -} - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.total_score() > b.second.total_score(); -} - -void CtcPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - const std::vector& start_boundaries = prefix.second.start_boundaries; - const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - if (s < start_boundaries.size() && i == start_boundaries[s]) { - output.emplace_back(context_graph_->start_tag_id()); - ++s; - } - output.emplace_back(input[i]); - if (e < end_boundaries.size() && i == end_boundaries[e]) { - output.emplace_back(context_graph_->end_tag_id()); - ++e; - } - } - outputs_.emplace_back(output); -} - -void CtcPrefixBeamSearch::UpdateHypotheses( - const std::vector, PrefixScore>>& hpys) { - cur_hyps_.clear(); - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - for (auto& item : hpys) { - cur_hyps_[item.first] = item.second; - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.total_score()); - viterbi_likelihood_.emplace_back(item.second.viterbi_score()); - times_.emplace_back(item.second.times()); - } -} - -// Please refer https://robin1001.github.io/2020/12/11/ctc-search -// for how CTC prefix beam search works, and there is a simple graph demo in -// it. -void CtcPrefixBeamSearch::Search(const std::vector>& logp) { - if (logp.size() == 0) return; - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. First beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. Token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields s(blank ending score) and - // ns(none blank ending score) to -inf, respectively. - if (id == opts_.blank) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = LogAdd(next_score.s, prefix_score.score() + prob); - next_score.v_s = prefix_score.viterbi_score() + prob; - next_score.times_s = prefix_score.times(); - // Prefix not changed, copy the context from prefix. - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.ns = LogAdd(next_score1.ns, prefix_score.ns + prob); - if (next_score1.v_ns < prefix_score.v_ns + prob) { - next_score1.v_ns = prefix_score.v_ns + prob; - if (next_score1.cur_token_prob < prob) { - next_score1.cur_token_prob = prob; - next_score1.times_ns = prefix_score.times_ns; - CHECK_GT(next_score1.times_ns.size(), 0); - next_score1.times_ns.back() = abs_time_step_; - } - } - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.ns = LogAdd(next_score2.ns, prefix_score.s + prob); - if (next_score2.v_ns < prefix_score.v_s + prob) { - next_score2.v_ns = prefix_score.v_s + prob; - next_score2.cur_token_prob = prob; - next_score2.times_ns = prefix_score.times_s; - next_score2.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score2.has_context) { - // Prefix changed, calculate the context score. - next_score2.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score2.has_context = true; - } - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = LogAdd(next_score.ns, prefix_score.score() + prob); - if (next_score.v_ns < prefix_score.viterbi_score() + prob) { - next_score.v_ns = prefix_score.viterbi_score() + prob; - next_score.cur_token_prob = prob; - next_score.times_ns = prefix_score.times(); - next_score.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score.has_context) { - // Calculate the context score. - next_score.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score.has_context = true; - } - } - } - } - - // 3. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), arr.begin() + second_beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. Update cur_hyps_ and get new result - UpdateHypotheses(arr); - } -} - -void CtcPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } - -void CtcPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - CHECK_EQ(hypotheses_.size(), cur_hyps_.size()); - CHECK_EQ(hypotheses_.size(), likelihood_.size()); - // We should backoff the context score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_state != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); - } - } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_prefix_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_prefix_beam_search.h deleted file mode 100644 index f44ec23c37af517c9e45140f89ef7346768f5d35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_prefix_beam_search.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_PREFIX_BEAM_SEARCH_H_ -#define DECODER_CTC_PREFIX_BEAM_SEARCH_H_ - -#include -#include -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "utils/utils.h" - -namespace wenet { - -struct CtcPrefixBeamSearchOptions { - int blank = 0; // blank id - int first_beam_size = 10; - int second_beam_size = 10; -}; - -struct PrefixScore { - float s = -kFloatMax; // blank ending score - float ns = -kFloatMax; // none blank ending score - float v_s = -kFloatMax; // viterbi blank ending score - float v_ns = -kFloatMax; // viterbi none blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_s; // times of viterbi blank path - std::vector times_ns; // times of viterbi none blank path - - float score() const { return LogAdd(s, ns); } - float viterbi_score() const { return v_s > v_ns ? v_s : v_ns; } - const std::vector& times() const { - return v_s > v_ns ? times_s : times_ns; - } - - bool has_context = false; - int context_state = 0; - float context_score = 0; - std::vector start_boundaries; - std::vector end_boundaries; - - void CopyContext(const PrefixScore& prefix_score) { - context_state = prefix_score.context_state; - context_score = prefix_score.context_score; - start_boundaries = prefix_score.start_boundaries; - end_boundaries = prefix_score.end_boundaries; - } - - void UpdateContext(const std::shared_ptr& context_graph, - const PrefixScore& prefix_score, int word_id, - int prefix_len) { - this->CopyContext(prefix_score); - - float score = 0; - bool is_start_boundary = false; - bool is_end_boundary = false; - - context_state = - context_graph->GetNextState(prefix_score.context_state, word_id, &score, - &is_start_boundary, &is_end_boundary); - context_score += score; - if (is_start_boundary) start_boundaries.emplace_back(prefix_len); - if (is_end_boundary) end_boundaries.emplace_back(prefix_len); - } - - float total_score() const { return score() + context_score; } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -class CtcPrefixBeamSearch : public SearchInterface { - public: - explicit CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph = nullptr); - - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kPrefixBeamSearch; } - void UpdateOutputs(const std::pair, PrefixScore>& prefix); - void UpdateHypotheses( - const std::vector, PrefixScore>>& hpys); - void UpdateFinalContext(); - - const std::vector& viterbi_likelihood() const { - return viterbi_likelihood_; - } - const std::vector>& Inputs() const override { - return hypotheses_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - int abs_time_step_ = 0; - - // N-best list and corresponding likelihood_, in sorted order - std::vector> hypotheses_; - std::vector likelihood_; - std::vector viterbi_likelihood_; - std::vector> times_; - - std::unordered_map, PrefixScore, PrefixHash> cur_hyps_; - std::shared_ptr context_graph_ = nullptr; - // Outputs contain the hypotheses_ and tags like: and - std::vector> outputs_; - const CtcPrefixBeamSearchOptions& opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(CtcPrefixBeamSearch); -}; - -} // namespace wenet - -#endif // DECODER_CTC_PREFIX_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_wfst_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_wfst_beam_search.cc deleted file mode 100644 index 10e93f387e87b5f16fb7784d7060c50f227bf58e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_wfst_beam_search.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_wfst_beam_search.h" - -#include - -namespace wenet { - -void DecodableTensorScaled::Reset() { - num_frames_ready_ = 0; - done_ = false; - // Give an empty initialization, will throw error when - // AcceptLoglikes is not called - logp_.clear(); -} - -void DecodableTensorScaled::AcceptLoglikes(const std::vector& logp) { - ++num_frames_ready_; - // TODO(Binbin Zhang): Avoid copy here - logp_ = logp; -} - -float DecodableTensorScaled::LogLikelihood(int32 frame, int32 index) { - CHECK_GT(index, 0); - CHECK_LT(frame, num_frames_ready_); - return scale_ * logp_[index - 1]; -} - -bool DecodableTensorScaled::IsLastFrame(int32 frame) const { - CHECK_LT(frame, num_frames_ready_); - return done_ && (frame == num_frames_ready_ - 1); -} - -int32 DecodableTensorScaled::NumIndices() const { - LOG(FATAL) << "Not implement"; - return 0; -} - -CtcWfstBeamSearch::CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : decodable_(opts.acoustic_scale), - decoder_(fst, opts, context_graph), - context_graph_(context_graph), - opts_(opts) { - Reset(); -} - -void CtcWfstBeamSearch::Reset() { - num_frames_ = 0; - decoded_frames_mapping_.clear(); - is_last_frame_blank_ = false; - last_best_ = 0; - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - decodable_.Reset(); - decoder_.InitDecoding(); -} - -void CtcWfstBeamSearch::Search(const std::vector>& logp) { - if (0 == logp.size()) { - return; - } - // Every time we get the log posterior, we decode it all before return - for (int i = 0; i < logp.size(); i++) { - float blank_score = std::exp(logp[i][0]); - if (blank_score > opts_.blank_skip_thresh * opts_.blank_scale) { - VLOG(3) << "skipping frame " << num_frames_ << " score " << blank_score; - is_last_frame_blank_ = true; - last_frame_prob_ = logp[i]; - } else { - // Get the best symbol - int cur_best = - std::max_element(logp[i].begin(), logp[i].end()) - logp[i].begin(); - // Optional, adding one blank frame if we has skipped it in two same - // symbols - if (cur_best != 0 && is_last_frame_blank_ && cur_best == last_best_) { - decodable_.AcceptLoglikes(last_frame_prob_); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_ - 1); - VLOG(2) << "Adding blank frame at symbol " << cur_best; - } - last_best_ = cur_best; - - decodable_.AcceptLoglikes(logp[i]); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_); - is_last_frame_blank_ = false; - } - num_frames_++; - } - // Get the best path - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - if (decoded_frames_mapping_.size() > 0) { - inputs_.resize(1); - outputs_.resize(1); - likelihood_.resize(1); - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, false); - std::vector alignment; - kaldi::LatticeWeight weight; - fst::GetLinearSymbolSequence(lat, &alignment, &outputs_[0], &weight); - ConvertToInputs(alignment, &inputs_[0]); - RemoveContinuousTags(&outputs_[0]); - VLOG(3) << weight.Value1() << " " << weight.Value2(); - likelihood_[0] = -(weight.Value1() + weight.Value2()); - } -} - -void CtcWfstBeamSearch::FinalizeSearch() { - decodable_.SetFinish(); - decoder_.FinalizeDecoding(); - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - if (decoded_frames_mapping_.size() > 0) { - std::vector nbest_lats; - if (opts_.nbest == 1) { - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, true); - nbest_lats.push_back(std::move(lat)); - } else { - // Get N-best path by lattice(CompactLattice) - kaldi::CompactLattice clat; - decoder_.GetLattice(&clat, true); - kaldi::Lattice lat, nbest_lat; - fst::ConvertLattice(clat, &lat); - // TODO(Binbin Zhang): it's n-best word lists here, not character n-best - fst::ShortestPath(lat, &nbest_lat, opts_.nbest); - fst::ConvertNbestToVector(nbest_lat, &nbest_lats); - } - int nbest = nbest_lats.size(); - inputs_.resize(nbest); - outputs_.resize(nbest); - likelihood_.resize(nbest); - times_.resize(nbest); - for (int i = 0; i < nbest; i++) { - kaldi::LatticeWeight weight; - std::vector alignment; - fst::GetLinearSymbolSequence(nbest_lats[i], &alignment, &outputs_[i], - &weight); - ConvertToInputs(alignment, &inputs_[i], ×_[i]); - RemoveContinuousTags(&outputs_[i]); - likelihood_[i] = -(weight.Value1() + weight.Value2()); - } - } -} - -void CtcWfstBeamSearch::ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time) { - input->clear(); - if (time != nullptr) time->clear(); - for (int cur = 0; cur < alignment.size(); ++cur) { - // ignore blank - if (alignment[cur] - 1 == 0) continue; - // merge continuous same label - if (cur > 0 && alignment[cur] == alignment[cur - 1]) continue; - - input->push_back(alignment[cur] - 1); - if (time != nullptr) { - time->push_back(decoded_frames_mapping_[cur]); - } - } -} - -void CtcWfstBeamSearch::RemoveContinuousTags(std::vector* output) { - if (context_graph_) { - for (auto it = output->begin(); it != output->end();) { - if (*it == context_graph_->start_tag_id() || - *it == context_graph_->end_tag_id()) { - if (it + 1 != output->end() && *it == *(it + 1)) { - it = output->erase(it); - continue; - } - } - ++it; - } - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_wfst_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_wfst_beam_search.h deleted file mode 100644 index 204a0c8db1254035b7e3bd4a6e02b65d66b756f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/ctc_wfst_beam_search.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_WFST_BEAM_SEARCH_H_ -#define DECODER_CTC_WFST_BEAM_SEARCH_H_ - -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "kaldi/decoder/lattice-faster-online-decoder.h" -#include "utils/utils.h" - -namespace wenet { - -class DecodableTensorScaled : public kaldi::DecodableInterface { - public: - explicit DecodableTensorScaled(float scale = 1.0) : scale_(scale) { Reset(); } - - void Reset(); - int32 NumFramesReady() const override { return num_frames_ready_; } - bool IsLastFrame(int32 frame) const override; - float LogLikelihood(int32 frame, int32 index) override; - int32 NumIndices() const override; - void AcceptLoglikes(const std::vector& logp); - void SetFinish() { done_ = true; } - - private: - int num_frames_ready_ = 0; - float scale_ = 1.0; - bool done_ = false; - std::vector logp_; -}; - -// LatticeFasterDecoderConfig has the following key members -// beam: decoding beam -// max_active: Decoder max active states -// lattice_beam: Lattice generation beam -struct CtcWfstBeamSearchOptions : public kaldi::LatticeFasterDecoderConfig { - float acoustic_scale = 1.0; - float nbest = 10; - // When blank score is greater than this thresh, skip the frame in viterbi - // search - float blank_skip_thresh = 0.98; - float blank_scale = 1.0; -}; - -class CtcWfstBeamSearch : public SearchInterface { - public: - explicit CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph); - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kWfstBeamSearch; } - // For CTC prefix beam search, both inputs and outputs are hypotheses_ - const std::vector>& Inputs() const override { - return inputs_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - // Sub one and remove - void ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time = nullptr); - void RemoveContinuousTags(std::vector* output); - - int num_frames_ = 0; - std::vector decoded_frames_mapping_; - - int last_best_ = 0; // last none blank best id - std::vector last_frame_prob_; - bool is_last_frame_blank_ = false; - std::vector> inputs_, outputs_; - std::vector likelihood_; - std::vector> times_; - DecodableTensorScaled decodable_; - kaldi::LatticeFasterOnlineDecoder decoder_; - std::shared_ptr context_graph_; - const CtcWfstBeamSearchOptions& opts_; -}; - -} // namespace wenet - -#endif // DECODER_CTC_WFST_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/onnx_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/onnx_asr_model.cc deleted file mode 100644 index fc7afc704febbde3b7e350e392dc46763c453e74..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/onnx_asr_model.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/onnx_asr_model.h" - -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -Ort::Env OnnxAsrModel::env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, ""); -Ort::SessionOptions OnnxAsrModel::session_options_ = Ort::SessionOptions(); - -void OnnxAsrModel::InitEngineThreads(int num_threads) { - session_options_.SetIntraOpNumThreads(num_threads); -} - -void OnnxAsrModel::GetInputOutputInfo( - const std::shared_ptr& session, - std::vector* in_names, std::vector* out_names) { - Ort::AllocatorWithDefaultOptions allocator; - // Input info - int num_nodes = session->GetInputCount(); - in_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetInputName(i, allocator); - Ort::TypeInfo type_info = session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tInput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*in_names)[i] = name; - } - // Output info - num_nodes = session->GetOutputCount(); - out_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetOutputName(i, allocator); - Ort::TypeInfo type_info = session->GetOutputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tOutput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*out_names)[i] = name; - } -} - -void OnnxAsrModel::Read(const std::string& model_dir) { - std::string encoder_onnx_path = model_dir + "/encoder.onnx"; - std::string rescore_onnx_path = model_dir + "/decoder.onnx"; - std::string ctc_onnx_path = model_dir + "/ctc.onnx"; - - // 1. Load sessions - try { -#ifdef _MSC_VER - encoder_session_ = std::make_shared( - env_, ToWString(encoder_onnx_path).c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, ToWString(rescore_onnx_path).c_str(), session_options_); - ctc_session_ = std::make_shared( - env_, ToWString(ctc_onnx_path).c_str(), session_options_); -#else - encoder_session_ = std::make_shared( - env_, encoder_onnx_path.c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, rescore_onnx_path.c_str(), session_options_); - ctc_session_ = std::make_shared(env_, ctc_onnx_path.c_str(), - session_options_); -#endif - } catch (std::exception const& e) { - LOG(ERROR) << "error when load onnx model: " << e.what(); - exit(0); - } - - // 2. Read metadata - auto model_metadata = encoder_session_->GetModelMetadata(); - - Ort::AllocatorWithDefaultOptions allocator; - encoder_output_size_ = - atoi(model_metadata.LookupCustomMetadataMap("output_size", allocator)); - num_blocks_ = - atoi(model_metadata.LookupCustomMetadataMap("num_blocks", allocator)); - head_ = atoi(model_metadata.LookupCustomMetadataMap("head", allocator)); - cnn_module_kernel_ = atoi( - model_metadata.LookupCustomMetadataMap("cnn_module_kernel", allocator)); - subsampling_rate_ = atoi( - model_metadata.LookupCustomMetadataMap("subsampling_rate", allocator)); - right_context_ = - atoi(model_metadata.LookupCustomMetadataMap("right_context", allocator)); - sos_ = atoi(model_metadata.LookupCustomMetadataMap("sos_symbol", allocator)); - eos_ = atoi(model_metadata.LookupCustomMetadataMap("eos_symbol", allocator)); - is_bidirectional_decoder_ = atoi(model_metadata.LookupCustomMetadataMap( - "is_bidirectional_decoder", allocator)); - chunk_size_ = - atoi(model_metadata.LookupCustomMetadataMap("chunk_size", allocator)); - num_left_chunks_ = - atoi(model_metadata.LookupCustomMetadataMap("left_chunks", allocator)); - - LOG(INFO) << "Onnx Model Info:"; - LOG(INFO) << "\tencoder_output_size " << encoder_output_size_; - LOG(INFO) << "\tnum_blocks " << num_blocks_; - LOG(INFO) << "\thead " << head_; - LOG(INFO) << "\tcnn_module_kernel " << cnn_module_kernel_; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; - LOG(INFO) << "\tchunk_size " << chunk_size_; - LOG(INFO) << "\tnum_left_chunks " << num_left_chunks_; - - // 3. Read model nodes - LOG(INFO) << "Onnx Encoder:"; - GetInputOutputInfo(encoder_session_, &encoder_in_names_, &encoder_out_names_); - LOG(INFO) << "Onnx CTC:"; - GetInputOutputInfo(ctc_session_, &ctc_in_names_, &ctc_out_names_); - LOG(INFO) << "Onnx Rescore:"; - GetInputOutputInfo(rescore_session_, &rescore_in_names_, &rescore_out_names_); -} - -OnnxAsrModel::OnnxAsrModel(const OnnxAsrModel& other) { - // metadatas - encoder_output_size_ = other.encoder_output_size_; - num_blocks_ = other.num_blocks_; - head_ = other.head_; - cnn_module_kernel_ = other.cnn_module_kernel_; - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - // sessions - encoder_session_ = other.encoder_session_; - ctc_session_ = other.ctc_session_; - rescore_session_ = other.rescore_session_; - - // node names - encoder_in_names_ = other.encoder_in_names_; - encoder_out_names_ = other.encoder_out_names_; - ctc_in_names_ = other.ctc_in_names_; - ctc_out_names_ = other.ctc_out_names_; - rescore_in_names_ = other.rescore_in_names_; - rescore_out_names_ = other.rescore_out_names_; -} - -std::shared_ptr OnnxAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void OnnxAsrModel::Reset() { - offset_ = 0; - encoder_outs_.clear(); - cached_feature_.clear(); - // Reset att_cache - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - if (num_left_chunks_ > 0) { - int required_cache_size = chunk_size_ * num_left_chunks_; - offset_ = required_cache_size; - att_cache_.resize(num_blocks_ * head_ * required_cache_size * - encoder_output_size_ / head_ * 2, - 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, required_cache_size, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } else { - att_cache_.resize(0, 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, 0, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } - - // Reset cnn_cache - cnn_cache_.resize( - num_blocks_ * encoder_output_size_ * (cnn_module_kernel_ - 1), 0.0); - const int64_t cnn_cache_shape[] = {num_blocks_, 1, encoder_output_size_, - cnn_module_kernel_ - 1}; - cnn_cache_ort_ = Ort::Value::CreateTensor( - memory_info, cnn_cache_.data(), cnn_cache_.size(), cnn_cache_shape, 4); -} - -void OnnxAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - // 1. Prepare onnx required data, splice cached_feature_ and chunk_feats - // chunk - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - std::vector feats; - for (size_t i = 0; i < cached_feature_.size(); ++i) { - feats.insert(feats.end(), cached_feature_[i].begin(), - cached_feature_[i].end()); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - feats.insert(feats.end(), chunk_feats[i].begin(), chunk_feats[i].end()); - } - const int64_t feats_shape[3] = {1, num_frames, feature_dim}; - Ort::Value feats_ort = Ort::Value::CreateTensor( - memory_info, feats.data(), feats.size(), feats_shape, 3); - // offset - int64_t offset_int64 = static_cast(offset_); - Ort::Value offset_ort = Ort::Value::CreateTensor( - memory_info, &offset_int64, 1, std::vector{}.data(), 0); - // required_cache_size - int64_t required_cache_size = chunk_size_ * num_left_chunks_; - Ort::Value required_cache_size_ort = Ort::Value::CreateTensor( - memory_info, &required_cache_size, 1, std::vector{}.data(), 0); - // att_mask - Ort::Value att_mask_ort{nullptr}; - std::vector att_mask(required_cache_size + chunk_size_, 1); - if (num_left_chunks_ > 0) { - int chunk_idx = offset_ / chunk_size_ - num_left_chunks_; - if (chunk_idx < num_left_chunks_) { - for (int i = 0; i < (num_left_chunks_ - chunk_idx) * chunk_size_; ++i) { - att_mask[i] = 0; - } - } - const int64_t att_mask_shape[] = {1, 1, required_cache_size + chunk_size_}; - att_mask_ort = Ort::Value::CreateTensor( - memory_info, reinterpret_cast(att_mask.data()), att_mask.size(), - att_mask_shape, 3); - } - - // 2. Encoder chunk forward - std::vector inputs; - for (auto name : encoder_in_names_) { - if (!strcmp(name, "chunk")) { - inputs.emplace_back(std::move(feats_ort)); - } else if (!strcmp(name, "offset")) { - inputs.emplace_back(std::move(offset_ort)); - } else if (!strcmp(name, "required_cache_size")) { - inputs.emplace_back(std::move(required_cache_size_ort)); - } else if (!strcmp(name, "att_cache")) { - inputs.emplace_back(std::move(att_cache_ort_)); - } else if (!strcmp(name, "cnn_cache")) { - inputs.emplace_back(std::move(cnn_cache_ort_)); - } else if (!strcmp(name, "att_mask")) { - inputs.emplace_back(std::move(att_mask_ort)); - } - } - - std::vector ort_outputs = encoder_session_->Run( - Ort::RunOptions{nullptr}, encoder_in_names_.data(), inputs.data(), - inputs.size(), encoder_out_names_.data(), encoder_out_names_.size()); - - offset_ += static_cast( - ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape()[1]); - att_cache_ort_ = std::move(ort_outputs[1]); - cnn_cache_ort_ = std::move(ort_outputs[2]); - - std::vector ctc_inputs; - ctc_inputs.emplace_back(std::move(ort_outputs[0])); - - std::vector ctc_ort_outputs = ctc_session_->Run( - Ort::RunOptions{nullptr}, ctc_in_names_.data(), ctc_inputs.data(), - ctc_inputs.size(), ctc_out_names_.data(), ctc_out_names_.size()); - encoder_outs_.push_back(std::move(ctc_inputs[0])); - - float* logp_data = ctc_ort_outputs[0].GetTensorMutableData(); - auto type_info = ctc_ort_outputs[0].GetTensorTypeAndShapeInfo(); - - int num_outputs = type_info.GetShape()[1]; - int output_dim = type_info.GetShape()[2]; - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), logp_data + i * output_dim, - sizeof(float) * output_dim); - } -} - -float OnnxAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void OnnxAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - - std::vector rescore_input; - int encoder_len = 0; - for (int i = 0; i < encoder_outs_.size(); i++) { - float* encoder_outs_data = encoder_outs_[i].GetTensorMutableData(); - auto type_info = encoder_outs_[i].GetTensorTypeAndShapeInfo(); - for (int j = 0; j < type_info.GetElementCount(); j++) { - rescore_input.emplace_back(encoder_outs_data[j]); - } - encoder_len += type_info.GetShape()[1]; - } - - const int64_t decode_input_shape[] = {1, encoder_len, encoder_output_size_}; - - std::vector hyps_pad; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad.emplace_back(0); - } - } - - const int64_t hyps_pad_shape[] = {num_hyps, max_hyps_len}; - - const int64_t hyps_lens_shape[] = {num_hyps}; - - Ort::Value decode_input_tensor_ = Ort::Value::CreateTensor( - memory_info, rescore_input.data(), rescore_input.size(), - decode_input_shape, 3); - Ort::Value hyps_pad_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_pad.data(), hyps_pad.size(), hyps_pad_shape, 2); - Ort::Value hyps_lens_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_lens.data(), hyps_lens.size(), hyps_lens_shape, 1); - - std::vector rescore_inputs; - - rescore_inputs.emplace_back(std::move(hyps_pad_tensor_)); - rescore_inputs.emplace_back(std::move(hyps_lens_tensor_)); - rescore_inputs.emplace_back(std::move(decode_input_tensor_)); - - std::vector rescore_outputs = rescore_session_->Run( - Ort::RunOptions{nullptr}, rescore_in_names_.data(), rescore_inputs.data(), - rescore_inputs.size(), rescore_out_names_.data(), - rescore_out_names_.size()); - - float* decoder_outs_data = rescore_outputs[0].GetTensorMutableData(); - float* r_decoder_outs_data = rescore_outputs[1].GetTensorMutableData(); - - auto type_info = rescore_outputs[0].GetTensorTypeAndShapeInfo(); - int decode_out_len = type_info.GetShape()[2]; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - score = ComputeAttentionScore( - decoder_outs_data + max_hyps_len * decode_out_len * i, hyp, eos_, - decode_out_len); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore( - r_decoder_outs_data + max_hyps_len * decode_out_len * i, r_hyp, eos_, - decode_out_len); - } - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/onnx_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/onnx_asr_model.h deleted file mode 100644 index f5d9e9a0c61d728f2fb6d45d1428234abae98c90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/onnx_asr_model.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_ONNX_ASR_MODEL_H_ -#define DECODER_ONNX_ASR_MODEL_H_ - -#include -#include -#include - -#include "onnxruntime_cxx_api.h" // NOLINT - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -class OnnxAsrModel : public AsrModel { - public: - static void InitEngineThreads(int num_threads = 1); - - public: - OnnxAsrModel() = default; - OnnxAsrModel(const OnnxAsrModel& other); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - void GetInputOutputInfo(const std::shared_ptr& session, - std::vector* in_names, - std::vector* out_names); - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // sessions - // NOTE(Mddct): The Env holds the logging state used by all other objects. - // One Env must be created before using any other Onnxruntime functionality. - static Ort::Env env_; // shared environment across threads. - static Ort::SessionOptions session_options_; - std::shared_ptr encoder_session_ = nullptr; - std::shared_ptr rescore_session_ = nullptr; - std::shared_ptr ctc_session_ = nullptr; - - // node names - std::vector encoder_in_names_, encoder_out_names_; - std::vector ctc_in_names_, ctc_out_names_; - std::vector rescore_in_names_, rescore_out_names_; - - // caches - Ort::Value att_cache_ort_{nullptr}; - Ort::Value cnn_cache_ort_{nullptr}; - std::vector encoder_outs_; - // NOTE: Instead of making a copy of the xx_cache, ONNX only maintains - // its data pointer when initializing xx_cache_ort (see https://github.com/ - // microsoft/onnxruntime/blob/master/onnxruntime/core/framework - // /tensor.cc#L102-L129), so we need the following variables to keep - // our data "alive" during the lifetime of decoder. - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // DECODER_ONNX_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/params.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/params.h deleted file mode 100644 index 3edc877f1bb6d876ca087cab8e4ed00d42e97e63..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/params.h +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_PARAMS_H_ -#define DECODER_PARAMS_H_ - -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#ifdef USE_ONNX -#include "decoder/onnx_asr_model.h" -#endif -#ifdef USE_TORCH -#include "decoder/torch_asr_model.h" -#endif -#ifdef USE_XPU -#include "xpu/xpu_asr_model.h" -#endif -#ifdef USE_BPU -#include "bpu/bpu_asr_model.h" -#endif -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); - -// TorchAsrModel flags -DEFINE_string(model_path, "", "pytorch exported model path"); -// OnnxAsrModel flags -DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); -// XPUAsrModel flags -DEFINE_string(xpu_model_dir, "", - "directory where the XPU model and weights is saved"); -// BPUAsrModel flags -DEFINE_string(bpu_model_dir, "", - "directory where the HORIZON BPU model is saved"); - -// FeaturePipelineConfig flags -DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); -DEFINE_int32(sample_rate, 16000, "sample rate for audio"); - -// TLG fst -DEFINE_string(fst_path, "", "TLG fst path"); - -// DecodeOptions flags -DEFINE_int32(chunk_size, 16, "decoding chunk size"); -DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); -DEFINE_double(ctc_weight, 0.5, - "ctc weight when combining ctc score and rescoring score"); -DEFINE_double(rescoring_weight, 1.0, - "rescoring weight when combining ctc score and rescoring score"); -DEFINE_double(reverse_weight, 0.0, - "used for bitransformer rescoring. it must be 0.0 if decoder is" - "conventional transformer decoder, and only reverse_weight > 0.0" - "dose the right to left decoder will be calculated and used"); -DEFINE_int32(max_active, 7000, "max active states in ctc wfst search"); -DEFINE_int32(min_active, 200, "min active states in ctc wfst search"); -DEFINE_double(beam, 16.0, "beam in ctc wfst search"); -DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); -DEFINE_double(blank_skip_thresh, 1.0, - "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(blank_scale, 1.0, "blank scale for ctc wfst search"); -DEFINE_double(length_penalty, 0.0, - "length penalty ctc wfst search, will not" - "apply on self-loop arc, for balancing the del/ins ratio, " - "suggest set to -3.0"); -DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); - -// SymbolTable flags -DEFINE_string(dict_path, "", - "dict symbol table path, required when LM is enabled"); -DEFINE_string(unit_path, "", - "e2e model unit symbol table, it is used in both " - "with/without LM scenarios for context/timestamp"); - -// Context flags -DEFINE_string(context_path, "", "context path, is used to build context graph"); -DEFINE_double(context_score, 3.0, "is used to rescore the decoded result"); - -// PostProcessOptions flags -DEFINE_int32(language_type, 0, - "remove spaces according to language type" - "0x00 = kMandarinEnglish, " - "0x01 = kIndoEuropean"); -DEFINE_bool(lowercase, true, "lowercase final result if needed"); - -namespace wenet { -std::shared_ptr InitFeaturePipelineConfigFromFlags() { - auto feature_config = std::make_shared( - FLAGS_num_bins, FLAGS_sample_rate); - return feature_config; -} - -std::shared_ptr InitDecodeOptionsFromFlags() { - auto decode_config = std::make_shared(); - decode_config->chunk_size = FLAGS_chunk_size; - decode_config->num_left_chunks = FLAGS_num_left_chunks; - decode_config->ctc_weight = FLAGS_ctc_weight; - decode_config->reverse_weight = FLAGS_reverse_weight; - decode_config->rescoring_weight = FLAGS_rescoring_weight; - decode_config->ctc_wfst_search_opts.max_active = FLAGS_max_active; - decode_config->ctc_wfst_search_opts.min_active = FLAGS_min_active; - decode_config->ctc_wfst_search_opts.beam = FLAGS_beam; - decode_config->ctc_wfst_search_opts.lattice_beam = FLAGS_lattice_beam; - decode_config->ctc_wfst_search_opts.acoustic_scale = FLAGS_acoustic_scale; - decode_config->ctc_wfst_search_opts.blank_skip_thresh = - FLAGS_blank_skip_thresh; - decode_config->ctc_wfst_search_opts.blank_scale = FLAGS_blank_scale; - decode_config->ctc_wfst_search_opts.length_penalty = FLAGS_length_penalty; - decode_config->ctc_wfst_search_opts.nbest = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; - return decode_config; -} - -std::shared_ptr InitDecodeResourceFromFlags() { - auto resource = std::make_shared(); - const int kNumGemmThreads = 1; - if (!FLAGS_onnx_dir.empty()) { -#ifdef USE_ONNX - LOG(INFO) << "Reading onnx model "; - OnnxAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_onnx_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; -#endif - } else if (!FLAGS_model_path.empty()) { -#ifdef USE_TORCH - LOG(INFO) << "Reading torch model " << FLAGS_model_path; - TorchAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_model_path); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; -#endif - } else if (!FLAGS_xpu_model_dir.empty()) { -#ifdef USE_XPU - LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; - auto model = std::make_shared(); - model->SetEngineThreads(kNumGemmThreads); - model->SetDeviceId(FLAGS_device_id); - model->Read(FLAGS_xpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; -#endif - } else if (!FLAGS_bpu_model_dir.empty()) { -#ifdef USE_BPU - LOG(INFO) << "Reading Horizon BPU model from " << FLAGS_bpu_model_dir; - auto model = std::make_shared(); - model->Read(FLAGS_bpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DBPU=ON'."; -#endif - } else { - LOG(FATAL) << "Please set ONNX, TORCH, XPU or BPU model path!!!"; - } - - LOG(INFO) << "Reading unit table " << FLAGS_unit_path; - auto unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_unit_path)); - CHECK(unit_table != nullptr); - resource->unit_table = unit_table; - - if (!FLAGS_fst_path.empty()) { // With LM - CHECK(!FLAGS_dict_path.empty()); - LOG(INFO) << "Reading fst " << FLAGS_fst_path; - auto fst = std::shared_ptr>( - fst::Fst::Read(FLAGS_fst_path)); - CHECK(fst != nullptr); - resource->fst = fst; - - LOG(INFO) << "Reading symbol table " << FLAGS_dict_path; - auto symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_dict_path)); - CHECK(symbol_table != nullptr); - resource->symbol_table = symbol_table; - } else { // Without LM, symbol_table is the same as unit_table - resource->symbol_table = unit_table; - } - - if (!FLAGS_context_path.empty()) { - LOG(INFO) << "Reading context " << FLAGS_context_path; - std::vector contexts; - std::ifstream infile(FLAGS_context_path); - std::string context; - while (getline(infile, context)) { - contexts.emplace_back(Trim(context)); - } - ContextConfig config; - config.context_score = FLAGS_context_score; - resource->context_graph = std::make_shared(config); - resource->context_graph->BuildContextGraph(contexts, - resource->symbol_table); - } - - PostProcessOptions post_process_opts; - post_process_opts.language_type = - FLAGS_language_type == 0 ? kMandarinEnglish : kIndoEuropean; - post_process_opts.lowercase = FLAGS_lowercase; - resource->post_processor = - std::make_shared(std::move(post_process_opts)); - return resource; -} - -} // namespace wenet - -#endif // DECODER_PARAMS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/search_interface.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/search_interface.h deleted file mode 100644 index 25bad26705f8be44561d2c686f50a63035b14bbf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/search_interface.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_SEARCH_INTERFACE_H_ -#define DECODER_SEARCH_INTERFACE_H_ - -namespace wenet { - -#include - -enum SearchType { - kPrefixBeamSearch = 0x00, - kWfstBeamSearch = 0x01, -}; - -class SearchInterface { - public: - virtual ~SearchInterface() {} - virtual void Search(const std::vector>& logp) = 0; - virtual void Reset() = 0; - virtual void FinalizeSearch() = 0; - - virtual SearchType Type() const = 0; - // N-best inputs id - virtual const std::vector>& Inputs() const = 0; - // N-best outputs id - virtual const std::vector>& Outputs() const = 0; - // N-best likelihood - virtual const std::vector& Likelihood() const = 0; - // N-best timestamp - virtual const std::vector>& Times() const = 0; -}; - -} // namespace wenet - -#endif // DECODER_SEARCH_INTERFACE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/torch_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/torch_asr_model.cc deleted file mode 100644 index 3abca283e12f5c173c9511707229ea82b31f26d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/torch_asr_model.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/torch_asr_model.h" - -#include -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -namespace wenet { - -#ifndef IOS -void TorchAsrModel::InitEngineThreads(int num_threads) { - // For multi-thread performance - at::set_num_threads(num_threads); - VLOG(1) << "Num intra-op threads: " << at::get_num_threads(); -} -#endif - -void TorchAsrModel::Read(const std::string& model_path) { - torch::DeviceType device = at::kCPU; -#ifdef USE_GPU - if (!torch::cuda::is_available()) { - VLOG(1) << "CUDA is not available! Please check your GPU settings"; - throw std::runtime_error("CUDA is not available!"); - } else { - VLOG(1) << "CUDA available! Running on GPU"; - device = at::kCUDA; - } -#endif - torch::jit::script::Module model = torch::jit::load(model_path, device); - model_ = std::make_shared(std::move(model)); - torch::NoGradGuard no_grad; - model_->eval(); - torch::jit::IValue o1 = model_->run_method("subsampling_rate"); - CHECK_EQ(o1.isInt(), true); - subsampling_rate_ = o1.toInt(); - torch::jit::IValue o2 = model_->run_method("right_context"); - CHECK_EQ(o2.isInt(), true); - right_context_ = o2.toInt(); - torch::jit::IValue o3 = model_->run_method("sos_symbol"); - CHECK_EQ(o3.isInt(), true); - sos_ = o3.toInt(); - torch::jit::IValue o4 = model_->run_method("eos_symbol"); - CHECK_EQ(o4.isInt(), true); - eos_ = o4.toInt(); - torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder"); - CHECK_EQ(o5.isBool(), true); - is_bidirectional_decoder_ = o5.toBool(); - - VLOG(1) << "Torch Model Info:"; - VLOG(1) << "\tsubsampling_rate " << subsampling_rate_; - VLOG(1) << "\tright context " << right_context_; - VLOG(1) << "\tsos " << sos_; - VLOG(1) << "\teos " << eos_; - VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - // 2. Model copy, just copy the model ptr since: - // PyTorch allows using multiple CPU threads during TorchScript model - // inference, please see https://pytorch.org/docs/stable/notes/cpu_ - // threading_torchscript_inference.html - model_ = other.model_; - - // NOTE(Binbin Zhang): - // inner states for forward are not copied here. -} - -std::shared_ptr TorchAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void TorchAsrModel::Reset() { - offset_ = 0; - att_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - encoder_outs_.clear(); - cached_feature_.clear(); -} - -void TorchAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - torch::Tensor feats = - torch::zeros({1, num_frames, feature_dim}, torch::kFloat); - for (size_t i = 0; i < cached_feature_.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(cached_feature_[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][i] = std::move(row); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(chunk_feats[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][cached_feature_.size() + i] = std::move(row); - } - - // 2. Encoder chunk forward -#ifdef USE_GPU - feats = feats.to(at::kCUDA); - att_cache_ = att_cache_.to(at::kCUDA); - cnn_cache_ = cnn_cache_.to(at::kCUDA); -#endif - int required_cache_size = chunk_size_ * num_left_chunks_; - torch::NoGradGuard no_grad; - std::vector inputs = {feats, offset_, required_cache_size, - att_cache_, cnn_cache_}; - - // Refer interfaces in wenet/transformer/asr_model.py - auto outputs = - model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements(); - CHECK_EQ(outputs.size(), 3); -#ifdef USE_GPU - torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU); - att_cache_ = outputs[1].toTensor().to(at::kCPU); - cnn_cache_ = outputs[2].toTensor().to(at::kCPU); -#else - torch::Tensor chunk_out = outputs[0].toTensor(); - att_cache_ = outputs[1].toTensor(); - cnn_cache_ = outputs[2].toTensor(); -#endif - offset_ += chunk_out.size(1); - - // The first dimension of returned value is for batchsize, which is 1 -#ifdef USE_GPU - chunk_out = chunk_out.to(at::kCUDA); - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor(); - ctc_log_probs = ctc_log_probs.to(at::kCPU)[0]; - encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU))); -#else - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor()[0]; - encoder_outs_.push_back(std::move(chunk_out)); -#endif - - // Copy to output - int num_outputs = ctc_log_probs.size(0); - int output_dim = ctc_log_probs.size(1); - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(), - sizeof(float) * output_dim); - } -} - -float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, - int eos) { - float score = 0.0f; - auto accessor = prob.accessor(); - for (size_t j = 0; j < hyp.size(); ++j) { - score += accessor[j][hyp[j]]; - } - score += accessor[hyp.size()][eos]; - return score; -} - -void TorchAsrModel::AttentionRescoring( - const std::vector>& hyps, float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - torch::NoGradGuard no_grad; - // Step 1: Prepare input for libtorch - torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong); - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_length[i] = static_cast(length); - } - torch::Tensor hyps_tensor = - torch::zeros({num_hyps, max_hyps_len}, torch::kLong); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_tensor[i][0] = sos_; - for (size_t j = 0; j < hyp.size(); ++j) { - hyps_tensor[i][j + 1] = hyp[j]; - } - } - - // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_ - torch::Tensor encoder_out = torch::cat(encoder_outs_, 1); -#ifdef USE_GPU - hyps_tensor = hyps_tensor.to(at::kCUDA); - hyps_length = hyps_length.to(at::kCUDA); - encoder_out = encoder_out.to(at::kCUDA); -#endif - auto outputs = model_ - ->run_method("forward_attention_decoder", hyps_tensor, - hyps_length, encoder_out, reverse_weight) - .toTuple() - ->elements(); -#ifdef USE_GPU - auto probs = outputs[0].toTensor().to(at::kCPU); - auto r_probs = outputs[1].toTensor().to(at::kCPU); -#else - auto probs = outputs[0].toTensor(); - auto r_probs = outputs[1].toTensor(); -#endif - CHECK_EQ(probs.size(0), num_hyps); - CHECK_EQ(probs.size(1), max_hyps_len); - - // Step 3: Compute rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left-to-right decoder score - score = ComputeAttentionScore(probs[i], hyp, eos_); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - // right-to-left score - CHECK_EQ(r_probs.size(0), num_hyps); - CHECK_EQ(r_probs.size(1), max_hyps_len); - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_); - } - - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/torch_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/torch_asr_model.h deleted file mode 100644 index a3cebe08798f1cad60ca4cd73c7b2488173b6114..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/decoder/torch_asr_model.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_TORCH_ASR_MODEL_H_ -#define DECODER_TORCH_ASR_MODEL_H_ - -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -#include "decoder/asr_model.h" -#include "utils/utils.h" - -namespace wenet { - -class TorchAsrModel : public AsrModel { - public: -#ifndef IOS - static void InitEngineThreads(int num_threads = 1); -#endif - - public: - using TorchModule = torch::jit::script::Module; - TorchAsrModel() = default; - TorchAsrModel(const TorchAsrModel& other); - void Read(const std::string& model_path); - std::shared_ptr torch_model() const { return model_; } - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, int eos); - - private: - std::shared_ptr model_ = nullptr; - std::vector encoder_outs_; - // transformer/conformer attention cache - torch::Tensor att_cache_ = torch::zeros({0, 0, 0, 0}); - // conformer-only conv_module cache - torch::Tensor cnn_cache_ = torch::zeros({0, 0, 0, 0}); -}; - -} // namespace wenet - -#endif // DECODER_TORCH_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/docker/Dockerfile b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/docker/Dockerfile deleted file mode 100644 index 5c6865faa1a07719d913e9b36abe41a9ba1041d1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/docker/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM ubuntu:latest -MAINTAINER -ENV DEBIAN_FRONTEND=noninteractive -RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list -RUN apt-get update && apt-get install -y git cmake wget build-essential -RUN git clone https://github.com/wenet-e2e/wenet.git /home/wenet -ARG model=20210618_u2pp_conformer_libtorch.tar.gz -RUN wget -P /home https://wenet-1256283475.cos.ap-shanghai.myqcloud.com/models/aishell2/$model -RUN tar -xzf /home/$model -C /home -ARG build=/home/wenet/runtime/libtorch/build -RUN cmake -B $build -DCMAKE_BUILD_TYPE=Release -DGRAPH_TOOLS=ON && cmake --build $build diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/CMakeLists.txt deleted file mode 100644 index 78872257e43bb9a6ffcedaae977bf0173817ae50..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(frontend STATIC - feature_pipeline.cc - fft.cc -) -target_link_libraries(frontend PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fbank.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fbank.h deleted file mode 100644 index 5a650dc035b8e244388cc1f2e0b9512654de7fda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fbank.h +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FBANK_H_ -#define FRONTEND_FBANK_H_ - -#include -#include -#include -#include -#include - -#include "frontend/fft.h" -#include "utils/log.h" - -namespace wenet { - -// This code is based on kaldi Fbank implementation, please see -// https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.cc -class Fbank { - public: - Fbank(int num_bins, int sample_rate, int frame_length, int frame_shift) - : num_bins_(num_bins), - sample_rate_(sample_rate), - frame_length_(frame_length), - frame_shift_(frame_shift), - use_log_(true), - remove_dc_offset_(true), - generator_(0), - distribution_(0, 1.0), - dither_(0.0) { - fft_points_ = UpperPowerOfTwo(frame_length_); - // generate bit reversal table and trigonometric function table - const int fft_points_4 = fft_points_ / 4; - bitrev_.resize(fft_points_); - sintbl_.resize(fft_points_ + fft_points_4); - make_sintbl(fft_points_, sintbl_.data()); - make_bitrev(fft_points_, bitrev_.data()); - - int num_fft_bins = fft_points_ / 2; - float fft_bin_width = static_cast(sample_rate_) / fft_points_; - int low_freq = 20, high_freq = sample_rate_ / 2; - float mel_low_freq = MelScale(low_freq); - float mel_high_freq = MelScale(high_freq); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); - bins_.resize(num_bins_); - center_freqs_.resize(num_bins_); - for (int bin = 0; bin < num_bins; ++bin) { - float left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - center_freqs_[bin] = InverseMelScale(center_mel); - std::vector this_bin(num_fft_bins); - int first_index = -1, last_index = -1; - for (int i = 0; i < num_fft_bins; ++i) { - float freq = (fft_bin_width * i); // Center frequency of this fft - // bin. - float mel = MelScale(freq); - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) - weight = (mel - left_mel) / (center_mel - left_mel); - else - weight = (right_mel - mel) / (right_mel - center_mel); - this_bin[i] = weight; - if (first_index == -1) first_index = i; - last_index = i; - } - } - CHECK(first_index != -1 && last_index >= first_index); - bins_[bin].first = first_index; - int size = last_index + 1 - first_index; - bins_[bin].second.resize(size); - for (int i = 0; i < size; ++i) { - bins_[bin].second[i] = this_bin[first_index + i]; - } - } - - // povey window - povey_window_.resize(frame_length_); - double a = M_2PI / (frame_length - 1); - for (int i = 0; i < frame_length; ++i) { - povey_window_[i] = pow(0.5 - 0.5 * cos(a * i), 0.85); - } - } - - void set_use_log(bool use_log) { use_log_ = use_log; } - - void set_remove_dc_offset(bool remove_dc_offset) { - remove_dc_offset_ = remove_dc_offset; - } - - void set_dither(float dither) { dither_ = dither; } - - int num_bins() const { return num_bins_; } - - static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); - } - - static inline float MelScale(float freq) { - return 1127.0f * logf(1.0f + freq / 700.0f); - } - - static int UpperPowerOfTwo(int n) { - return static_cast(pow(2, ceil(log(n) / log(2)))); - } - - // pre emphasis - void PreEmphasis(float coeff, std::vector* data) const { - if (coeff == 0.0) return; - for (int i = data->size() - 1; i > 0; i--) - (*data)[i] -= coeff * (*data)[i - 1]; - (*data)[0] -= coeff * (*data)[0]; - } - - // Apply povey window on data in place - void Povey(std::vector* data) const { - CHECK_GE(data->size(), povey_window_.size()); - for (size_t i = 0; i < povey_window_.size(); ++i) { - (*data)[i] *= povey_window_[i]; - } - } - - // Compute fbank feat, return num frames - int Compute(const std::vector& wave, - std::vector>* feat) { - int num_samples = wave.size(); - if (num_samples < frame_length_) return 0; - int num_frames = 1 + ((num_samples - frame_length_) / frame_shift_); - feat->resize(num_frames); - std::vector fft_real(fft_points_, 0), fft_img(fft_points_, 0); - std::vector power(fft_points_ / 2); - for (int i = 0; i < num_frames; ++i) { - std::vector data(wave.data() + i * frame_shift_, - wave.data() + i * frame_shift_ + frame_length_); - // optional add noise - if (dither_ != 0.0) { - for (size_t j = 0; j < data.size(); ++j) - data[j] += dither_ * distribution_(generator_); - } - // optinal remove dc offset - if (remove_dc_offset_) { - float mean = 0.0; - for (size_t j = 0; j < data.size(); ++j) mean += data[j]; - mean /= data.size(); - for (size_t j = 0; j < data.size(); ++j) data[j] -= mean; - } - - PreEmphasis(0.97, &data); - Povey(&data); - // copy data to fft_real - memset(fft_img.data(), 0, sizeof(float) * fft_points_); - memset(fft_real.data() + frame_length_, 0, - sizeof(float) * (fft_points_ - frame_length_)); - memcpy(fft_real.data(), data.data(), sizeof(float) * frame_length_); - fft(bitrev_.data(), sintbl_.data(), fft_real.data(), fft_img.data(), - fft_points_); - // power - for (int j = 0; j < fft_points_ / 2; ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - } - - (*feat)[i].resize(num_bins_); - // cepstral coefficients, triangle filter array - for (int j = 0; j < num_bins_; ++j) { - float mel_energy = 0.0; - int s = bins_[j].first; - for (size_t k = 0; k < bins_[j].second.size(); ++k) { - mel_energy += bins_[j].second[k] * power[s + k]; - } - // optional use log - if (use_log_) { - if (mel_energy < std::numeric_limits::epsilon()) - mel_energy = std::numeric_limits::epsilon(); - mel_energy = logf(mel_energy); - } - - (*feat)[i][j] = mel_energy; - } - } - return num_frames; - } - - private: - int num_bins_; - int sample_rate_; - int frame_length_, frame_shift_; - int fft_points_; - bool use_log_; - bool remove_dc_offset_; - std::vector center_freqs_; - std::vector>> bins_; - std::vector povey_window_; - std::default_random_engine generator_; - std::normal_distribution distribution_; - float dither_; - - // bit reversal table - std::vector bitrev_; - // trigonometric function table - std::vector sintbl_; -}; - -} // namespace wenet - -#endif // FRONTEND_FBANK_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/feature_pipeline.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/feature_pipeline.cc deleted file mode 100644 index ab450b15cd35ebd8101a3bcdec4f963a73bed10c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/feature_pipeline.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/feature_pipeline.h" - -#include -#include - -namespace wenet { - -FeaturePipeline::FeaturePipeline(const FeaturePipelineConfig& config) - : config_(config), - feature_dim_(config.num_bins), - fbank_(config.num_bins, config.sample_rate, config.frame_length, - config.frame_shift), - num_frames_(0), - input_finished_(false) {} - -void FeaturePipeline::AcceptWaveform(const float* pcm, const int size) { - std::vector> feats; - std::vector waves; - waves.insert(waves.end(), remained_wav_.begin(), remained_wav_.end()); - waves.insert(waves.end(), pcm, pcm + size); - int num_frames = fbank_.Compute(waves, &feats); - feature_queue_.Push(std::move(feats)); - num_frames_ += num_frames; - - int left_samples = waves.size() - config_.frame_shift * num_frames; - remained_wav_.resize(left_samples); - std::copy(waves.begin() + config_.frame_shift * num_frames, waves.end(), - remained_wav_.begin()); - // We are still adding wave, notify input is not finished - finish_condition_.notify_one(); -} - -void FeaturePipeline::AcceptWaveform(const int16_t* pcm, const int size) { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = static_cast(pcm[i]); - } - this->AcceptWaveform(float_pcm, size); - delete[] float_pcm; -} - -void FeaturePipeline::set_input_finished() { - CHECK(!input_finished_); - { - std::lock_guard lock(mutex_); - input_finished_ = true; - } - finish_condition_.notify_one(); -} - -bool FeaturePipeline::ReadOne(std::vector* feat) { - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - return false; - } - } -} - -bool FeaturePipeline::Read(int num_frames, - std::vector>* feats) { - feats->clear(); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - *feats = std::move(feature_queue_.Pop(feature_queue_.Size())); - return false; - } - } -} - -void FeaturePipeline::Reset() { - input_finished_ = false; - num_frames_ = 0; - remained_wav_.clear(); - feature_queue_.Clear(); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/feature_pipeline.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/feature_pipeline.h deleted file mode 100644 index 9918d6b573255795e0e665f0a9598c44be625c19..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/feature_pipeline.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FEATURE_PIPELINE_H_ -#define FRONTEND_FEATURE_PIPELINE_H_ - -#include -#include -#include -#include - -#include "frontend/fbank.h" -#include "utils/blocking_queue.h" -#include "utils/log.h" - -namespace wenet { - -struct FeaturePipelineConfig { - int num_bins; - int sample_rate; - int frame_length; - int frame_shift; - FeaturePipelineConfig(int num_bins, int sample_rate) - : num_bins(num_bins), // 80 dim fbank - sample_rate(sample_rate) { // 16k sample rate - frame_length = sample_rate / 1000 * 25; // frame length 25ms - frame_shift = sample_rate / 1000 * 10; // frame shift 10ms - } - - void Info() const { - LOG(INFO) << "feature pipeline config" - << " num_bins " << num_bins << " frame_length " << frame_length - << " frame_shift " << frame_shift; - } -}; - -// Typically, FeaturePipeline is used in two threads: one thread A calls -// AcceptWaveform() to add raw wav data and set_input_finished() to notice -// the end of input wav, another thread B (decoder thread) calls Read() to -// consume features.So a BlockingQueue is used to make this class thread safe. - -// The Read() is designed as a blocking method when there is no feature -// in feature_queue_ and the input is not finished. - -// See bin/decoder_main.cc, websocket/websocket_server.cc and -// decoder/torch_asr_decoder.cc for usage - -class FeaturePipeline { - public: - explicit FeaturePipeline(const FeaturePipelineConfig& config); - - // The feature extraction is done in AcceptWaveform(). - void AcceptWaveform(const float* pcm, const int size); - void AcceptWaveform(const int16_t* pcm, const int size); - - // Current extracted frames number. - int num_frames() const { return num_frames_; } - int feature_dim() const { return feature_dim_; } - const FeaturePipelineConfig& config() const { return config_; } - - // The caller should call this method when speech input is end. - // Never call AcceptWaveform() after calling set_input_finished() ! - void set_input_finished(); - bool input_finished() const { return input_finished_; } - - // Return False if input is finished and no feature could be read. - // Return True if a feature is read. - // This function is a blocking method. It will block the thread when - // there is no feature in feature_queue_ and the input is not finished. - bool ReadOne(std::vector* feat); - - // Read #num_frames frame features. - // Return False if less than #num_frames features are read and the - // input is finished. - // Return True if #num_frames features are read. - // This function is a blocking method when there is no feature - // in feature_queue_ and the input is not finished. - bool Read(int num_frames, std::vector>* feats); - - void Reset(); - bool IsLastFrame(int frame) const { - return input_finished_ && (frame == num_frames_ - 1); - } - - int NumQueuedFrames() const { return feature_queue_.Size(); } - - private: - const FeaturePipelineConfig& config_; - int feature_dim_; - Fbank fbank_; - - BlockingQueue> feature_queue_; - int num_frames_; - bool input_finished_; - - // The feature extraction is done in AcceptWaveform(). - // This waveform sample points are consumed by frame size. - // The residual waveform sample points after framing are - // kept to be used in next AcceptWaveform() calling. - std::vector remained_wav_; - - // Used to block the Read when there is no feature in feature_queue_ - // and the input is not finished. - mutable std::mutex mutex_; - std::condition_variable finish_condition_; -}; - -} // namespace wenet - -#endif // FRONTEND_FEATURE_PIPELINE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fft.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fft.cc deleted file mode 100644 index 9e05f854e79ea733d0411045385e924c2670b7f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fft.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include - -#include "frontend/fft.h" - -namespace wenet { - -void make_sintbl(int n, float* sintbl) { - int i, n2, n4, n8; - float c, s, dc, ds, t; - - n2 = n / 2; - n4 = n / 4; - n8 = n / 8; - t = sin(M_PI / n); - dc = 2 * t * t; - ds = sqrt(dc * (2 - dc)); - t = 2 * dc; - c = sintbl[n4] = 1; - s = sintbl[0] = 0; - for (i = 1; i < n8; ++i) { - c -= dc; - dc += t * c; - s += ds; - ds -= t * s; - sintbl[i] = s; - sintbl[n4 - i] = c; - } - if (n8 != 0) sintbl[n8] = sqrt(0.5); - for (i = 0; i < n4; ++i) sintbl[n2 - i] = sintbl[i]; - for (i = 0; i < n2 + n4; ++i) sintbl[i + n2] = -sintbl[i]; -} - -void make_bitrev(int n, int* bitrev) { - int i, j, k, n2; - - n2 = n / 2; - i = j = 0; - for (;;) { - bitrev[i] = j; - if (++i >= n) break; - k = n2; - while (k <= j) { - j -= k; - k /= 2; - } - j += k; - } -} - -// bitrev: bit reversal table -// sintbl: trigonometric function table -// x:real part -// y:image part -// n: fft length -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n) { - int i, j, k, ik, h, d, k2, n4, inverse; - float t, s, c, dx, dy; - - /* preparation */ - if (n < 0) { - n = -n; - inverse = 1; /* inverse transform */ - } else { - inverse = 0; - } - n4 = n / 4; - if (n == 0) { - return 0; - } - - /* bit reversal */ - for (i = 0; i < n; ++i) { - j = bitrev[i]; - if (i < j) { - t = x[i]; - x[i] = x[j]; - x[j] = t; - t = y[i]; - y[i] = y[j]; - y[j] = t; - } - } - - /* transformation */ - for (k = 1; k < n; k = k2) { - h = 0; - k2 = k + k; - d = n / k2; - for (j = 0; j < k; ++j) { - c = sintbl[h + n4]; - if (inverse) - s = -sintbl[h]; - else - s = sintbl[h]; - for (i = j; i < n; i += k2) { - ik = i + k; - dx = s * y[ik] + c * x[ik]; - dy = c * y[ik] - s * x[ik]; - x[ik] = x[i] - dx; - x[i] += dx; - y[ik] = y[i] - dy; - y[i] += dy; - } - h += d; - } - } - if (inverse) { - /* divide by n in case of the inverse transformation */ - for (i = 0; i < n; ++i) { - x[i] /= n; - y[i] /= n; - } - } - return 0; /* finished successfully */ -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fft.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fft.h deleted file mode 100644 index 6b92e406c44b4768eaee6e734f55bb39cd9af28b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/fft.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_FFT_H_ -#define FRONTEND_FFT_H_ - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -namespace wenet { - -// Fast Fourier Transform - -void make_sintbl(int n, float* sintbl); - -void make_bitrev(int n, int* bitrev); - -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n); - -} // namespace wenet - -#endif // FRONTEND_FFT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/wav.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/wav.h deleted file mode 100644 index 688a049a940ebbdc83f24e59134fff22b7b09bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/frontend/wav.h +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2016 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_WAV_H_ -#define FRONTEND_WAV_H_ - -#include -#include -#include -#include -#include - -#include - -#include "utils/log.h" - -namespace wenet { - -struct WavHeader { - char riff[4] = {'R', 'I', 'F', 'F'}; - unsigned int size = 0; - char wav[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - unsigned int fmt_size = 16; - uint16_t format = 1; - uint16_t channels = 0; - unsigned int sample_rate = 0; - unsigned int bytes_per_second = 0; - uint16_t block_size = 0; - uint16_t bit = 0; - char data[4] = {'d', 'a', 't', 'a'}; - unsigned int data_size = 0; - - WavHeader() {} - - WavHeader(int num_samples, int num_channel, int sample_rate, - int bits_per_sample) { - data_size = num_samples * num_channel * (bits_per_sample / 8); - size = sizeof(WavHeader) - 8 + data_size; - channels = num_channel; - this->sample_rate = sample_rate; - bytes_per_second = sample_rate * num_channel * (bits_per_sample / 8); - block_size = num_channel * (bits_per_sample / 8); - bit = bits_per_sample; - } -}; - -class WavReader { - public: - WavReader() : data_(nullptr) {} - explicit WavReader(const std::string& filename) { Open(filename); } - - bool Open(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "rb"); - if (NULL == fp) { - LOG(WARNING) << "Error in read " << filename; - return false; - } - - WavHeader header; - fread(&header, 1, sizeof(header), fp); - if (header.fmt_size < 16) { - fprintf(stderr, - "WaveData: expect PCM format data " - "to have fmt chunk of at least size 16.\n"); - return false; - } else if (header.fmt_size > 16) { - int offset = 44 - 8 + header.fmt_size - 16; - fseek(fp, offset, SEEK_SET); - fread(header.data, 8, sizeof(char), fp); - } - // check "RIFF" "WAVE" "fmt " "data" - - // Skip any sub-chunks between "fmt" and "data". Usually there will - // be a single "fact" sub chunk, but on Windows there can also be a - // "list" sub chunk. - while (0 != strncmp(header.data, "data", 4)) { - // We will just ignore the data in these chunks. - fseek(fp, header.data_size, SEEK_CUR); - // read next sub chunk - fread(header.data, 8, sizeof(char), fp); - } - - num_channel_ = header.channels; - sample_rate_ = header.sample_rate; - bits_per_sample_ = header.bit; - int num_data = header.data_size / (bits_per_sample_ / 8); - data_ = new float[num_data]; - num_samples_ = num_data / num_channel_; - - for (int i = 0; i < num_data; ++i) { - switch (bits_per_sample_) { - case 8: { - char sample; - fread(&sample, 1, sizeof(char), fp); - data_[i] = static_cast(sample); - break; - } - case 16: { - int16_t sample; - fread(&sample, 1, sizeof(int16_t), fp); - data_[i] = static_cast(sample); - break; - } - case 32: { - int sample; - fread(&sample, 1, sizeof(int), fp); - data_[i] = static_cast(sample); - break; - } - default: - fprintf(stderr, "unsupported quantization bits"); - exit(1); - } - } - fclose(fp); - return true; - } - - int num_channel() const { return num_channel_; } - int sample_rate() const { return sample_rate_; } - int bits_per_sample() const { return bits_per_sample_; } - int num_samples() const { return num_samples_; } - - ~WavReader() { - delete[] data_; - } - - const float* data() const { return data_; } - - private: - int num_channel_; - int sample_rate_; - int bits_per_sample_; - int num_samples_; // sample points per channel - float* data_; -}; - -class WavWriter { - public: - WavWriter(const float* data, int num_samples, int num_channel, - int sample_rate, int bits_per_sample) - : data_(data), - num_samples_(num_samples), - num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample) {} - - void Write(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "wb"); - WavHeader header(num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fwrite(&header, 1, sizeof(header), fp); - - for (int i = 0; i < num_samples_; ++i) { - for (int j = 0; j < num_channel_; ++j) { - switch (bits_per_sample_) { - case 8: { - char sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 16: { - int16_t sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 32: { - int sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - } - } - } - fclose(fp); - } - - private: - const float* data_; - int num_samples_; // total float points in data_ - int num_channel_; - int sample_rate_; - int bits_per_sample_; -}; - -class StreamWavWriter { - public: - StreamWavWriter(int num_channel, int sample_rate, int bits_per_sample) - : num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample), - total_num_samples_(0) {} - - StreamWavWriter(const std::string& filename, int num_channel, - int sample_rate, int bits_per_sample) - : StreamWavWriter(num_channel, sample_rate, bits_per_sample) { - Open(filename); - } - - void Open(const std::string& filename) { - fp_ = fopen(filename.c_str(), "wb"); - fseek(fp_, sizeof(WavHeader), SEEK_SET); - } - - void Write(const int16_t* sample_data, size_t num_samples) { - fwrite(sample_data, sizeof(int16_t), num_samples, fp_); - total_num_samples_ += num_samples; - } - - void Close() { - WavHeader header(total_num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fseek(fp_, 0L, SEEK_SET); - fwrite(&header, 1, sizeof(header), fp_); - fclose(fp_); - } - - private: - FILE* fp_; - int num_channel_; - int sample_rate_; - int bits_per_sample_; - size_t total_num_samples_; -}; - -} // namespace wenet - -#endif // FRONTEND_WAV_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/CMakeLists.txt deleted file mode 100644 index 2a152dd0d38cdc17d2758d7dbd542cd974d5f0c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# compile wenet.proto -set(PROTO_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -add_custom_command( - OUTPUT ${PROTO_DIR}/wenet.pb.cc - ${PROTO_DIR}/wenet.pb.h - ${PROTO_DIR}/wenet.grpc.pb.cc - ${PROTO_DIR}/wenet.grpc.pb.h - COMMAND ${protobuf_BINARY_DIR}/protoc - ARGS --grpc_out "${PROTO_DIR}" - --cpp_out "${PROTO_DIR}" - -I "${PROTO_DIR}" - --plugin=protoc-gen-grpc=${grpc_BINARY_DIR}/grpc_cpp_plugin - wenet.proto) - -# grpc_server/client -link_directories(${protobuf_BINARY_DIR}/lib) -add_library(wenet_grpc STATIC - grpc_client.cc - grpc_server.cc - wenet.pb.cc - wenet.grpc.pb.cc -) -target_link_libraries(wenet_grpc PUBLIC grpc++ grpc++_reflection decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_client.cc deleted file mode 100644 index 7a2e3f6f384980b6566468213d3eead43a404070..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_client.cc +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "grpc/grpc_client.h" - -#include "utils/log.h" - -namespace wenet { -using grpc::Channel; -using grpc::ClientContext; -using grpc::ClientReaderWriter; -using grpc::Status; -using wenet::Request; -using wenet::Response; - -GrpcClient::GrpcClient(const std::string& host, int port, int nbest, - bool continuous_decoding) - : host_(host), - port_(port), - nbest_(nbest), - continuous_decoding_(continuous_decoding) { - Connect(); - t_.reset(new std::thread(&GrpcClient::ReadLoopFunc, this)); -} - -void GrpcClient::Connect() { - channel_ = grpc::CreateChannel(host_ + ":" + std::to_string(port_), - grpc::InsecureChannelCredentials()); - stub_ = ASR::NewStub(channel_); - context_ = std::make_shared(); - stream_ = stub_->Recognize(context_.get()); - request_ = std::make_shared(); - response_ = std::make_shared(); - request_->mutable_decode_config()->set_nbest_config(nbest_); - request_->mutable_decode_config()->set_continuous_decoding_config( - continuous_decoding_); - stream_->Write(*request_); -} - -void GrpcClient::SendBinaryData(const void* data, size_t size) { - const int16_t* pdata = reinterpret_cast(data); - request_->set_audio_data(pdata, size); - stream_->Write(*request_); -} - -void GrpcClient::ReadLoopFunc() { - try { - while (stream_->Read(response_.get())) { - for (int i = 0; i < response_->nbest_size(); i++) { - // you can also traverse wordpieces like demonstrated above - LOG(INFO) << i + 1 << "best " << response_->nbest(i).sentence(); - } - if (response_->status() != Response_Status_ok) { - break; - } - if (response_->type() == Response_Type_speech_end) { - done_ = true; - break; - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void GrpcClient::Join() { - stream_->WritesDone(); - t_->join(); - Status status = stream_->Finish(); - if (!status.ok()) { - LOG(INFO) << "Recognize rpc failed."; - } -} -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_client.h deleted file mode 100644 index 36e36a0f5f5ec5bbb818009fe931e863eaa7fd60..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_client.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GRPC_GRPC_CLIENT_H_ -#define GRPC_GRPC_CLIENT_H_ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "grpc/wenet.grpc.pb.h" -#include "utils/utils.h" - -namespace wenet { - -using grpc::Channel; -using grpc::ClientContext; -using grpc::ClientReaderWriter; -using wenet::ASR; -using wenet::Request; -using wenet::Response; - -class GrpcClient { - public: - GrpcClient(const std::string& host, int port, int nbest, - bool continuous_decoding); - - void SendBinaryData(const void* data, size_t size); - void ReadLoopFunc(); - void Join(); - bool done() const { return done_; } - - private: - void Connect(); - std::string host_; - int port_; - std::shared_ptr channel_{nullptr}; - std::unique_ptr stub_{nullptr}; - std::shared_ptr context_{nullptr}; - std::unique_ptr> stream_{nullptr}; - std::shared_ptr request_{nullptr}; - std::shared_ptr response_{nullptr}; - int nbest_ = 1; - bool continuous_decoding_ = false; - bool done_ = false; - std::unique_ptr t_{nullptr}; - - WENET_DISALLOW_COPY_AND_ASSIGN(GrpcClient); -}; - -} // namespace wenet - -#endif // GRPC_GRPC_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_server.cc deleted file mode 100644 index 26268bc02a2f2ea56bb24a1eb379a565f693429a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_server.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "grpc/grpc_server.h" - -namespace wenet { - -using grpc::ServerReaderWriter; -using wenet::Request; -using wenet::Response; - -GrpcConnectionHandler::GrpcConnectionHandler( - ServerReaderWriter* stream, - std::shared_ptr request, std::shared_ptr response, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : stream_(std::move(stream)), - request_(std::move(request)), - response_(std::move(response)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - -void GrpcConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Received speech start signal, start reading speech"; - got_start_tag_ = true; - response_->set_status(Response::ok); - response_->set_type(Response::server_ready); - stream_->Write(*response_); - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = std::make_shared( - &GrpcConnectionHandler::DecodeThreadFunc, this); -} - -void GrpcConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Received speech end signal"; - CHECK(feature_pipeline_ != nullptr); - feature_pipeline_->set_input_finished(); - got_end_tag_ = true; -} - -void GrpcConnectionHandler::OnPartialResult() { - LOG(INFO) << "Partial result"; - response_->set_status(Response::ok); - response_->set_type(Response::partial_result); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnFinalResult() { - LOG(INFO) << "Final result"; - response_->set_status(Response::ok); - response_->set_type(Response::final_result); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnFinish() { - // Send finish tag - response_->set_status(Response::ok); - response_->set_type(Response::speech_end); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnSpeechData() { - // Read binary PCM data - const int16_t* pcm_data = - reinterpret_cast(request_->audio_data().c_str()); - int num_samples = request_->audio_data().length() / sizeof(int16_t); - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - feature_pipeline_->AcceptWaveform(pcm_data, num_samples); -} - -void GrpcConnectionHandler::SerializeResult(bool finish) { - for (const DecodeResult& path : decoder_->result()) { - Response_OneBest* one_best_ = response_->add_nbest(); - one_best_->set_sentence(path.sentence); - if (finish) { - for (const WordPiece& word_piece : path.word_pieces) { - Response_OnePiece* one_piece_ = one_best_->add_wordpieces(); - one_piece_->set_word(word_piece.word); - one_piece_->set_start(word_piece.start); - one_piece_->set_end(word_piece.end); - } - } - if (response_->nbest_size() == nbest_) { - break; - } - } - return; -} - -void GrpcConnectionHandler::DecodeThreadFunc() { - while (true) { - DecodeState state = decoder_->Decode(); - response_->clear_status(); - response_->clear_type(); - response_->clear_nbest(); - if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - SerializeResult(true); - OnFinalResult(); - OnFinish(); - stop_recognition_ = true; - break; - } else if (state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - SerializeResult(true); - OnFinalResult(); - // If it's not continuous decoding, continue to do next recognition - // otherwise stop the recognition - if (continuous_decoding_) { - decoder_->ResetContinuousDecoding(); - } else { - OnFinish(); - stop_recognition_ = true; - break; - } - } else { - if (decoder_->DecodedSomething()) { - SerializeResult(false); - OnPartialResult(); - } - } - } -} - -void GrpcConnectionHandler::operator()() { - try { - while (stream_->Read(request_.get())) { - if (!got_start_tag_) { - nbest_ = request_->decode_config().nbest_config(); - continuous_decoding_ = - request_->decode_config().continuous_decoding_config(); - OnSpeechStart(); - } else { - OnSpeechData(); - } - } - OnSpeechEnd(); - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -Status GrpcServer::Recognize(ServerContext* context, - ServerReaderWriter* stream) { - LOG(INFO) << "Get Recognize request" << std::endl; - auto request = std::make_shared(); - auto response = std::make_shared(); - GrpcConnectionHandler handler(stream, request, response, feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.join(); - return Status::OK; -} -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_server.h deleted file mode 100644 index 3ab47ce5b15897c2a596d8ef27f2e7c4f8d26a3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/grpc_server.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GRPC_GRPC_SERVER_H_ -#define GRPC_GRPC_SERVER_H_ - -#include -#include -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -#include "grpc/wenet.grpc.pb.h" - -namespace wenet { - -using grpc::ServerContext; -using grpc::ServerReaderWriter; -using grpc::Status; -using wenet::ASR; -using wenet::Request; -using wenet::Response; - -class GrpcConnectionHandler { - public: - GrpcConnectionHandler(ServerReaderWriter* stream, - std::shared_ptr request, - std::shared_ptr response, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnFinish(); - void OnSpeechData(); - void OnPartialResult(); - void OnFinalResult(); - void DecodeThreadFunc(); - void SerializeResult(bool finish); - - bool continuous_decoding_ = false; - int nbest_ = 1; - ServerReaderWriter* stream_; - std::shared_ptr request_; - std::shared_ptr response_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - bool got_start_tag_ = false; - bool got_end_tag_ = false; - // When endpoint is detected, stop recognition, and stop receiving data. - bool stop_recognition_ = false; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class GrpcServer final : public ASR::Service { - public: - GrpcServer(std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - Status Recognize(ServerContext* context, - ServerReaderWriter* reader) override; - - private: - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - DISALLOW_COPY_AND_ASSIGN(GrpcServer); -}; - -} // namespace wenet - -#endif // GRPC_GRPC_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/wenet.proto b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/wenet.proto deleted file mode 100644 index 4c3033c034c513611c9159ff9db42b225be2cc98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/grpc/wenet.proto +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -syntax = "proto3"; - -option java_package = "ex.grpc"; -option objc_class_prefix = "wenet"; - -package wenet; - -service ASR { - rpc Recognize (stream Request) returns (stream Response) {} -} - -message Request { - - message DecodeConfig { - int32 nbest_config = 1; - bool continuous_decoding_config = 2; - } - - oneof RequestPayload { - DecodeConfig decode_config = 1; - bytes audio_data = 2; - } -} - -message Response { - - message OneBest { - string sentence = 1; - repeated OnePiece wordpieces = 2; - } - - message OnePiece { - string word = 1; - int32 start = 2; - int32 end = 3; - } - - enum Status { - ok = 0; - failed = 1; - } - - enum Type { - server_ready = 0; - partial_result = 1; - final_result = 2; - speech_end = 3; - } - - Status status = 1; - Type type = 2; - repeated OneBest nbest = 3; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/CMakeLists.txt deleted file mode 100644 index 4ba414e25bd577575b1baec2eba4bf1c3062b211..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(http STATIC - http_client.cc - http_server.cc -) -target_link_libraries(http PUBLIC decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_client.cc deleted file mode 100644 index 50bace0d4e40b073586c744cd85799f7414655e8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_client.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "http/http_client.h" - -#include "boost/json/src.hpp" - -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace net = boost::asio; // from -using tcp = net::ip::tcp; // from -namespace json = boost::json; - -HttpClient::HttpClient(const std::string& hostname, int port) - : hostname_(hostname), port_(port) { - Connect(); -} - -void HttpClient::Connect() { - tcp::resolver resolver{ioc_}; - // Look up the domain name - auto const results = resolver.resolve(hostname_, std::to_string(port_)); - stream_.connect(results); -} - -void HttpClient::SendBinaryData(const void* data, size_t size) { - try { - json::value start_tag = {{"nbest", nbest_}, - {"continuous_decoding", continuous_decoding_}}; - std::string config = json::serialize(start_tag); - req_.set("config", config); - std::size_t encode_size = beast::detail::base64::encoded_size(size); - char encode_data[encode_size]; // NOLINT - beast::detail::base64::encode(encode_data, data, size); - req_.body() = encode_data; - req_.prepare_payload(); - http::write(stream_, req_, ec_); - - http::read(stream_, buffer_, res_); - std::string message = res_.body(); - json::object obj = json::parse(message).as_object(); - LOG(INFO) << message; - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } - stream_.socket().shutdown(tcp::socket::shutdown_both, ec_); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_client.h deleted file mode 100644 index 803af26a4ef2b5a236570476fb89003014bc0280..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_client.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HTTP_HTTP_CLIENT_H_ -#define HTTP_HTTP_CLIENT_H_ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace net = boost::asio; // from -using tcp = net::ip::tcp; // from - -class HttpClient { - public: - HttpClient(const std::string& host, int port); - - void SendBinaryData(const void* data, size_t size); - void set_nbest(int nbest) { nbest_ = nbest; } - - private: - void Connect(); - std::string hostname_; - int port_; - std::string target_ = "/"; - int version_ = 11; - int nbest_ = 1; - const bool continuous_decoding_ = false; - net::io_context ioc_; - beast::tcp_stream stream_{ioc_}; - beast::flat_buffer buffer_; - http::request req_{http::verb::get, target_, version_}; - http::response res_{http::status::ok, version_}; - beast::error_code ec_; - - WENET_DISALLOW_COPY_AND_ASSIGN(HttpClient); -}; - -} // namespace wenet - -#endif // HTTP_HTTP_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_server.cc deleted file mode 100644 index c839757647554235a9e70a3dfc886a02b4e2cd79..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_server.cc +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "http/http_server.h" - -#include -#include -#include - -#include "boost/json/src.hpp" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace net = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -ConnectionHandler::ConnectionHandler( - tcp::socket&& socket, std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : socket_(std::move(socket)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)), - req_(std::make_shared>( - http::verb::post, target_, version_)), - res_(std::make_shared>(http::status::ok, - version_)) {} - -void ConnectionHandler::OnSpeechStart() { - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = - std::make_shared(&ConnectionHandler::DecodeThreadFunc, this); -} - -void ConnectionHandler::OnSpeechEnd() { - if (feature_pipeline_ != nullptr) { - feature_pipeline_->set_input_finished(); - } -} - -void ConnectionHandler::OnFinalResult(const std::string& result) { - LOG(INFO) << "Final result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "final_result"}, {"nbest", result}}; - std::string message = json::serialize(rv); - res_.get()->body() = message; - http::write(socket_, *res_.get(), ec_); -} - -void ConnectionHandler::OnSpeechData(const std::string& message) { - std::size_t decode_size = - beast::detail::base64::decoded_size(message.length()); - int num_samples = decode_size / sizeof(int16_t); - int16_t decode_data[num_samples]; // NOLINT - beast::detail::base64::decode(decode_data, message.c_str(), message.length()); - - // Read binary PCM data - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - feature_pipeline_->AcceptWaveform(decode_data, num_samples); -} - -std::string ConnectionHandler::SerializeResult(bool finish) { - json::array nbest; - for (const DecodeResult& path : decoder_->result()) { - json::object jpath({{"sentence", path.sentence}}); - if (finish) { - json::array word_pieces; - for (const WordPiece& word_piece : path.word_pieces) { - json::object jword_piece({{"word", word_piece.word}, - {"start", word_piece.start}, - {"end", word_piece.end}}); - word_pieces.emplace_back(jword_piece); - } - jpath.emplace("word_pieces", word_pieces); - } - nbest.emplace_back(jpath); - - if (nbest.size() == nbest_) { - break; - } - } - return json::serialize(nbest); -} - -void ConnectionHandler::DecodeThreadFunc() { - try { - while (true) { - DecodeState state = decoder_->Decode(); - if (state == DecodeState::kEndFeats || state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - break; - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void ConnectionHandler::OnError(const std::string& message) { - json::value rv = {{"status", "failed"}, {"message", message}}; - res_.get()->body() = json::serialize(rv); - http::write(socket_, *res_.get(), ec_); - // Send a TCP shutdown - socket_.shutdown(tcp::socket::shutdown_send, ec_); -} - -void ConnectionHandler::OnText(const std::string& message) { - LOG(INFO) << message; - json::value v = json::parse(message); - if (v.is_object()) { - json::object obj = v.get_object(); - if (obj.find("nbest") != obj.end()) { - if (obj["nbest"].is_int64()) { - nbest_ = obj["nbest"].as_int64(); - } else { - OnError("integer is expected for nbest option"); - } - } - } else { - OnError("Wrong protocol"); - } -} - -void ConnectionHandler::operator()() { - try { - http::read(socket_, buffer_, *req_.get(), ec_); - if (ec_) { - LOG(ERROR) << ec_; - } else { - OnText(req_.get()->base()["config"].to_string()); - OnSpeechStart(); - OnSpeechData(req_.get()->body()); - OnSpeechEnd(); - } - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (beast::system_error const& se) { - LOG(INFO) << se.code().message(); - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } - socket_.shutdown(tcp::socket::shutdown_send, ec_); -} - -void HttpServer::Start() { - try { - auto const address = net::ip::make_address("0.0.0.0"); - tcp::acceptor acceptor{ioc_, {address, static_cast(port_)}}; - for (;;) { - // This will receive the new connection - tcp::socket socket{ioc_}; - // Block until we get a connection - acceptor.accept(socket); - // Launch the session, transferring ownership of the socket - ConnectionHandler handler(std::move(socket), feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.detach(); - } - } catch (const std::exception& e) { - LOG(FATAL) << e.what(); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_server.h deleted file mode 100644 index f7304475e0c374dfb2b5308864b5e08dce71ae12..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/http/http_server.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HTTP_HTTP_SERVER_H_ -#define HTTP_HTTP_SERVER_H_ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace net = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class ConnectionHandler { - public: - ConnectionHandler(tcp::socket&& socket, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource_); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnText(const std::string& message); - void OnSpeechData(const std::string& message); - void OnError(const std::string& message); - void OnFinalResult(const std::string& result); - void DecodeThreadFunc(); - std::string SerializeResult(bool finish); - - std::string target_ = "/"; - int version_ = 11; - const bool continuous_decoding_ = false; - int nbest_ = 1; - tcp::socket socket_; - beast::flat_buffer buffer_; - beast::error_code ec_; - std::shared_ptr> req_; - std::shared_ptr> res_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class HttpServer { - public: - HttpServer(int port, std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : port_(port), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - - void Start(); - - private: - int port_; - // The io_context is required for all I/O - net::io_context ioc_{1}; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - WENET_DISALLOW_COPY_AND_ASSIGN(HttpServer); -}; - -} // namespace wenet - -#endif // HTTP_HTTP_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/CMakeLists.txt deleted file mode 100644 index b072309e44b90dcee44ea31e9bcbc1741e73f151..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) - -project(kaldi) - -# include_directories() is called in the root CMakeLists.txt - -add_library(kaldi-util - base/kaldi-error.cc - base/kaldi-math.cc - util/kaldi-io.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/text-utils.cc -) -target_link_libraries(kaldi-util PUBLIC utils) - -add_library(kaldi-decoder - lat/determinize-lattice-pruned.cc - lat/lattice-functions.cc - decoder/lattice-faster-decoder.cc - decoder/lattice-faster-online-decoder.cc -) -target_link_libraries(kaldi-decoder PUBLIC kaldi-util) - -if(GRAPH_TOOLS) - # Arpa binary - add_executable(arpa2fst - lm/arpa-file-parser.cc - lm/arpa-lm-compiler.cc - lmbin/arpa2fst.cc - ) - target_link_libraries(arpa2fst PUBLIC kaldi-util) - - # FST tools binary - set(FST_BINS - fstaddselfloops - fstdeterminizestar - fstisstochastic - fstminimizeencoded - fsttablecompose - ) - - if(NOT MSVC) - # dl is for dynamic linking, otherwise there is a linking error on linux - link_libraries(dl) - endif() - foreach(name IN LISTS FST_BINS) - add_executable(${name} - fstbin/${name}.cc - fstext/kaldi-fst-io.cc - ) - target_link_libraries(${name} PUBLIC kaldi-util) - endforeach() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/README.md deleted file mode 100644 index 4eb9c9173b747686f00b658afc5e1e0dfdc17e68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/README.md +++ /dev/null @@ -1,21 +0,0 @@ -We use Kaldi decoder to implement TLG based language model integration, -so we copied related files to this directory. -The main changes are: - -1. To minimize the change, we use the same directories tree as Kaldi. - -2. We replace Kaldi log system with glog in the following way. - -``` c++ -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_INFO \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) -``` - -3. We lint all the files to satisfy the lint in WeNet. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs-inl.h deleted file mode 100644 index 9397400833676b323492321183c989cec2f41c3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs-inl.h +++ /dev/null @@ -1,329 +0,0 @@ -// base/io-funcs-inl.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; -// Johns Hopkins University (Author: Daniel Povey) -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_INL_H_ -#define KALDI_BASE_IO_FUNCS_INL_H_ 1 - -// Do not include this file directly. It is included by base/io-funcs.h - -#include -#include -#include - -namespace kaldi { - -// Template that covers integers. -template -void WriteBasicType(std::ostream &os, bool binary, T t) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char len_c = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(t)); - os.put(len_c); - os.write(reinterpret_cast(&t), sizeof(t)); - } else { - if (sizeof(t) == 1) - os << static_cast(t) << " "; - else - os << t << " "; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteBasicType."; - } -} - -// Template that covers integers. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t) { - KALDI_PARANOID_ASSERT(t != NULL); - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - int len_c_in = is.get(); - if (len_c_in == -1) - KALDI_ERR << "ReadBasicType: encountered end of stream."; - char len_c = static_cast(len_c_in), - len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(*t)); - if (len_c != len_c_expected) { - KALDI_ERR << "ReadBasicType: did not get expected integer type, " - << static_cast(len_c) << " vs. " - << static_cast(len_c_expected) - << ". You can change this code to successfully" - << " read it later, if needed."; - // insert code here to read "wrong" type. Might have a switch statement. - } - is.read(reinterpret_cast(t), sizeof(*t)); - } else { - if (sizeof(*t) == 1) { - int16 i; - is >> i; - *t = i; - } else { - is >> *t; - } - } - if (is.fail()) { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << is.peek(); - } -} - -// Template that covers integers. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector >::const_iterator iter = v.begin(), - end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(iter->first) << ',' - << static_cast(iter->second) << ' '; - else - os << iter->first << ',' << iter->second << ' '; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerPairVector."; - } -} - -// Template that covers integers. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz * 2); - } - } else { - std::vector > tmp_v; // use temporary so v doesn't use - // extra memory due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); - } else { - T next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::pair(next_t1, next_t2)); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerPairVector: read failure at file position " - << is.tellg(); -} - -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(*iter) << " "; - else - os << *iter << " "; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerVector."; - } -} - -template -inline void ReadIntegerVector(std::istream &is, bool binary, - std::vector *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz); - } - } else { - std::vector tmp_v; // use temporary so v doesn't use extra memory - // due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back((T)next_t); - } else { - T next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(next_t); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerVector: read failure at file position " - << is.tellg(); -} - -// Initialize an opened stream for writing by writing an optional binary -// header and modifying the floating-point precision. -inline void InitKaldiOutputStream(std::ostream &os, bool binary) { - // This does not throw exceptions (does not check for errors). - if (binary) { - os.put('\0'); - os.put('B'); - } - // Note, in non-binary mode we may at some point want to mess with - // the precision a bit. - // 7 is a bit more than the precision of float.. - if (os.precision() < 7) os.precision(7); -} - -/// Initialize an opened stream for reading by detecting the binary header and -// setting the "binary" value appropriately. -inline bool InitKaldiInputStream(std::istream &is, bool *binary) { - // Sets the 'binary' variable. - // Throws exception in the very unusual situation that stream - // starts with '\0' but not then 'B'. - - if (is.peek() == '\0') { // seems to be binary - is.get(); - if (is.peek() != 'B') { - return false; - } - is.get(); - *binary = true; - return true; - } else { - *binary = false; - return true; - } -} - -} // end namespace kaldi. - -#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs.cc deleted file mode 100644 index bd6c350780d1096ff8c452fd00864aa07a30ac65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs.cc +++ /dev/null @@ -1,215 +0,0 @@ -// base/io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" - -namespace kaldi { - -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b) { - os << (b ? "T" : "F"); - if (!binary) os << " "; - if (os.fail()) KALDI_ERR << "Write failure in WriteBasicType"; -} - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b) { - KALDI_PARANOID_ASSERT(b != NULL); - if (!binary) is >> std::ws; // eat up whitespace. - char c = is.peek(); - if (c == 'T') { - *b = true; - is.get(); - } else if (c == 'F') { - *b = false; - is.get(); - } else { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << CharToString(c); - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, float f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f) { - KALDI_PARANOID_ASSERT(f != NULL); - if (binary) { - double d; - int c = is.peek(); - if (c == sizeof(*f)) { - is.get(); - is.read(reinterpret_cast(f), sizeof(*f)); - } else if (c == sizeof(d)) { - ReadBasicType(is, binary, &d); - *f = d; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *f; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, double *d) { - KALDI_PARANOID_ASSERT(d != NULL); - if (binary) { - float f; - int c = is.peek(); - if (c == sizeof(*d)) { - is.get(); - is.read(reinterpret_cast(d), sizeof(*d)); - } else if (c == sizeof(f)) { - ReadBasicType(is, binary, &f); - *d = f; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *d; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -void CheckToken(const char *token) { - if (*token == '\0') KALDI_ERR << "Token is empty (not a valid token)"; - const char *orig_token = token; - while (*token != '\0') { - if (::isspace(*token)) - KALDI_ERR << "Token is not a valid token (contains space): '" - << orig_token << "'"; - token++; - } -} - -void WriteToken(std::ostream &os, bool binary, const char *token) { - // binary mode is ignored; - // we use space as termination character in either case. - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - os << token << " "; - if (os.fail()) { - KALDI_ERR << "Write failure in WriteToken."; - } -} - -int Peek(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // eat up whitespace. - return is.peek(); -} - -void WriteToken(std::ostream &os, bool binary, const std::string &token) { - WriteToken(os, binary, token.c_str()); -} - -void ReadToken(std::istream &is, bool binary, std::string *str) { - KALDI_ASSERT(str != NULL); - if (!binary) is >> std::ws; // consume whitespace. - is >> *str; - if (is.fail()) { - KALDI_ERR << "ReadToken, failed to read token at file position " - << is.tellg(); - } - if (!isspace(is.peek())) { - KALDI_ERR << "ReadToken, expected space after token, saw instead " - << CharToString(static_cast(is.peek())) - << ", at file position " << is.tellg(); - } - is.get(); // consume the space. -} - -int PeekToken(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // consume whitespace. - bool read_bracket; - if (static_cast(is.peek()) == '<') { - read_bracket = true; - is.get(); - } else { - read_bracket = false; - } - int ans = is.peek(); - if (read_bracket) { - if (!is.unget()) { - // Clear the bad bit. This code can be (and is in fact) reached, since the - // C++ standard does not guarantee that a call to unget() must succeed. - is.clear(); - } - } - return ans; -} - -void ExpectToken(std::istream &is, bool binary, const char *token) { - int pos_at_start = is.tellg(); - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - if (!binary) is >> std::ws; // consume whitespace. - std::string str; - is >> str; - is.get(); // consume the space. - if (is.fail()) { - KALDI_ERR << "Failed to read token [started at file position " - << pos_at_start << "], expected " << token; - } - // The second half of the '&&' expression below is so that if we're expecting - // "", we will accept "Foo>" instead. This is so that the model-reading - // code will tolerate errors in PeekToken where is.unget() failed; search for - // is.clear() in PeekToken() for an explanation. - if (strcmp(str.c_str(), token) != 0 && - !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { - KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str - << "\"."; - } -} - -void ExpectToken(std::istream &is, bool binary, const std::string &token) { - ExpectToken(is, binary, token.c_str()); -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs.h deleted file mode 100644 index 06ad1e3d2d8dc8385886a7c6653f620642c7c05a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/io-funcs.h +++ /dev/null @@ -1,246 +0,0 @@ -// base/io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_H_ -#define KALDI_BASE_IO_FUNCS_H_ - -// This header only contains some relatively low-level I/O functions. -// The full Kaldi I/O declarations are in ../util/kaldi-io.h -// and ../util/kaldi-table.h -// They were put in util/ in order to avoid making the Matrix library -// dependent on them. - -#include -#include -#include -#include - -#include "base/io-funcs-inl.h" -#include "base/kaldi-common.h" - -namespace kaldi { - -/* - This comment describes the Kaldi approach to I/O. All objects can be written - and read in two modes: binary and text. In addition we want to make the I/O - work if we redefine the typedef "BaseFloat" between floats and doubles. - We also want to have control over whitespace in text mode without affecting - the meaning of the file, for pretty-printing purposes. - - Errors are handled by throwing a KaldiFatalError exception. - - For integer and floating-point types (and boolean values): - - WriteBasicType(std::ostream &, bool binary, const T&); - ReadBasicType(std::istream &, bool binary, T*); - - and we expect these functions to be defined in such a way that they work when - the type T changes between float and double, so you can read float into double - and vice versa]. Note that for efficiency and space-saving reasons, the - Vector and Matrix classes do not use these functions [but they preserve the - type interchangeability in their own way] - - For a class (or struct) C: - class C { - .. - Write(std::ostream &, bool binary, [possibly extra optional args for - specific classes]) const; Read(std::istream &, bool binary, [possibly extra - optional args for specific classes]); - .. - } - NOTE: The only actual optional args we used are the "add" arguments in - Vector/Matrix classes, which specify whether we should sum the data already - in the class with the data being read. - - For types which are typedef's involving stl classes, I/O is as follows: - typedef std::vector > MyTypedefName; - - The user should define something like: - - WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); - ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); - - The user would have to write these functions. - - For a type std::vector: - - void WriteIntegerVector(std::ostream &os, bool binary, const std::vector - &v); void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - - For other types, e.g. vectors of pairs, the user should create a routine of - the type WriteMyTypedefName. This is to avoid introducing confusing templated - functions; we could easily create templated functions to handle most of these - cases but they would have to share the same name. - - It also often happens that the user needs to write/read special tokens as part - of a file. These might be class headers, or separators/identifiers in the - class. We provide special functions for manipulating these. These special - tokens must be nonempty and must not contain any whitespace. - - void WriteToken(std::ostream &os, bool binary, const char*); - void WriteToken(std::ostream &os, bool binary, const std::string & token); - int Peek(std::istream &is, bool binary); - void ReadToken(std::istream &is, bool binary, std::string *str); - void PeekToken(std::istream &is, bool binary, std::string *str); - - WriteToken writes the token and one space (whether in binary or text mode). - - Peek returns the first character of the next token, by consuming whitespace - (in text mode) and then returning the peek() character. It returns -1 at EOF; - it doesn't throw. It's useful if a class can have various forms based on - typedefs and virtual classes, and wants to know which version to read. - - ReadToken allows the caller to obtain the next token. PeekToken works just - like ReadToken, but seeks back to the beginning of the token. A subsequent - call to ReadToken will read the same token again. This is useful when - different object types are written to the same file; using PeekToken one can - decide which of the objects to read. - - There is currently no special functionality for writing/reading strings (where - the strings contain data rather than "special tokens" that are whitespace-free - and nonempty). This is because Kaldi is structured in such a way that strings - don't appear, except as OpenFst symbol table entries (and these have their own - format). - - - NOTE: you should not call ReadIntegerType and WriteIntegerType with types, - such as int and size_t, that are machine-independent -- at least not - if you want your file formats to port between machines. Use int32 and - int64 where necessary. There is no way to detect this using compile-time - assertions because C++ only keeps track of the internal representation of - the type. -*/ - -/// \addtogroup io_funcs_basic -/// @{ - -/// WriteBasicType is the name of the write function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void WriteBasicType(std::ostream &os, bool binary, T t); - -/// ReadBasicType is the name of the read function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void ReadBasicType(std::istream &is, bool binary, T *t); - -// Declare specialization for bool. -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b); - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b); - -// Declare specializations for float and double. -template <> -void WriteBasicType(std::ostream &os, bool binary, float f); - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f); - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f); - -template <> -void ReadBasicType(std::istream &is, bool binary, double *f); - -// Define ReadBasicType that accepts an "add" parameter to add to -// the destination. Caution: if used in Read functions, be careful -// to initialize the parameters concerned to zero in the default -// constructor. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { - if (!add) { - ReadBasicType(is, binary, t); - } else { - T tmp = T(0); - ReadBasicType(is, binary, &tmp); - *t += tmp; - } -} - -/// Function for writing STL vectors of integer types. -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v); - -/// Function for reading STL vector of integer types. -template -inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - -/// Function for writing STL vectors of pairs of integer types. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v); - -/// Function for reading STL vector of pairs of integer types. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v); - -/// The WriteToken functions are for writing nonempty sequences of non-space -/// characters. They are not for general strings. -void WriteToken(std::ostream &os, bool binary, const char *token); -void WriteToken(std::ostream &os, bool binary, const std::string &token); - -/// Peek consumes whitespace (if binary == false) and then returns the peek() -/// value of the stream. -int Peek(std::istream &is, bool binary); - -/// ReadToken gets the next token and puts it in str (exception on failure). If -/// PeekToken() had been previously called, it is possible that the stream had -/// failed to unget the starting '<' character. In this case ReadToken() returns -/// the token string without the leading '<'. You must be prepared to handle -/// this case. ExpectToken() handles this internally, and is not affected. -void ReadToken(std::istream &is, bool binary, std::string *token); - -/// PeekToken will return the first character of the next token, or -1 if end of -/// file. It's the same as Peek(), except if the first character is '<' it will -/// skip over it and will return the next character. It will attempt to unget -/// the '<' so the stream is where it was before you did PeekToken(), however, -/// this is not guaranteed (see ReadToken()). -int PeekToken(std::istream &is, bool binary); - -/// ExpectToken tries to read in the given token, and throws an exception -/// on failure. -void ExpectToken(std::istream &is, bool binary, const char *token); -void ExpectToken(std::istream &is, bool binary, const std::string &token); - -/// ExpectPretty attempts to read the text in "token", but only in non-binary -/// mode. Throws exception on failure. It expects an exact match except that -/// arbitrary whitespace matches arbitrary whitespace. -void ExpectPretty(std::istream &is, bool binary, const char *token); -void ExpectPretty(std::istream &is, bool binary, const std::string &token); - -/// @} end "addtogroup io_funcs_basic" - -/// InitKaldiOutputStream initializes an opened stream for writing by writing an -/// optional binary header and modifying the floating-point precision; it will -/// typically not be called by users directly. -inline void InitKaldiOutputStream(std::ostream &os, bool binary); - -/// InitKaldiInputStream initializes an opened stream for reading by detecting -/// the binary header and setting the "binary" value appropriately; -/// It will typically not be called by users directly. -inline bool InitKaldiInputStream(std::istream &is, bool *binary); - -} // end namespace kaldi. -#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-common.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-common.h deleted file mode 100644 index eee5f34d7234e7c029e6bb59584d3ee65ff5a875..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-common.h +++ /dev/null @@ -1,41 +0,0 @@ -// base/kaldi-common.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_COMMON_H_ -#define KALDI_BASE_KALDI_COMMON_H_ 1 - -#include -#include -#include // C string stuff like strcpy -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-utils.h" -#include "base/kaldi-error.h" -#include "base/kaldi-types.h" -// #include "base/io-funcs.h" -#include "base/kaldi-math.h" -// #include "base/timer.h" - -#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-error.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-error.cc deleted file mode 100644 index 77edc6af6e56bb8fa3431d519e58fda9ee0bac6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-error.cc +++ /dev/null @@ -1,42 +0,0 @@ -// base/kaldi-error.cc - -// Copyright 2019 LAIX (Yi Sun) -// Copyright 2019 SmartAction LLC (kkm) -// Copyright 2016 Brno University of Technology (author: Karel Vesely) -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-error.h" - -#include - -namespace kaldi { - -/***** GLOBAL VARIABLES FOR LOGGING *****/ - -int32 g_kaldi_verbose_level = 0; -static std::string program_name; // NOLINT - -void SetProgramName(const char *basename) { - // Using the 'static std::string' for the program name is mostly harmless, - // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ - // string implementation has been found in the wild that would not be just - // an empty string when zero-initialized but not yet constructed. - program_name = basename; -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-error.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-error.h deleted file mode 100644 index 0f65db372b5f05a8017433eed7c95badc819a0a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// base/kaldi-error.h - -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_ERROR_H_ -#define KALDI_BASE_KALDI_ERROR_H_ 1 - -#include "utils/log.h" - -namespace kaldi { - -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_LOG \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) - - -/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ - -/// Called by ParseOptions to set base name (no directory) of the executing -/// program. The name is printed in logging code along with every message, -/// because in our scripts, we often mix together the stderr of many programs. -/// This function is very thread-unsafe. -void SetProgramName(const char *basename); - -/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. -/// Do not use directly, prefer {Get,Set}VerboseLevel(). -extern int32 g_kaldi_verbose_level; - -/// Get verbosity level, usually set via command line '--verbose=' switch. -inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } - -/// This should be rarely used, except by programs using Kaldi as library; -/// command-line programs set the verbose level automatically from ParseOptions. -inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } - -} // namespace kaldi - -#endif // KALDI_BASE_KALDI_ERROR_H_ - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-math.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-math.cc deleted file mode 100644 index 175d9f49b6c5216645e90e146f4e2eab5572c342..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-math.cc +++ /dev/null @@ -1,164 +0,0 @@ -// base/kaldi-math.cc - -// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; -// Saarland University; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-math.h" -#ifndef _MSC_VER -#include -#include -#endif -#include -#include - -namespace kaldi { -// These routines are tested in matrix/matrix-test.cc - -int32 RoundUpToNearestPowerOfTwo(int32 n) { - KALDI_ASSERT(n > 0); - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n+1; -} - -static std::mutex _RandMutex; - -int Rand(struct RandomState* state) { -#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) - // On Windows and Cygwin, just call Rand() - return rand(); -#else - if (state) { - return rand_r(&(state->seed)); - } else { - std::lock_guard lock(_RandMutex); - return rand(); - } -#endif -} - -RandomState::RandomState() { - // we initialize it as Rand() + 27437 instead of just Rand(), because on some - // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be - // the case that rand_r when initialized with rand() will give you the exact - // same sequence of numbers that rand() will give if you keep calling rand() - // after that initial call. This can cause problems with repeated sequences. - // For example if you initialize two RandomState structs one after the other - // without calling rand() in between, they would give you the same sequence - // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just - // a randomly chosen prime number. - seed = unsigned(Rand()) + 27437; -} - -bool WithProb(BaseFloat prob, struct RandomState* state) { - KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, - // but we allow slightly larger values that could arise from roundoff in - // previous calculations. - KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); - if (prob == 0) { - return false; - } else if (prob == 1.0) { - return true; - } else if (prob * RAND_MAX < 128.0) { - // prob is very small but nonzero, and the "main algorithm" - // wouldn't work that well. So: with probability 1/128, we - // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... - // Note: we know that prob * 128.0 < 1.0, because - // we asserted RAND_MAX > 128 * 128. - return WithProb(prob * 128.0); - } else { - return false; - } - } else { - return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); - } -} - -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { - // This is not exact. - KALDI_ASSERT(max_val >= min_val); - if (max_val == min_val) return min_val; - -#ifdef _MSC_VER - // RAND_MAX is quite small on Windows -> may need to handle larger numbers. - if (RAND_MAX > (max_val-min_val)*8) { - // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + - ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); - } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > - (unsigned int)((max_val+1-min_val)*8)) { - // *8 to avoid inaccuracies in probability, from the modulus... - return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) - % (unsigned int)(max_val+1-min_val)); - } else { - KALDI_ERR << "rand_int failed because we do not support such large " - "random numbers. (Extend this function)."; - } - } -#else - return min_val + - (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); -#endif -} - -// Returns poisson-distributed random number. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state) { - // Knuth's algorithm. - KALDI_ASSERT(lambda >= 0); - float L = expf(-lambda), p = 1.0; - int32 k = 0; - do { - k++; - float u = RandUniform(state); - p *= u; - } while (p > L); - return k-1; -} - -void RandGauss2(float *a, float *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float u1 = RandUniform(state); - float u2 = RandUniform(state); - u1 = sqrtf(-2.0f * logf(u1)); - u2 = 2.0f * M_PI * u2; - *a = u1 * cosf(u2); - *b = u1 * sinf(u2); -} - -void RandGauss2(double *a, double *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float a_float, b_float; - // Just because we're using doubles doesn't mean we need super-high-quality - // random numbers, so we just use the floating-point version internally. - RandGauss2(&a_float, &b_float, state); - *a = a_float; - *b = b_float; -} - - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-math.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-math.h deleted file mode 100644 index 93c265ee96e704893da26b9083a44a9e60c6c192..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-math.h +++ /dev/null @@ -1,363 +0,0 @@ -// base/kaldi-math.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; -// Jan Silovsky; Saarland University -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_MATH_H_ -#define KALDI_BASE_KALDI_MATH_H_ 1 - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include - -#include "base/kaldi-types.h" -#include "base/kaldi-common.h" - - -#ifndef DBL_EPSILON -#define DBL_EPSILON 2.2204460492503131e-16 -#endif -#ifndef FLT_EPSILON -#define FLT_EPSILON 1.19209290e-7f -#endif - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.4142135623730950488016887 -#endif - -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244008443621048490 -#endif - -#ifndef M_LOG_2PI -#define M_LOG_2PI 1.8378770664093454835606594728112 -#endif - -#ifndef M_LN2 -#define M_LN2 0.693147180559945309417232121458 -#endif - -#ifndef M_LN10 -#define M_LN10 2.302585092994045684017991454684 -#endif - - -#define KALDI_ISNAN std::isnan -#define KALDI_ISINF std::isinf -#define KALDI_ISFINITE(x) std::isfinite(x) - -#if !defined(KALDI_SQR) -# define KALDI_SQR(x) ((x) * (x)) -#endif - -namespace kaldi { - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline double Exp(double x) { return exp(x); } -#ifndef KALDI_NO_EXPF -inline float Exp(float x) { return expf(x); } -#else -inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF -#else -inline double Exp(double x) { return exp(x); } -#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -// Microsoft CL v18.0 buggy 64-bit implementation of -// expf() incorrectly returns -inf for exp(-inf). -inline float Exp(float x) { return exp(static_cast(x)); } -#else -inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -inline double Log(double x) { return log(x); } -inline float Log(float x) { return logf(x); } - -#if !defined(_MSC_VER) || (_MSC_VER >= 1700) -inline double Log1p(double x) { return log1p(x); } -inline float Log1p(float x) { return log1pf(x); } -#else -inline double Log1p(double x) { - const double cutoff = 1.0e-08; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} - -inline float Log1p(float x) { - const float cutoff = 1.0e-07; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} -#endif - -static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! -static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! - -// -infinity -const float kLogZeroFloat = -std::numeric_limits::infinity(); -const double kLogZeroDouble = -std::numeric_limits::infinity(); -const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); - -// Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state = NULL); - -// State for thread-safe random number generator -struct RandomState { - RandomState(); - unsigned seed; -}; - -// Returns a random integer between first and last inclusive. -int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); - -// Returns true with probability "prob", -bool WithProb(BaseFloat prob, struct RandomState* state = NULL); -// with 0 <= prob <= 1 [we check this]. -// Internally calls Rand(). This function is carefully implemented so -// that it should work even if prob is very small. - -/// Returns a random number strictly between 0 and 1. -inline float RandUniform(struct RandomState* state = NULL) { - return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); -} - -inline float RandGauss(struct RandomState* state = NULL) { - return static_cast(sqrtf (-2 * Log(RandUniform(state))) - * cosf(2*M_PI*RandUniform(state))); -} - -// Returns poisson-distributed random number. Uses Knuth's algorithm. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state = NULL); - -// Returns a pair of gaussian random numbers. Uses Box-Muller transform -void RandGauss2(float *a, float *b, RandomState *state = NULL); -void RandGauss2(double *a, double *b, RandomState *state = NULL); - -// Also see Vector::RandCategorical(). - -// This is a randomized pruning mechanism that preserves expectations, -// that we typically use to prune posteriors. -template -inline Float RandPrune(Float post, BaseFloat prune_thresh, - struct RandomState* state = NULL) { - KALDI_ASSERT(prune_thresh >= 0.0); - if (post == 0.0 || std::abs(post) >= prune_thresh) - return post; - return (post >= 0 ? 1.0 : -1.0) * - (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); -} - -// returns log(exp(x) + exp(y)). -inline double LogAdd(double x, double y) { - double diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffDouble) { - double res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) + exp(y)). -inline float LogAdd(float x, float y) { - float diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffFloat) { - float res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) - exp(y)). -inline double LogSub(double x, double y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - double diff = y - x; // Will be negative. - double res = x + Log(1.0 - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroDouble; - return res; -} - - -// returns log(exp(x) - exp(y)). -inline float LogSub(float x, float y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - float diff = y - x; // Will be negative. - float res = x + Log(1.0f - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroFloat; - return res; -} - -/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). -static inline bool ApproxEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - if (a == b) return true; - float diff = std::abs(a-b); - if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); -} - -/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) -static inline void AssertEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); -} - - -// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. -int32 RoundUpToNearestPowerOfTwo(int32 n); - -/// Returns a / b, rounding towards negative infinity in all cases. -static inline int32 DivideRoundingDown(int32 a, int32 b) { - KALDI_ASSERT(b != 0); - if (a * b >= 0) - return a / b; - else if (a < 0) - return (a - b + 1) / b; - else - return (a - b - 1) / b; -} - -template I Gcd(I m, I n) { - if (m == 0 || n == 0) { - if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. - KALDI_ERR << "Undefined GCD since m = 0, n = 0."; - } - return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); - // return absolute value of whichever is nonzero - } - // could use compile-time assertion - // but involves messing with complex template stuff. - KALDI_ASSERT(std::numeric_limits::is_integer); - while (1) { - m %= n; - if (m == 0) return (n > 0 ? n : -n); - n %= m; - if (n == 0) return (m > 0 ? m : -m); - } -} - -/// Returns the least common multiple of two integers. Will -/// crash unless the inputs are positive. -template I Lcm(I m, I n) { - KALDI_ASSERT(m > 0 && n > 0); - I gcd = Gcd(m, n); - return gcd * (m/gcd) * (n/gcd); -} - - -template void Factorize(I m, std::vector *factors) { - // Splits a number into its prime factors, in sorted order from - // least to greatest, with duplication. A very inefficient - // algorithm, which is mainly intended for use in the - // mixed-radix FFT computation (where we assume most factors - // are small). - KALDI_ASSERT(factors != NULL); - KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. - factors->clear(); - I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; - - // First try small factors. - for (I i = 0; i < 10; i++) { - if (m == 1) return; // We're done. - while (m % small_factors[i] == 0) { - m /= small_factors[i]; - factors->push_back(small_factors[i]); - } - } - // Next try all odd numbers starting from 31. - for (I j = 31;; j += 2) { - if (m == 1) return; - while (m % j == 0) { - m /= j; - factors->push_back(j); - } - } -} - -inline double Hypot(double x, double y) { return hypot(x, y); } -inline float Hypot(float x, float y) { return hypotf(x, y); } - - - - -} // namespace kaldi - - -#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-types.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-types.h deleted file mode 100644 index 7ebf4f85386192a65e176d8f0ecde9bb348af4a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-types.h +++ /dev/null @@ -1,75 +0,0 @@ -// base/kaldi-types.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_TYPES_H_ -#define KALDI_BASE_KALDI_TYPES_H_ 1 - -namespace kaldi { -// TYPEDEFS .................................................................. -#if (KALDI_DOUBLEPRECISION != 0) -typedef double BaseFloat; -#else -typedef float BaseFloat; -#endif -} - -#ifdef _MSC_VER -#include -#define ssize_t SSIZE_T -#endif - -// we can do this a different way if some platform -// we find in the future lacks stdint.h -#include - -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - -#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-utils.h deleted file mode 100644 index bd434d09ed92ec94bc4208f53a4416f941edfdb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/base/kaldi-utils.h +++ /dev/null @@ -1,155 +0,0 @@ -// base/kaldi-utils.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; -// Saarland University; Karel Vesely; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_UTILS_H_ -#define KALDI_BASE_KALDI_UTILS_H_ 1 - -#if defined(_MSC_VER) -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX -# include -#endif - -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) -#if _MSC_VER < 1400 -#define __restrict__ -#else -#define __restrict__ __restrict -#endif -#endif - -#if defined(_MSC_VER) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = _aligned_malloc(size, align)) -# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) -#elif defined(__CYGWIN__) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = aligned_alloc(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#else -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#endif - -#ifdef __ICC -#pragma warning(disable: 383) // ICPC remark we don't want. -#pragma warning(disable: 810) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#pragma warning(disable: 1418) // ICPC remark we don't want. -#pragma warning(disable: 444) // ICPC remark we don't want. -#pragma warning(disable: 869) // ICPC remark we don't want. -#pragma warning(disable: 1287) // ICPC remark we don't want. -#pragma warning(disable: 279) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#endif - - -namespace kaldi { - - -// CharToString prints the character in a human-readable form, for debugging. -std::string CharToString(const char &c); - - -inline int MachineIsLittleEndian() { - int check = 1; - return (*reinterpret_cast(&check) != 0); -} - -// This function kaldi::Sleep() provides a portable way -// to sleep for a possibly fractional -// number of seconds. On Windows it's only accurate to microseconds. -void Sleep(float seconds); -} // namespace kaldi - -#define KALDI_SWAP8(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ - (reinterpret_cast(&a))[7] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ - (reinterpret_cast(&a))[6] = t;\ - t = (reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ - (reinterpret_cast(&a))[5] = t;\ - t = (reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ - (reinterpret_cast(&a))[4] = t;} while (0) -#define KALDI_SWAP4(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=t;} while (0) -#define KALDI_SWAP2(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1] = t;} while (0) - - -// Makes copy constructor and operator= private. -#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ - type(const type&); \ - void operator = (const type&) - -template class KaldiCompileTimeAssert { }; -template<> class KaldiCompileTimeAssert { - public: - static inline void Check() { } -}; - -#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() - -#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ - KaldiCompileTimeAssert::is_specialized \ - && std::numeric_limits::is_integer>::Check() - -#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ - KaldiCompileTimeAssert::is_specialized \ - && !std::numeric_limits::is_integer>::Check() - -#if defined(_MSC_VER) -#define KALDI_STRCASECMP _stricmp -#elif defined(__CYGWIN__) -#include -#define KALDI_STRCASECMP strcasecmp -#else -#define KALDI_STRCASECMP strcasecmp -#endif -#ifdef _MSC_VER -# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif - -#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-decoder.cc deleted file mode 100644 index 06f77557fa49a23f6a44d07c327a1b3b081c6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-decoder.cc +++ /dev/null @@ -1,1101 +0,0 @@ -// decoder/lattice-faster-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2018 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "decoder/lattice-faster-decoder.h" -// #include "lat/lattice-functions.h" - -namespace kaldi { - -// instantiate this class once for each thing you have to decode. -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : fst_(&fst), - delete_fst_(false), - config_(config), - num_toks_(0), - context_graph_(context_graph) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const LatticeFasterDecoderConfig &config, FST *fst) - : fst_(fst), delete_fst_(true), config_(config), num_toks_(0) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::~LatticeFasterDecoderTpl() { - DeleteElems(toks_.Clear()); - ClearActiveTokens(); - if (delete_fst_) delete fst_; -} - -template -void LatticeFasterDecoderTpl::InitDecoding() { - // clean up from last time: - DeleteElems(toks_.Clear()); - cost_offsets_.clear(); - ClearActiveTokens(); - warned_ = false; - num_toks_ = 0; - decoding_finalized_ = false; - final_costs_.clear(); - StateId start_state = fst_->Start(); - KALDI_ASSERT(start_state != fst::kNoStateId); - active_toks_.resize(1); - Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL); - active_toks_[0].toks = start_tok; - toks_.Insert(start_state, start_tok); - num_toks_++; - ProcessNonemitting(config_.beam); -} - -// Returns true if any kind of traceback is available (not necessarily from -// a final state). It should only very rarely return false; this indicates -// an unusual search error. -template -bool LatticeFasterDecoderTpl::Decode( - DecodableInterface *decodable) { - InitDecoding(); - // We use 1-based indexing for frames in this decoder (if you view it in - // terms of features), but note that the decodable object uses zero-based - // numbering, which we have to correct for when we call it. - AdvanceDecoding(decodable); - FinalizeDecoding(); - - // Returns true if we have any kind of traceback available (not necessarily - // to the end state; query ReachedFinal() for that). - return !active_toks_.empty() && active_toks_.back().toks != NULL; -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - Lattice raw_lat; - GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, olat); - return (olat->NumStates() != 0); -} - -// Outputs an FST corresponding to the raw, state-level lattice -template -bool LatticeFasterDecoderTpl::GetRawLattice( - Lattice *ofst, bool use_final_probs) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (decoding_finalized_ ? final_costs_ : final_costs_local); - if (!decoding_finalized_ && use_final_probs) - ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_ / 2 + 3; - unordered_map tok_map(bucket_count); - // First create all states. - std::vector token_list; - for (int32 f = 0; f <= num_frames; f++) { - if (active_toks_[f].toks == NULL) { - KALDI_WARN << "GetRawLattice: no tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - TopSortTokens(active_toks_[f].toks, &token_list); - for (size_t i = 0; i < token_list.size(); i++) - if (token_list[i] != NULL) tok_map[token_list[i]] = ofst->AddState(); - } - // The next statement sets the start state of the output FST. Because we - // topologically sorted the tokens, state zero must be the start-state. - ofst->SetStart(0); - - KALDI_VLOG(4) << "init:" << num_toks_ / 2 + 3 - << " buckets:" << tok_map.bucket_count() - << " load:" << tok_map.load_factor() - << " max:" << tok_map.max_load_factor(); - // Now create all arcs. - for (int32 f = 0; f <= num_frames; f++) { - for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) { - StateId cur_state = tok_map[tok]; - for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) { - typename unordered_map::const_iterator iter = - tok_map.find(l->next_tok); - StateId nextstate = iter->second; - KALDI_ASSERT(iter != tok_map.end()); - BaseFloat cost_offset = 0.0; - if (l->ilabel != 0) { // emitting.. - KALDI_ASSERT(f >= 0 && f < cost_offsets_.size()); - cost_offset = cost_offsets_[f]; - } - - StateId state = cur_state; - if (l->is_start_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->start_tag_id(), Weight(0, 0), tmp); - ofst->AddArc(state, arc); - state = tmp; - } - if (l->is_end_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->end_tag_id(), Weight(0, 0), nextstate); - ofst->AddArc(tmp, arc); - nextstate = tmp; - } - - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(state, arc); - } - if (f == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - } - - fst::TopSort(ofst); - return (ofst->NumStates() > 0); -} - -// This function is now deprecated, since now we do determinization from outside -// the LatticeFasterDecoder class. Outputs an FST corresponding to the -// lattice-determinized lattice (one path per word sequence). -template -bool LatticeFasterDecoderTpl::GetLattice( - CompactLattice *ofst, bool use_final_probs) const { - Lattice raw_fst; - GetRawLattice(&raw_fst, use_final_probs); - Invert(&raw_fst); // make it so word labels are on the input. - // (in phase where we get backward-costs). - fst::ILabelCompare ilabel_comp; - ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes - // lattice-determinization more efficient. - - fst::DeterminizeLatticePrunedOptions lat_opts; - lat_opts.max_mem = config_.det_opts.max_mem; - - DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); - raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. - Connect(ofst); // Remove unreachable states... there might be - // a small number of these, in some cases. - // Note: if something went wrong and the raw lattice was empty, - // we should still get to this point in the code without warnings or failures. - return (ofst->NumStates() != 0); -} - -template -void LatticeFasterDecoderTpl::PossiblyResizeHash(size_t num_toks) { - size_t new_sz = static_cast(static_cast(num_toks) * - config_.hash_ratio); - if (new_sz > toks_.Size()) { - toks_.SetSize(new_sz); - } -} - -/* - A note on the definition of extra_cost. - - extra_cost is used in pruning tokens, to save memory. - - extra_cost can be thought of as a beta (backward) cost assuming - we had set the betas on currently-active tokens to all be the negative - of the alphas for those tokens. (So all currently active tokens would - be on (tied) best paths). - - We can use the extra_cost to accurately prune away tokens that we know will - never appear in the lattice. If the extra_cost is greater than the desired - lattice beam, the token would provably never appear in the lattice, so we can - prune away the token. - - (Note: we don't update all the extra_costs every time we update a frame; we - only do it every 'config_.prune_interval' frames). - */ - -// FindOrAddToken either locates a token in hash of toks_, -// or if necessary inserts a new, empty token (i.e. with no forward links) -// for the current frame. [note: it's inserted if necessary into hash toks_ -// and also into the singly linked list of tokens active on this frame -// (whose head is at active_toks_[frame]). -template -inline typename LatticeFasterDecoderTpl::Elem * -LatticeFasterDecoderTpl::FindOrAddToken(StateId state, - int32 frame_plus_one, - BaseFloat tot_cost, - Token *backpointer, - bool *changed) { - // Returns the Token pointer. Sets "changed" (if non-NULL) to true - // if the token was newly created or the cost changed. - KALDI_ASSERT(frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - Elem *e_found = toks_.Insert(state, NULL); - if (e_found->val == NULL) { // no such token presently. - const BaseFloat extra_cost = 0.0; - // tokens on the currently final frame have zero extra_cost - // as any of them could end up - // on the winning path. - Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer); - // NULL: no forward links yet - toks = new_tok; - num_toks_++; - e_found->val = new_tok; - if (changed) *changed = true; - return e_found; - } else { - Token *tok = e_found->val; // There is an existing Token for this state. - if (tok->tot_cost > tot_cost) { // replace old token - tok->tot_cost = tot_cost; - // SetBackpointer() just does tok->backpointer = backpointer in - // the case where Token == BackpointerToken, else nothing. - tok->SetBackpointer(backpointer); - // we don't allocate a new token, the old stays linked in active_toks_ - // we only replace the tot_cost - // in the current frame, there are no forward links (and no extra_cost) - // only in ProcessNonemitting we have to delete forward links - // in case we visit a state for the second time - // those forward links, that lead to this replaced token before: - // they remain and will hopefully be pruned later (PruneForwardLinks...) - if (changed) *changed = true; - } else { - if (changed) *changed = false; - } - return e_found; - } -} - -// prunes outgoing links for all tokens in active_toks_[frame] -// it's called by PruneActiveTokens -// all links, that have link_extra_cost > lattice_beam are pruned -template -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - // delta is the amount by which the extra_costs must change - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - - *extra_costs_changed = false; - *links_pruned = false; - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - if (active_toks_[frame_plus_one].toks == - NULL) { // empty list; should not happen. - if (!warned_) { - KALDI_WARN << "No tokens alive [doing pruning].. warning first " - "time only for each utterance\n"; - warned_ = true; - } - } - - // We have to iterate until there is no more change, because the links - // are not guaranteed to be in topological order. - bool changed = true; // difference new minus old extra cost >= delta ? - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost for tok. - BaseFloat tok_extra_cost = std::numeric_limits::infinity(); - // tok_extra_cost is the best (min) of link_extra_cost of outgoing links - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state - KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN - // the graph_cost contatins the context score - // if it's the score of the backoff arc, it should be removed. - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - *links_pruned = true; - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; // move to next link - link = link->next; - } - } // for all outgoing links - if (fabs(tok_extra_cost - tok->extra_cost) > delta) - changed = true; // difference new minus old is bigger than delta - tok->extra_cost = tok_extra_cost; - // will be +infinity or <= lattice_beam_. - // infinity indicates, that no forward link survived pruning - } // for all Token on active_toks_[frame] - if (changed) *extra_costs_changed = true; - - // Note: it's theoretically possible that aggressive compiler - // optimizations could cause an infinite loop here for small delta and - // high-dynamic-range scores. - } // while changed -} - -// PruneForwardLinksFinal is a version of PruneForwardLinks that we call -// on the final frame. If there are final tokens active, it uses -// the final-probs for pruning, otherwise it treats all tokens as final. -template -void LatticeFasterDecoderTpl::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame_plus_one = active_toks_.size() - 1; - - if (active_toks_[frame_plus_one].toks == - NULL) // empty list; should not happen. - KALDI_WARN << "No tokens alive at end of file"; - - typedef typename unordered_map::const_iterator IterType; - ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); - decoding_finalized_ = true; - // We call DeleteElems() as a nicety, not because it's really necessary; - // otherwise there would be a time, after calling PruneTokensForFrame() on the - // final frame, when toks_.GetList() or toks_.Clear() would contain pointers - // to nonexistent tokens. - DeleteElems(toks_.Clear()); - - // Now go through tokens on this frame, pruning forward links... may have to - // iterate a few times until there is no more change, because the list is not - // in topological order. This is a modified version of the code in - // PruneForwardLinks, but here we also take account of the final-probs. - bool changed = true; - BaseFloat delta = 1.0e-05; - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost. It has a term in it that corresponds - // to the "final-prob", so instead of initializing tok_extra_cost to - // infinity below we set it to the difference between the - // (score+final_prob) of this token, and the best such (score+final_prob). - BaseFloat final_cost; - if (final_costs_.empty()) { - final_cost = 0.0; - } else { - IterType iter = final_costs_.find(tok); - if (iter != final_costs_.end()) - final_cost = iter->second; - else - final_cost = std::numeric_limits::infinity(); - } - BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_; - // tok_extra_cost will be a "min" over either directly being final, or - // being indirectly final through other links, and the loop below may - // decrease its value: - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; - link = link->next; - } - } - // prune away tokens worse than lattice_beam above best path. This step - // was not necessary in the non-final case because then, this case - // showed up as having no forward links. Here, the tok_extra_cost has - // an extra component relating to the final-prob. - if (tok_extra_cost > config_.lattice_beam) - tok_extra_cost = std::numeric_limits::infinity(); - // to be pruned in PruneTokensForFrame - - if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true; - tok->extra_cost = - tok_extra_cost; // will be +infinity or <= lattice_beam_. - } - } // while changed -} - -template -BaseFloat LatticeFasterDecoderTpl::FinalRelativeCost() const { - if (!decoding_finalized_) { - BaseFloat relative_cost; - ComputeFinalCosts(NULL, &relative_cost, NULL); - return relative_cost; - } else { - // we're not allowed to call that function if FinalizeDecoding() has - // been called; return a cached value. - return final_relative_cost_; - } -} - -// Prune away any tokens on this frame that have no forward links. -// [we don't do this in PruneForwardLinks because it would give us -// a problem with dangling pointers]. -// It's called by PruneActiveTokens if any forward links have been pruned -template -void LatticeFasterDecoderTpl::PruneTokensForFrame( - int32 frame_plus_one) { - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]"; - Token *tok, *next_tok, *prev_tok = NULL; - for (tok = toks; tok != NULL; tok = next_tok) { - next_tok = tok->next; - if (tok->extra_cost == std::numeric_limits::infinity()) { - // token is unreachable from end of graph; (no forward links survived) - // excise tok from list and delete tok. - if (prev_tok != NULL) - prev_tok->next = tok->next; - else - toks = tok->next; - delete tok; - num_toks_--; - } else { // fetch next Token - prev_tok = tok; - } - } -} - -// Go backwards through still-alive tokens, pruning them, starting not from -// the current frame (where we want to keep all tokens) but from the frame -// before that. We go backwards through the frames and stop when we reach a -// point where the delta-costs are not changing (and the delta controls when we -// consider a cost to have "not changed"). -template -void LatticeFasterDecoderTpl::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // The index "f" below represents a "frame plus one", i.e. you'd have to - // subtract one to get the corresponding index for the decodable object. - for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) { - // Reason why we need to prune forward links in this situation: - // (1) we have never pruned them (new TokenList) - // (2) we have not yet pruned the forward links to the next f, - // after any of those tokens have changed their extra_cost. - if (active_toks_[f].must_prune_forward_links) { - bool extra_costs_changed = false, links_pruned = false; - PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); - if (extra_costs_changed && f > 0) // any token has changed extra_cost - active_toks_[f - 1].must_prune_forward_links = true; - if (links_pruned) // any link was pruned - active_toks_[f].must_prune_tokens = true; - active_toks_[f].must_prune_forward_links = false; // job done - } - if (f + 1 < cur_frame_plus_one && // except for last f (no forward links) - active_toks_[f + 1].must_prune_tokens) { - PruneTokensForFrame(f + 1); - active_toks_[f + 1].must_prune_tokens = false; - } - } - KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin - << " to " << num_toks_; -} - -template -void LatticeFasterDecoderTpl::ComputeFinalCosts( - unordered_map *final_costs, - BaseFloat *final_relative_cost, BaseFloat *final_best_cost) const { - KALDI_ASSERT(!decoding_finalized_); - if (final_costs != NULL) final_costs->clear(); - const Elem *final_toks = toks_.GetList(); - BaseFloat infinity = std::numeric_limits::infinity(); - BaseFloat best_cost = infinity, best_cost_with_final = infinity; - - while (final_toks != NULL) { - StateId state = final_toks->key; - Token *tok = final_toks->val; - const Elem *next = final_toks->tail; - BaseFloat final_cost = fst_->Final(state).Value(); - BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost; - best_cost = std::min(cost, best_cost); - best_cost_with_final = std::min(cost_with_final, best_cost_with_final); - if (final_costs != NULL && final_cost != infinity) - (*final_costs)[tok] = final_cost; - final_toks = next; - } - if (final_relative_cost != NULL) { - if (best_cost == infinity && best_cost_with_final == infinity) { - // Likely this will only happen if there are no tokens surviving. - // This seems the least bad way to handle it. - *final_relative_cost = infinity; - } else { - *final_relative_cost = best_cost_with_final - best_cost; - } - } - if (final_best_cost != NULL) { - if (best_cost_with_final != infinity) { // final-state exists. - *final_best_cost = best_cost_with_final; - } else { // no final-state exists. - *final_best_cost = best_cost; - } - } -} - -template -void LatticeFasterDecoderTpl::AdvanceDecoding( - DecodableInterface *decodable, int32 max_num_frames) { - if (std::is_same >::value) { - // if the type 'FST' is the FST base-class, then see if the FST type of fst_ - // is actually VectorFst or ConstFst. If so, call the AdvanceDecoding() - // function after casting *this to the more specific type. - if (fst_->Type() == "const") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } else if (fst_->Type() == "vector") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } - } - - KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ && - "You must call InitDecoding() before AdvanceDecoding"); - int32 num_frames_ready = decodable->NumFramesReady(); - // num_frames_ready must be >= num_frames_decoded, or else - // the number of frames ready must have decreased (which doesn't - // make sense) or the decodable object changed between calls - // (which isn't allowed). - KALDI_ASSERT(num_frames_ready >= NumFramesDecoded()); - int32 target_frames_decoded = num_frames_ready; - if (max_num_frames >= 0) - target_frames_decoded = - std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames); - while (NumFramesDecoded() < target_frames_decoded) { - if (NumFramesDecoded() % config_.prune_interval == 0) { - PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); - } -} - -// FinalizeDecoding() is a version of PruneActiveTokens that we call -// (optionally) on the final frame. Takes into account the final-prob of -// tokens. This function used to be called PruneActiveTokensFinal(). -template -void LatticeFasterDecoderTpl::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // PruneForwardLinksFinal() prunes final frame (with final-probs), and - // sets decoding_finalized_. - PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { - bool b1, b2; // values not used. - BaseFloat dontcare = 0.0; // delta of zero means we must always update - PruneForwardLinks(f, &b1, &b2, dontcare); - PruneTokensForFrame(f + 1); - } - PruneTokensForFrame(0); - KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " - << num_toks_; -} - -/// Gets the weight cutoff. Also counts the active tokens. -template -BaseFloat LatticeFasterDecoderTpl::GetCutoff( - Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, - Elem **best_elem) { - BaseFloat best_weight = std::numeric_limits::infinity(); - // positive == high cost == bad. - size_t count = 0; - if (config_.max_active == std::numeric_limits::max() && - config_.min_active == 0) { - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = static_cast(e->val->tot_cost); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - if (adaptive_beam != NULL) *adaptive_beam = config_.beam; - return best_weight + config_.beam; - } else { - tmp_array_.clear(); - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = e->val->tot_cost; - tmp_array_.push_back(w); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - - BaseFloat beam_cutoff = best_weight + config_.beam, - min_active_cutoff = std::numeric_limits::infinity(), - max_active_cutoff = std::numeric_limits::infinity(); - - KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() - << " is " << tmp_array_.size(); - - if (tmp_array_.size() > static_cast(config_.max_active)) { - std::nth_element(tmp_array_.begin(), - tmp_array_.begin() + config_.max_active, - tmp_array_.end()); - max_active_cutoff = tmp_array_[config_.max_active]; - } - if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam. - if (adaptive_beam) - *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; - return max_active_cutoff; - } - if (tmp_array_.size() > static_cast(config_.min_active)) { - if (config_.min_active == 0) { - min_active_cutoff = best_weight; - } else { - std::nth_element( - tmp_array_.begin(), tmp_array_.begin() + config_.min_active, - tmp_array_.size() > static_cast(config_.max_active) - ? tmp_array_.begin() + config_.max_active - : tmp_array_.end()); - min_active_cutoff = tmp_array_[config_.min_active]; - } - } - if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. - if (adaptive_beam) - *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; - return min_active_cutoff; - } else { - *adaptive_beam = config_.beam; - return beam_cutoff; - } - } -} - -template -BaseFloat LatticeFasterDecoderTpl::ProcessEmitting( - DecodableInterface *decodable) { - KALDI_ASSERT(active_toks_.size() > 0); - int32 frame = - active_toks_.size() - 1; // frame is the frame-index - // (zero-based) used to get likelihoods - // from the decodable object. - active_toks_.resize(active_toks_.size() + 1); - - Elem *final_toks = - toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_ - // in simple-decoder.h. Removes the Elems from - // being indexed in the hash in toks_. - Elem *best_elem = NULL; - BaseFloat adaptive_beam; - size_t tok_cnt; - BaseFloat cur_cutoff = - GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); - KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " - << adaptive_beam; - - PossiblyResizeHash( - tok_cnt); // This makes sure the hash is always big enough. - - BaseFloat next_cutoff = std::numeric_limits::infinity(); - // pruning "online" before having seen all tokens - - BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good - // dynamic range. - - // First process the best token to get a hopefully - // reasonably tight bound on the next cutoff. The only - // products of the next block are "next_cutoff" and "cost_offset". - if (best_elem) { - StateId state = best_elem->key; - Token *tok = best_elem->val; - cost_offset = -tok->tot_cost; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat new_weight = arc.weight.Value() + cost_offset - - decodable->LogLikelihood(frame, arc.ilabel) + - tok->tot_cost; - if (state != arc.nextstate) { - new_weight += config_.length_penalty; - } - if (new_weight + adaptive_beam < next_cutoff) - next_cutoff = new_weight + adaptive_beam; - } - } - } - - // Store the offset on the acoustic likelihoods that we're applying. - // Could just do cost_offsets_.push_back(cost_offset), but we - // do it this way as it's more robust to future code changes. - cost_offsets_.resize(frame + 1, 0.0); - cost_offsets_[frame] = cost_offset; - - // the tokens are now owned here, in final_toks, and the hash is empty. - // 'owned' is a complex thing here; the point is we need to call DeleteElem - // on each elem 'e' to let toks_ know we're done with them. - for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) { - // loop this way because we delete "e" as we go. - StateId state = e->key; - Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat ac_cost = cost_offset - - decodable->LogLikelihood(frame, arc.ilabel), - graph_cost = arc.weight.Value(); - if (state != arc.nextstate) { - graph_cost += config_.length_penalty; - } - BaseFloat cur_cost = tok->tot_cost, - tot_cost = cur_cost + ac_cost + graph_cost; - if (tot_cost >= next_cutoff) - continue; - else if (tot_cost + adaptive_beam < next_cutoff) - next_cutoff = - tot_cost + adaptive_beam; // prune by best current token - // Note: the frame indexes into active_toks_ are one-based, - // hence the + 1. - Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); - // NULL: no change indicator needed - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - // Add ForwardLink from tok to next_tok (put on head of list - // tok->links) - tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); - tok->links->context_score = context_score; - } - } // for all arcs - } - e_tail = e->tail; - toks_.Delete(e); // delete Elem - } - return next_cutoff; -} - -// static inline -template -void LatticeFasterDecoderTpl::DeleteForwardLinks(Token *tok) { - ForwardLinkT *l = tok->links, *m; - while (l != NULL) { - m = l->next; - delete l; - l = m; - } - tok->links = NULL; -} - -template -void LatticeFasterDecoderTpl::ProcessNonemitting(BaseFloat cutoff) { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame = static_cast(active_toks_.size()) - 2; - // Note: "frame" is the time-index we just processed, or -1 if - // we are processing the nonemitting transitions before the - // first frame (called from InitDecoding()). - - // Processes nonemitting arcs for one frame. Propagates within toks_. - // Note-- this queue structure is not very optimal as - // it may cause us to process states unnecessarily (e.g. more than once), - // but in the baseline code, turning this vector into a set to fix this - // problem did not improve overall speed. - - KALDI_ASSERT(queue_.empty()); - - if (toks_.GetList() == NULL) { - if (!warned_) { - KALDI_WARN << "Error, no surviving tokens: frame is " << frame; - warned_ = true; - } - } - - int before = 0, after = 0; - for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) { - StateId state = e->key; - if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(e); - ++before; - } - - while (!queue_.empty()) { - ++after; - const Elem *e = queue_.back(); - queue_.pop_back(); - - StateId state = e->key; - Token *tok = - e->val; // would segfault if e is a NULL pointer but this can't happen. - BaseFloat cur_cost = tok->tot_cost; - if (cur_cost >= cutoff) // Don't bother processing successors. - continue; - // If "tok" has any existing forward links, delete them, - // because we're about to regenerate them. This is a kind - // of non-optimality (remember, this is the simple decoder), - // but since most states are emitting it's not a huge issue. - DeleteForwardLinks(tok); // necessary when re-visiting - tok->links = NULL; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel == 0) { // propagate nonemitting only... - BaseFloat graph_cost = arc.weight.Value(), - tot_cost = cur_cost + graph_cost; - if (tot_cost < cutoff) { - bool changed; - - Elem *e_new = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed); - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_new->val->context_state = tok->context_state; - } else { - e_new->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - - tok->links = - new ForwardLinkT(e_new->val, 0, arc.olabel, graph_cost, 0, - is_start_boundary, is_end_boundary, tok->links); - tok->links->context_score = context_score; - - // "changed" tells us whether the new token has a different - // cost from before, or is new [if so, add into queue]. - if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0) - queue_.push_back(e_new); - } - } - } // for all arcs - } // while queue not empty - KALDI_VLOG(3) << "ProcessNonemitting " << before << " " << after; -} - -template -void LatticeFasterDecoderTpl::DeleteElems(Elem *list) { - for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { - e_tail = e->tail; - toks_.Delete(e); - } -} - -template -void LatticeFasterDecoderTpl< - FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin - for (size_t i = 0; i < active_toks_.size(); i++) { - // Delete all tokens alive on this frame, and any forward - // links they may have. - for (Token *tok = active_toks_[i].toks; tok != NULL;) { - DeleteForwardLinks(tok); - Token *next_tok = tok->next; - delete tok; - num_toks_--; - tok = next_tok; - } - } - active_toks_.clear(); - KALDI_ASSERT(num_toks_ == 0); -} - -// static -template -void LatticeFasterDecoderTpl::TopSortTokens( - Token *tok_list, std::vector *topsorted_list) { - unordered_map token2pos; - using std::unordered_set; - typedef typename unordered_map::iterator IterType; - int32 num_toks = 0; - for (Token *tok = tok_list; tok != NULL; tok = tok->next) num_toks++; - int32 cur_pos = 0; - // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0. - // This is likely to be in closer to topological order than - // if we had given them ascending order, because of the way - // new tokens are put at the front of the list. - for (Token *tok = tok_list; tok != NULL; tok = tok->next) - token2pos[tok] = num_toks - ++cur_pos; - - unordered_set reprocess; - - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) { - Token *tok = iter->first; - int32 pos = iter->second; - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - // We only need to consider epsilon links, since non-epsilon links - // transition between frames and this function only needs to sort a list - // of tokens from a single frame. - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { // another token on this - // frame, so must consider it. - int32 next_pos = following_iter->second; - if (next_pos < pos) { // reassign the position of the next Token. - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - // In case we had previously assigned this token to be reprocessed, we can - // erase it from that set because it's "happy now" (we just processed it). - reprocess.erase(tok); - } - - size_t max_loop = 1000000, - loop_count; // max_loop is to detect epsilon cycles. - for (loop_count = 0; !reprocess.empty() && loop_count < max_loop; - ++loop_count) { - std::vector reprocess_vec; - for (typename unordered_set::iterator iter = reprocess.begin(); - iter != reprocess.end(); ++iter) - reprocess_vec.push_back(*iter); - reprocess.clear(); - for (typename std::vector::iterator iter = reprocess_vec.begin(); - iter != reprocess_vec.end(); ++iter) { - Token *tok = *iter; - int32 pos = token2pos[tok]; - // Repeat the processing we did above (for comments, see above). - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { - int32 next_pos = following_iter->second; - if (next_pos < pos) { - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - } - } - KALDI_ASSERT(loop_count < max_loop && - "Epsilon loops exist in your decoding " - "graph (this is not allowed!)"); - - topsorted_list->clear(); - topsorted_list->resize(cur_pos, - NULL); // create a list with NULLs in between. - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) - (*topsorted_list)[iter->second] = iter->first; -} - -// Instantiate the template for the combination of token types and FST types -// that we'll need. -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; - -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-decoder.h deleted file mode 100644 index 0152b85447e354b770745b748d266b1ca2d57024..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-decoder.h +++ /dev/null @@ -1,558 +0,0 @@ -// decoder/lattice-faster-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_ - -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "decoder/context_graph.h" -#include "fst/fstlib.h" -#include "fstext/fstext-lib.h" -#include "itf/decodable-itf.h" -#include "lat/determinize-lattice-pruned.h" -#include "lat/kaldi-lattice.h" -#include "util/hash-list.h" - -namespace kaldi { - -struct LatticeFasterDecoderConfig { - BaseFloat beam; - int32 max_active; - int32 min_active; - BaseFloat lattice_beam; - int32 prune_interval; - bool determinize_lattice; // not inspected by this class... used in - // command-line program. - BaseFloat beam_delta; - BaseFloat hash_ratio; - // Note: we don't make prune_scale configurable on the command line, it's not - // a very important parameter. It affects the algorithm that prunes the - // tokens as we go. - BaseFloat prune_scale; - BaseFloat length_penalty; // for balancing the del/ins ratio, suggested -3.0 - - // Most of the options inside det_opts are not actually queried by the - // LatticeFasterDecoder class itself, but by the code that calls it, for - // example in the function DecodeUtteranceLatticeFaster. - fst::DeterminizeLatticePhonePrunedOptions det_opts; - - LatticeFasterDecoderConfig() - : beam(16.0), - max_active(std::numeric_limits::max()), - min_active(200), - lattice_beam(10.0), - prune_interval(25), - determinize_lattice(true), - beam_delta(0.5), - hash_ratio(2.0), - prune_scale(0.1), - length_penalty(0.0) {} - void Register(OptionsItf *opts) { - det_opts.Register(opts); - opts->Register("beam", &beam, - "Decoding beam. Larger->slower, more accurate."); - opts->Register("max-active", &max_active, - "Decoder max active states. Larger->slower; " - "more accurate"); - opts->Register("min-active", &min_active, - "Decoder minimum #active states."); - opts->Register("lattice-beam", &lattice_beam, - "Lattice generation beam. Larger->slower, " - "and deeper lattices"); - opts->Register("prune-interval", &prune_interval, - "Interval (in frames) at " - "which to prune tokens"); - opts->Register( - "determinize-lattice", &determinize_lattice, - "If true, " - "determinize the lattice (lattice-determinization, keeping only " - "best pdf-sequence for each word-sequence)."); - opts->Register( - "beam-delta", &beam_delta, - "Increment used in decoding-- this " - "parameter is obscure and relates to a speedup in the way the " - "max-active constraint is applied. Larger is more accurate."); - opts->Register("hash-ratio", &hash_ratio, - "Setting used in decoder to " - "control hash behavior"); - } - void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && - min_active <= max_active && prune_interval > 0 && - beam_delta > 0.0 && hash_ratio >= 1.0 && prune_scale > 0.0 && - prune_scale < 1.0); - } -}; - -namespace decoder { -// We will template the decoder on the token type as well as the FST type; this -// is a mechanism so that we can use the same underlying decoder code for -// versions of the decoder that support quickly getting the best path -// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also -// those that do not (LatticeFasterDecoder). - -// ForwardLinks are the links from a token to a token on the next frame. -// or sometimes on the current frame (for input-epsilon links). -template -struct ForwardLink { - using Label = fst::StdArc::Label; - - Token *next_tok; // the next token [or NULL if represents final-state] - Label ilabel; // ilabel on arc - Label olabel; // olabel on arc - BaseFloat graph_cost; // graph cost of traversing arc (contains LM, etc.) - BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing arc - bool is_start_boundary; - bool is_end_boundary; - float context_score; - ForwardLink *next; // next in singly-linked list of forward arcs (arcs - // in the state-level lattice) from a token. - inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, - bool is_start_boundary, bool is_end_boundary, - ForwardLink *next) - : next_tok(next_tok), - ilabel(ilabel), - olabel(olabel), - graph_cost(graph_cost), - acoustic_cost(acoustic_cost), - is_start_boundary(is_start_boundary), - is_end_boundary(is_end_boundary), - context_score(0), - next(next) {} -}; - -struct StdToken { - using ForwardLinkT = ForwardLink; - using Token = StdToken; - - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals the - // minimum difference between the cost of the best path that this link is a - // part of, and the cost of the absolute best path, under the assumption that - // any of the currently active states at the decoding front may eventually - // succeed (e.g. if you were to take the currently active states one by one - // and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - Token *next; - - // This function does nothing and should be optimized out; it's needed - // so we can share the regular LatticeFasterDecoderTpl code and the code - // for LatticeFasterOnlineDecoder that supports fast traceback. - inline void SetBackpointer(Token *backpointer) {} - - // This constructor just ignores the 'backpointer' argument. That argument is - // needed so that we can use the same decoder code for LatticeFasterDecoderTpl - // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a - // fast way to obtain the best path). - inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links, - Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - context_state(0), - next(next) {} -}; - -struct BackpointerToken { - using ForwardLinkT = ForwardLink; - using Token = BackpointerToken; - - // BackpointerToken is like Token but also - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - BackpointerToken *next; - - // Best preceding BackpointerToken (could be a on this frame, connected to - // this via an epsilon transition, or on a previous frame). This is only - // required for an efficient GetBestPath function in - // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation - // (the "links" list is what stores the forward links, for that). - Token *backpointer; - - inline void SetBackpointer(Token *backpointer) { - this->backpointer = backpointer; - } - - inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, - ForwardLinkT *links, Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - next(next), - backpointer(backpointer), - context_state(0) {} -}; - -} // namespace decoder - -/** This is the "normal" lattice-generating decoder. - See \ref lattices_generation \ref decoders_faster and \ref decoders_simple - for more information. - - The decoder is templated on the FST type and the token type. The token type - will normally be StdToken, but also may be BackpointerToken which is to - support quick lookup of the current best path (see - lattice-faster-online-decoder.h) - - The FST you invoke this decoder which is expected to equal - Fst::Fst, a.k.a. StdFst, or GrammarFst. If you invoke it with - FST == StdFst and it notices that the actual FST type is - fst::VectorFst or fst::ConstFst, the decoder object - will internally cast itself to one that is templated on those more specific - types; this is an optimization for speed. - */ -template -class LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph); - - // This version of the constructor takes ownership of the fst, and will delete - // it when this object is destroyed. - LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config, FST *fst); - - void SetOptions(const LatticeFasterDecoderConfig &config) { - config_ = config; - } - - const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - - ~LatticeFasterDecoderTpl(); - - /// Decodes until there are no more frames left in the "decodable" object.. - /// note, this may block waiting for input if the "decodable" object blocks. - /// Returns true if any kind of traceback is available (not necessarily from a - /// final state). - bool Decode(DecodableInterface *decodable); - - /// says whether a final-state was active on the last frame. If it was not, - /// the lattice (or traceback) will end with states that are not final-states. - bool ReachedFinal() const { - return FinalRelativeCost() != std::numeric_limits::infinity(); - } - - /// Outputs an FST corresponding to the single best path through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. Note: this just calls - /// GetRawLattice() and figures out the shortest path. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// Outputs an FST corresponding to the raw, state-level - /// tracebacks. Returns true if result is nonempty. - /// If "use_final_probs" is true AND we reached the final-state - /// of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - /// The raw lattice will be topologically sorted. - /// - /// See also GetRawLatticePruned in lattice-faster-online-decoder.h, - /// which also supports a pruning beam, in case for some reason - /// you want it pruned tighter than the regular lattice beam. - /// We could put that here in future needed. - bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const; - - /// [Deprecated, users should now use GetRawLattice and determinize it - /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper]. - /// Outputs an FST corresponding to the lattice-determinized - /// lattice (one path per word sequence). Returns true if result is - /// nonempty. If "use_final_probs" is true AND we reached the final-state of - /// the graph then it will include those as final-probs, else it will treat - /// all final-probs as one. - bool GetLattice(CompactLattice *ofst, bool use_final_probs = true) const; - - /// InitDecoding initializes the decoding, and should only be used if you - /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to - /// call this. You can also call InitDecoding if you have already decoded an - /// utterance and want to start with a new utterance. - void InitDecoding(); - - /// This will decode until there are no more frames ready in the decodable - /// object. You can keep calling it each time more frames become available. - /// If max_num_frames is specified, it specifies the maximum number of frames - /// the function will decode before returning. - void AdvanceDecoding(DecodableInterface *decodable, - int32 max_num_frames = -1); - - /// This function may be optionally called after AdvanceDecoding(), when you - /// do not plan to decode any further. It does an extra pruning step that - /// will help to prune the lattices output by GetLattice and (particularly) - /// GetRawLattice more completely, particularly toward the end of the - /// utterance. If you call this, you cannot call AdvanceDecoding again (it - /// will fail), and you cannot call GetLattice() and related functions with - /// use_final_probs = false. Used to be called PruneActiveTokensFinal(). - void FinalizeDecoding(); - - /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives - /// more information. It returns the difference between the best (final-cost - /// plus cost) of any token on the final frame, and the best cost of any token - /// on the final frame. If it is infinity it means no final-states were - /// present on the final frame. It will usually be nonnegative. If it not - /// too positive (e.g. < 5 is my first guess, but this is not tested) you can - /// take it as a good indication that we reached the final-state with - /// reasonable likelihood. - BaseFloat FinalRelativeCost() const; - - // Returns the number of frames decoded so far. The value returned changes - // whenever we call ProcessEmitting(). - inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; } - - protected: - // we make things protected instead of private, as code in - // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the - // internals. - - // Deletes the elements of the singly linked list tok->links. - inline static void DeleteForwardLinks(Token *tok); - - // head of per-frame list of Tokens (list is in topological order), - // and something saying whether we ever pruned it using PruneForwardLinks. - struct TokenList { - Token *toks; - bool must_prune_forward_links; - bool must_prune_tokens; - TokenList() - : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) {} - }; - - using Elem = typename HashList::Elem; - // Equivalent to: - // struct Elem { - // StateId key; - // Token *val; - // Elem *tail; - // }; - - void PossiblyResizeHash(size_t num_toks); - - // FindOrAddToken either locates a token in hash of toks_, or if necessary - // inserts a new, empty token (i.e. with no forward links) for the current - // frame. [note: it's inserted if necessary into hash toks_ and also into the - // singly linked list of tokens active on this frame (whose head is at - // active_toks_[frame]). The frame_plus_one argument is the acoustic frame - // index plus one, which is used to index into the active_toks_ array. - // Returns the Token pointer. Sets "changed" (if non-NULL) to true if the - // token was newly created or the cost changed. - // If Token == StdToken, the 'backpointer' argument has no purpose (and will - // hopefully be optimized out). - inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one, - BaseFloat tot_cost, Token *backpointer, - bool *changed); - - // prunes outgoing links for all tokens in active_toks_[frame] - // it's called by PruneActiveTokens - // all links, that have link_extra_cost > lattice_beam are pruned - // delta is the amount by which the extra_costs must change - // before we set *extra_costs_changed = true. - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed, - bool *links_pruned, BaseFloat delta); - - // This function computes the final-costs for tokens active on the final - // frame. It outputs to final-costs, if non-NULL, a map from the Token* - // pointer to the final-prob of the corresponding state, for all Tokens - // that correspond to states that have final-probs. This map will be - // empty if there were no final-probs. It outputs to - // final_relative_cost, if non-NULL, the difference between the best - // forward-cost including the final-prob cost, and the best forward-cost - // without including the final-prob cost (this will usually be positive), or - // infinity if there were no final-probs. [c.f. FinalRelativeCost(), which - // outputs this quanitity]. It outputs to final_best_cost, if - // non-NULL, the lowest for any token t active on the final frame, of - // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in - // the graph of the state corresponding to token t, or the best of - // forward-cost[t] if there were no final-probs active on the final frame. - // You cannot call this after FinalizeDecoding() has been called; in that - // case you should get the answer from class-member variables. - void ComputeFinalCosts(unordered_map *final_costs, - BaseFloat *final_relative_cost, - BaseFloat *final_best_cost) const; - - // PruneForwardLinksFinal is a version of PruneForwardLinks that we call - // on the final frame. If there are final tokens active, it uses - // the final-probs for pruning, otherwise it treats all tokens as final. - void PruneForwardLinksFinal(); - - // Prune away any tokens on this frame that have no forward links. - // [we don't do this in PruneForwardLinks because it would give us - // a problem with dangling pointers]. - // It's called by PruneActiveTokens if any forward links have been pruned - void PruneTokensForFrame(int32 frame_plus_one); - - // Go backwards through still-alive tokens, pruning them if the - // forward+backward cost is more than lat_beam away from the best path. It's - // possible to prove that this is "correct" in the sense that we won't lose - // anything outside of lat_beam, regardless of what happens in the future. - // delta controls when it considers a cost to have changed enough to continue - // going backward and propagating the change. larger delta -> will recurse - // less far. - void PruneActiveTokens(BaseFloat delta); - - /// Gets the weight cutoff. Also counts the active tokens. - BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, - BaseFloat *adaptive_beam, Elem **best_elem); - - /// Processes emitting arcs for one frame. Propagates from prev_toks_ to - /// cur_toks_. Returns the cost cutoff for subsequent ProcessNonemitting() to - /// use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); - - /// Processes nonemitting (epsilon) arcs for one frame. Called after - /// ProcessEmitting() on each frame. The cost cutoff is computed by the - /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); - - // HashList defined in ../util/hash-list.h. It actually allows us to maintain - // more than one list (e.g. for current and previous frames), but only one of - // them at a time can be indexed by StateId. It is indexed by frame-index - // plus one, where the frame-index is zero-based, as used in decodable object. - // That is, the emitting probs of frame t are accounted for in tokens at - // toks_[t+1]. The zeroth frame is for nonemitting transition at the start of - // the graph. - HashList toks_; - - std::vector active_toks_; // Lists of tokens, indexed by - // frame (members of TokenList are toks, must_prune_forward_links, - // must_prune_tokens). - std::vector - queue_; // temp variable used in ProcessNonemitting, - std::vector tmp_array_; // used in GetCutoff. - - // fst_ is a pointer to the FST we are decoding from. - const FST *fst_; - // delete_fst_ is true if the pointer fst_ needs to be deleted when this - // object is destroyed. - bool delete_fst_; - - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic log-likelihoods on that - // frame in order to keep everything in a nice dynamic range i.e. close to - // zero, to reduce roundoff errors. - LatticeFasterDecoderConfig config_; - int32 num_toks_; // current total #toks allocated... - bool warned_; - - /// decoding_finalized_ is true if someone called FinalizeDecoding(). [note, - /// calling this is optional]. If true, it's forbidden to decode more. Also, - /// if this is set, then the output of ComputeFinalCosts() is in the next - /// three variables. The reason we need to do this is that after - /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some - /// of the tokens on the last frame are freed, so we free the list from toks_ - /// to avoid having dangling pointers hanging around. - bool decoding_finalized_; - /// For the meaning of the next 3 variables, see the comment for - /// decoding_finalized_ above., and ComputeFinalCosts(). - unordered_map final_costs_; - BaseFloat final_relative_cost_; - BaseFloat final_best_cost_; - - std::shared_ptr context_graph_ = nullptr; - - // There are various cleanup tasks... the toks_ structure contains - // singly linked lists of Token pointers, where Elem is the list type. - // It also indexes them in a hash, indexed by state (this hash is only - // maintained for the most recent frame). toks_.Clear() - // deletes them from the hash and returns the list of Elems. The - // function DeleteElems calls toks_.Delete(elem) for each elem in - // the list, which returns ownership of the Elem to the toks_ structure - // for reuse, but does not delete the Token pointer. The Token pointers - // are reference-counted and are ultimately deleted in PruneTokensForFrame, - // but are also linked together on each frame by their own linked-list, - // using the "next" pointer. We delete them manually. - void DeleteElems(Elem *list); - - // This function takes a singly linked list of tokens for a single frame, and - // outputs a list of them in topological order (it will crash if no such order - // can be found, which will typically be due to decoding graphs with epsilon - // cycles, which are not allowed). Note: the output list may contain NULLs, - // which the caller should pass over; it just happens to be more efficient for - // the algorithm to output a list that contains NULLs. - static void TopSortTokens(Token *tok_list, - std::vector *topsorted_list); - - void ClearActiveTokens(); - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl); -}; - -typedef LatticeFasterDecoderTpl - LatticeFasterDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-online-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-online-decoder.cc deleted file mode 100644 index 2345b4d129ff905784762e973bad279f2fb55d31..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-online-decoder.cc +++ /dev/null @@ -1,278 +0,0 @@ -// decoder/lattice-faster-online-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2014 IMSL, PKU-HKUST (author: Wei Shi) -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.cc, about how to maintain this -// file in sync with lattice-faster-decoder.cc - -#include -#include -#include -#include - -#include "decoder/lattice-faster-online-decoder.h" - -namespace kaldi { - -template -bool LatticeFasterOnlineDecoderTpl::TestGetBestPath( - bool use_final_probs) const { - Lattice lat1; - { - Lattice raw_lat; - this->GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, &lat1); - } - Lattice lat2; - GetBestPath(&lat2, use_final_probs); - BaseFloat delta = 0.1; - int32 num_paths = 1; - if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) { - KALDI_WARN << "Best-path test failed"; - return false; - } else { - return true; - } -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterOnlineDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - olat->DeleteStates(); - BaseFloat final_graph_cost; - BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost); - if (iter.Done()) return false; // would have printed warning. - StateId state = olat->AddState(); - olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0)); - while (!iter.Done()) { - LatticeArc arc; - iter = TraceBackBestPath(iter, &arc); - arc.nextstate = state; - StateId new_state = olat->AddState(); - olat->AddArc(new_state, arc); - state = new_state; - } - olat->SetStart(state); - return true; -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::BestPathEnd( - bool use_final_probs, BaseFloat *final_cost_out) const { - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "BestPathEnd() with use_final_probs == false"; - KALDI_ASSERT(this->NumFramesDecoded() > 0 && - "You cannot call BestPathEnd if no frames were decoded."); - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - // Singly linked list of tokens on last frame (access list through "next" - // pointer). - BaseFloat best_cost = std::numeric_limits::infinity(); - BaseFloat best_final_cost = 0; - Token *best_tok = NULL; - for (Token *tok = this->active_toks_.back().toks; tok != NULL; - tok = tok->next) { - BaseFloat cost = tok->tot_cost, final_cost = 0.0; - if (use_final_probs && !final_costs.empty()) { - // if we are instructed to use final-probs, and any final tokens were - // active on final frame, include the final-prob in the cost of the token. - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) { - final_cost = iter->second; - cost += final_cost; - } else { - cost = std::numeric_limits::infinity(); - } - } - if (cost < best_cost) { - best_cost = cost; - best_tok = tok; - best_final_cost = final_cost; - } - } - if (best_tok == - NULL) { // this should not happen, and is likely a code error or - // caused by infinities in likelihoods, but I'm not making - // it a fatal error for now. - KALDI_WARN << "No final token found."; - } - if (final_cost_out) *final_cost_out = best_final_cost; - return BestPathIterator(best_tok, this->NumFramesDecoded() - 1); -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::TraceBackBestPath(BestPathIterator iter, - LatticeArc *oarc) const { - KALDI_ASSERT(!iter.Done() && oarc != NULL); - Token *tok = static_cast(iter.tok); - int32 cur_t = iter.frame, step_t = 0; - if (tok->backpointer != NULL) { - // retrieve the correct forward link(with the best link cost) - BaseFloat best_cost = std::numeric_limits::infinity(); - ForwardLinkT *link; - for (link = tok->backpointer->links; link != NULL; link = link->next) { - if (link->next_tok == tok) { // this is a link to "tok" - BaseFloat graph_cost = link->graph_cost, - acoustic_cost = link->acoustic_cost; - BaseFloat cost = graph_cost + acoustic_cost; - if (cost < best_cost) { - oarc->ilabel = link->ilabel; - oarc->olabel = link->olabel; - if (link->ilabel != 0) { - KALDI_ASSERT(static_cast(cur_t) < - this->cost_offsets_.size()); - acoustic_cost -= this->cost_offsets_[cur_t]; - step_t = -1; - } else { - step_t = 0; - } - oarc->weight = LatticeWeight(graph_cost, acoustic_cost); - best_cost = cost; - } - } - } - if (link == NULL && - best_cost == - std::numeric_limits::infinity()) { // Did not find - // correct link. - KALDI_ERR << "Error tracing best-path back (likely " - << "bug in token-pruning algorithm)"; - } - } else { - oarc->ilabel = 0; - oarc->olabel = 0; - oarc->weight = LatticeWeight::One(); // zero costs. - } - return BestPathIterator(tok->backpointer, cur_t + step_t); -} - -template -bool LatticeFasterOnlineDecoderTpl::GetRawLatticePruned( - Lattice *ofst, bool use_final_probs, BaseFloat beam) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = this->active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - for (int32 f = 0; f <= num_frames; f++) { - if (this->active_toks_[f].toks == NULL) { - KALDI_WARN << "No tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - } - unordered_map tok_map; - std::queue > tok_queue; - // First initialize the queue and states. Put the initial state on the queue; - // this is the last token in the list active_toks_[0].toks. - for (Token *tok = this->active_toks_[0].toks; tok != NULL; tok = tok->next) { - if (tok->next == NULL) { - tok_map[tok] = ofst->AddState(); - ofst->SetStart(tok_map[tok]); - std::pair tok_pair(tok, 0); // #frame = 0 - tok_queue.push(tok_pair); - } - } - - // Next create states for "good" tokens - while (!tok_queue.empty()) { - std::pair cur_tok_pair = tok_queue.front(); - tok_queue.pop(); - Token *cur_tok = cur_tok_pair.first; - int32 cur_frame = cur_tok_pair.second; - KALDI_ASSERT(cur_frame >= 0 && cur_frame <= this->cost_offsets_.size()); - - typename unordered_map::const_iterator iter = - tok_map.find(cur_tok); - KALDI_ASSERT(iter != tok_map.end()); - StateId cur_state = iter->second; - - for (ForwardLinkT *l = cur_tok->links; l != NULL; l = l->next) { - Token *next_tok = l->next_tok; - if (next_tok->extra_cost < beam) { - // so both the current and the next token are good; create the arc - int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1; - StateId nextstate; - if (tok_map.find(next_tok) == tok_map.end()) { - nextstate = tok_map[next_tok] = ofst->AddState(); - tok_queue.push(std::pair(next_tok, next_frame)); - } else { - nextstate = tok_map[next_tok]; - } - BaseFloat cost_offset = - (l->ilabel != 0 ? this->cost_offsets_[cur_frame] : 0); - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(cur_state, arc); - } - } - if (cur_frame == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(cur_tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - return (ofst->NumStates() != 0); -} - -// Instantiate the template for the FST types that we'll need. -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-online-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-online-decoder.h deleted file mode 100644 index dc50cfa73e6574e9625eda9045c47f674fcbc1e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/decoder/lattice-faster-online-decoder.h +++ /dev/null @@ -1,131 +0,0 @@ -// decoder/lattice-faster-online-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.h, about how to maintain this -// file in sync with lattice-faster-decoder.h - -#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ - -#include "decoder/lattice-faster-decoder.h" - -#include - -namespace kaldi { - -/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also - supports an efficient way to get the best path (see the function - BestPathEnd()), which is useful in endpointing and in situations where you - might want to frequently access the best path. - - This is only templated on the FST type, since the Token type is required to - be BackpointerToken. Actually it only makes sense to instantiate - LatticeFasterDecoderTpl with Token == BackpointerToken if you do so - indirectly via this child class. - */ -template -class LatticeFasterOnlineDecoderTpl - : public LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using Token = decoder::BackpointerToken; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterOnlineDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : LatticeFasterDecoderTpl(fst, config, context_graph) {} - - // This version of the initializer takes ownership of 'fst', and will delete - // it when this object is destroyed. - LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config, - FST *fst) - : LatticeFasterDecoderTpl(config, fst) {} - - struct BestPathIterator { - void *tok; - int32 frame; - // note, "frame" is the frame-index of the frame you'll get the - // transition-id for next time, if you call TraceBackBestPath on this - // iterator (assuming it's not an epsilon transition). Note that this - // is one less than you might reasonably expect, e.g. it's -1 for - // the nonemitting transitions before the first frame. - BestPathIterator(void *t, int32 f) : tok(t), frame(f) {} - bool Done() const { return tok == NULL; } - }; - - /// Outputs an FST corresponding to the single best path through the lattice. - /// This is quite efficient because it doesn't get the entire raw lattice and - /// find the best path through it; instead, it uses the BestPathEnd and - /// BestPathIterator so it basically traces it back through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// This function does a self-test of GetBestPath(). Returns true on - /// success; returns false and prints a warning on failure. - bool TestGetBestPath(bool use_final_probs = true) const; - - /// This function returns an iterator that can be used to trace back - /// the best path. If use_final_probs == true and at least one final state - /// survived till the end, it will use the final-probs in working out the best - /// final Token, and will output the final cost to *final_cost (if non-NULL), - /// else it will use only the forward likelihood, and will put zero in - /// *final_cost (if non-NULL). - /// Requires that NumFramesDecoded() > 0. - BestPathIterator BestPathEnd(bool use_final_probs, - BaseFloat *final_cost = NULL) const; - - /// This function can be used in conjunction with BestPathEnd() to trace back - /// the best path one link at a time (e.g. this can be useful in endpoint - /// detection). By "link" we mean a link in the graph; not all links cross - /// frame boundaries, but each time you see a nonzero ilabel you can interpret - /// that as a frame. The return value is the updated iterator. It outputs - /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" - /// pointer, while leaving its "nextstate" variable unchanged. - BestPathIterator TraceBackBestPath(BestPathIterator iter, - LatticeArc *arc) const; - - /// Behaves the same as GetRawLattice but only processes tokens whose - /// extra_cost is smaller than the best-cost plus the specified beam. - /// It is only worthwhile to call this function if beam is less than - /// the lattice_beam specified in the config; otherwise, it would - /// return essentially the same thing as GetRawLattice, but more slowly. - bool GetRawLatticePruned(Lattice *ofst, bool use_final_probs, - BaseFloat beam) const; - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl); -}; - -typedef LatticeFasterOnlineDecoderTpl LatticeFasterOnlineDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstaddselfloops.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstaddselfloops.cc deleted file mode 100644 index 145bf006f2324136c5fea4a8d0012a7a4126c646..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstaddselfloops.cc +++ /dev/null @@ -1,100 +0,0 @@ -// fstbin/fstaddselfloops.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#include "util/simple-io-funcs.h" - -/* some test examples: - pushd ~/tmpdir - ( echo 3; echo 4) > in.list - ( echo 5; echo 6) > out.list - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list - | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | - fstcompile | fstaddselfloops in.list out.list | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Adds self-loops to states of an FST to propagate disambiguation " - "symbols through it\n" - "They are added on each final state and each state with non-epsilon " - "output symbols\n" - "on at least one arc out of the state. Useful in conjunction with " - "predeterminize\n" - "\n" - "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " - "[out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" - "in.list and out.list are lists of integers, one per line, of the\n" - "same length.\n"; - - ParseOptions po(usage); - po.Read(argc, argv); - - if (po.NumArgs() < 2 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string disambig_in_rxfilename = po.GetArg(1), - disambig_out_rxfilename = po.GetArg(2), - fst_in_filename = po.GetOptArg(3), - fst_out_filename = po.GetOptArg(4); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - std::vector disambig_in; - if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_in_rxfilename); - - std::vector disambig_out; - if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_out_rxfilename); - - if (disambig_in.size() != disambig_out.size()) - KALDI_ERR - << "fstaddselfloops: mismatch in size of disambiguation symbols"; - - AddSelfLoops(fst, disambig_in, disambig_out); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstdeterminizestar.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstdeterminizestar.cc deleted file mode 100644 index e818143025c0fd5d389c28c77715d65711fe63f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstdeterminizestar.cc +++ /dev/null @@ -1,114 +0,0 @@ -// fstbin/fstdeterminizestar.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#if !defined(_MSC_VER) && !defined(__APPLE__) -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. We are disabling this code if -// compiling on Windows because signal.h is not available there, and on -// MacOS due to a problem with in the initial release of Sierra. -#endif - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | - fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 - 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst - > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n - "." done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo - "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id - of fstdeterminizestar] # prints out a bunch of debugging output showing the - mess it got itself into. -*/ - -bool debug_location = false; -void signal_handler(int) { debug_location = true; } - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Removes epsilons and determinizes in one step\n" - "\n" - "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" - "\n" - "See also: fstdeterminizelog, lattice-determinize\n"; - - float delta = kDelta; - int max_states = -1; - bool use_log = false; - ParseOptions po(usage); - po.Register("use-log", &use_log, "Determinize in log semiring."); - po.Register("delta", &delta, - "Delta value used to determine equivalence of weights."); - po.Register( - "max-states", &max_states, - "Maximum number of states in determinized FST before it will abort."); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); - - // This enables us to get traceback info from determinization that is - // not seeming to terminate. -#if !defined(_MSC_VER) && !defined(__APPLE__) - signal(SIGUSR1, signal_handler); -#endif - // Normal case: just files. - VectorFst *fst = ReadFstKaldi(fst_in_str); - - ArcSort(fst, ILabelCompare()); // improves speed. - if (use_log) { - DeterminizeStarInLog(fst, delta, &debug_location, max_states); - } else { - VectorFst det_fst; - DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); - *fst = det_fst; // will do shallow copy and then det_fst goes - // out of scope anyway. - } - WriteFstKaldi(*fst, fst_out_str); - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstisstochastic.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstisstochastic.cc deleted file mode 100644 index 468ed0daa7d37cb9a25cf25264f86e48e137b975..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstisstochastic.cc +++ /dev/null @@ -1,91 +0,0 @@ -// fstbin/fstisstochastic.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -// e.g. of test: -// echo " 0 0" | fstcompile | fstisstochastic -// should return 0 and print "0 0" [meaning, min and -// max weight are one = exp(0)] -// echo " 0 1" | fstcompile | fstisstochastic -// should return 1, not stochastic, and print 1 1 -// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 -// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo -// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, -// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; -// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic -// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" -// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even -// though not stochastic because we gave it an absurdly large delta. - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Checks whether an FST is stochastic and exits with success if so.\n" - "Prints out maximum error (in log units).\n" - "\n" - "Usage: fstisstochastic [ in.fst ]\n"; - - float delta = 0.01; - bool test_in_log = true; - - ParseOptions po(usage); - po.Register("delta", &delta, "Maximum error to accept."); - po.Register("test-in-log", &test_in_log, - "Test stochasticity in log semiring."); - po.Read(argc, argv); - - if (po.NumArgs() > 1) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1); - - Fst *fst = ReadFstKaldiGeneric(fst_in_filename); - - bool ans; - StdArc::Weight min, max; - if (test_in_log) - ans = IsStochasticFstInLog(*fst, delta, &min, &max); - else - ans = IsStochasticFst(*fst, delta, &min, &max); - - std::cout << min.Value() << " " << max.Value() << '\n'; - delete fst; - if (ans) - return 0; // success; - else - return 1; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstminimizeencoded.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstminimizeencoded.cc deleted file mode 100644 index ae9ca6d75abe67d9a195572dd6d91ec3c7b44851..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fstminimizeencoded.cc +++ /dev/null @@ -1,74 +0,0 @@ -// fstbin/fstminimizeencoded.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint - ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | - fstminimizeencoded | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Minimizes FST after encoding [similar to fstminimize, but no " - "weight-pushing]\n" - "\n" - "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; - - float delta = kDelta; - ParseOptions po(usage); - po.Register("delta", &delta, - "Delta likelihood used for quantization of weights"); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1), - fst_out_filename = po.GetOptArg(2); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - MinimizeEncoded(fst, delta); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fsttablecompose.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fsttablecompose.cc deleted file mode 100644 index bdd476da78b8cb8823c60abf33b5278e05bfd92c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstbin/fsttablecompose.cc +++ /dev/null @@ -1,133 +0,0 @@ -// fstbin/fsttablecompose.cc - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "fstext/table-matcher.h" -#include "util/parse-options.h" - -/* - cd ~/tmpdir - while true; do - fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort - > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst - fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" - echo -n "." - done - -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - /* - fsttablecompose should always give equivalent results to compose, - but it is more efficient for certain kinds of inputs. - In particular, it is useful when, say, the left FST has states - that typically either have epsilon olabels, or - one transition out for each of the possible symbols (as the - olabel). The same with the input symbols of the right-hand FST - is possible. - */ - - const char *usage = - "Composition algorithm [between two FSTs of standard type, in " - "tropical\n" - "semiring] that is more efficient for certain cases-- in particular,\n" - "where one of the FSTs (the left one, if --match-side=left) has large\n" - "out-degree\n" - "\n" - "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " - "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; - - ParseOptions po(usage); - - TableComposeOptions opts; - std::string match_side = "left"; - std::string compose_filter = "sequence"; - - po.Register("connect", &opts.connect, "If true, trim FST before output."); - po.Register("match-side", &match_side, - "Side of composition to do table " - "match, one of: \"left\" or \"right\"."); - po.Register("compose-filter", &compose_filter, - "Composition filter to use, " - "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); - - po.Read(argc, argv); - - if (match_side == "left") { - opts.table_match_type = MATCH_OUTPUT; - } else if (match_side == "right") { - opts.table_match_type = MATCH_INPUT; - } else { - KALDI_ERR << "Invalid match-side option: " << match_side; - } - - if (compose_filter == "alt_sequence") { - opts.filter_type = ALT_SEQUENCE_FILTER; - } else if (compose_filter == "auto") { - opts.filter_type = AUTO_FILTER; - } else if (compose_filter == "match") { - opts.filter_type = MATCH_FILTER; - } else if (compose_filter == "sequence") { - opts.filter_type = SEQUENCE_FILTER; - } else { - KALDI_ERR << "Invalid compose-filter option: " << compose_filter; - } - - if (po.NumArgs() < 2 || po.NumArgs() > 3) { - po.PrintUsage(); - exit(1); - } - - std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), - fst_out_str = po.GetOptArg(3); - - VectorFst *fst1 = ReadFstKaldi(fst1_in_str); - - VectorFst *fst2 = ReadFstKaldi(fst2_in_str); - - // Checks if is olabel sorted and is ilabel sorted. - if (fst1->Properties(fst::kOLabelSorted, true) == 0) { - KALDI_WARN << "The first FST is not olabel sorted."; - } - if (fst2->Properties(fst::kILabelSorted, true) == 0) { - KALDI_WARN << "The second FST is not ilabel sorted."; - } - - VectorFst composed_fst; - - TableCompose(*fst1, *fst2, &composed_fst, opts); - - delete fst1; - delete fst2; - - WriteFstKaldi(composed_fst, fst_out_str); - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstext/determinize-lattice-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstext/determinize-lattice-inl.h deleted file mode 100644 index 0bfbc8f41c7e439b1fac037f60490e04fdcbdd8b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/fstext/determinize-lattice-inl.h +++ /dev/null @@ -1,1357 +0,0 @@ -// fstext/determinize-lattice-inl.h - -// Copyright 2009-2012 Microsoft Corporation -// 2012-2013 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -#define KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -// Do not include this file directly. It is included by determinize-lattice.h - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fst { - -// This class maps back and forth from/to integer id's to sequences of strings. -// used in determinization algorithm. It is constructed in such a way that -// finding the string-id of the successor of (string, next-label) has constant -// time. - -// Note: class IntType, typically int32, is the type of the element in the -// string (typically a template argument of the CompactLatticeWeightTpl). - -template -class LatticeStringRepository { - public: - struct Entry { - const Entry *parent; // NULL for empty string. - IntType i; - inline bool operator==(const Entry &other) const { - return (parent == other.parent && i == other.i); - } - Entry() {} - Entry(const Entry &e) : parent(e.parent), i(e.i) {} - }; - // Note: all Entry* pointers returned in function calls are - // owned by the repository itself, not by the caller! - - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } - - // Returns string of "parent" with i appended. Pointer - // owned by repository - const Entry *Successor(const Entry *parent, IntType i) { - new_entry_->parent = parent; - new_entry_->i = i; - - std::pair pr = set_.insert(new_entry_); - if (pr.second) { // Was successfully inserted (was not there). We need to - // replace the element we inserted, which resides on the - // stack, with one from the heap. - const Entry *ans = new_entry_; - new_entry_ = new Entry(); - return ans; - } else { // Was not inserted because an equivalent Entry already - // existed. - return *pr.first; - } - } - - const Entry *Concatenate(const Entry *a, const Entry *b) { - if (a == NULL) - return b; - else if (b == NULL) - return a; - std::vector v; - ConvertToVector(b, &v); - const Entry *ans = a; - for (size_t i = 0; i < v.size(); i++) ans = Successor(ans, v[i]); - return ans; - } - const Entry *CommonPrefix(const Entry *a, const Entry *b) { - std::vector a_vec, b_vec; - ConvertToVector(a, &a_vec); - ConvertToVector(b, &b_vec); - const Entry *ans = NULL; - for (size_t i = 0; - i < a_vec.size() && i < b_vec.size() && a_vec[i] == b_vec[i]; i++) - ans = Successor(ans, a_vec[i]); - return ans; - } - - // removes any elements from b that are not part of - // a common prefix with a. - void ReduceToCommonPrefix(const Entry *a, std::vector *b) { - size_t a_size = Size(a), b_size = b->size(); - while (a_size > b_size) { - a = a->parent; - a_size--; - } - if (b_size > a_size) b_size = a_size; - typename std::vector::iterator b_begin = b->begin(); - while (a_size != 0) { - if (a->i != *(b_begin + a_size - 1)) b_size = a_size - 1; - a = a->parent; - a_size--; - } - if (b_size != b->size()) b->resize(b_size); - } - - // removes the first n elements of a. - const Entry *RemovePrefix(const Entry *a, size_t n) { - if (n == 0) return a; - std::vector a_vec; - ConvertToVector(a, &a_vec); - assert(a_vec.size() >= n); - const Entry *ans = NULL; - for (size_t i = n; i < a_vec.size(); i++) ans = Successor(ans, a_vec[i]); - return ans; - } - - // Returns true if a is a prefix of b. If a is prefix of b, - // time taken is |b| - |a|. Else, time taken is |b|. - bool IsPrefixOf(const Entry *a, const Entry *b) const { - if (a == NULL) return true; // empty string prefix of all. - if (a == b) return true; - if (b == NULL) return false; - return IsPrefixOf(a, b->parent); - } - - inline size_t Size(const Entry *entry) const { - size_t ans = 0; - while (entry != NULL) { - ans++; - entry = entry->parent; - } - return ans; - } - - void ConvertToVector(const Entry *entry, std::vector *out) const { - size_t length = Size(entry); - out->resize(length); - if (entry != NULL) { - typename std::vector::reverse_iterator iter = out->rbegin(); - while (entry != NULL) { - *iter = entry->i; - entry = entry->parent; - ++iter; - } - } - } - - const Entry *ConvertFromVector(const std::vector &vec) { - const Entry *e = NULL; - for (size_t i = 0; i < vec.size(); i++) e = Successor(e, vec[i]); - return e; - } - - LatticeStringRepository() { new_entry_ = new Entry; } - - void Destroy() { - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) - delete *iter; - SetType tmp; - tmp.swap(set_); - if (new_entry_) { - delete new_entry_; - new_entry_ = NULL; - } - } - - // Rebuild will rebuild this object, guaranteeing only - // to preserve the Entry values that are in the vector pointed - // to (this list does not have to be unique). The point of - // this is to save memory. - void Rebuild(const std::vector &to_keep) { - SetType tmp_set; - for (typename std::vector::const_iterator iter = - to_keep.begin(); - iter != to_keep.end(); ++iter) - RebuildHelper(*iter, &tmp_set); - // Now delete all elems not in tmp_set. - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) { - if (tmp_set.count(*iter) == 0) - delete (*iter); // delete the Entry; not needed. - } - set_.swap(tmp_set); - } - - ~LatticeStringRepository() { Destroy(); } - int32 MemSize() const { - return set_.size() * sizeof(Entry) * 2; // this is a lower bound - // on the size this structure might take. - } - - private: - class EntryKey { // Hash function object. - public: - inline size_t operator()(const Entry *entry) const { - size_t prime = 49109; - return static_cast(entry->i) + - prime * reinterpret_cast(entry->parent); - } - }; - class EntryEqual { - public: - inline bool operator()(const Entry *e1, const Entry *e2) const { - return (*e1 == *e2); - } - }; - typedef std::unordered_set SetType; - - void RebuildHelper(const Entry *to_add, SetType *tmp_set) { - while (true) { - if (to_add == NULL) return; - typename SetType::iterator iter = tmp_set->find(to_add); - if (iter == tmp_set->end()) { // not in tmp_set. - tmp_set->insert(to_add); - to_add = to_add->parent; // and loop. - } else { - return; - } - } - } - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); - Entry *new_entry_; // We always have a pre-allocated Entry ready to use, - // to avoid unnecessary news and deletes. - SetType set_; -}; - -// class LatticeDeterminizer is templated on the same types that -// CompactLatticeWeight is templated on: the base weight (Weight), typically -// LatticeWeightTpl etc. but could also be e.g. TropicalWeight, and the -// IntType, typically int32, used for the output symbols in the compact -// representation of strings [note: the output symbols would usually be -// p.d.f. id's in the anticipated use of this code] It has a special requirement -// on the Weight type: that there should be a Compare function on the weights -// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 -// > w2. This requires that there be a total order on the weights. - -template -class LatticeDeterminizer { - public: - // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 - // correspondence between our states and the states in ofst. If destroy == - // true, release memory as we go (but we cannot output again). - - typedef CompactLatticeWeightTpl CompactWeight; - typedef ArcTpl - CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - - // Output to standard FST with CompactWeightTpl as its weight type - // (the weight stores the original output-symbol strings). If destroy == - // true, release memory as we go (but we cannot output again). - void Output(MutableFst *ofst, bool destroy = true) { - assert(determinized_); - typedef typename Arc::StateId StateId; - StateId nStates = static_cast(output_arcs_.size()); - if (destroy) FreeMostMemory(); - ofst->DeleteStates(); - ofst->SetStart(kNoStateId); - if (nStates == 0) { - return; - } - for (StateId s = 0; s < nStates; s++) { - OutputStateId news = ofst->AddState(); - assert(news == s); - } - ofst->SetStart(0); - // now process transitions. - for (StateId this_state = 0; this_state < nStates; this_state++) { - std::vector &this_vec(output_arcs_[this_state]); - typename std::vector::const_iterator iter = this_vec.begin(), - end = this_vec.end(); - - for (; iter != end; ++iter) { - const TempArc &temp_arc(*iter); - CompactArc new_arc; - std::vector is not treated as epsilon, create a common end state for - // all transitions accepting the , since they do not back off. This small - // optimization saves about 2% states in an average grammar. - if (sub_eps_ == 0) { - eos_state_ = fst_->AddState(); - fst_->SetFinal(eos_state_, 0); - } -} - -template -void ArpaLmCompilerImpl::ConsumeNGram(const NGram& ngram, - bool is_highest) { - // Generally, we do the following. Suppose we are adding an n-gram "A B - // C". Then find the node for "A B", add a new node for "A B C", and connect - // them with the arc accepting "C" with the specified weight. Also, add a - // backoff arc from the new "A B C" node to its backoff state "B C". - // - // Two notable exceptions are the highest order n-grams, and final n-grams. - // - // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM), - // the following optimization is performed. There is no point adding a node - // for "A B C" with a "C" arc from "A B", since there will be no other - // arcs ingoing to this node, and an epsilon backoff arc into the backoff - // model "B C", with the weight of \bar{1}. To save a node, create an arc - // accepting "C" directly from "A B" to "B C". This saves as many nodes - // as there are the highest order n-grams, which is typically about half - // the size of a large 3-gram model. - // - // Indeed, this does not apply to n-grams ending in EOS, since they do not - // back off. These are special, as they do not have a back-off state, and - // the node for "(..anything..) " is always final. These are handled - // in one of the two possible ways, If symbols and are being - // replaced by epsilons, neither node nor arc is created, and the logprob - // of the n-gram is applied to its source node as final weight. If and - // are preserved, then a special final node for is allocated and - // used as the destination of the "" acceptor arc. - HistKey heads(ngram.words.begin(), ngram.words.end() - 1); - typename HistoryMap::iterator source_it = history_.find(heads); - if (source_it == history_.end()) { - // There was no "A B", therefore the probability of "A B C" is zero. - // Print a warning and discard current n-gram. - if (parent_->ShouldWarn()) - KALDI_WARN << parent_->LineReference() - << " skipped: no parent (n-1)-gram exists"; - return; - } - - StateId source = source_it->second; - StateId dest; - Symbol sym = ngram.words.back(); - float weight = -ngram.logprob; - if (sym == sub_eps_ || sym == 0) { - KALDI_ERR << " or disambiguation symbol " << sym - << "found in the ARPA file. "; - } - if (sym == eos_symbol_) { - if (sub_eps_ == 0) { - // Keep as a real symbol when not substituting. - dest = eos_state_; - } else { - // Treat as if it was epsilon: mark source final, with the weight - // of the n-gram. - fst_->SetFinal(source, weight); - return; - } - } else { - // For the highest order n-gram, this may find an existing state, for - // non-highest, will create one (unless there are duplicate n-grams - // in the grammar, which cannot be reliably detected if highest order, - // so we better do not do that at all). - dest = AddStateWithBackoff( - HistKey(ngram.words.begin() + (is_highest ? 1 : 0), ngram.words.end()), - -ngram.backoff); - } - - if (sym == bos_symbol_) { - weight = 0; // Accepting is always free. - if (sub_eps_ == 0) { - // is as a real symbol, only accepted in the start state. - source = fst_->AddState(); - fst_->SetStart(source); - } else { - // The new state for unigram history *is* the start state. - fst_->SetStart(dest); - return; - } - } - - // Add arc from source to dest, whichever way it was found. - fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest)); - return; -} - -// Find or create a new state for n-gram defined by key, and ensure it has a -// backoff transition. The key is either the current n-gram for all but -// highest orders, or the tails of the n-gram for the highest order. The -// latter arises from the chain-collapsing optimization described above. -template -StateId ArpaLmCompilerImpl::AddStateWithBackoff(HistKey key, - float backoff) { - typename HistoryMap::iterator dest_it = history_.find(key); - if (dest_it != history_.end()) { - // Found an existing state in the history map. Invariant: if the state in - // the map, then its backoff arc is in the FST. We are done. - return dest_it->second; - } - // Otherwise create a new state and its backoff arc, and register in the map. - StateId dest = fst_->AddState(); - history_[key] = dest; - CreateBackoff(key.Tails(), dest, backoff); - return dest; -} - -// Create a backoff arc for a state. Key is a backoff destination that may or -// may not exist. When the destination is not found, naturally fall back to -// the lower order model, and all the way down until one is found (since the -// 0-gram model is always present, the search is guaranteed to terminate). -template -inline void ArpaLmCompilerImpl::CreateBackoff(HistKey key, - StateId state, - float weight) { - typename HistoryMap::iterator dest_it = history_.find(key); - while (dest_it == history_.end()) { - key = key.Tails(); - dest_it = history_.find(key); - } - - // The arc should transduce either or #0 to , depending on the - // epsilon substitution mode. This is the only case when input and output - // label may differ. - fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second)); -} - -ArpaLmCompiler::~ArpaLmCompiler() { - if (impl_ != NULL) delete impl_; -} - -void ArpaLmCompiler::HeaderAvailable() { - KALDI_ASSERT(impl_ == NULL); - // Use optimized implementation if the grammar is 4-gram or less, and the - // maximum attained symbol id will fit into the optimized range. - int64 max_symbol = 0; - if (Symbols() != NULL) max_symbol = Symbols()->AvailableKey() - 1; - // If augmenting the symbol table, assume the worst case when all words in - // the model being read are novel. - if (Options().oov_handling == ArpaParseOptions::kAddToSymbols) - max_symbol += NgramCounts()[0]; - - if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - } else { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - KALDI_LOG << "Reverting to slower state tracking because model is large: " - << NgramCounts().size() << "-gram with symbols up to " - << max_symbol; - } -} - -void ArpaLmCompiler::ConsumeNGram(const NGram& ngram) { - // is invalid in tails, in heads of an n-gram. - for (int i = 0; i < ngram.words.size(); ++i) { - if ((i > 0 && ngram.words[i] == Options().bos_symbol) || - (i + 1 < ngram.words.size() && - ngram.words[i] == Options().eos_symbol)) { - if (ShouldWarn()) - KALDI_WARN << LineReference() - << " skipped: n-gram has invalid BOS/EOS placement"; - return; - } - } - - bool is_highest = ngram.words.size() == NgramCounts().size(); - impl_->ConsumeNGram(ngram, is_highest); -} - -void ArpaLmCompiler::RemoveRedundantStates() { - fst::StdArc::Label backoff_symbol = sub_eps_; - if (backoff_symbol == 0) { - // The method of removing redundant states implemented in this function - // leads to slow determinization of L o G when people use the older style of - // usage of arpa2fst where the --disambig-symbol option was not specified. - // The issue seems to be that it creates a non-deterministic FST, while G is - // supposed to be deterministic. By 'return'ing below, we just disable this - // method if people were using an older script. This method isn't really - // that consequential anyway, and people will move to the newer-style - // scripts (see current utils/format_lm.sh), so this isn't much of a - // problem. - return; - } - - fst::StdArc::StateId num_states = fst_.NumStates(); - - // replace the #0 symbols on the input of arcs out of redundant states (states - // that are not final and have only a backoff arc leaving them), with . - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && - fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } - } - } - - // we could call fst::RemoveEps, and it would have the same effect in normal - // cases, where backoff_symbol != 0 and there are no epsilons in unexpected - // places, but RemoveEpsLocal is a bit safer in case something weird is going - // on; it guarantees not to blow up the FST. - fst::RemoveEpsLocal(&fst_); - KALDI_LOG << "Reduced num-states from " << num_states << " to " - << fst_.NumStates(); -} - -void ArpaLmCompiler::Check() const { - if (fst_.Start() == fst::kNoStateId) { - KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " - << Symbols()->Find(Options().bos_symbol) << "."; - } -} - -void ArpaLmCompiler::ReadComplete() { - fst_.SetInputSymbols(Symbols()); - fst_.SetOutputSymbols(Symbols()); - RemoveRedundantStates(); - Check(); -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/lm/arpa-lm-compiler.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/lm/arpa-lm-compiler.h deleted file mode 100644 index 069c71bd0e6f5acf0b9521ec1ef46796eb31fe4d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/lm/arpa-lm-compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// lm/arpa-lm-compiler.h - -// Copyright 2009-2011 Gilles Boulianne -// Copyright 2016 Smart Action LLC (kkm) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_LM_ARPA_LM_COMPILER_H_ -#define KALDI_LM_ARPA_LM_COMPILER_H_ - -#include - -#include "lm/arpa-file-parser.h" - -namespace kaldi { - -class ArpaLmCompilerImplInterface; - -class ArpaLmCompiler : public ArpaFileParser { - public: - ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps, - fst::SymbolTable* symbols) - : ArpaFileParser(options, symbols), sub_eps_(sub_eps), impl_(NULL) {} - ~ArpaLmCompiler(); - - const fst::StdVectorFst& Fst() const { return fst_; } - fst::StdVectorFst* MutableFst() { return &fst_; } - - protected: - // ArpaFileParser overrides. - virtual void HeaderAvailable(); - virtual void ConsumeNGram(const NGram& ngram); - virtual void ReadComplete(); - - private: - // this function removes states that only have a backoff arc coming - // out of them. - void RemoveRedundantStates(); - void Check() const; - - int sub_eps_; - ArpaLmCompilerImplInterface* impl_; // Owned. - fst::StdVectorFst fst_; - template - friend class ArpaLmCompilerImpl; -}; - -} // namespace kaldi - -#endif // KALDI_LM_ARPA_LM_COMPILER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/lmbin/arpa2fst.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/lmbin/arpa2fst.cc deleted file mode 100644 index 881a45c5b37810247ea38dae56237f59b5554a9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/lmbin/arpa2fst.cc +++ /dev/null @@ -1,145 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "lm/arpa-lm-compiler.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; // NOLINT - try { - const char *usage = - "Convert an ARPA format language model into an FST\n" - "Usage: arpa2fst [opts] \n" - " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" - "data/lang/words.txt lm/input.arpa G.fst\n\n" - "Note: When called without switches, the output G.fst will contain\n" - "an embedded symbol table. This is compatible with the way a previous\n" - "version of arpa2fst worked.\n"; - - ParseOptions po(usage); - - ArpaParseOptions options; - options.Register(&po); - - // Option flags. - std::string bos_symbol = ""; - std::string eos_symbol = ""; - std::string disambig_symbol; - std::string read_syms_filename; - std::string write_syms_filename; - bool keep_symbols = false; - bool ilabel_sort = true; - - po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); - po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); - po.Register("disambig-symbol", &disambig_symbol, - "Disambiguator. If provided (e. g. #0), used on input side of " - "backoff links, and and are replaced with epsilons"); - po.Register("read-symbol-table", &read_syms_filename, - "Use existing symbol table"); - po.Register("write-symbol-table", &write_syms_filename, - "Write generated symbol table to a file"); - po.Register("keep-symbols", &keep_symbols, - "Store symbol table with FST. Symbols always saved to FST if " - "symbol tables are neither read or written (otherwise symbols " - "would be lost entirely)"); - po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); - - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_rxfilename = po.GetArg(1), - fst_wxfilename = po.GetOptArg(2); - - int64 disambig_symbol_id = 0; - - fst::SymbolTable *symbols; - if (!read_syms_filename.empty()) { - // Use existing symbols. Required symbols must be in the table. - kaldi::Input kisym(read_syms_filename); - symbols = fst::SymbolTable::ReadText( - kisym.Stream(), PrintableWxfilename(read_syms_filename)); - if (symbols == NULL) - KALDI_ERR << "Could not read symbol table from file " - << read_syms_filename; - - options.oov_handling = ArpaParseOptions::kSkipNGram; - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == -1) // fst::kNoSymbol - KALDI_ERR << "Symbol table " << read_syms_filename - << " has no symbol for " << disambig_symbol; - } - } else { - // Create a new symbol table and populate it from ARPA file. - symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); - options.oov_handling = ArpaParseOptions::kAddToSymbols; - symbols->AddSymbol("", 0); - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->AddSymbol(disambig_symbol); - } - } - - // Add or use existing BOS and EOS. - options.bos_symbol = symbols->AddSymbol(bos_symbol); - options.eos_symbol = symbols->AddSymbol(eos_symbol); - - // If producing new (not reading existing) symbols and not saving them, - // need to keep symbols with FST, otherwise they would be lost. - if (read_syms_filename.empty() && write_syms_filename.empty()) - keep_symbols = true; - - // Actually compile LM. - KALDI_ASSERT(symbols != NULL); - ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); - { - Input ki(arpa_rxfilename); - lm_compiler.Read(ki.Stream()); - } - - // Sort the FST in-place if requested by options. - if (ilabel_sort) { - fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); - } - - // Write symbols if requested. - if (!write_syms_filename.empty()) { - kaldi::Output kosym(write_syms_filename, false); - symbols->WriteText(kosym.Stream()); - } - - // Write LM FST. - bool write_binary = true, write_header = false; - kaldi::Output kofst(fst_wxfilename, write_binary, write_header); - fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); - wopts.write_isymbols = wopts.write_osymbols = keep_symbols; - lm_compiler.Fst().Write(kofst.Stream(), wopts); - - delete symbols; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/basic-filebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/basic-filebuf.h deleted file mode 100644 index 22ec891064d5955c8b1d255e0d34781a9f505a38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/basic-filebuf.h +++ /dev/null @@ -1,952 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// This is a modified version of the std::basic_filebuf from libc++ -// Copyright 20XX LLVM -// (http://libcxx.llvm.org/). -// It allows one to create basic_filebuf from an existing FILE* handle or file -// descriptor. -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source License licenses. See LICENSE.TXT for details (included at the -// bottom). -/////////////////////////////////////////////////////////////////////////////// -#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ -#define KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////// -namespace kaldi { -/////////////////////////////////////////////////////////////////////////////// -template > -class basic_filebuf : public std::basic_streambuf { - public: - typedef CharT char_type; - typedef Traits traits_type; - typedef typename traits_type::int_type int_type; - typedef typename traits_type::pos_type pos_type; - typedef typename traits_type::off_type off_type; - typedef typename traits_type::state_type state_type; - - basic_filebuf(); - basic_filebuf(basic_filebuf&& rhs); - virtual ~basic_filebuf(); - - basic_filebuf& operator=(basic_filebuf&& rhs); - void swap(basic_filebuf& rhs); - - bool is_open() const; - basic_filebuf* open(const char* s, std::ios_base::openmode mode); - basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); - basic_filebuf* open(int fd, std::ios_base::openmode mode); - basic_filebuf* open(FILE* f, std::ios_base::openmode mode); - basic_filebuf* close(); - - FILE* file() { return this->_M_file; } - int fd() { return fileno(this->_M_file); } - - protected: - int_type underflow() override; - int_type pbackfail(int_type c = traits_type::eof()) override; - int_type overflow(int_type c = traits_type::eof()) override; - std::basic_streambuf* setbuf( - char_type* s, std::streamsize n) override; - pos_type seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - pos_type seekpos(pos_type sp, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - int sync() override; - void imbue(const std::locale& loc) override; - - protected: - char* _M_extbuf; - const char* _M_extbufnext; - const char* _M_extbufend; - char _M_extbuf_min[8]; - size_t _M_ebs; - char_type* _M_intbuf; - size_t _M_ibs; - FILE* _M_file; - const std::codecvt* _M_cv; - state_type _M_st; - state_type _M_st_last; - std::ios_base::openmode _M_om; - std::ios_base::openmode _M_cm; - bool _M_owns_eb; - bool _M_owns_ib; - bool _M_always_noconv; - - const char* _M_get_mode(std::ios_base::openmode mode); - bool _M_read_mode(); - void _M_write_mode(); -}; - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf() - : _M_extbuf(nullptr), - _M_extbufnext(nullptr), - _M_extbufend(nullptr), - _M_ebs(0), - _M_intbuf(nullptr), - _M_ibs(0), - _M_file(nullptr), - _M_cv(nullptr), - _M_st(), - _M_st_last(), - _M_om(std::ios_base::openmode(0)), - _M_cm(std::ios_base::openmode(0)), - _M_owns_eb(false), - _M_owns_ib(false), - _M_always_noconv(false) { - if (std::has_facet >( - this->getloc())) { - _M_cv = &std::use_facet >( - this->getloc()); - _M_always_noconv = _M_cv->always_noconv(); - } - setbuf(0, 4096); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf(basic_filebuf&& rhs) - : std::basic_streambuf(rhs) { - if (rhs._M_extbuf == rhs._M_extbuf_min) { - _M_extbuf = _M_extbuf_min; - _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); - _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); - } else { - _M_extbuf = rhs._M_extbuf; - _M_extbufnext = rhs._M_extbufnext; - _M_extbufend = rhs._M_extbufend; - } - _M_ebs = rhs._M_ebs; - _M_intbuf = rhs._M_intbuf; - _M_ibs = rhs._M_ibs; - _M_file = rhs._M_file; - _M_cv = rhs._M_cv; - _M_st = rhs._M_st; - _M_st_last = rhs._M_st_last; - _M_om = rhs._M_om; - _M_cm = rhs._M_cm; - _M_owns_eb = rhs._M_owns_eb; - _M_owns_ib = rhs._M_owns_ib; - _M_always_noconv = rhs._M_always_noconv; - if (rhs.pbase()) { - if (rhs.pbase() == rhs._M_intbuf) - this->setp(_M_intbuf, _M_intbuf + (rhs.epptr() - rhs.pbase())); - else - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + - (rhs.epptr() - rhs.pbase())); - this->pbump(rhs.pptr() - rhs.pbase()); - } else if (rhs.eback()) { - if (rhs.eback() == rhs._M_intbuf) - this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), - _M_intbuf + (rhs.egptr() - rhs.eback())); - else - this->setg( - reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (rhs.gptr() - rhs.eback()), - reinterpret_cast(_M_extbuf) + - (rhs.egptr() - rhs.eback())); - } - rhs._M_extbuf = nullptr; - rhs._M_extbufnext = nullptr; - rhs._M_extbufend = nullptr; - rhs._M_ebs = 0; - rhs._M_intbuf = nullptr; - rhs._M_ibs = 0; - rhs._M_file = nullptr; - rhs._M_st = state_type(); - rhs._M_st_last = state_type(); - rhs._M_om = std::ios_base::openmode(0); - rhs._M_cm = std::ios_base::openmode(0); - rhs._M_owns_eb = false; - rhs._M_owns_ib = false; - rhs.setg(0, 0, 0); - rhs.setp(0, 0); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf& basic_filebuf::operator=( - basic_filebuf&& rhs) { - close(); - swap(rhs); - return *this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::~basic_filebuf() { - // try - // { - // close(); - // } - // catch (...) - // { - // } - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::swap(basic_filebuf& rhs) { - std::basic_streambuf::swap(rhs); - if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - std::swap(_M_extbuf, rhs._M_extbuf); - std::swap(_M_extbufnext, rhs._M_extbufnext); - std::swap(_M_extbufend, rhs._M_extbufend); - } else { - ptrdiff_t ln = _M_extbufnext - _M_extbuf; - ptrdiff_t le = _M_extbufend - _M_extbuf; - ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; - ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; - if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - _M_extbuf = rhs._M_extbuf; - rhs._M_extbuf = rhs._M_extbuf_min; - } else if (_M_extbuf != _M_extbuf_min && - rhs._M_extbuf == rhs._M_extbuf_min) { - rhs._M_extbuf = _M_extbuf; - _M_extbuf = _M_extbuf_min; - } - _M_extbufnext = _M_extbuf + rn; - _M_extbufend = _M_extbuf + re; - rhs._M_extbufnext = rhs._M_extbuf + ln; - rhs._M_extbufend = rhs._M_extbuf + le; - } - std::swap(_M_ebs, rhs._M_ebs); - std::swap(_M_intbuf, rhs._M_intbuf); - std::swap(_M_ibs, rhs._M_ibs); - std::swap(_M_file, rhs._M_file); - std::swap(_M_cv, rhs._M_cv); - std::swap(_M_st, rhs._M_st); - std::swap(_M_st_last, rhs._M_st_last); - std::swap(_M_om, rhs._M_om); - std::swap(_M_cm, rhs._M_cm); - std::swap(_M_owns_eb, rhs._M_owns_eb); - std::swap(_M_owns_ib, rhs._M_owns_ib); - std::swap(_M_always_noconv, rhs._M_always_noconv); - if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->gptr() - this->eback(); - ptrdiff_t e = this->egptr() - this->eback(); - this->setg(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + n, - reinterpret_cast(_M_extbuf_min) + e); - } else if (this->pbase() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->pptr() - this->pbase(); - ptrdiff_t e = this->epptr() - this->pbase(); - this->setp(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + e); - this->pbump(n); - } - if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.gptr() - rhs.eback(); - ptrdiff_t e = rhs.egptr() - rhs.eback(); - rhs.setg(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + n, - reinterpret_cast(rhs._M_extbuf_min) + e); - } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.pptr() - rhs.pbase(); - ptrdiff_t e = rhs.epptr() - rhs.pbase(); - rhs.setp(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + e); - rhs.pbump(n); - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline void swap(basic_filebuf& x, - basic_filebuf& y) { - x.swap(y); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline bool basic_filebuf::is_open() const { - return _M_file != nullptr; -} - -/////////////////////////////////////////////////////////////////////////////// -template -const char* basic_filebuf::_M_get_mode( - std::ios_base::openmode mode) { - switch ((mode & ~std::ios_base::ate) | 0) { - case std::ios_base::out: - case std::ios_base::out | std::ios_base::trunc: - return "w"; - case std::ios_base::out | std::ios_base::app: - case std::ios_base::app: - return "a"; - break; - case std::ios_base::in: - return "r"; - case std::ios_base::in | std::ios_base::out: - return "r+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: - return "w+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app: - case std::ios_base::in | std::ios_base::app: - return "a+"; - case std::ios_base::out | std::ios_base::binary: - case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: - return "wb"; - case std::ios_base::out | std::ios_base::app | std::ios_base::binary: - case std::ios_base::app | std::ios_base::binary: - return "ab"; - case std::ios_base::in | std::ios_base::binary: - return "rb"; - case std::ios_base::in | std::ios_base::out | std::ios_base::binary: - return "r+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | - std::ios_base::binary: - return "w+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app | - std::ios_base::binary: - case std::ios_base::in | std::ios_base::app | std::ios_base::binary: - return "a+b"; - default: - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - const char* s, std::ios_base::openmode mode) { - basic_filebuf* rt = nullptr; - if (_M_file == nullptr) { - const char* md = _M_get_mode(mode); - if (md) { - _M_file = fopen(s, md); - if (_M_file) { - rt = this; - _M_om = mode; - if (mode & std::ios_base::ate) { - if (fseek(_M_file, 0, SEEK_END)) { - fclose(_M_file); - _M_file = nullptr; - rt = nullptr; - } - } - } - } - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf* basic_filebuf::open( - const std::string& s, std::ios_base::openmode mode) { - return open(s.c_str(), mode); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - int fd, std::ios_base::openmode mode) { - const char* md = this->_M_get_mode(mode); - if (md) { - this->_M_file = fdopen(fd, md); - this->_M_om = mode; - return this; - } else { - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - FILE* f, std::ios_base::openmode mode) { - this->_M_file = f; - this->_M_om = mode; - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::close() { - basic_filebuf* rt = nullptr; - if (_M_file) { - rt = this; - std::unique_ptr h(_M_file, fclose); - if (sync()) rt = nullptr; - if (fclose(h.release()) == 0) - _M_file = nullptr; - else - rt = nullptr; - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::underflow() { - if (_M_file == nullptr) return traits_type::eof(); - bool initial = _M_read_mode(); - char_type buf; - if (this->gptr() == nullptr) this->setg(&buf, &buf + 1, &buf + 1); - const size_t unget_sz = - initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); - int_type c = traits_type::eof(); - if (this->gptr() == this->egptr()) { - memmove(this->eback(), this->egptr() - unget_sz, - unget_sz * sizeof(char_type)); - if (_M_always_noconv) { - size_t nmemb = - static_cast(this->egptr() - this->eback() - unget_sz); - nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); - if (nmemb != 0) { - this->setg(this->eback(), this->eback() + unget_sz, - this->eback() + unget_sz + nmemb); - c = traits_type::to_int_type(*this->gptr()); - } - } else { - memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); - _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); - _M_extbufend = - _M_extbuf + - (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); - size_t nmemb = - std::min(static_cast(_M_ibs - unget_sz), - static_cast(_M_extbufend - _M_extbufnext)); - std::codecvt_base::result r; - _M_st_last = _M_st; - size_t nr = - fread(reinterpret_cast(const_cast(_M_extbufnext)), - 1, nmemb, _M_file); - if (nr != 0) { - if (!_M_cv) throw std::bad_cast(); - _M_extbufend = _M_extbufnext + nr; - char_type* inext; - r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, - this->eback() + unget_sz, this->eback() + _M_ibs, inext); - if (r == std::codecvt_base::noconv) { - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf), - const_cast(_M_extbufend)); - c = traits_type::to_int_type(*this->gptr()); - } else if (inext != this->eback() + unget_sz) { - this->setg(this->eback(), this->eback() + unget_sz, inext); - c = traits_type::to_int_type(*this->gptr()); - } - } - } - } else { - c = traits_type::to_int_type(*this->gptr()); - } - if (this->eback() == &buf) this->setg(0, 0, 0); - return c; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::pbackfail(int_type c) { - if (_M_file && this->eback() < this->gptr()) { - if (traits_type::eq_int_type(c, traits_type::eof())) { - this->gbump(-1); - return traits_type::not_eof(c); - } - if ((_M_om & std::ios_base::out) || - traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { - this->gbump(-1); - *this->gptr() = traits_type::to_char_type(c); - return c; - } - } - return traits_type::eof(); -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::overflow(int_type c) { - if (_M_file == nullptr) return traits_type::eof(); - _M_write_mode(); - char_type buf; - char_type* pb_save = this->pbase(); - char_type* epb_save = this->epptr(); - if (!traits_type::eq_int_type(c, traits_type::eof())) { - if (this->pptr() == nullptr) this->setp(&buf, &buf + 1); - *this->pptr() = traits_type::to_char_type(c); - this->pbump(1); - } - if (this->pptr() != this->pbase()) { - if (_M_always_noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else { - char* extbe = _M_extbuf; - std::codecvt_base::result r; - do { - if (!_M_cv) throw std::bad_cast(); - const char_type* e; - r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, _M_extbuf, - _M_extbuf + _M_ebs, extbe); - if (e == this->pbase()) return traits_type::eof(); - if (r == std::codecvt_base::noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else if (r == std::codecvt_base::ok || - r == std::codecvt_base::partial) { - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - if (r == std::codecvt_base::partial) { - this->setp(const_cast(e), this->pptr()); - this->pbump(this->epptr() - this->pbase()); - } - } else { - return traits_type::eof(); - } - } while (r == std::codecvt_base::partial); - } - this->setp(pb_save, epb_save); - } - return traits_type::not_eof(c); -} - -/////////////////////////////////////////////////////////////////////////////// -template -std::basic_streambuf* basic_filebuf::setbuf( - char_type* s, std::streamsize n) { - this->setg(0, 0, 0); - this->setp(0, 0); - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; - _M_ebs = n; - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv && s) { - _M_extbuf = reinterpret_cast(s); - _M_owns_eb = false; - } else { - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } - } else { - _M_extbuf = _M_extbuf_min; - _M_ebs = sizeof(_M_extbuf_min); - _M_owns_eb = false; - } - if (!_M_always_noconv) { - _M_ibs = std::max(n, sizeof(_M_extbuf_min)); - if (s && _M_ibs >= sizeof(_M_extbuf_min)) { - _M_intbuf = s; - _M_owns_ib = false; - } else { - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } else { - _M_ibs = 0; - _M_intbuf = 0; - _M_owns_ib = false; - } - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode) { - if (!_M_cv) throw std::bad_cast(); - int width = _M_cv->encoding(); - if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) - return pos_type(off_type(-1)); - // width > 0 || off == 0 - int whence; - switch (way) { - case std::ios_base::beg: - whence = SEEK_SET; - break; - case std::ios_base::cur: - whence = SEEK_CUR; - break; - case std::ios_base::end: - whence = SEEK_END; - break; - default: - return pos_type(off_type(-1)); - } -#if _WIN32 - if (fseek(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftell(_M_file); -#else - if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftello(_M_file); -#endif - r.state(_M_st); - return r; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { - if (_M_file == nullptr || sync()) return pos_type(off_type(-1)); -#if _WIN32 - if (fseek(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#else - if (fseeko(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#endif - _M_st = sp.state(); - return sp; -} - -/////////////////////////////////////////////////////////////////////////////// -template -int basic_filebuf::sync() { - if (_M_file == nullptr) return 0; - if (!_M_cv) throw std::bad_cast(); - if (_M_cm & std::ios_base::out) { - if (this->pptr() != this->pbase()) - if (overflow() == traits_type::eof()) return -1; - std::codecvt_base::result r; - do { - char* extbe; - r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) return -1; - } while (r == std::codecvt_base::partial); - if (r == std::codecvt_base::error) return -1; - if (fflush(_M_file)) return -1; - } else if (_M_cm & std::ios_base::in) { - off_type c; - state_type state = _M_st_last; - bool update_st = false; - if (_M_always_noconv) { - c = this->egptr() - this->gptr(); - } else { - int width = _M_cv->encoding(); - c = _M_extbufend - _M_extbufnext; - if (width > 0) { - c += width * (this->egptr() - this->gptr()); - } else { - if (this->gptr() != this->egptr()) { - const int off = _M_cv->length(state, _M_extbuf, _M_extbufnext, - this->gptr() - this->eback()); - c += _M_extbufnext - _M_extbuf - off; - update_st = true; - } - } - } -#if _WIN32 - if (fseek(_M_file_, -c, SEEK_CUR)) return -1; -#else - if (fseeko(_M_file, -c, SEEK_CUR)) return -1; -#endif - if (update_st) _M_st = state; - _M_extbufnext = _M_extbufend = _M_extbuf; - this->setg(0, 0, 0); - _M_cm = std::ios_base::openmode(0); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::imbue(const std::locale& loc) { - sync(); - _M_cv = &std::use_facet >(loc); - bool old_anc = _M_always_noconv; - _M_always_noconv = _M_cv->always_noconv(); - if (old_anc != _M_always_noconv) { - this->setg(0, 0, 0); - this->setp(0, 0); - // invariant, char_type is char, else we couldn't get here - // need to dump _M_intbuf - if (_M_always_noconv) { - if (_M_owns_eb) delete[] _M_extbuf; - _M_owns_eb = _M_owns_ib; - _M_ebs = _M_ibs; - _M_extbuf = reinterpret_cast(_M_intbuf); - _M_ibs = 0; - _M_intbuf = nullptr; - _M_owns_ib = false; - } else { // need to obtain an _M_intbuf. - // If _M_extbuf is user-supplied, use it, else new _M_intbuf - if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { - _M_ibs = _M_ebs; - _M_intbuf = reinterpret_cast(_M_extbuf); - _M_owns_ib = false; - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } else { - _M_ibs = _M_ebs; - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -bool basic_filebuf::_M_read_mode() { - if (!(_M_cm & std::ios_base::in)) { - this->setp(0, 0); - if (_M_always_noconv) - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + _M_ebs, - reinterpret_cast(_M_extbuf) + _M_ebs); - else - this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); - _M_cm = std::ios_base::in; - return true; - } - return false; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::_M_write_mode() { - if (!(_M_cm & std::ios_base::out)) { - this->setg(0, 0, 0); - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv) - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (_M_ebs - 1)); - else - this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); - } else { - this->setp(0, 0); - } - _M_cm = std::ios_base::out; - } -} - -/////////////////////////////////////////////////////////////////////////////// -} // namespace kaldi - -/////////////////////////////////////////////////////////////////////////////// -#endif // KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// - -/* - * ============================================================================ - * libc++ License - * ============================================================================ - * - * The libc++ library is dual licensed under both the University of Illinois - * "BSD-Like" license and the MIT license. As a user of this code you may - * choose to use it under either license. As a contributor, you agree to allow - * your code to be used under both. - * - * Full text of the relevant licenses is included below. - * - * ============================================================================ - * - * University of Illinois/NCSA - * Open Source License - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * All rights reserved. - * - * Developed by: - * - * LLVM Team - * - * University of Illinois at Urbana-Champaign - * - * http://llvm.org - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the names of the LLVM Team, University of Illinois at - * Urbana-Champaign, nor the names of its contributors may be used to - * endorse or promote products derived from this Software without specific - * prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - * ============================================================================== - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * ============================================================================== - * - * This file is a partial list of people who have contributed to the LLVM/libc++ - * project. If you have contributed a patch or made some other contribution to - * LLVM/libc++, please submit a patch to this file to add yourself, and it will - * be done! - * - * The list is sorted by surname and formatted to allow easy grepping and - * beautification by scripts. The fields are: name (N), email (E), web-address - * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address - * (S). - * - * N: Saleem Abdulrasool - * E: compnerd@compnerd.org - * D: Minor patches and Linux fixes. - * - * N: Dimitry Andric - * E: dimitry@andric.com - * D: Visibility fixes, minor FreeBSD portability patches. - * - * N: Holger Arnold - * E: holgerar@gmail.com - * D: Minor fix. - * - * N: Ruben Van Boxem - * E: vanboxem dot ruben at gmail dot com - * D: Initial Windows patches. - * - * N: David Chisnall - * E: theraven at theravensnest dot org - * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. - * - * N: Marshall Clow - * E: mclow.lists@gmail.com - * E: marshall@idio.com - * D: C++14 support, patches and bug fixes. - * - * N: Bill Fisher - * E: william.w.fisher@gmail.com - * D: Regex bug fixes. - * - * N: Matthew Dempsky - * E: matthew@dempsky.org - * D: Minor patches and bug fixes. - * - * N: Google Inc. - * D: Copyright owner and contributor of the CityHash algorithm - * - * N: Howard Hinnant - * E: hhinnant@apple.com - * D: Architect and primary author of libc++ - * - * N: Hyeon-bin Jeong - * E: tuhertz@gmail.com - * D: Minor patches and bug fixes. - * - * N: Argyrios Kyrtzidis - * E: kyrtzidis@apple.com - * D: Bug fixes. - * - * N: Bruce Mitchener, Jr. - * E: bruce.mitchener@gmail.com - * D: Emscripten-related changes. - * - * N: Michel Morin - * E: mimomorin@gmail.com - * D: Minor patches to is_convertible. - * - * N: Andrew Morrow - * E: andrew.c.morrow@gmail.com - * D: Minor patches and Linux fixes. - * - * N: Arvid Picciani - * E: aep at exys dot org - * D: Minor patches and musl port. - * - * N: Bjorn Reese - * E: breese@users.sourceforge.net - * D: Initial regex prototype - * - * N: Nico Rieck - * E: nico.rieck@gmail.com - * D: Windows fixes - * - * N: Jonathan Sauer - * D: Minor patches, mostly related to constexpr - * - * N: Craig Silverstein - * E: csilvers@google.com - * D: Implemented Cityhash as the string hash function on 64-bit machines - * - * N: Richard Smith - * D: Minor patches. - * - * N: Joerg Sonnenberger - * E: joerg@NetBSD.org - * D: NetBSD port. - * - * N: Stephan Tolksdorf - * E: st@quanttec.com - * D: Minor fix - * - * N: Michael van der Westhuizen - * E: r1mikey at gmail dot com - * - * N: Klaas de Vries - * E: klaas at klaasgaaf dot nl - * D: Minor bug fix. - * - * N: Zhang Xiongpang - * E: zhangxiongpang@gmail.com - * D: Minor patches and bug fixes. - * - * N: Xing Xue - * E: xingxue@ca.ibm.com - * D: AIX port - * - * N: Zhihao Yuan - * E: lichray@gmail.com - * D: Standard compatibility fixes. - * - * N: Jeffrey Yasskin - * E: jyasskin@gmail.com - * E: jyasskin@google.com - * D: Linux fixes. - */ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/const-integer-set-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/const-integer-set-inl.h deleted file mode 100644 index b93846148a3e4595774507f638396ce13393ac0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/const-integer-set-inl.h +++ /dev/null @@ -1,87 +0,0 @@ -// util/const-integer-set-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ - -// Do not include this file directly. It is included by const-integer-set.h - -namespace kaldi { - -template -void ConstIntegerSet::InitInternal() { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - quick_set_.clear(); // just in case we previously had data. - if (slow_set_.size() == 0) { - lowest_member_ = (I)1; - highest_member_ = (I)0; - contiguous_ = false; - quick_ = false; - } else { - lowest_member_ = slow_set_.front(); - highest_member_ = slow_set_.back(); - size_t range = highest_member_ + 1 - lowest_member_; - if (range == slow_set_.size()) { - contiguous_ = true; - quick_ = false; - } else { - contiguous_ = false; - // If it would be more compact to store as bool - if (range < slow_set_.size() * 8 * sizeof(I)) { - // (assuming 1 bit per element)... - quick_set_.resize(range, false); - for (size_t i = 0; i < slow_set_.size(); i++) - quick_set_[slow_set_[i] - lowest_member_] = true; - quick_ = true; - } else { - quick_ = false; - } - } - } -} - -template -int ConstIntegerSet::count(I i) const { - if (i < lowest_member_ || i > highest_member_) { - return 0; - } else { - if (contiguous_) return true; - if (quick_) { - return (quick_set_[i - lowest_member_] ? 1 : 0); - } else { - bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); - return (ans ? 1 : 0); - } - } -} - -template -void ConstIntegerSet::Write(std::ostream &os, bool binary) const { - WriteIntegerVector(os, binary, slow_set_); -} - -template -void ConstIntegerSet::Read(std::istream &is, bool binary) { - ReadIntegerVector(is, binary, &slow_set_); - InitInternal(); -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/const-integer-set.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/const-integer-set.h deleted file mode 100644 index 809a56a7c83804bfaa4badb5e28059734bfcad1e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/const-integer-set.h +++ /dev/null @@ -1,96 +0,0 @@ -// util/const-integer-set.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_H_ -#include -#include -#include -#include -#include -#include "util/stl-utils.h" - -/* ConstIntegerSet is a way to efficiently test whether something is in a - supplied set of integers. It can be initialized from a vector or set, but - never changed after that. It either uses a sorted vector or an array of - bool, depending on the input. It behaves like a const version of an STL set, - with only a subset of the functionality, except all the member functions are - upper-case. - - Note that we could get rid of the member slow_set_, but we'd have to - do more work to implement an iterator type. This would save memory. -*/ - -namespace kaldi { - -template -class ConstIntegerSet { - public: - ConstIntegerSet() : lowest_member_(1), highest_member_(0) {} - - void Init(const std::vector &input) { - slow_set_ = input; - SortAndUniq(&slow_set_); - InitInternal(); - } - - void Init(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - - explicit ConstIntegerSet(const std::vector &input) : slow_set_(input) { - SortAndUniq(&slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const ConstIntegerSet &other) - : slow_set_(other.slow_set_) { - InitInternal(); - } - - int count(I i) const; // returns 1 or 0. - - typedef typename std::vector::const_iterator iterator; - iterator begin() const { return slow_set_.begin(); } - iterator end() const { return slow_set_.end(); } - size_t size() const { return slow_set_.size(); } - bool empty() const { return slow_set_.empty(); } - - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); - - private: - I lowest_member_; - I highest_member_; - bool contiguous_; - bool quick_; - std::vector quick_set_; - std::vector slow_set_; - void InitInternal(); -}; - -} // end namespace kaldi - -#include "util/const-integer-set-inl.h" - -#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/hash-list-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/hash-list-inl.h deleted file mode 100644 index 063fa7131ec618f0aae9dc30f4edd26c9dcce7fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/hash-list-inl.h +++ /dev/null @@ -1,193 +0,0 @@ -// util/hash-list-inl.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_INL_H_ -#define KALDI_UTIL_HASH_LIST_INL_H_ - -// Do not include this file directly. It is included by fast-hash.h - -namespace kaldi { - -template -HashList::HashList() { - list_head_ = NULL; - bucket_list_tail_ = static_cast(-1); // invalid. - hash_size_ = 0; - freed_head_ = NULL; -} - -template -void HashList::SetSize(size_t size) { - hash_size_ = size; - KALDI_ASSERT(list_head_ == NULL && - bucket_list_tail_ == - static_cast(-1)); // make sure empty. - if (size > buckets_.size()) buckets_.resize(size, HashBucket(0, NULL)); -} - -template -typename HashList::Elem *HashList::Clear() { - // Clears the hashtable and gives ownership of the currently contained list - // to the user. - for (size_t cur_bucket = bucket_list_tail_; - cur_bucket != static_cast(-1); - cur_bucket = buckets_[cur_bucket].prev_bucket) { - buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". - } - bucket_list_tail_ = static_cast(-1); - Elem *ans = list_head_; - list_head_ = NULL; - return ans; -} - -template -const typename HashList::Elem *HashList::GetList() const { - return list_head_; -} - -template -inline void HashList::Delete(Elem *e) { - e->tail = freed_head_; - freed_head_ = e; -} - -template -inline typename HashList::Elem *HashList::Find(I key) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - if (bucket.last_elem == NULL) { - return NULL; // empty bucket. - } else { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - return NULL; // Not found. - } -} - -template -inline typename HashList::Elem *HashList::New() { - if (freed_head_) { - Elem *ans = freed_head_; - freed_head_ = freed_head_->tail; - return ans; - } else { - Elem *tmp = new Elem[allocate_block_size_]; - for (size_t i = 0; i + 1 < allocate_block_size_; i++) - tmp[i].tail = tmp + i + 1; - tmp[allocate_block_size_ - 1].tail = NULL; - freed_head_ = tmp; - allocated_.push_back(tmp); - return this->New(); - } -} - -template -HashList::~HashList() { - // First test whether we had any memory leak within the - // HashList, i.e. things for which the user did not call Delete(). - size_t num_in_list = 0, num_allocated = 0; - for (Elem *e = freed_head_; e != NULL; e = e->tail) num_in_list++; - for (size_t i = 0; i < allocated_.size(); i++) { - num_allocated += allocate_block_size_; - delete[] allocated_[i]; - } - if (num_in_list != num_allocated) { - KALDI_WARN << "Possible memory leak: " << num_in_list - << " != " << num_allocated - << ": you might have forgotten to call Delete on " - << "some Elems"; - } -} - -template -inline typename HashList::Elem *HashList::Insert(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - // Check the element is existing or not. - if (bucket.last_elem != NULL) { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - } - - // This is a new element. Insert it. - Elem *elem = New(); - elem->key = key; - elem->val = val; - if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at - // head of bucket list (which is tail of regular list, they go in - // opposite directions). - if (bucket_list_tail_ == static_cast(-1)) { - // list was empty so this is the first elem. - KALDI_ASSERT(list_head_ == NULL); - list_head_ = elem; - } else { - // link in to the chain of Elems - buckets_[bucket_list_tail_].last_elem->tail = elem; - } - elem->tail = NULL; - bucket.last_elem = elem; - bucket.prev_bucket = bucket_list_tail_; - bucket_list_tail_ = index; - } else { - // Already-occupied bucket. Insert at tail of list of elements within - // the bucket. - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - } - return elem; -} - -template -void HashList::InsertMore(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - Elem *elem = New(); - elem->key = key; - elem->val = val; - - KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here - if (bucket.last_elem->key == key) { // standard behavior: add as last element - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - return; - } - Elem *e = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail); - // find place to insert in linked list - while (e != bucket.last_elem->tail && e->key != key) e = e->tail; - KALDI_ASSERT(e->key == key); // not found? - should not happen - elem->tail = e->tail; - e->tail = elem; -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/hash-list.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/hash-list.h deleted file mode 100644 index 31cc9bdc4870773475f8c5139539e320746bf5fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/hash-list.h +++ /dev/null @@ -1,146 +0,0 @@ -// util/hash-list.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_H_ -#define KALDI_UTIL_HASH_LIST_H_ - -#include -#include -#include -#include -#include - -#include "base/kaldi-error.h" - -/* This header provides utilities for a structure that's used in a decoder (but - is quite generic in nature so we implement and test it separately). - Basically it's a singly-linked list, but implemented in such a way that we - can quickly search for elements in the list. We give it a slightly richer - interface than just a hash and a list. The idea is that we want to separate - the hash part and the list part: basically, in the decoder, we want to have a - single hash for the current frame and the next frame, because by the time we - need to access the hash for the next frame we no longer need the hash for the - previous frame. So we have an operation that clears the hash but leaves the - list structure intact. We also control memory management inside this object, - to avoid repeated new's/deletes. - - See hash-list-test.cc for an example of how to use this object. -*/ - -namespace kaldi { - -template -class HashList { - public: - struct Elem { - I key; - T val; - Elem *tail; - }; - - /// Constructor takes no arguments. - /// Call SetSize to inform it of the likely size. - HashList(); - - /// Clears the hash and gives the head of the current list to the user; - /// ownership is transferred to the user (the user must call Delete() - /// for each element in the list, at his/her leisure). - Elem *Clear(); - - /// Gives the head of the current list to the user. Ownership retained in the - /// class. Caution: in December 2013 the return type was changed to const - /// Elem* and this function was made const. You may need to change some types - /// of local Elem* variables to const if this produces compilation errors. - const Elem *GetList() const; - - /// Think of this like delete(). It is to be called for each Elem in turn - /// after you "obtained ownership" by doing Clear(). This is not the opposite - /// of. Insert, it is the opposite of New. It's really a memory operation. - inline void Delete(Elem *e); - - /// This should probably not be needed to be called directly by the user. - /// Think of it as opposite - /// to Delete(); - inline Elem *New(); - - /// Find tries to find this element in the current list using the hashtable. - /// It returns NULL if not present. The Elem it returns is not owned by the - /// user, it is part of the internal list owned by this object, but the user - /// is free to modify the "val" element. - inline Elem *Find(I key); - - /// Insert inserts a new element into the hashtable/stored list. - /// Because element keys in a hashtable are unique, this operation checks - /// whether each inserted element has a key equivalent to the one of an - /// element already in the hashtable. If so, the element is not inserted, - /// returning an pointer to this existing element. - inline Elem *Insert(I key, T val); - - /// Insert inserts another element with same key into the hashtable/ - /// stored list. - /// By calling this, the user asserts that one element with that key is - /// already present. - /// We insert it that way, that all elements with the same key - /// follow each other. - /// Find() will return the first one of the elements with the same key. - inline void InsertMore(I key, T val); - - /// SetSize tells the object how many hash buckets to allocate (should - /// typically be at least twice the number of objects we expect to go in the - /// structure, for fastest performance). It must be called while the hash - /// is empty (e.g. after Clear() or after initializing the object, but before - /// adding anything to the hash. - void SetSize(size_t sz); - - /// Returns current number of hash buckets. - inline size_t Size() { return hash_size_; } - - ~HashList(); - - private: - struct HashBucket { - size_t prev_bucket; // index to next bucket (-1 if list tail). Note: - // list of buckets goes in opposite direction to list of Elems. - Elem *last_elem; // pointer to last element in this bucket (NULL if empty) - inline HashBucket(size_t i, Elem *e) : prev_bucket(i), last_elem(e) {} - }; - - Elem *list_head_; // head of currently stored list. - size_t bucket_list_tail_; // tail of list of active hash buckets. - - size_t hash_size_; // number of hash buckets. - - std::vector buckets_; - - Elem *freed_head_; // head of list of currently freed elements. [ready for - // allocation] - - std::vector allocated_; // list of allocated blocks. - - static const size_t allocate_block_size_ = 1024; // Number of Elements to - // allocate in one block. Must be largish so storing allocated_ doesn't - // become a problem. -}; - -} // end namespace kaldi - -#include "util/hash-list-inl.h" - -#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io-inl.h deleted file mode 100644 index 8b0c92131c4af2113eb33da6f3cfa9dc4dee83e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io-inl.h +++ /dev/null @@ -1,40 +0,0 @@ -// util/kaldi-io-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_INL_H_ -#define KALDI_UTIL_KALDI_IO_INL_H_ - -#include - -namespace kaldi { - -bool Input::Open(const std::string &rxfilename, bool *binary) { - return OpenInternal(rxfilename, true, binary); -} - -bool Input::OpenTextMode(const std::string &rxfilename) { - return OpenInternal(rxfilename, false, NULL); -} - -bool Input::IsOpen() { return impl_ != NULL; } - -bool Output::IsOpen() { return impl_ != NULL; } - -} // end namespace kaldi. - -#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io.cc deleted file mode 100644 index 5f8ec4870138df32f6aca9c12383cf3885411741..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io.cc +++ /dev/null @@ -1,898 +0,0 @@ -// util/kaldi-io.cc - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/kaldi-io.h" - -#include -#include -#include - -#include - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" -#include "util/kaldi-pipebuf.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -#ifdef KALDI_CYGWIN_COMPAT -#include "util/kaldi-cygwin-io-inl.h" -#define MapOsPath(x) MapCygwinPath(x) -#else // KALDI_CYGWIN_COMPAT -#define MapOsPath(x) x -#endif // KALDI_CYGWIN_COMPAT - -#if defined(_MSC_VER) -static FILE *popen(const char *command, const char *mode) { -#ifdef KALDI_CYGWIN_COMPAT - return kaldi::CygwinCompatPopen(command, mode); -#else // KALDI_CYGWIN_COMPAT - return _popen(command, mode); -#endif // KALDI_CYGWIN_COMPAT -} -#endif // _MSC_VER - -namespace kaldi { - -#ifndef _MSC_VER // on VS, we don't need this type. -// could replace basic_pipebuf with stdio_filebuf on some platforms. -// Would mean we could use less of our own code. -typedef basic_pipebuf PipebufType; -#endif -} // namespace kaldi - -namespace kaldi { - -std::string PrintableRxfilename(const std::string &rxfilename) { - if (rxfilename == "" || rxfilename == "-") { - return "standard input"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return rxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(rxfilename); - } -} - -std::string PrintableWxfilename(const std::string &wxfilename) { - if (wxfilename == "" || wxfilename == "-") { - return "standard output"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return wxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(wxfilename); - } -} - -OutputType ClassifyWxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardOutput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardOutput; - } else if (first_char == '|') { - return kPipeOutput; // An output pipe like "|blah". - } else if (isspace(first_char) || isspace(last_char) || last_char == '|') { - return kNoOutput; // Leading or trailing space: can't interpret this. - // Final '|' would represent an input pipe, not an - // output pipe. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoOutput; - } else if (isdigit(last_char)) { - // This could be a file, but we have to see if it's an offset into a file - // (like foo.ark:4314328), which is not allowed for writing (but is - // allowed for reaching). This eliminates some things which would be - // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed - // such filenames for writing, we woudln't be able to correctly read them). - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') return kNoOutput; - // else it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but we - // check for internal '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" - " wrong place (pipe without | at the beginning?): " - << filename; - return kNoOutput; - } - return kFileOutput; // It matched no other pattern: assume it's a filename. -} - -InputType ClassifyRxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardInput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardInput; - } else if (first_char == '|') { - return kNoInput; // An output pipe like "|blah": not - // valid for input. - } else if (last_char == '|') { - return kPipeInput; - } else if (isspace(first_char) || isspace(last_char)) { - return kNoInput; // We don't allow leading or trailing space in a filename. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoInput; - } else if (isdigit(last_char)) { - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') - return kOffsetFileInput; // Filename is like - // some_file:12345 - // otherwise it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but - // we check for '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified in this case. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" - " wrong place (pipe without | at the end?): " - << filename; - return kNoInput; - } - return kFileInput; // It matched no other pattern: assume it's a filename. -} - -class OutputImplBase { - public: - // Open will open it as a file (no header), and return true - // on success. It cannot be called on an already open stream. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::ostream &Stream() = 0; - virtual bool Close() = 0; - virtual ~OutputImplBase() {} -}; - -class FileOutputImpl : public OutputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (os_.is_open()) - KALDI_ERR << "FileOutputImpl::Open(), " - << "open called on already open file."; - filename_ = filename; - os_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out); - return os_.is_open(); - } - - virtual std::ostream &Stream() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return os_; - } - - virtual bool Close() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - os_.close(); - return !(os_.fail()); - } - virtual ~FileOutputImpl() { - if (os_.is_open()) { - os_.close(); - if (os_.fail()) KALDI_ERR << "Error closing output file " << filename_; - } - } - - private: - std::string filename_; - std::ofstream os_; -}; - -class StandardOutputImpl : public OutputImplBase { - public: - StandardOutputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardOutputImpl::Open(), " - "open called on already open file."; -#ifdef _MSC_VER - _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); -#endif - is_open_ = std::cout.good(); - return is_open_; - } - - virtual std::ostream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cout; - } - - virtual bool Close() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; - is_open_ = false; - std::cout << std::flush; - return !(std::cout.fail()); - } - virtual ~StandardOutputImpl() { - if (is_open_) { - std::cout << std::flush; - if (std::cout.fail()) KALDI_ERR << "Error writing to standard output"; - } - } - - private: - bool is_open_; -}; - -class PipeOutputImpl : public OutputImplBase { - public: - PipeOutputImpl() : f_(NULL), os_(NULL) {} - - virtual bool Open(const std::string &wxfilename, bool binary) { - filename_ = wxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should - // start with '|' - std::string cmd_name(wxfilename, 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); -#else - f_ = popen(cmd_name.c_str(), "w"); -#endif - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for writing, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't make the - // destructor try to close the stream when - // we're done. - (binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - os_ = new std::ostream(fb_); -#else - os_ = new std::ofstream(f_); -#endif - return os_->good(); - } - } - - virtual std::ostream &Stream() { - if (os_ == NULL) - KALDI_ERR << "PipeOutputImpl::Stream()," - " object not initialized."; - // I believe this error can only arise from coding error. - return *os_; - } - - virtual bool Close() { - if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; - bool ok = true; - os_->flush(); - if (os_->fail()) ok = false; - delete os_; - os_ = NULL; - int status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return ok; - } - virtual ~PipeOutputImpl() { - if (os_) { - if (!Close()) - KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); - } - } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::ostream *os_; -}; - -class InputImplBase { - public: - // Open will open it as a file, and return true on success. - // May be called twice only for kOffsetFileInput (otherwise, - // if called twice, we just create a new Input object, to avoid - // having to deal with the extra hassle of reopening with the - // same object. - // Note that we will to call Open with true (binary) for - // for text-mode Kaldi files; the only actual text-mode input - // is for non-Kaldi files. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::istream &Stream() = 0; - virtual int32 Close() = 0; // We only need to check failure in the case of - // kPipeInput. - // on close for input streams. - virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may - // call Open twice - // (has efficiency benefits). - - virtual ~InputImplBase() {} -}; - -class FileInputImpl : public InputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (is_.is_open()) - KALDI_ERR << "FileInputImpl::Open(), " - << "open called on already open file."; - is_.open( - MapOsPath(filename).c_str(), - binary ? std::ios_base::in | std::ios_base::binary : std::ios_base::in); - return is_.is_open(); - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kFileInput; } - - virtual ~FileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::ifstream is_; -}; - -class StandardInputImpl : public InputImplBase { - public: - StandardInputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardInputImpl::Open(), " - "open called on already open file."; - is_open_ = true; -#ifdef _MSC_VER - _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); -#endif - return true; // Don't check good() because would be false if - // eof, which may be valid input. - } - - virtual std::istream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cin; - } - - virtual InputType MyType() { return kStandardInput; } - - virtual int32 Close() { - if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; - is_open_ = false; - return 0; - } - virtual ~StandardInputImpl() {} - - private: - bool is_open_; -}; - -class PipeInputImpl : public InputImplBase { - public: - PipeInputImpl() : f_(NULL), is_(NULL) {} - - virtual bool Open(const std::string &rxfilename, bool binary) { - filename_ = rxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(rxfilename.length() != 0 && - rxfilename[rxfilename.length() - 1] == - '|'); // should end with '|' - std::string cmd_name(rxfilename, 0, rxfilename.length() - 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); -#else - f_ = popen(cmd_name.c_str(), "r"); -#endif - - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for reading, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't lead the - // destructor to close the stream. - (binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - is_ = new std::istream(fb_); -#else - is_ = new std::ifstream(f_); -#endif - if (is_->fail() || is_->bad()) return false; - if (is_->eof()) { - KALDI_WARN << "Pipe opened with command " - << PrintableRxfilename(rxfilename) << " is empty."; - // don't return false: empty may be valid. - } - return true; - } - } - - virtual std::istream &Stream() { - if (is_ == NULL) - KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return *is_; - } - - virtual int32 Close() { - if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open."; - delete is_; - is_ = NULL; - int32 status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return status; - } - virtual ~PipeInputImpl() { - if (is_) Close(); - } - virtual InputType MyType() { return kPipeInput; } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::istream *is_; -}; - -/* -#else - -// Just have an empty implementation of the pipe input that crashes if -// called. -class PipeInputImpl: public InputImplBase { - public: - PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this - platform."); } - virtual bool Open(const std::string, bool) { return 0; } - virtual std::istream &Stream() const { return NULL; } - virtual void Close() {} - virtual InputType MyType() { return kPipeInput; } -}; - -#endif -*/ - -class OffsetFileInputImpl : public InputImplBase { - // This class is a bit more complicated than the - - public: - // splits a filename like /my/file:123 into /my/file and the - // number 123. Crashes if not this format. - static void SplitFilename(const std::string &rxfilename, - std::string *filename, size_t *offset) { - size_t pos = rxfilename.find_last_of(':'); - KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling - // code, as the filename is supposed to be of the correct form at this - // point. - *filename = std::string(rxfilename, 0, pos); - std::string number(rxfilename, pos + 1); - bool ans = ConvertStringToInteger(number, offset); - if (!ans) - KALDI_ERR << "Cannot get offset from filename " << rxfilename - << " (possibly you compiled in 32-bit and have a >32-bit" - << " byte offset into a file; you'll have to compile 64-bit."; - } - - bool Seek(size_t offset) { - size_t cur_pos = is_.tellg(); - if (cur_pos == offset) { - return true; - } else if (cur_pos < offset && cur_pos + 100 > offset) { - // We're close enough that it may be faster to just - // read that data, rather than seek. - for (size_t i = cur_pos; i < offset; i++) is_.get(); - return (is_.tellg() == std::streampos(offset)); - } - // Try to actually seek. - is_.seekg(offset, std::ios_base::beg); - if (is_.fail()) { // failbit or badbit is set [error happened] - is_.close(); - return false; // failure. - } else { - is_.clear(); // Clear any failure bits (e.g. eof). - return true; // success. - } - } - - // This Open routine is unusual in that it is designed to work even - // if it was already open. This for efficiency when seeking multiple - // times. - virtual bool Open(const std::string &rxfilename, bool binary) { - if (is_.is_open()) { - // We are opening when we have an already-open file. - // We may have to seek within this file, or else close it and - // open a different one. - std::string tmp_filename; - size_t offset; - SplitFilename(rxfilename, &tmp_filename, &offset); - if (tmp_filename == filename_ && binary == binary_) { // Just seek - is_.clear(); // clear fail bit, etc. - return Seek(offset); - } else { - is_.close(); // don't bother checking error status of is_. - filename_ = tmp_filename; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } else { - size_t offset; - SplitFilename(rxfilename, &filename_, &offset); - binary_ = binary; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kOffsetFileInput; } - - virtual ~OffsetFileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::string filename_; // the actual filename - bool binary_; // true if was opened in binary mode. - std::ifstream is_; -}; - -Output::Output(const std::string &wxfilename, bool binary, bool write_header) - : impl_(NULL) { - if (!Open(wxfilename, binary, write_header)) { - if (impl_) { - delete impl_; - impl_ = NULL; - } - KALDI_ERR << "Error opening output stream " - << PrintableWxfilename(wxfilename); - } -} - -bool Output::Close() { - if (!impl_) { - return false; // error to call Close if not open. - } else { - bool ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } -} - -Output::~Output() { - if (impl_) { - bool ok = impl_->Close(); - delete impl_; - impl_ = NULL; - if (!ok) - KALDI_ERR << "Error closing output file " - << PrintableWxfilename(filename_) - << (ClassifyWxfilename(filename_) == kFileOutput - ? " (disk full?)" - : ""); - } -} - -std::ostream &Output::Stream() { // will throw if not open; else returns - // stream. - if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; - return impl_->Stream(); -} - -bool Output::Open(const std::string &wxfn, bool binary, bool header) { - if (IsOpen()) { - if (!Close()) { // Throw here rather than return status, as it's an error - // about something else: if the user wanted to avoid the exception he/she - // could have called Close(). - KALDI_ERR << "Output::Open(), failed to close output stream: " - << PrintableWxfilename(filename_); - } - } - - filename_ = wxfn; - - OutputType type = ClassifyWxfilename(wxfn); - KALDI_ASSERT(impl_ == NULL); - - if (type == kFileOutput) { - impl_ = new FileOutputImpl(); - } else if (type == kStandardOutput) { - impl_ = new StandardOutputImpl(); - } else if (type == kPipeOutput) { - impl_ = new PipeOutputImpl(); - } else { // type == kNoOutput - KALDI_WARN << "Invalid output filename format " - << PrintableWxfilename(wxfn); - return false; - } - if (!impl_->Open(wxfn, binary)) { - delete impl_; - impl_ = NULL; - return false; // failed to open. - } else { // successfully opened it. - if (header) { - InitKaldiOutputStream(impl_->Stream(), binary); - bool ok = impl_->Stream().good(); // still OK? - if (!ok) { - delete impl_; - impl_ = NULL; - return false; - } - return true; - } else { - return true; - } - } -} - -Input::Input(const std::string &rxfilename, bool *binary) : impl_(NULL) { - if (!Open(rxfilename, binary)) { - KALDI_ERR << "Error opening input stream " - << PrintableRxfilename(rxfilename); - } -} - -int32 Input::Close() { - if (impl_) { - int32 ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } else { - return 0; - } -} - -bool Input::OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary) { - InputType type = ClassifyRxfilename(rxfilename); - if (IsOpen()) { - // May have to close the stream first. - if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { - // We want to use the same object to Open... this is in case - // the files are the same, so we can just seek. - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always open in binary. - delete impl_; - impl_ = NULL; - return false; - } - // read the binary header, if requested. - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; - } else { - Close(); - // and fall through to code below which actually opens the file. - } - } - if (type == kFileInput) { - impl_ = new FileInputImpl(); - } else if (type == kStandardInput) { - impl_ = new StandardInputImpl(); - } else if (type == kPipeInput) { - impl_ = new PipeInputImpl(); - } else if (type == kOffsetFileInput) { - impl_ = new OffsetFileInputImpl(); - } else { // type == kNoInput - KALDI_WARN << "Invalid input filename format " - << PrintableRxfilename(rxfilename); - return false; - } - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always read in binary. - delete impl_; - impl_ = NULL; - return false; - } - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; -} - -Input::~Input() { - if (impl_) Close(); -} - -std::istream &Input::Stream() { - if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; - return impl_->Stream(); -} - -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io.h deleted file mode 100644 index 2175ca8f89ed5f3e3bade26528e924208df692c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-io.h +++ /dev/null @@ -1,266 +0,0 @@ -// util/kaldi-io.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_H_ -#define KALDI_UTIL_KALDI_IO_H_ - -#ifdef _MSC_VER -#include -#include -#endif -#include // For isspace. -#include -#include -#include "base/kaldi-common.h" -// #include "matrix/kaldi-matrix.h" - -namespace kaldi { - -class OutputImplBase; // Forward decl; defined in a .cc file -class InputImplBase; // Forward decl; defined in a .cc file - -/// \addtogroup io_group -/// @{ - -// The Output and Input classes handle stream-opening for "extended" filenames -// that include actual files, standard-input/standard-output, pipes, and -// offsets into actual files. They also handle reading and writing the -// binary-mode headers for Kaldi files, where applicable. The classes have -// versions of the Open routines that throw and do not throw, depending whether -// the calling code wants to catch the errors or not; there are also versions -// that write (or do not write) the Kaldi binary-mode header that says if it's -// binary mode. Generally files that contain Kaldi objects will have the header -// on, so we know upon reading them whether they have the header. So you would -// use the OpenWithHeader routines for these (or the constructor); but other -// types of objects (e.g. FSTs) would have files without a header so you would -// use OpenNoHeader. - -// We now document the types of extended filenames that we use. -// -// A "wxfilename" is an extended filename for writing. It can take three forms: -// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My -// Documents\\boo" -// (whatever the actual file-system interprets) -// (2) Standard output: "" or "-" -// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" -// -// -// A "rxfilename" is an extended filename for reading. It can take four forms: -// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". -// (2) Standard input: "" or "-" -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" -// [these are created by the Table and TableWriter classes; I may also write -// a program that creates them for arbitrary files] -// - -// Typical usage: -// ... -// bool binary; -// MyObject.Write(Output(some_filename, binary).Stream(), binary); -// -// ... more extensive example: -// { -// Output ko(some_filename, binary); -// MyObject1.Write(ko.Stream(), binary); -// MyObject2.Write(ko.Stream(), binary); -// } - -enum OutputType { kNoOutput, kFileOutput, kStandardOutput, kPipeOutput }; - -/// ClassifyWxfilename interprets filenames as follows: -/// - kNoOutput: invalid filenames (leading or trailing space, things that look -/// like wspecifiers and rspecifiers or like pipes to read from with leading -/// |. -/// - kFileOutput: Normal filenames -/// - kStandardOutput: The empty string or "-", interpreted as standard output -/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" -OutputType ClassifyWxfilename(const std::string &wxfilename); - -enum InputType { - kNoInput, - kFileInput, - kStandardInput, - kOffsetFileInput, - kPipeInput -}; - -/// ClassifyRxfilenames interprets filenames for reading as follows: -/// - kNoInput: invalid filenames (leading or trailing space, things that -/// look like wspecifiers and rspecifiers or pipes to write to -/// with trailing |. -/// - kFileInput: normal filenames -/// - kStandardInput: the empty string or "-" -/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" -/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 -InputType ClassifyRxfilename(const std::string &rxfilename); - -class Output { - public: - // The normal constructor, provided for convenience. - // Equivalent to calling with default constructor then Open() - // with these arguments. - Output(const std::string &filename, bool binary, bool write_header = true); - - Output() : impl_(NULL) {} - - /// This opens the stream, with the given mode (binary or text). It returns - /// true on success and false on failure. However, it will throw if something - /// was already open and could not be closed (to avoid this, call Close() - /// first. if write_header == true and binary == true, it writes the Kaldi - /// binary-mode header ('\0' then 'B'). You may call Open even if it is - /// already open; it will close the existing stream and reopen (however if - /// closing the old stream failed it will throw). - bool Open(const std::string &wxfilename, bool binary, bool write_header); - - inline bool IsOpen(); // return true if we have an open stream. Does not - // imply stream is good for writing. - - std::ostream &Stream(); // will throw if not open; else returns stream. - - // Close closes the stream. Calling Close is never necessary unless you - // want to avoid exceptions being thrown. There are times when calling - // Close will hurt efficiency (basically, when using offsets into files, - // and using the same Input object), - // but most of the time the user won't be doing this directly, it will - // be done in kaldi-table.{h, cc}, so you don't have to worry about it. - bool Close(); - - // This will throw if stream could not be closed (to check error status, - // call Close()). - ~Output(); - - private: - OutputImplBase *impl_; // non-NULL if open. - std::string filename_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Output); -}; - -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject.Read(ki.Stream(), binary_in); -// -// ... more extensive example: -// -// { -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject1.Read(ki.Stream(), &binary_in); -// MyObject2.Write(ki.Stream(), &binary_in); -// } -// Note that to catch errors you need to use try.. catch. -// Input communicates errors by throwing exceptions. - -// Input interprets four kinds of filenames: -// (1) Normal filenames -// (2) The empty string or "-", interpreted as standard output -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) Offsets into [real] files, e.g. "/my/filename:12049" -// The last one has no correspondence in Output. - -class Input { - public: - /// The normal constructor. Opens the stream in binary mode. - /// Equivalent to calling the default constructor followed by Open(); then, if - /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it - /// throws on error. - explicit Input(const std::string &rxfilename, bool *contents_binary = NULL); - - Input() : impl_(NULL) {} - - // Open opens the stream for reading (the mode, where relevant, is binary; use - // OpenTextMode for text-mode, we made this a separate function rather than a - // boolean argument, to avoid confusion with Kaldi's text/binary distinction, - // since reading in the file system's text mode is unusual.) If - // contents_binary != NULL, it reads the binary-mode header and puts it in the - // "binary" variable. Returns true on success. If it returns false it will - // not be open. You may call Open even if it is already open; it will close - // the existing stream and reopen (however if closing the old stream failed it - // will throw). - inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); - - // As Open but (if the file system has text/binary modes) opens in text mode; - // you shouldn't ever have to use this as in Kaldi we read even text files in - // binary mode (and ignore the \r). - inline bool OpenTextMode(const std::string &rxfilename); - - // Return true if currently open for reading and Stream() will - // succeed. Does not guarantee that the stream is good. - inline bool IsOpen(); - - // It is never necessary or helpful to call Close, except if - // you are concerned about to many filehandles being open. - // Close does not throw. It returns the exit code as int32 - // in the case of a pipe [kPipeInput], and always zero otherwise. - int32 Close(); - - // Returns the underlying stream. Throws if !IsOpen() - std::istream &Stream(); - - // Destructor does not throw: input streams may legitimately fail so we - // don't worry about the status when we close them. - ~Input(); - - private: - bool OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary); - InputImplBase *impl_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Input); -}; - -template -void ReadKaldiObject(const std::string &filename, C *c) { - bool binary_in; - Input ki(filename, &binary_in); - c->Read(ki.Stream(), binary_in); -} - -// Specialize the template for reading matrices, because we want to be able to -// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); -// -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); - -template -inline void WriteKaldiObject(const C &c, const std::string &filename, - bool binary) { - Output ko(filename, binary); - c.Write(ko.Stream(), binary); -} - -/// PrintableRxfilename turns the rxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard input". -std::string PrintableRxfilename(const std::string &rxfilename); - -/// PrintableWxfilename turns the wxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard output". -std::string PrintableWxfilename(const std::string &wxfilename); - -/// @} - -} // end namespace kaldi. - -#include "util/kaldi-io-inl.h" - -#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-pipebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-pipebuf.h deleted file mode 100644 index bcee80ccb1a6fa8ce3195483ac144c5ff66d2f89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/kaldi-pipebuf.h +++ /dev/null @@ -1,86 +0,0 @@ -// util/kaldi-pipebuf.h - -// Copyright 2009-2011 Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/** @file kaldi-pipebuf.h - * This is an Kaldi C++ Library header. - */ - -#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ -#define KALDI_UTIL_KALDI_PIPEBUF_H_ - -#include -#if !defined(_LIBCPP_VERSION) // libc++ -#include -#else -#include "util/basic-filebuf.h" -#endif - -namespace kaldi { -// This class provides a way to initialize a filebuf with a FILE* pointer -// directly; it will not close the file pointer when it is deleted. -// The C++ standard does not allow implementations of C++ to provide -// this constructor within basic_filebuf, which makes it hard to deal -// with pipes using completely native C++. This is a workaround - -#ifdef _MSC_VER -#elif defined(_LIBCPP_VERSION) // libc++ -template > -class basic_pipebuf : public basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : basic_filebuf() { - this->open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - } -}; // class basic_pipebuf -#else -template > -class basic_pipebuf : public std::basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : std::basic_filebuf() { - this->_M_file.sys_open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - this->_M_mode = mode; - this->_M_buf_size = BUFSIZ; - this->_M_allocate_internal_buffer(); - this->_M_reading = false; - this->_M_writing = false; - this->_M_set_buffer(-1); - } -}; // class basic_pipebuf -#endif // _MSC_VER - -} // namespace kaldi - -#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/parse-options.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/parse-options.cc deleted file mode 100644 index 1f2ef844d28d67ed58d2e0c9d7c7b674e8209df8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/parse-options.cc +++ /dev/null @@ -1,636 +0,0 @@ -// util/parse-options.cc - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -// Frantisek Skala; Arnab Ghoshal -// Copyright 2013 Tanel Alumae -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -namespace kaldi { - -ParseOptions::ParseOptions(const std::string &prefix, OptionsItf *other) - : print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { - ParseOptions *po = dynamic_cast(other); - if (po != NULL && po->other_parser_ != NULL) { - // we get here if this constructor is used twice, recursively. - other_parser_ = po->other_parser_; - } else { - other_parser_ = other; - } - if (po != NULL && po->prefix_ != "") { - prefix_ = po->prefix_ + std::string(".") + prefix; - } else { - prefix_ = prefix; - } -} - -void ParseOptions::Register(const std::string &name, bool *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, int32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, uint32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, float *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, double *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, std::string *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -// old-style, used for registering application-specific parameters -template -void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, - const std::string &doc) { - if (other_parser_ == NULL) { - this->RegisterCommon(name, ptr, doc, false); - } else { - KALDI_ASSERT(prefix_ != "" && - "Cannot use empty prefix when registering with prefix."); - std::string new_name = prefix_ + '.' + name; // name becomes prefix.name - other_parser_->Register(new_name, ptr, doc); - } -} - -// does the common part of the job of registering a parameter -template -void ParseOptions::RegisterCommon(const std::string &name, T *ptr, - const std::string &doc, bool is_standard) { - KALDI_ASSERT(ptr != NULL); - std::string idx = name; - NormalizeArgName(&idx); - if (doc_map_.find(idx) != doc_map_.end()) - KALDI_WARN << "Registering option twice, ignoring second time: " << name; - this->RegisterSpecific(name, idx, ptr, doc, is_standard); -} - -// used to register standard parameters (those that are present in all of the -// applications) -template -void ParseOptions::RegisterStandard(const std::string &name, T *ptr, - const std::string &doc) { - this->RegisterCommon(name, ptr, doc, true); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, bool *b, - const std::string &doc, bool is_standard) { - bool_map_[idx] = b; - doc_map_[idx] = - DocInfo(name, doc + " (bool, default = " + ((*b) ? "true)" : "false)"), - is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, int32 *i, - const std::string &doc, bool is_standard) { - int_map_[idx] = i; - std::ostringstream ss; - ss << doc << " (int, default = " << *i << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, uint32 *u, - const std::string &doc, bool is_standard) { - uint_map_[idx] = u; - std::ostringstream ss; - ss << doc << " (uint, default = " << *u << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, float *f, - const std::string &doc, bool is_standard) { - float_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (float, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, double *f, - const std::string &doc, bool is_standard) { - double_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (double, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, std::string *s, - const std::string &doc, bool is_standard) { - string_map_[idx] = s; - doc_map_[idx] = - DocInfo(name, doc + " (string, default = \"" + *s + "\")", is_standard); -} -void ParseOptions::DisableOption(const std::string &name) { - if (argv_ != NULL) - KALDI_ERR << "DisableOption must not be called after calling Read()."; - if (doc_map_.erase(name) == 0) - KALDI_ERR << "Option " << name - << " was not registered so cannot be disabled: "; - bool_map_.erase(name); - int_map_.erase(name); - uint_map_.erase(name); - float_map_.erase(name); - double_map_.erase(name); - string_map_.erase(name); -} - -int ParseOptions::NumArgs() const { return positional_args_.size(); } - -std::string ParseOptions::GetArg(int i) const { - // use KALDI_ERR if code error - if (i < 1 || i > static_cast(positional_args_.size())) - KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; - return positional_args_[i - 1]; -} - -// We currently do not support any other options. -enum ShellType { kBash = 0 }; - -// This can be changed in the code if it ever does need to be changed (as it's -// unlikely that one compilation of this tool-set would use both shells). -static ShellType kShellType = kBash; - -// Returns true if we need to escape a string before putting it into -// a shell (mainly thinking of bash shell, but should work for others) -// This is for the convenience of the user so command-lines that are -// printed out by ParseOptions::Read (with --print-args=true) are -// paste-able into the shell and will run. If you use a different type of -// shell, it might be necessary to change this function. -// But it's mostly a cosmetic issue as it basically affects how -// the program echoes its command-line arguments to the screen. -static bool MustBeQuoted(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - const char *c = str.c_str(); - if (*c == '\0') { - return true; // Must quote empty string - } else { - const char *ok_chars[2]; - - // These seem not to be interpreted as long as there are no other "bad" - // characters involved (e.g. "," would be interpreted as part of something - // like a{b,c}, but not on its own. - ok_chars[kBash] = "[]~#^_-+=:.,/"; - - // Just want to make sure that a space character doesn't get automatically - // inserted here via an automated style-checking script, like it did before. - KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); - - for (; *c != '\0'; c++) { - // For non-alphanumeric characters we have a list of characters which - // are OK. All others are forbidden (this is easier since the shell - // interprets most non-alphanumeric characters). - if (!isalnum(*c)) { - const char *d; - for (d = ok_chars[st]; *d != '\0'; d++) - if (*c == *d) break; - // If not alphanumeric or one of the "ok_chars", it must be escaped. - if (*d == '\0') return true; - } - } - return false; // The string was OK. No quoting or escaping. - } -} - -// Returns a quoted and escaped version of "str" -// which has previously been determined to need escaping. -// Our aim is to print out the command line in such a way that if it's -// pasted into a shell of ShellType "st" (only bash for now), it -// will get passed to the program in the same way. -static std::string QuoteAndEscape(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - // For now we use the following rules: - // In the normal case, we quote with single-quote "'", and to escape - // a single-quote we use the string: '\'' (interpreted as closing the - // single-quote, putting an escaped single-quote from the shell, and - // then reopening the single quote). - char quote_char = '\''; - const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b - - // If the string contains single-quotes that would need escaping this - // way, and we determine that the string could be safely double-quoted - // without requiring any escaping, then we double-quote the string. - // This is the case if the characters "`$\ do not appear in the string. - // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html - const char *c_str = str.c_str(); - if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { - quote_char = '"'; - escape_str = "\\\""; // should never be accessed. - } - - char buf[2]; - buf[1] = '\0'; - - buf[0] = quote_char; - std::string ans = buf; - const char *c = str.c_str(); - for (; *c != '\0'; c++) { - if (*c == quote_char) { - ans += escape_str; - } else { - buf[0] = *c; - ans += buf; - } - } - buf[0] = quote_char; - ans += buf; - return ans; -} - -// static function -std::string ParseOptions::Escape(const std::string &str) { - return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; -} - -int ParseOptions::Read(int argc, const char *const argv[]) { - argc_ = argc; - argv_ = argv; - std::string key, value; - int i; - if (argc > 0) { - // set global "const char*" g_program_name (name of the program) - // so it can be printed out in error messages; - // it's useful because often the stderr of different programs will - // be mixed together in the same log file. -#ifdef _MSC_VER - const char *c = strrchr(argv[0], '\\'); -#else - const char *c = strrchr(argv[0], '/'); -#endif - SetProgramName(c == NULL ? argv[0] : c + 1); - } - // first pass: look for config parameter, look for priority - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // a lone "--" marks the end of named options - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (key.compare("config") == 0) { - ReadConfigFile(value); - } - if (key.compare("help") == 0) { - PrintUsage(); - exit(0); - } - } - } - bool double_dash_seen = false; - // second pass: add the command line options - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // A lone "--" marks the end of named options. - // Skip that option and break the processing of named options - i += 1; - double_dash_seen = true; - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << argv[i]; - } - } else { - break; - } - } - - // process remaining arguments as positional - for (; i < argc; i++) { - if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { - double_dash_seen = true; - } else { - positional_args_.push_back(std::string(argv[i])); - } - } - - // if the user did not suppress this with --print-args = false.... - if (print_args_) { - std::ostringstream strm; - for (int j = 0; j < argc; j++) strm << Escape(argv[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } - return i; -} - -void ParseOptions::PrintUsage(bool print_command_line) { - std::cerr << '\n' << usage_ << '\n'; - DocMapType::iterator it; - // first we print application-specific options - bool app_specific_header_printed = false; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option - if (app_specific_header_printed == false) { // header was not yet printed - std::cerr << "Options:" << '\n'; - app_specific_header_printed = true; - } - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - if (app_specific_header_printed == true) { - std::cerr << '\n'; - } - - // then the standard options - std::cerr << "Standard options:" << '\n'; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - std::cerr << '\n'; - if (print_command_line) { - std::ostringstream strm; - strm << "Command line was: "; - for (int j = 0; j < argc_; j++) strm << Escape(argv_[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } -} - -void ParseOptions::PrintConfig(std::ostream &os) { - os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; - std::string key; - DocMapType::iterator it; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; - if (bool_map_.end() != bool_map_.find(key)) { - os << (*bool_map_[key] ? "true" : "false"); - } else if (int_map_.end() != int_map_.find(key)) { - os << (*int_map_[key]); - } else if (uint_map_.end() != uint_map_.find(key)) { - os << (*uint_map_[key]); - } else if (float_map_.end() != float_map_.find(key)) { - os << (*float_map_[key]); - } else if (double_map_.end() != double_map_.find(key)) { - os << (*double_map_[key]); - } else if (string_map_.end() != string_map_.find(key)) { - os << "'" << *string_map_[key] << "'"; - } else { - KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; - } - os << '\n'; - } - os << '\n'; -} - -void ParseOptions::ReadConfigFile(const std::string &filename) { - std::ifstream is(filename.c_str(), std::ifstream::in); - if (!is.good()) { - KALDI_ERR << "Cannot open config file: " << filename; - } - - std::string line, key, value; - int32 line_number = 0; - while (std::getline(is, line)) { - line_number++; - // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { - line.erase(pos); - } - // skip empty lines - Trim(&line); - if (line.length() == 0) continue; - - if (line.substr(0, 2) != "--") { - KALDI_ERR << "Reading config file " << filename << ": line " - << line_number << " does not look like a line " - << "from a Kaldi command-line program's config file: should " - << "be of the form --x=y. Note: config files intended to " - << "be sourced by shell scripts lack the '--'."; - } - - // parse option - bool has_equal_sign; - SplitLongArg(line, &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << line << " in config file " << filename; - } - } -} - -void ParseOptions::SplitLongArg(const std::string &in, std::string *key, - std::string *value, bool *has_equal_sign) { - KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. - size_t pos = in.find_first_of('=', 0); - if (pos == std::string::npos) { // we allow --option for bools - // defaults to empty. We handle this differently in different cases. - *key = in.substr(2, in.size() - 2); // 2 because starts with --. - *value = ""; - *has_equal_sign = false; - } else if (pos == 2) { // we also don't allow empty keys: --=value - PrintUsage(true); - KALDI_ERR << "Invalid option (no key): " << in; - } else { // normal case: --option=value - *key = in.substr(2, pos - 2); // 2 because starts with --. - *value = in.substr(pos + 1); - *has_equal_sign = true; - } -} - -void ParseOptions::NormalizeArgName(std::string *str) { - std::string out; - std::string::iterator it; - - for (it = str->begin(); it != str->end(); ++it) { - if (*it == '_') - out += '-'; // convert _ to - - else - out += std::tolower(*it); - } - *str = out; - - KALDI_ASSERT(str->length() > 0); -} - -bool ParseOptions::SetOption(const std::string &key, const std::string &value, - bool has_equal_sign) { - if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") - KALDI_ERR << "Invalid option --" << key << "="; - *(bool_map_[key]) = ToBool(value); - } else if (int_map_.end() != int_map_.find(key)) { - *(int_map_[key]) = ToInt(value); - } else if (uint_map_.end() != uint_map_.find(key)) { - *(uint_map_[key]) = ToUint(value); - } else if (float_map_.end() != float_map_.find(key)) { - *(float_map_[key]) = ToFloat(value); - } else if (double_map_.end() != double_map_.find(key)) { - *(double_map_[key]) = ToDouble(value); - } else if (string_map_.end() != string_map_.find(key)) { - if (!has_equal_sign) - KALDI_ERR << "Invalid option --" << key << " (option format is --x=y)."; - *(string_map_[key]) = value; - } else { - return false; - } - return true; -} - -bool ParseOptions::ToBool(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); - - // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { - return true; - } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { - return false; - } - // if it is neither true nor false: - PrintUsage(true); - KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " - << str; - return false; // never reached -} - -int32 ParseOptions::ToInt(const std::string &str) { - int32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -uint32 ParseOptions::ToUint(const std::string &str) { - uint32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -float ParseOptions::ToFloat(const std::string &str) { - float ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -double ParseOptions::ToDouble(const std::string &str) { - double ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -// instantiate templates -template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterStandard(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - float *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - double *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterCommon(const std::string &name, bool *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, int32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, uint32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, float *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, double *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, - std::string *ptr, - const std::string &doc, - bool is_standard); - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/parse-options.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/parse-options.h deleted file mode 100644 index 93a060f4a411dfd63298a91bb313e0b66d337a75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/parse-options.h +++ /dev/null @@ -1,265 +0,0 @@ -// util/parse-options.h - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ -#define KALDI_UTIL_PARSE_OPTIONS_H_ - -#include -#include -#include - -#include "base/kaldi-common.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/// The class ParseOptions is for parsing command-line options; see -/// \ref parse_options for more documentation. -class ParseOptions : public OptionsItf { - public: - explicit ParseOptions(const char *usage) - : print_args_(true), - help_(false), - usage_(usage), - argc_(0), - argv_(NULL), - prefix_(""), - other_parser_(NULL) { -#if !defined(_MSC_VER) && \ - !defined(__CYGWIN__) // This is just a convenient place to set the stderr - // to line - setlinebuf(stderr); // buffering mode, since it's called at program start. -#endif // This helps ensure different programs' output is not mixed up. - RegisterStandard("config", &config_, - "Configuration file to read (this " - "option may be repeated)"); - RegisterStandard("print-args", &print_args_, - "Print the command line arguments (to stderr)"); - RegisterStandard("help", &help_, "Print out usage message"); - RegisterStandard("verbose", &g_kaldi_verbose_level, - "Verbose level (higher->more logging)"); - } - - /** - This is a constructor for the special case where some options are - registered with a prefix to avoid conflicts. The object thus created will - only be used temporarily to register an options class with the original - options parser (which is passed as the *other pointer) using the given - prefix. It should not be used for any other purpose, and the prefix must - not be the empty string. It seems to be the least bad way of implementing - options with prefixes at this point. - Example of usage is: - ParseOptions po; // original ParseOptions object - ParseOptions po_mfcc("mfcc", &po); // object with prefix. - MfccOptions mfcc_opts; - mfcc_opts.Register(&po_mfcc); - The options will now get registered as, e.g., --mfcc.frame-shift=10.0 - instead of just --frame-shift=10.0 - */ - ParseOptions(const std::string &prefix, OptionsItf *other); - - ~ParseOptions() {} - - // Methods from the interface - void Register(const std::string &name, bool *ptr, const std::string &doc); - void Register(const std::string &name, int32 *ptr, const std::string &doc); - void Register(const std::string &name, uint32 *ptr, const std::string &doc); - void Register(const std::string &name, float *ptr, const std::string &doc); - void Register(const std::string &name, double *ptr, const std::string &doc); - void Register(const std::string &name, std::string *ptr, - const std::string &doc); - - /// If called after registering an option and before calling - /// Read(), disables that option from being used. Will crash - /// at runtime if that option had not been registered. - void DisableOption(const std::string &name); - - /// This one is used for registering standard parameters of all the programs - template - void RegisterStandard(const std::string &name, T *ptr, - const std::string &doc); - - /** - Parses the command line options and fills the ParseOptions-registered - variables. This must be called after all the variables were registered!!! - - Initially the variables have implicit values, - then the config file values are set-up, - finally the command line values given. - Returns the first position in argv that was not used. - [typically not useful: use NumParams() and GetParam(). ] - */ - int Read(int argc, const char *const *argv); - - /// Prints the usage documentation [provided in the constructor]. - void PrintUsage(bool print_command_line = false); - /// Prints the actual configuration of all the registered variables - void PrintConfig(std::ostream &os); - - /// Reads the options values from a config file. Must be called after - /// registering all options. This is usually used internally after the - /// standard --config option is used, but it may also be called from a - /// program. - void ReadConfigFile(const std::string &filename); - - /// Number of positional parameters (c.f. argc-1). - int NumArgs() const; - - /// Returns one of the positional parameters; 1-based indexing for argc/argv - /// compatibility. Will crash if param is not >=1 and <=NumArgs(). - std::string GetArg(int param) const; - - std::string GetOptArg(int param) const { - return (param <= NumArgs() ? GetArg(param) : ""); - } - - /// The following function will return a possibly quoted and escaped - /// version of "str", according to the current shell. Currently - /// this is just hardwired to bash. It's useful for debug output. - static std::string Escape(const std::string &str); - - private: - /// Template to register various variable types, - /// used for program-specific parameters - template - void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); - - // Following functions do just the datatype-specific part of the job - /// Register boolean variable - void RegisterSpecific(const std::string &name, const std::string &idx, - bool *b, const std::string &doc, bool is_standard); - /// Register int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - int32 *i, const std::string &doc, bool is_standard); - /// Register unsinged int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - uint32 *u, const std::string &doc, bool is_standard); - /// Register float variable - void RegisterSpecific(const std::string &name, const std::string &idx, - float *f, const std::string &doc, bool is_standard); - /// Register double variable [useful as we change BaseFloat type]. - void RegisterSpecific(const std::string &name, const std::string &idx, - double *f, const std::string &doc, bool is_standard); - /// Register string variable - void RegisterSpecific(const std::string &name, const std::string &idx, - std::string *s, const std::string &doc, - bool is_standard); - - /// Does the actual job for both kinds of parameters - /// Does the common part of the job for all datatypes, - /// then calls RegisterSpecific - template - void RegisterCommon(const std::string &name, T *ptr, const std::string &doc, - bool is_standard); - - /// Set option with name "key" to "value"; will crash if can't do it. - /// "has_equal_sign" is used to allow --x for a boolean option x, - /// and --y=, for a string option y. - bool SetOption(const std::string &key, const std::string &value, - bool has_equal_sign); - - bool ToBool(std::string str); - int32 ToInt(const std::string &str); - uint32 ToUint(const std::string &str); - float ToFloat(const std::string &str); - double ToDouble(const std::string &str); - - // maps for option variables - std::map bool_map_; - std::map int_map_; - std::map uint_map_; - std::map float_map_; - std::map double_map_; - std::map string_map_; - - /** - Structure for options' documentation - */ - struct DocInfo { - DocInfo() {} - DocInfo(const std::string &name, const std::string &usemsg) - : name_(name), use_msg_(usemsg), is_standard_(false) {} - DocInfo(const std::string &name, const std::string &usemsg, - bool is_standard) - : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} - - std::string name_; - std::string use_msg_; - bool is_standard_; - }; - typedef std::map DocMapType; - DocMapType doc_map_; ///< map for the documentation - - bool print_args_; ///< variable for the implicit --print-args parameter - bool help_; ///< variable for the implicit --help parameter - std::string config_; ///< variable for the implicit --config parameter - std::vector positional_args_; - const char *usage_; - int argc_; - const char *const *argv_; - - /// These members are not normally used. They are only used when the object - /// is constructed with a prefix - std::string prefix_; - OptionsItf *other_parser_; - - protected: - /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, - /// and sets "has_equal_sign" to true if an equals-sign was parsed.. - /// this is needed in order to correctly allow --x for a boolean option - /// x, and --y= for a string option y, and to disallow --x= and --y. - void SplitLongArg(const std::string &in, std::string *key, std::string *value, - bool *has_equal_sign); - - void NormalizeArgName(std::string *str); -}; - -/// This template is provided for convenience in reading config classes from -/// files; this is not the standard way to read configuration options, but may -/// occasionally be needed. This function assumes the config has a function -/// "void Register(OptionsItf *opts)" which it can call to register the -/// ParseOptions object. -template -void ReadConfigFromFile(const std::string &config_filename, C *c) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << config_filename << "'"; - ParseOptions po(usage_str.str().c_str()); - c->Register(&po); - po.ReadConfigFile(config_filename); -} - -/// This variant of the template ReadConfigFromFile is for if you need to read -/// two config classes from the same file. -template -void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << conf << "'"; - ParseOptions po(usage_str.str().c_str()); - c1->Register(&po); - c2->Register(&po); - po.ReadConfigFile(conf); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/simple-io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/simple-io-funcs.cc deleted file mode 100644 index 5ace601b6a2bb186dec78b0b25cb5a3227c48bc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/simple-io-funcs.cc +++ /dev/null @@ -1,80 +0,0 @@ -// util/simple-io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/simple-io-funcs.h" -#include "util/text-utils.h" - -namespace kaldi { - -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; - return ko.Close(); -} - -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - int32 i; - list->clear(); - while (!(is >> i).fail()) list->push_back(i); - is >> std::ws; - return is.eof(); // should be eof, or junk at end of file. -} - -bool WriteIntegerVectorVectorSimple( - const std::string &wxfilename, - const std::vector > &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - std::ostream &os = ko.Stream(); - for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i].size(); j++) { - os << list[i][j]; - if (j + 1 < list[i].size()) os << ' '; - } - os << '\n'; - } - return ko.Close(); -} - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - list->clear(); - std::string line; - while (std::getline(is, line)) { - std::vector v; - if (!SplitStringToIntegers(line, " \t\r", true, &v)) { - list->clear(); - return false; - } - list->push_back(v); - } - return is.eof(); // if we're not at EOF, something weird happened. -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/simple-io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/simple-io-funcs.h deleted file mode 100644 index 1ead12790ba9bd6a44ccdff855918270191b8ebd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/simple-io-funcs.h +++ /dev/null @@ -1,61 +0,0 @@ -// util/simple-io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ -#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ - -#include -#include -#include "util/kaldi-io.h" - -// This header contains some utilities for reading some common, simple text -// formats:integers in files, one per line, and integers in files, possibly -// multiple per line. these are not really fully native Kaldi formats; they are -// mostly for small files that might be generated by scripts, and can be read -// all at one time. for longer files of this type, we would probably use the -// Table code. - -namespace kaldi { - -/// WriteToList attempts to write this list of integers, one per line, -/// to the given file, in text format. -/// returns true if succeeded. -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &v); - -/// ReadFromList attempts to read this list of integers, one per line, -/// from the given file, in text format. -/// returns true if succeeded. -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *v); - -// This is a file format like: -// 1 2 -// 3 -// -// 4 5 6 -// etc. -bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, - const std::vector > &v); - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *v); - -} // end namespace kaldi. - -#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/stl-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/stl-utils.h deleted file mode 100644 index 8a29cd582c77b3078277aa9713b8676032bbc5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/stl-utils.h +++ /dev/null @@ -1,310 +0,0 @@ -// util/stl-utils.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_STL_UTILS_H_ -#define KALDI_UTIL_STL_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -using std::unordered_map; -using std::unordered_set; - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Sorts and uniq's (removes duplicates) from a vector. -template -inline void SortAndUniq(std::vector *vec) { - std::sort(vec->begin(), vec->end()); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Returns true if the vector is sorted. -template -inline bool IsSorted(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter < *iter) return false; - iter = next_iter; - } -} - -/// Returns true if the vector is sorted and contains each element -/// only once. -template -inline bool IsSortedAndUniq(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter <= *iter) return false; - iter = next_iter; - } -} - -/// Removes duplicate elements from a sorted list. -template -inline void Uniq(std::vector *vec) { // must be already sorted. - KALDI_PARANOID_ASSERT(IsSorted(*vec)); - KALDI_ASSERT(vec); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Copies the elements of a set to a vector. -template -void CopySetToVector(const std::set &s, std::vector *v) { - // copies members of s into v, in sorted order from lowest to highest - // (because the set was in sorted order). - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename std::set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -template -void CopySetToVector(const unordered_set &s, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename unordered_set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -/// Copies the (key, value) pairs in a map to a vector of pairs. -template -void CopyMapToVector(const std::map &m, - std::vector > *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector >::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = std::make_pair(miter->first, miter->second); - // do it like this because of const casting. - } -} - -/// Copies the keys in a map to a vector. -template -void CopyMapKeysToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->first; - } -} - -/// Copies the values in a map to a vector. -template -void CopyMapValuesToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->second; - } -} - -/// Copies the keys in a map to a set. -template -void CopyMapKeysToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) { - s->insert(s->end(), miter->first); - } -} - -/// Copies the values in a map to a set. -template -void CopyMapValuesToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) s->insert(s->end(), miter->second); -} - -/// Copies the contents of a vector to a set. -template -void CopyVectorToSet(const std::vector &v, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) s->insert(s->end(), *iter); - // s->end() is a hint in case v was sorted. will work regardless. -} - -/// Deletes any non-NULL pointers in the vector v, and sets -/// the corresponding entries of v to NULL -template -void DeletePointers(std::vector *v) { - KALDI_ASSERT(v != NULL); - typename std::vector::iterator iter = v->begin(), end = v->end(); - for (; iter != end; ++iter) { - if (*iter != NULL) { - delete *iter; - *iter = NULL; // set to NULL for extra safety. - } - } -} - -/// Returns true if the vector of pointers contains NULL pointers. -template -bool ContainsNullPointers(const std::vector &v) { - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) - if (*iter == static_cast(NULL)) return true; - return false; -} - -/// Copies the contents a vector of one type to a vector -/// of another type. -template -void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { - KALDI_ASSERT(vec_out != NULL); - vec_out->resize(vec_in.size()); - for (size_t i = 0; i < vec_in.size(); i++) - (*vec_out)[i] = static_cast(vec_in[i]); -} - -/// A hashing function-object for vectors. -template -struct VectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const noexcept { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - VectorHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - } - - private: - static const int kPrime = 7853; -}; - -/// A hashing function-object for pairs of ints -template -struct PairHasher { // hashing function for pair - size_t operator()(const std::pair &x) const noexcept { - // 7853 was chosen at random from a list of primes. - return x.first + x.second * 7853; - } - PairHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int1); - KALDI_ASSERT_IS_INTEGER_TYPE(Int2); - } -}; - -/// A hashing function object for strings. -struct StringHasher { // hashing function for std::string - size_t operator()(const std::string &str) const noexcept { - size_t ans = 0, len = str.length(); - const char *c = str.c_str(), *end = c + len; - for (; c != end; c++) { - ans *= kPrime; - ans += *c; - } - return ans; - } - - private: - static const int kPrime = 7853; -}; - -/// Reverses the contents of a vector. -template -inline void ReverseVector(std::vector *vec) { - KALDI_ASSERT(vec != NULL); - size_t sz = vec->size(); - for (size_t i = 0; i < sz / 2; i++) std::swap((*vec)[i], (*vec)[sz - 1 - i]); -} - -/// Comparator object for pairs that compares only the first pair. -template -struct CompareFirstMemberOfPair { - inline bool operator()(const std::pair &p1, const std::pair &p2) { - return p1.first < p2.first; - } -}; - -/// For a vector of pair where I is an integer and F a floating-point or -/// integer type, this function sorts a vector of type vector > on -/// the I value and then merges elements with equal I values, summing these over -/// the F component and then removing any F component with zero value. This -/// is for where the vector of pairs represents a map from the integer to float -/// component, with an "adding" type of semantics for combining the elements. -template -inline void MergePairVectorSumming(std::vector > *vec) { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - CompareFirstMemberOfPair c; - std::sort(vec->begin(), vec->end(), c); // sort on 1st element. - typename std::vector >::iterator out = vec->begin(), - in = vec->begin(), - end = vec->end(); - // special case: while there is nothing to be changed, skip over - // initial input (avoids unnecessary copying). - while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { - in++; - out++; - } - while (in < end) { - // We reach this point only at the first element of - // each stretch of identical .first elements. - *out = *in; - ++in; - while (in < end && in->first == out->first) { - out->second += in->second; // this is the merge operation. - ++in; - } - if (out->second != static_cast(0)) // Don't keep zero elements. - out++; - } - vec->erase(out, end); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/text-utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/text-utils.cc deleted file mode 100644 index fd70889644f6b4e14793ddd4f5b0d71a66768699..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/text-utils.cc +++ /dev/null @@ -1,580 +0,0 @@ -// util/text-utils.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/text-utils.h" - -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out) { - KALDI_ASSERT(out != NULL); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - F f = 0; - if (!ConvertStringToReal(split[i], &f)) return false; - (*out)[i] = f; - } - return true; -} - -// Instantiate the template above for float and double. -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out) { - std::string tmp_str; - for (size_t i = 0; i < vec_in.size(); i++) { - if (!omit_empty_strings || !vec_in[i].empty()) { - tmp_str.append(vec_in[i]); - if (i < vec_in.size() - 1) - if (!omit_empty_strings || !vec_in[i + 1].empty()) - tmp_str.append(delim); - } - } - str_out->swap(tmp_str); -} - -void Trim(std::string *str) { - const char *white_chars = " \t\n\r\f\v"; - - std::string::size_type pos = str->find_last_not_of(white_chars); - if (pos != std::string::npos) { - str->erase(pos + 1); - pos = str->find_first_not_of(white_chars); - if (pos != std::string::npos) str->erase(0, pos); - } else { - str->erase(str->begin(), str->end()); - } -} - -bool IsToken(const std::string &token) { - size_t l = token.length(); - if (l == 0) return false; - for (size_t i = 0; i < l; i++) { - unsigned char c = token[i]; - if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) - return false; - // The "&& (isascii(c) || c == 255)" was added so that we won't reject - // non-ASCII characters such as French characters with accents [except for - // 255 which is "nbsp", a form of space]. - } - return true; -} - -void SplitStringOnFirstSpace(const std::string &str, std::string *first, - std::string *rest) { - const char *white_chars = " \t\n\r\f\v"; - typedef std::string::size_type I; - const I npos = std::string::npos; - I first_nonwhite = str.find_first_not_of(white_chars); - if (first_nonwhite == npos) { - first->clear(); - rest->clear(); - return; - } - // next_white is first whitespace after first nonwhitespace. - I next_white = str.find_first_of(white_chars, first_nonwhite); - - if (next_white == npos) { // no more whitespace... - *first = std::string(str, first_nonwhite); - rest->clear(); - return; - } - I next_nonwhite = str.find_first_not_of(white_chars, next_white); - if (next_nonwhite == npos) { - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - rest->clear(); - return; - } - - I last_nonwhite = str.find_last_not_of(white_chars); - KALDI_ASSERT(last_nonwhite != npos); // or coding error. - - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - *rest = std::string(str, next_nonwhite, last_nonwhite + 1 - next_nonwhite); -} - -bool IsLine(const std::string &line) { - if (line.find('\n') != std::string::npos) return false; - if (line.empty()) return true; - if (isspace(*(line.begin()))) return false; - if (isspace(*(line.rbegin()))) return false; - std::string::const_iterator iter = line.begin(), end = line.end(); - for (; iter != end; iter++) - if (!isprint(*iter)) return false; - return true; -} - -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - -template bool ConvertStringToReal(const std::string &str, float *out); -template bool ConvertStringToReal(const std::string &str, double *out); - -/* - This function is a helper function of StringsApproxEqual. It should be - thought of as a recursive function-- it was designed that way-- but rather - than actually recursing (which would cause problems with stack overflow), we - just set the args and return to the start. - - The 'decimal_places_tolerance' argument is just passed in from outside, - see the documentation for StringsApproxEqual in text-utils.h to see an - explanation. The argument 'places_into_number' provides some information - about the strings 'a' and 'b' that precedes the current pointers. - For purposes of this comment, let's define the 'decimal' of a number - as the part that comes after the decimal point, e.g. in '99.123', - '123' would be the decimal. If 'places_into_number' is -1, it means - we're not currently inside some place like that (i.e. it's not the - case that we're pointing to the '1' or the '2' or the '3'). - If it's 0, then we'd be pointing to the first place after the decimal, - '1' in this case. Note if one of the numbers is shorter than the - other, like '99.123' versus '99.1234' and 'a' points to the first '3' - while 'b' points to the second '4', 'places_into_number' referes to the - shorter of the two, i.e. it would be 2 in this example. - - - */ -bool StringsApproxEqualInternal(const char *a, const char *b, - int32 decimal_places_tolerance, - int32 places_into_number) { -start: - char ca = *a, cb = *b; - if (ca == cb) { - if (ca == '\0') { - return true; - } else { - if (places_into_number >= 0) { - if (isdigit(ca)) { - places_into_number++; - } else { - places_into_number = -1; - } - } else { - if (ca == '.') { - places_into_number = 0; - } - } - a++; - b++; - goto start; - } - } else { - if (places_into_number >= decimal_places_tolerance && - (isdigit(ca) || isdigit(cb))) { - // we're potentially willing to accept this difference between the - // strings. - if (isdigit(ca)) a++; - if (isdigit(cb)) b++; - // we'll have advanced at least one of the two strings. - goto start; - } else if (places_into_number >= 0 && - ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { - // this clause is designed to ensure that, for example, - // "0.1" would count the same as "0.100001". - if (ca == '0') - a++; - else - b++; - places_into_number++; - goto start; - } else { - return false; - } - } -} - -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_tolerance) { - return StringsApproxEqualInternal(a.c_str(), b.c_str(), - decimal_places_tolerance, -1); -} - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = - std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals - // sign, or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign + 1] == '\'' || - line[next_equals_sign + 1] == '"') { - char my_quote = line[next_equals_sign + 1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote - << " in config line '" << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) - // foo=bar": in general, config values with spaces in them, even without - // quoting. - - size_t next_next_equals_sign = - line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != - std::string::npos) { // found a later equals sign. - size_t preceding_space = - line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -// void ExpectOneOrTwoTokens(std::istream &is, bool binary, -// const std::string &token1, -// const std::string &token2) { -// KALDI_ASSERT(token1 != token2); -// std::string temp; -// ReadToken(is, binary, &temp); -// if (temp == token1) { -// ExpectToken(is, binary, token2); -// } else { -// if (temp != token2) { -// KALDI_ERR << "Expecting token " << token1 << " or " << token2 -// << " but got " << temp; -// } -// } -// } - -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines) { - config_lines->resize(lines.size()); - for (size_t i = 0; i < lines.size(); i++) { - bool ret = (*config_lines)[i].ParseLine(lines[i]); - if (!ret) { - KALDI_ERR << "Error parsing config line: " << lines[i]; - } - } -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/text-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/text-utils.h deleted file mode 100644 index bc7763c4aff38214d97cbeda3b29c8717dd65318..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/kaldi/util/text-utils.h +++ /dev/null @@ -1,264 +0,0 @@ -// util/text-utils.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_TEXT_UTILS_H_ -#define KALDI_UTIL_TEXT_UTILS_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Split a string using any of the single character delimiters. -/// If omit_empty_strings == true, the output will contain any -/// nonempty strings after splitting on any of the -/// characters in the delimiter. If omit_empty_strings == false, -/// the output will contain n+1 strings if there are n characters -/// in the set "delim" within the input string. In this case -/// the empty string is split to a single empty string. -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -/// Joins the elements of a vector of strings into a single string using -/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings -/// in the vector are skipped. A vector of empty strings results in an empty -/// string on the output. -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out); - -/** - \brief Split a string (e.g. 1:2:3) into a vector of integers. - - \param [in] delim String containing a list of characters, any of which - is allowed as a delimiter. - \param [in] omit_empty_strings If true, empty strings between delimiters are - allowed and will not produce an output integer; if false, - instances of characters in 'delim' that are consecutive or - at the start or end of the string would be an error. - You'll normally want this to be true if 'delim' consists - of spaces, and false otherwise. - \param [out] out The output list of integers. -*/ -template -bool SplitStringToIntegers(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false [but - // should probably be true - // if "delim" is spaces]. - std::vector *out) { - KALDI_ASSERT(out != NULL); - KALDI_ASSERT_IS_INTEGER_TYPE(I); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - const char *this_str = split[i].c_str(); - char *end = NULL; - int64 j = 0; - j = KALDI_STRTOLL(this_str, &end); - if (end == this_str || *end != '\0') { - out->clear(); - return false; - } else { - I jI = static_cast(j); - if (static_cast(jI) != j) { - // output type cannot fit this integer. - out->clear(); - return false; - } - (*out)[i] = jI; - } - } - return true; -} - -// This is defined for F = float and double. -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out); - -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - const char *this_str = str.c_str(); - char *end = NULL; - errno = 0; - int64 i = KALDI_STRTOLL(this_str, &end); - if (end != this_str) - while (isspace(*end)) end++; - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out); - -/// Removes the beginning and trailing whitespaces from a string -void Trim(std::string *str); - -/// Removes leading and trailing white space from the string, then splits on the -/// first section of whitespace found (if present), putting the part before the -/// whitespace in "first" and the rest in "rest". If there is no such space, -/// everything that remains after removing leading and trailing whitespace goes -/// in "first". -void SplitStringOnFirstSpace(const std::string &line, std::string *first, - std::string *rest); - -/// Returns true if "token" is nonempty, and all characters are -/// printable and whitespace-free. -bool IsToken(const std::string &token); - -/// Returns true if "line" is free of \n characters and unprintable -/// characters, and does not contain leading or trailing whitespace. -bool IsLine(const std::string &line); - -/** - This function returns true when two text strings are approximately equal, and - false when they are not. The definition of 'equal' is normal string - equality, except that two substrings like "0.31134" and "0.311341" would be - considered equal. 'decimal_places_tolerance' controls how many digits after - the '.' have to match up. - E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would - return false because there is a difference in the 2nd decimal, but with - an argument of 1 it would return true. - */ -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_check = 2); - -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' - baz="a b c d='a b' e" and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free - of the '=' character. If values are going to contain the '=' character, you - need to quote them with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; -}; - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, const std::string &token2); - -/** - This function reads in a config file and *appends* its contents to a vector - of lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, std::vector *lines); - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); - -/// Returns true if 'name' would be a valid name for a component or node in a -/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing -/// only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - -} // namespace kaldi - -#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/CPPLINT.cfg deleted file mode 100644 index 51ff339c18435a6c3a3be03131080d7b8ab8de86..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/CPPLINT.cfg +++ /dev/null @@ -1 +0,0 @@ -exclude_files=.* diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/CMakeLists.txt deleted file mode 100644 index 04051ef5ae46c04a40c1ffccc98c37fa594ad13e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - -#-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o - -include_directories(./include/) -install(DIRECTORY include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - -add_subdirectory(lib) - -if(HAVE_SCRIPT) - add_subdirectory(script) -endif(HAVE_SCRIPT) - -if(HAVE_BIN) - add_subdirectory(bin) -endif(HAVE_BIN) - -add_subdirectory(extensions) - -if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) -endif(BUILD_TESTING) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/extensions/special/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/extensions/special/CMakeLists.txt deleted file mode 100644 index 9c71b750a72ffe3c2dafde657273361c3dbae409..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/extensions/special/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) -message(STATUS "${HEADER_FILES}") - -if(HAVE_BIN) - add_executable(fstspecial-bin - ../../bin/fstconvert.cc - ../../bin/fstconvert-main.cc - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ) - - set_target_properties(fstspecial-bin PROPERTIES - FOLDER special/bin - OUTPUT_NAME fstspecial - ) - - target_link_libraries(fstspecial-bin - fstscript - fst - ${CMAKE_DL_LIBS} - ) -endif(HAVE_BIN) - - -add_library(fstspecial - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ${HEADER_FILES} -) - -set_target_properties(fstspecial PROPERTIES - SOVERSION "${SOVERSION}" - FOLDER special -) -target_link_libraries(fstspecial - fst -) - -set(FST_SPECIAL_INSTALL_TARGETS fstspecial) -if(HAVE_BIN) - list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) -endif() - -install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib -) - -function (add_module _name) - add_library(${ARGV}) - if (TARGET ${_name}) - target_link_libraries(${_name} fst) - set_target_properties(${_name} - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true - FOLDER special/modules - ) - endif() - - install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) -endfunction() - -add_module(phi-fst MODULE phi-fst.cc) -add_module(rho-fst MODULE rho-fst.cc) -add_module(sigma-fst MODULE sigma-fst.cc) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/include/fst/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/include/fst/flags.h deleted file mode 100644 index b5ec8ff7416774a0612ae0fe7e008a630b289dd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/include/fst/flags.h +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style flag handling declarations and inline definitions. - -#ifndef FST_LIB_FLAGS_H_ -#define FST_LIB_FLAGS_H_ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" - -using std::string; - -// FLAGS USAGE: -// -// Definition example: -// -// DEFINE_int32(length, 0, "length"); -// -// This defines variable FLAGS_length, initialized to 0. -// -// Declaration example: -// -// DECLARE_int32(length); -// -// SET_FLAGS() can be used to set flags from the command line -// using, for example, '--length=2'. -// -// ShowUsage() can be used to print out command and flag usage. - -// #define DECLARE_bool(name) extern bool FLAGS_ ## name -// #define DECLARE_string(name) extern string FLAGS_ ## name -// #define DECLARE_int32(name) extern int32 FLAGS_ ## name -// #define DECLARE_int64(name) extern int64 FLAGS_ ## name -// #define DECLARE_double(name) extern double FLAGS_ ## name - -template -struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, - const char *file, const T val) - : address(addr), - doc_string(doc), - type_name(type), - file_name(file), - default_value(val) {} - - T *address; - const char *doc_string; - const char *type_name; - const char *file_name; - const T default_value; -}; - -template -class FlagRegister { - public: - static FlagRegister *GetRegister() { - static auto reg = new FlagRegister; - return reg; - } - - const FlagDescription &GetFlagDescription(const string &name) const { - fst::MutexLock l(&flag_lock_); - auto it = flag_table_.find(name); - return it != flag_table_.end() ? it->second : 0; - } - - void SetDescription(const string &name, - const FlagDescription &desc) { - fst::MutexLock l(&flag_lock_); - flag_table_.insert(make_pair(name, desc)); - } - - bool SetFlag(const string &val, bool *address) const { - if (val == "true" || val == "1" || val.empty()) { - *address = true; - return true; - } else if (val == "false" || val == "0") { - *address = false; - return true; - } - else { - return false; - } - } - - bool SetFlag(const string &val, string *address) const { - *address = val; - return true; - } - - bool SetFlag(const string &val, int32 *address) const { - char *p = 0; - *address = strtol(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, int64 *address) const { - char *p = 0; - *address = strtoll(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, double *address) const { - char *p = 0; - *address = strtod(val.c_str(), &p); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, FlagDescription >::const_iterator it = - flag_table_.begin(); - it != flag_table_.end(); - ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - if (arg == name) - return SetFlag(val, desc.address); - } - return false; - } - - void GetUsage(std::set> *usage_set) const { - for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - string usage = " --" + name; - usage += ": type = "; - usage += desc.type_name; - usage += ", default = "; - usage += GetDefault(desc.default_value) + "\n "; - usage += desc.doc_string; - usage_set->insert(make_pair(desc.file_name, usage)); - } - } - - private: - string GetDefault(bool default_value) const { - return default_value ? "true" : "false"; - } - - string GetDefault(const string &default_value) const { - return "\"" + default_value + "\""; - } - - template - string GetDefault(const V &default_value) const { - std::ostringstream strm; - strm << default_value; - return strm.str(); - } - - mutable fst::Mutex flag_lock_; // Multithreading lock. - std::map> flag_table_; -}; - -template -class FlagRegisterer { - public: - FlagRegisterer(const string &name, const FlagDescription &desc) { - auto registr = FlagRegister::GetRegister(); - registr->SetDescription(name, desc); - } - - private: - FlagRegisterer(const FlagRegisterer &) = delete; - FlagRegisterer &operator=(const FlagRegisterer &) = delete; -}; - - -#define DEFINE_VAR(type, name, value, doc) \ - type FLAGS_ ## name = value; \ - static FlagRegisterer \ - name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ - doc, \ - #type, \ - __FILE__, \ - value)) - -// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) -// #define DEFINE_string(name, value, doc) \ -// DEFINE_VAR(string, name, value, doc) -// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc) -// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc) -// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc) - - -// Temporary directory. -DECLARE_string(tmpdir); - -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, - const char *src = ""); - -#define SET_FLAGS(usage, argc, argv, rmflags) \ -gflags::ParseCommandLineFlags(argc, argv, true) -// SetFlags(usage, argc, argv, rmflags, __FILE__) - -// Deprecated; for backward compatibility. -inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { - return SetFlags(usage, argc, argv, rmflags); -} - -void ShowUsage(bool long_usage = true); - -#endif // FST_LIB_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/include/fst/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/include/fst/log.h deleted file mode 100644 index bf041c58ebfab73d03bb14adf28c7c7916a2217d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/patch/openfst/src/include/fst/log.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style logging declarations and inline definitions. - -#ifndef FST_LIB_LOG_H_ -#define FST_LIB_LOG_H_ - -#include -#include -#include - -#include -#include - -using std::string; - -DECLARE_int32(v); - -class LogMessage { - public: - LogMessage(const string &type) : fatal_(type == "FATAL") { - std::cerr << type << ": "; - } - ~LogMessage() { - std::cerr << std::endl; - if(fatal_) - exit(1); - } - std::ostream &stream() { return std::cerr; } - - private: - bool fatal_; -}; - -// #define LOG(type) LogMessage(#type).stream() -// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO) - -// Checks -inline void FstCheck(bool x, const char* expr, - const char *file, int line) { - if (!x) { - LOG(FATAL) << "Check failed: \"" << expr - << "\" file: " << file - << " line: " << line; - } -} - -// #define CHECK(x) FstCheck(static_cast(x), #x, __FILE__, __LINE__) -// #define CHECK_EQ(x, y) CHECK((x) == (y)) -// #define CHECK_LT(x, y) CHECK((x) < (y)) -// #define CHECK_GT(x, y) CHECK((x) > (y)) -// #define CHECK_LE(x, y) CHECK((x) <= (y)) -// #define CHECK_GE(x, y) CHECK((x) >= (y)) -// #define CHECK_NE(x, y) CHECK((x) != (y)) - -// Debug checks -// #define DCHECK(x) assert(x) -// #define DCHECK_EQ(x, y) DCHECK((x) == (y)) -// #define DCHECK_LT(x, y) DCHECK((x) < (y)) -// #define DCHECK_GT(x, y) DCHECK((x) > (y)) -// #define DCHECK_LE(x, y) DCHECK((x) <= (y)) -// #define DCHECK_GE(x, y) DCHECK((x) >= (y)) -// #define DCHECK_NE(x, y) DCHECK((x) != (y)) - - -// Ports -#define ATTRIBUTE_DEPRECATED __attribute__((deprecated)) - -#endif // FST_LIB_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/CMakeLists.txt deleted file mode 100644 index 6113bbc26eb8fe35e4e17ffd1cab382f0fb0f1f8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(post_processor STATIC - post_processor.cc -) -target_link_libraries(post_processor PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/post_processor.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/post_processor.cc deleted file mode 100644 index 315f62d34cbc441ecbaf7c07667eb35ee61c2c8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/post_processor.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "utils/string.h" - -namespace wenet { - -std::string PostProcessor::ProcessSpace(const std::string& str) { - std::string result = str; - // 1. remove ' ' if needed - // only spaces between mandarin words need to be removed, please note that - // if str contains '_', we assume that the decoding type must be - // `CtcPrefixBeamSearch` and this branch will do nothing since str must be - // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) - if (opts_.language_type == kMandarinEnglish && !str.empty()) { - result.clear(); - // split str by ' ' - std::vector words; - std::stringstream ss(str); - std::string tmp; - while (ss >> tmp) { - words.push_back(tmp); - } - // check english word - bool is_englishword_prev = false; - bool is_englishword_now = false; - for (std::string& w : words) { - is_englishword_now = CheckEnglishWord(w); - if (is_englishword_prev && is_englishword_now) { - result += (' ' + w); - } else { - result += (w); - } - is_englishword_prev = is_englishword_now; - } - } - // 2. replace '_' with ' ' - // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) - result = ProcessBlank(result, opts_.lowercase); - return result; -} - -std::string PostProcessor::Process(const std::string& str, bool finish) { - std::string result; - result = ProcessSpace(str); - // TODO(xcsong): do itn/punctuation if finish == true - return result; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/post_processor.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/post_processor.h deleted file mode 100644 index 54597845ebc88ad22e1244d2e693e2088cff6d21..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/post_processor/post_processor.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#ifndef POST_PROCESSOR_POST_PROCESSOR_H_ -#define POST_PROCESSOR_POST_PROCESSOR_H_ - -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -enum LanguageType { - // spaces between **mandarin words** should be removed. - // cases of processing spaces with mandarin-only, english-only - // and mandarin-english code-switch can be found in post_processor_test.cc - kMandarinEnglish = 0x00, - // spaces should be kept for most of the - // Indo-European languages (i.e., deutsch or english-deutsch code-switch). - // cases of those languages can be found in post_processor_test.cc - kIndoEuropean = 0x01 -}; - -struct PostProcessOptions { - // space options - // The decoded result may contain spaces (' ' or '_'), - // we will process those spaces according to language_type. More details can - // be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - LanguageType language_type = kMandarinEnglish; - // whether lowercase letters are required - bool lowercase = true; -}; - -// TODO(xcsong): add itn/punctuation related resource -struct PostProcessResource {}; - -// Post Processor -class PostProcessor { - public: - explicit PostProcessor(PostProcessOptions&& opts) : opts_(std::move(opts)) {} - explicit PostProcessor(const PostProcessOptions& opts) : opts_(opts) {} - // call other functions to do post processing - std::string Process(const std::string& str, bool finish); - // process spaces according to configurations - std::string ProcessSpace(const std::string& str); - // TODO(xcsong): add itn/punctuation - // void InverseTN(const std::string& str); - // void Punctuate(const std::string& str); - - private: - const PostProcessOptions opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(PostProcessor); -}; - -} // namespace wenet - -#endif // POST_PROCESSOR_POST_PROCESSOR_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/CMakeLists.txt deleted file mode 100644 index 145654105350e91a5f9121b47197f5fc60663f5c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -link_libraries(gtest_main gmock) - -add_executable(utils_test utils_test.cc) -target_link_libraries(utils_test PUBLIC utils) -add_test(UTILS_TEST utils_test) - -add_executable(ctc_prefix_beam_search_test ctc_prefix_beam_search_test.cc) -target_link_libraries(ctc_prefix_beam_search_test PUBLIC decoder) -add_test(CTC_PREFIX_BEAM_SEARCH_TEST ctc_prefix_beam_search_test) - -add_executable(post_processor_test post_processor_test.cc) -target_link_libraries(post_processor_test PUBLIC post_processor) -add_test(POST_PROCESSOR_TEST post_processor_test) - - -add_executable(feature_pipeline_test feature_pipeline_test.cc) -target_link_libraries(feature_pipeline_test PUBLIC frontend) -add_test(FEATURE_PIPELINE_TEST feature_pipeline_test) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/ctc_prefix_beam_search_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/ctc_prefix_beam_search_test.cc deleted file mode 100644 index d8f3b65693b934beb33f3a770795f0b6e7ce3456..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/ctc_prefix_beam_search_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(CtcPrefixBeamSearchTest, CtcPrefixBeamSearchLogicTest) { - using ::testing::ElementsAre; - // See https://robin1001.github.io/2020/12/11/ctc-search for the - // graph demonstration of the data - std::vector> data = { - {0.25, 0.40, 0.35}, {0.40, 0.35, 0.25}, {0.10, 0.50, 0.40}}; - // Apply log - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data[i].size(); j++) { - data[i][j] = std::log(data[i][j]); - } - } - wenet::CtcPrefixBeamSearchOptions option; - option.first_beam_size = 3; - option.second_beam_size = 3; - wenet::CtcPrefixBeamSearch prefix_beam_search(option); - prefix_beam_search.Search(data); - /* Test case info - | top k | result index | prefix score | viterbi score | timestamp | - |-------|--------------|--------------|---------------|-----------| - | top 1 | [2, 1] | 0.2185 | 0.07 | [0, 2] | - | top 2 | [1, 2] | 0.1550 | 0.064 | [0, 2] | - | top 3 | [1] | 0.1525 | 0.07 | [2] | - */ - const std::vector>& result = prefix_beam_search.Outputs(); - EXPECT_EQ(result.size(), 3); - ASSERT_THAT(result[0], ElementsAre(2, 1)); - ASSERT_THAT(result[1], ElementsAre(1, 2)); - ASSERT_THAT(result[2], ElementsAre(1)); - - const std::vector& likelihood = prefix_beam_search.Likelihood(); - EXPECT_EQ(likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(likelihood[0]), 0.2185); - EXPECT_FLOAT_EQ(std::exp(likelihood[1]), 0.1550); - EXPECT_FLOAT_EQ(std::exp(likelihood[2]), 0.1525); - - const std::vector& viterbi_likelihood = - prefix_beam_search.viterbi_likelihood(); - EXPECT_EQ(viterbi_likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[0]), 0.07); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[1]), 0.064); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[2]), 0.07); - - const std::vector>& times = prefix_beam_search.Times(); - EXPECT_EQ(times.size(), 3); - ASSERT_THAT(times[0], ElementsAre(0, 2)); - ASSERT_THAT(times[1], ElementsAre(0, 2)); - ASSERT_THAT(times[2], ElementsAre(2)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/feature_pipeline_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/feature_pipeline_test.cc deleted file mode 100644 index 244ec0735b6086211b476e8d97569e1ee5959bc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/feature_pipeline_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2022 Roney -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "frontend/feature_pipeline.h" -#include "utils/blocking_queue.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -void pushQueue(const std::shared_ptr>& que, - std::vector vec) { - que->Push(vec); -} - -void popQueue(const std::shared_ptr>& que, int num, - int back_data) { - auto pop_data = que->Pop(num); - ASSERT_EQ(pop_data[num - 1], back_data); -} - -TEST(FeaturePipelineTest, BlockingQueueTest) { - auto capacity_queue = std::make_shared>(2); - std::vector test_data{1, 2, 3, 4, 5}; - std::thread push_thread(&pushQueue, capacity_queue, test_data); - ASSERT_EQ(capacity_queue->Pop(), 1); - ASSERT_LE(capacity_queue->Size(), 2); // capacity_queue: 2 or 2,3 - auto pop_data = capacity_queue->Pop(3); // 2,3,4 num > capacity - ASSERT_EQ(pop_data.size(), 3); - ASSERT_EQ(pop_data[2], 4); - push_thread.join(); - ASSERT_EQ(capacity_queue->Size(), 1); // capacity_queue:5 - - std::thread pop_thread(&popQueue, capacity_queue, 3, 0); // num > capacity - capacity_queue->Push(9); // capacity_queue:5,9 - capacity_queue->Push(0); // capacity_queue:5,9,0 - pop_thread.join(); // capacity_queue: - ASSERT_EQ(capacity_queue->Size(), 0); - - pop_data = capacity_queue->Pop(0); - ASSERT_TRUE(pop_data.empty()); -} - -TEST(FeaturePipelineTest, PipelineTest) { - wenet::FeaturePipelineConfig config(80, 8000); - wenet::FeaturePipeline feature_pipeline(config); - int audio_len = 8 * 55; // audio len 55ms,4 frames - std::vector pcm(audio_len, 0); - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 4); - - std::vector> out_feats; - auto b = feature_pipeline.Read(2, &out_feats); - ASSERT_TRUE(b); - ASSERT_EQ(out_feats.size(), 2); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 2); - - std::vector out_feat; - b = feature_pipeline.ReadOne(&out_feat); - ASSERT_TRUE(b); - ASSERT_FALSE(out_feat.empty()); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 1); - - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 1); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); - - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - feature_pipeline.Read(2, &out_feats); - feature_pipeline.Reset(); - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 0); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/post_processor_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/post_processor_test.cc deleted file mode 100644 index fa11fa29231032d62389a93fd00b0ec782bf8a3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/post_processor_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(PostProcessorTest, ProcessSpacekMandarinEnglishTest) { - wenet::PostProcessOptions opts_lowercase; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: mandarin character - // decode type: CtcPrefixBeamSearch, "".join() - "震东好帅", - // modeling unit: mandarin word - // decode type: CtcWfstBeamSearch, " ".join() - " 吴迪 也 好帅", - // modeling unit: english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁binbin▁is▁also▁handsome", - // modeling unit: english word - // decode type: CtcWfstBeamSearch, " ".join() - " life is short i use wenet", - // modeling unit: mandarin character + english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "超哥▁is▁the▁most▁handsome", - // modeling unit: mandarin word + english word - // decode type: CtcWfstBeamSearch, " ".join() - " 人生 苦短 i use wenet", - }; - - std::vector result_lowercase = { - "震东好帅", - "吴迪也好帅", - "binbin is also handsome", - "life is short i use wenet", - "超哥 is the most handsome", - "人生苦短i use wenet", - }; - - std::vector result_uppercase = { - "震东好帅", - "吴迪也好帅", - "BINBIN IS ALSO HANDSOME", - "LIFE IS SHORT I USE WENET", - "超哥 IS THE MOST HANDSOME", - "人生苦短I USE WENET", - }; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} - -TEST(PostProcessorTest, ProcessSpacekIndoEuropeanTest) { - wenet::PostProcessOptions opts_lowercase; - opts_lowercase.language_type = wenet::kIndoEuropean; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.language_type = wenet::kIndoEuropean; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁zhendong▁ist▁so▁schön", - // modeling unit: word - // decode type: CtcWfstBeamSearch, " ".join() - " zhendong ist so schön"}; - - std::vector result_lowercase = {"zhendong ist so schön", - "zhendong ist so schön"}; - - std::vector result_uppercase = {"ZHENDONG IST SO SCHÖN", - "ZHENDONG IST SO SCHÖN"}; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/utils_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/utils_test.cc deleted file mode 100644 index 6b2bbac25e000ce854d5e55a50cb51109d62d758..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/test/utils_test.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "utils/utils.h" - -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -TEST(UtilsTest, TopKTest) { - using ::testing::ElementsAre; - using ::testing::FloatNear; - using ::testing::Pointwise; - std::vector data = {1, 3, 5, 7, 9, 2, 4, 6, 8, 10}; - std::vector values; - std::vector indices; - wenet::TopK(data, 3, &values, &indices); - EXPECT_THAT(values, Pointwise(FloatNear(1e-8), {10, 9, 8})); - ASSERT_THAT(indices, ElementsAre(9, 4, 8)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/CMakeLists.txt deleted file mode 100644 index 686362688c050d48224ca0a01e0d24b03d94758a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_library(utils STATIC - string.cc - utils.cc -) - -if(NOT ANDROID) - if(MSVC) - target_link_libraries(utils PUBLIC fst) - else() - target_link_libraries(utils PUBLIC fst dl) - endif() -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/blocking_queue.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/blocking_queue.h deleted file mode 100644 index 9bf0127d9298fbfae2eeebb9431c680fc5dd7647..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/blocking_queue.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_BLOCKING_QUEUE_H_ -#define UTILS_BLOCKING_QUEUE_H_ - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity = std::numeric_limits::max()) - : capacity_(capacity) {} - - void Push(const T& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(value); - } - not_empty_condition_.notify_one(); - } - - void Push(T&& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - void Push(const std::vector& values) { - { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(value); - } - } - not_empty_condition_.notify_one(); - } - - void Push(std::vector&& values) { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - T Pop() { - std::unique_lock lock(mutex_); - while (queue_.empty()) { - not_empty_condition_.wait(lock); - } - T t(std::move(queue_.front())); - queue_.pop(); - not_full_condition_.notify_one(); - return t; - } - - // num can be greater than capacity,but it needs to be used with care - std::vector Pop(size_t num) { - std::unique_lock lock(mutex_); - std::vector block_data; - while (block_data.size() < num) { - while (queue_.empty()) { - not_full_condition_.notify_one(); - not_empty_condition_.wait(lock); - } - block_data.push_back(std::move(queue_.front())); - queue_.pop(); - } - not_full_condition_.notify_one(); - return block_data; - } - - bool Empty() const { - std::lock_guard lock(mutex_); - return queue_.empty(); - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - void Clear() { - while (!Empty()) { - Pop(); - } - } - - private: - size_t capacity_; - mutable std::mutex mutex_; - std::condition_variable not_full_condition_; - std::condition_variable not_empty_condition_; - std::queue queue_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace wenet - -#endif // UTILS_BLOCKING_QUEUE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/file.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/file.h deleted file mode 100644 index 83ad9c8c52fecd334b3549285bf39cd4f59b9f2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/file.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FILE_H_ -#define UTILS_FILE_H_ - -#include -#include - -namespace wenet { - -inline bool FileExists(const std::string& path) { - std::ifstream f(path.c_str()); - return f.good(); -} - -} // namespace wenet - -#endif // UTILS_FILE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/flags.h deleted file mode 100644 index 3432aa78847322edec8d6d2aec59ed7ca5352fcd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/flags.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FLAGS_H_ -#define UTILS_FLAGS_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/flags.h" - -#endif // UTILS_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/json.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/json.h deleted file mode 100644 index bf8d94a3e42504139b10daa39b8f8e7a8b2d93cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/json.h +++ /dev/null @@ -1,754 +0,0 @@ -// Copyright (c) From https://github.com/nbsdx/SimpleJSON -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_JSON_H_ -#define UTILS_JSON_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace json { - -using std::deque; -using std::enable_if; -using std::initializer_list; -using std::is_convertible; -using std::is_floating_point; -using std::is_integral; -using std::is_same; -using std::map; -using std::string; - -namespace { // NOLINT -string json_escape(const string& str) { - string output; - for (unsigned i = 0; i < str.length(); ++i) switch (str[i]) { - case '\"': - output += "\\\""; - break; - case '\\': - output += "\\\\"; - break; - case '\b': - output += "\\b"; - break; - case '\f': - output += "\\f"; - break; - case '\n': - output += "\\n"; - break; - case '\r': - output += "\\r"; - break; - case '\t': - output += "\\t"; - break; - default: - output += str[i]; - break; - } - return std::move(output); -} -} // namespace - -class JSON { - union BackingData { - BackingData(double d) : Float(d) {} - BackingData(int l) : Int(l) {} - BackingData(bool b) : Bool(b) {} - BackingData(string s) : String(new string(s)) {} - BackingData() : Int(0) {} - - deque* List; - map* Map; - string* String; - double Float; - int Int; - bool Bool; - } Internal; - - public: - enum class Class { Null, Object, Array, String, Floating, Integral, Boolean }; - - template - class JSONWrapper { - Container* object; - - public: - explicit JSONWrapper(Container* val) : object(val) {} - explicit JSONWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::iterator begin() { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::iterator end() { - return object ? object->end() : typename Container::iterator(); - } - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::iterator(); - } - }; - - template - class JSONConstWrapper { - const Container* object; - - public: - explicit JSONConstWrapper(const Container* val) : object(val) {} - explicit JSONConstWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::const_iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::const_iterator(); - } - }; - - JSON() : Internal(), Type(Class::Null) {} - - explicit JSON(initializer_list list) : JSON() { - SetType(Class::Object); - for (auto i = list.begin(), e = list.end(); i != e; ++i, ++i) - operator[](i->ToString()) = *std::next(i); - } - - JSON(JSON&& other) : Internal(other.Internal), Type(other.Type) { - other.Type = Class::Null; - other.Internal.Map = nullptr; - } - - JSON& operator=(JSON&& other) { - ClearInternal(); - Internal = other.Internal; - Type = other.Type; - other.Internal.Map = nullptr; - other.Type = Class::Null; - return *this; - } - - JSON(const JSON& other) { - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - } - - JSON& operator=(const JSON& other) { - ClearInternal(); - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - return *this; - } - - ~JSON() { - switch (Type) { - case Class::Array: - delete Internal.List; - break; - case Class::Object: - delete Internal.Map; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - template - explicit JSON(T b, typename enable_if::value>::type* = 0) - : Internal(b), Type(Class::Boolean) {} - - template - explicit JSON(T i, typename enable_if::value && - !is_same::value>::type* = 0) - : Internal(static_cast(i)), Type(Class::Integral) {} - - template - explicit JSON(T f, typename enable_if::value>::type* = 0) - : Internal(static_cast(f)), Type(Class::Floating) {} - - template - explicit JSON(T s, - typename enable_if::value>::type* = 0) - : Internal(string(s)), Type(Class::String) {} - - explicit JSON(std::nullptr_t) : Internal(), Type(Class::Null) {} - - static JSON Make(Class type) { - JSON ret; - ret.SetType(type); - return ret; - } - - static JSON Load(const string&); - - template - void append(T arg) { - SetType(Class::Array); - Internal.List->emplace_back(arg); - } - - template - void append(T arg, U... args) { - append(arg); - append(args...); - } - - template - typename enable_if::value, JSON&>::type operator=(T b) { - SetType(Class::Boolean); - Internal.Bool = b; - return *this; - } - - template - typename enable_if::value && !is_same::value, - JSON&>::type - operator=(T i) { - SetType(Class::Integral); - Internal.Int = i; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=(T f) { - SetType(Class::Floating); - Internal.Float = f; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=( - T s) { - SetType(Class::String); - *Internal.String = string(s); - return *this; - } - - JSON& operator[](const string& key) { - SetType(Class::Object); - return Internal.Map->operator[](key); - } - - JSON& operator[](unsigned index) { - SetType(Class::Array); - if (index >= Internal.List->size()) Internal.List->resize(index + 1); - return Internal.List->operator[](index); - } - - JSON& at(const string& key) { return operator[](key); } - - const JSON& at(const string& key) const { return Internal.Map->at(key); } - - JSON& at(unsigned index) { return operator[](index); } - - const JSON& at(unsigned index) const { return Internal.List->at(index); } - - int length() const { - if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - bool hasKey(const string& key) const { - if (Type == Class::Object) - return Internal.Map->find(key) != Internal.Map->end(); - return false; - } - - int size() const { - if (Type == Class::Object) - return Internal.Map->size(); - else if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - Class JSONType() const { return Type; } - - /// Functions for getting primitives from the JSON object. - bool IsNull() const { return Type == Class::Null; } - - string ToString() const { - bool b; - return std::move(ToString(&b)); - } - string ToString(bool* ok) const { - *ok = (Type == Class::String); - return *ok ? std::move(json_escape(*Internal.String)) : string(""); - } - - double ToFloat() const { - bool b; - return ToFloat(&b); - } - double ToFloat(bool* ok) const { - *ok = (Type == Class::Floating); - return *ok ? Internal.Float : 0.0; - } - - int ToInt() const { - bool b; - return ToInt(&b); - } - int ToInt(bool* ok) const { - *ok = (Type == Class::Integral); - return *ok ? Internal.Int : 0; - } - - bool ToBool() const { - bool b; - return ToBool(&b); - } - bool ToBool(bool* ok) const { - *ok = (Type == Class::Boolean); - return *ok ? Internal.Bool : false; - } - - JSONWrapper> ObjectRange() { - if (Type == Class::Object) - return JSONWrapper>(Internal.Map); - return JSONWrapper>(nullptr); - } - - JSONWrapper> ArrayRange() { - if (Type == Class::Array) return JSONWrapper>(Internal.List); - return JSONWrapper>(nullptr); - } - - JSONConstWrapper> ObjectRange() const { - if (Type == Class::Object) - return JSONConstWrapper>(Internal.Map); - return JSONConstWrapper>(nullptr); - } - - JSONConstWrapper> ArrayRange() const { - if (Type == Class::Array) - return JSONConstWrapper>(Internal.List); - return JSONConstWrapper>(nullptr); - } - - string dump(int depth = 1, string tab = " ") const { - string pad = ""; - for (int i = 0; i < depth; ++i, pad += tab) { - } - - switch (Type) { - case Class::Null: - return "null"; - case Class::Object: { - string s = "{\n"; - bool skip = true; - for (auto& p : *Internal.Map) { - if (!skip) s += ",\n"; - s += (pad + "\"" + p.first + "\" : " + p.second.dump(depth + 1, tab)); - skip = false; - } - s += ("\n" + pad.erase(0, 2) + "}"); - return s; - } - case Class::Array: { - string s = "["; - bool skip = true; - for (auto& p : *Internal.List) { - if (!skip) s += ", "; - s += p.dump(depth + 1, tab); - skip = false; - } - s += "]"; - return s; - } - case Class::String: - return "\"" + json_escape(*Internal.String) + "\""; - case Class::Floating: - return std::to_string(Internal.Float); - case Class::Integral: - return std::to_string(Internal.Int); - case Class::Boolean: - return Internal.Bool ? "true" : "false"; - default: - return ""; - } - return ""; - } - - friend std::ostream& operator<<(std::ostream&, const JSON&); - - private: - void SetType(Class type) { - if (type == Type) return; - - ClearInternal(); - - switch (type) { - case Class::Null: - Internal.Map = nullptr; - break; - case Class::Object: - Internal.Map = new map(); - break; - case Class::Array: - Internal.List = new deque(); - break; - case Class::String: - Internal.String = new string(); - break; - case Class::Floating: - Internal.Float = 0.0; - break; - case Class::Integral: - Internal.Int = 0; - break; - case Class::Boolean: - Internal.Bool = false; - break; - } - - Type = type; - } - - private: - /* beware: only call if YOU know that Internal is allocated. No checks - performed here. This function should be called in a constructed JSON just - before you are going to overwrite Internal... -*/ - void ClearInternal() { - switch (Type) { - case Class::Object: - delete Internal.Map; - break; - case Class::Array: - delete Internal.List; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - private: - Class Type = Class::Null; -}; - -JSON Array() { return std::move(JSON::Make(JSON::Class::Array)); } - -template -JSON Array(T... args) { - JSON arr = JSON::Make(JSON::Class::Array); - arr.append(args...); - return std::move(arr); -} - -JSON Object() { return std::move(JSON::Make(JSON::Class::Object)); } - -std::ostream& operator<<(std::ostream& os, const JSON& json) { - os << json.dump(); - return os; -} - -namespace { // NOLINT -JSON parse_next(const string&, size_t&); - -void consume_ws(const string& str, size_t& offset) { // NOLINT - while (isspace(str[offset])) ++offset; -} - -JSON parse_object(const string& str, size_t& offset) { // NOLINT - JSON Object = JSON::Make(JSON::Class::Object); - - ++offset; - consume_ws(str, offset); - if (str[offset] == '}') { - ++offset; - return std::move(Object); - } - - while (true) { - JSON Key = parse_next(str, offset); - consume_ws(str, offset); - if (str[offset] != ':') { - std::cerr << "Error: Object: Expected colon, found '" << str[offset] - << "'\n"; - break; - } - consume_ws(str, ++offset); - JSON Value = parse_next(str, offset); - Object[Key.ToString()] = Value; - - consume_ws(str, offset); - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == '}') { - ++offset; - break; - } else { - std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] - << "'\n"; - break; - } - } - - return std::move(Object); -} - -JSON parse_array(const string& str, size_t& offset) { // NOLINT - JSON Array = JSON::Make(JSON::Class::Array); - unsigned index = 0; - - ++offset; - consume_ws(str, offset); - if (str[offset] == ']') { - ++offset; - return std::move(Array); - } - - while (true) { - Array[index++] = parse_next(str, offset); - consume_ws(str, offset); - - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == ']') { - ++offset; - break; - } else { - std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] - << "'\n"; - return std::move(JSON::Make(JSON::Class::Array)); - } - } - - return std::move(Array); -} - -JSON parse_string(const string& str, size_t& offset) { // NOLINT - JSON String; - string val; - for (char c = str[++offset]; c != '\"'; c = str[++offset]) { - if (c == '\\') { - switch (str[++offset]) { - case '\"': - val += '\"'; - break; - case '\\': - val += '\\'; - break; - case '/': - val += '/'; - break; - case 'b': - val += '\b'; - break; - case 'f': - val += '\f'; - break; - case 'n': - val += '\n'; - break; - case 'r': - val += '\r'; - break; - case 't': - val += '\t'; - break; - case 'u': { - val += "\\u"; - for (unsigned i = 1; i <= 4; ++i) { - c = str[offset + i]; - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { - val += c; - } else { - std::cerr << "ERROR: String: Expected hex character in unicode " - "escape, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::String)); - } - } - offset += 4; - } break; - default: - val += '\\'; - break; - } - } else { - val += c; - } - } - ++offset; - String = val; - return std::move(String); -} - -JSON parse_number(const string& str, size_t& offset) { // NOLINT - JSON Number; - string val, exp_str; - char c; - bool isDouble = false; - int exp = 0; - while (true) { - c = str[offset++]; - if ((c == '-') || (c >= '0' && c <= '9')) { - val += c; - } else if (c == '.') { - val += c; - isDouble = true; - } else { - break; - } - } - if (c == 'E' || c == 'e') { - c = str[offset++]; - if (c == '-') { - ++offset; - exp_str += '-'; - } - while (true) { - c = str[offset++]; - if (c >= '0' && c <= '9') { - exp_str += c; - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: Expected a number for exponent, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } else { - break; - } - } - exp = std::stol(exp_str); - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - --offset; - - if (isDouble) { - Number = std::stod(val) * std::pow(10, exp); - } else { - if (!exp_str.empty()) - Number = std::stol(val) * std::pow(10, exp); - else - Number = std::stol(val); - } - return std::move(Number); -} - -JSON parse_bool(const string& str, size_t& offset) { // NOLINT - JSON Bool; - if (str.substr(offset, 4) == "true") { - Bool = true; - } else if (str.substr(offset, 5) == "false") { - Bool = false; - } else { - std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" - << str.substr(offset, 5) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += (Bool.ToBool() ? 4 : 5); - return std::move(Bool); -} - -JSON parse_null(const string& str, size_t& offset) { // NOLINT - JSON Null; - if (str.substr(offset, 4) != "null") { - std::cerr << "ERROR: Null: Expected 'null', found '" - << str.substr(offset, 4) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += 4; - return std::move(Null); -} - -JSON parse_next(const string& str, size_t& offset) { // NOLINT - char value; - consume_ws(str, offset); - value = str[offset]; - switch (value) { - case '[': - return std::move(parse_array(str, offset)); - case '{': - return std::move(parse_object(str, offset)); - case '\"': - return std::move(parse_string(str, offset)); - case 't': - case 'f': - return std::move(parse_bool(str, offset)); - case 'n': - return std::move(parse_null(str, offset)); - default: - if ((value <= '9' && value >= '0') || value == '-') - return std::move(parse_number(str, offset)); - } - std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; - return JSON(); -} -} // namespace - -JSON JSON::Load(const string& str) { - size_t offset = 0; - return std::move(parse_next(str, offset)); -} - -} // namespace json - -#endif // UTILS_JSON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/log.h deleted file mode 100644 index c2bf03f261a8711f74da819d80d68e8eb9fb124a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/log.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_LOG_H_ -#define UTILS_LOG_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/log.h" - -#endif // UTILS_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/string.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/string.cc deleted file mode 100644 index 1ab93adf3cac1bc5a42c0b8c6cadbde399678fef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/string.cc +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/string.h" - -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -void SplitString(const std::string& str, std::vector* strs) { - SplitStringToVector(Trim(str), " \t", true, strs); -} - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars) { - chars->clear(); - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - assert((str[i] & 0xF8) <= 0xF0); - if ((str[i] & 0x80) == 0x00) { - // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - // The next 1,920 characters need two bytes to encode, - // which covers the remainder of almost all Latin-script alphabets. - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - // Three bytes are needed for characters in the rest of - // the Basic Multilingual Plane, which contains virtually all characters - // in common use, including most Chinese, Japanese and Korean characters. - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - // Four bytes are needed for characters in the other planes of Unicode, - // which include less common CJK characters, various historic scripts, - // mathematical symbols, and emoji (pictographic symbols). - bytes = 4; - } - chars->push_back(str.substr(i, bytes)); - } -} - -int UTF8StringLength(const std::string& str) { - int len = 0; - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - if ((str[i] & 0x80) == 0x00) { - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - bytes = 4; - } - ++len; - } - return len; -} - -bool CheckEnglishChar(const std::string& ch) { - // all english characters should be encoded in one byte - if (ch.size() != 1) return false; - // english words may contain apostrophe, i.e., "He's" - return isalpha(ch[0]) || ch[0] == '\''; -} - -bool CheckEnglishWord(const std::string& word) { - std::vector chars; - SplitUTF8StringToChars(word, &chars); - for (size_t k = 0; k < chars.size(); k++) { - if (!CheckEnglishChar(chars[k])) { - return false; - } - } - return true; -} - -std::string JoinString(const std::string& c, - const std::vector& strs) { - std::string result; - if (strs.size() > 0) { - for (int i = 0; i < strs.size() - 1; i++) { - result += (strs[i] + c); - } - result += strs.back(); - } - return result; -} - -bool IsAlpha(const std::string& str) { - for (size_t i = 0; i < str.size(); i++) { - if (!isalpha(str[i])) { - return false; - } - } - return true; -} - -std::string ProcessBlank(const std::string& str, bool lowercase) { - std::string result; - if (!str.empty()) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - for (std::string& ch : chars) { - if (ch != kSpaceSymbol) { - result.append(ch); - } else { - // Ignore consecutive space or located in head - if (!result.empty() && result.back() != ' ') { - result.push_back(' '); - } - } - } - // Ignore tailing space - if (!result.empty() && result.back() == ' ') { - result.pop_back(); - } - // NOTE: convert string to wstring - // see issue 745: https://github.com/wenet-e2e/wenet/issues/745 - std::locale loc(""); - std::wstring_convert, wchar_t> converter; - std::wstring wsresult = converter.from_bytes(result); - for (auto& c : wsresult) { - c = lowercase ? tolower(c, loc) : toupper(c, loc); - } - result = converter.to_bytes(wsresult); - } - return result; -} - -std::string Ltrim(const std::string& str) { - size_t start = str.find_first_not_of(WHITESPACE); - return (start == std::string::npos) ? "" : str.substr(start); -} - -std::string Rtrim(const std::string& str) { - size_t end = str.find_last_not_of(WHITESPACE); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); -} - -std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } - -std::string JoinPath(const std::string& left, const std::string& right) { - std::string path(left); - if (path.size() && path.back() != '/') { - path.push_back('/'); - } - path.append(right); - return path; -} - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str) { - unsigned len = str.size() * 2; - setlocale(LC_CTYPE, ""); - wchar_t* p = new wchar_t[len]; - mbstowcs(p, str.c_str(), len); - std::wstring wstr(p); - delete[] p; - return wstr; -} -#endif - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/string.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/string.h deleted file mode 100644 index bf7a52ae09bce45ab7e34a5277652d7ae91bae1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/string.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_STRING_H_ -#define UTILS_STRING_H_ - -#include -#include -#include -#include -#include - -#include "fst/symbol-table.h" - -namespace wenet { - -const char WHITESPACE[] = " \n\r\t\f\v"; - -// Split the string with space or tab. -void SplitString(const std::string& str, std::vector* strs); - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out); - -// NOTE(Xingchen Song): we add this function to make it possible to -// support multilingual recipe in the future, in which characters of -// different languages are all encoded in UTF-8 format. -// UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding -// Split the UTF-8 string into chars. -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars); - -int UTF8StringLength(const std::string& str); - -// Check whether the UTF-8 char is alphabet or '. -bool CheckEnglishChar(const std::string& ch); - -// Check whether the UTF-8 word is only contains alphabet or '. -bool CheckEnglishWord(const std::string& word); - -std::string JoinString(const std::string& c, - const std::vector& strs); - -bool IsAlpha(const std::string& str); - -// Split the UTF-8 string into words by symbol table. -// Return whether not contains oov. -bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - -// Replace ▁ with space, then remove head, tail and consecutive space. -std::string ProcessBlank(const std::string& str, bool lowercase); - -std::string Ltrim(const std::string& str); - -std::string Rtrim(const std::string& str); - -std::string Trim(const std::string& str); - -std::string JoinPath(const std::string& left, const std::string& right); - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str); -#endif - -} // namespace wenet - -#endif // UTILS_STRING_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/thread_pool.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/thread_pool.h deleted file mode 100644 index a78162995d90bf079ad091cf14cb9f2cd4476d05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/thread_pool.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2012 Jakob Progsch, Václav Zeman - -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. - -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: - -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. - -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original software. - -// 3. This notice may not be removed or altered from any source -// distribution. - -#ifndef UTILS_THREAD_POOL_H_ -#define UTILS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - public: - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared >( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) { - worker.join(); - } -} - -#endif // UTILS_THREAD_POOL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/timer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/timer.h deleted file mode 100644 index 068519f98d140ba0eef68babe2ad2fdcb798c074..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/timer.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_TIMER_H_ -#define UTILS_TIMER_H_ - -#include - -namespace wenet { - -class Timer { - public: - Timer() : time_start_(std::chrono::steady_clock::now()) {} - void Reset() { time_start_ = std::chrono::steady_clock::now(); } - // return int in milliseconds - int Elapsed() const { - auto time_now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(time_now - - time_start_) - .count(); - } - - private: - std::chrono::time_point time_start_; -}; -} // namespace wenet - -#endif // UTILS_TIMER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/utils.cc deleted file mode 100644 index c37e36c6e9f629e0a4b11cf21a791aefd58b659f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/utils.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/utils.h" - -#include -#include -#include -#include -#include -#include - -#include "utils/log.h" - -namespace wenet { - -float LogAdd(float x, float y) { - static float num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - float xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -template -struct ValueComp { - bool operator()(const std::pair& lhs, - const std::pair& rhs) const { - return lhs.first > rhs.first || - (lhs.first == rhs.first && lhs.second < rhs.second); - } -}; - -// We refer the pytorch topk implementation -// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices) { - std::vector> heap_data; - int n = data.size(); - for (int32_t i = 0; i < k && i < n; ++i) { - heap_data.emplace_back(data[i], i); - } - std::priority_queue, std::vector>, - ValueComp> - pq(ValueComp(), std::move(heap_data)); - for (int32_t i = k; i < n; ++i) { - if (pq.top().first < data[i]) { - pq.pop(); - pq.emplace(data[i], i); - } - } - - values->resize(std::min(k, n)); - indices->resize(std::min(k, n)); - int32_t cur = values->size() - 1; - while (!pq.empty()) { - const auto& item = pq.top(); - (*values)[cur] = item.first; - (*indices)[cur] = item.second; - pq.pop(); - cur -= 1; - } -} - -template void TopK(const std::vector& data, int32_t k, - std::vector* values, - std::vector* indices); - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/utils.h deleted file mode 100644 index f9957c0b6e8ae27d9260e75cf55e786055827801..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/utils/utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_UTILS_H_ -#define UTILS_UTILS_H_ - -#include -#include -#include - -namespace wenet { - -#define WENET_DISALLOW_COPY_AND_ASSIGN(Type) \ - Type(const Type&) = delete; \ - Type& operator=(const Type&) = delete; - -const float kFloatMax = std::numeric_limits::max(); -// kSpaceSymbol in UTF-8 is: ▁ -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Return the sum of two probabilities in log scale -float LogAdd(float x, float y); - -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices); - -} // namespace wenet - -#endif // UTILS_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/app.py b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/app.py deleted file mode 100644 index b880cf7ff41509bfd618cdcc26bd402123af2236..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/app.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: zhendong.peng@mobvoi.com (Zhendong Peng) - -import argparse - -from flask import Flask, render_template - -parser = argparse.ArgumentParser(description='training your network') -parser.add_argument('--port', default=19999, type=int, help='port id') -args = parser.parse_args() - -app = Flask(__name__) - -@app.route('/') -def index(): - return render_template('index.html') - -if __name__ == '__main__': - app.run(host='0.0.0.0', port=args.port, debug=True) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/css/font-awesome.min.css b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/css/font-awesome.min.css deleted file mode 100644 index 540440ce89f2a408aa699b65100e18f15e0f09ca..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/css/font-awesome.min.css +++ /dev/null @@ -1,4 +0,0 @@ -/*! - * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome - * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) - */@font-face{font-family:'FontAwesome';src:url('../fonts/fontawesome-webfont.eot?v=4.7.0');src:url('../fonts/fontawesome-webfont.eot?#iefix&v=4.7.0') format('embedded-opentype'),url('../fonts/fontawesome-webfont.woff2?v=4.7.0') format('woff2'),url('../fonts/fontawesome-webfont.woff?v=4.7.0') format('woff'),url('../fonts/fontawesome-webfont.ttf?v=4.7.0') format('truetype'),url('../fonts/fontawesome-webfont.svg?v=4.7.0#fontawesomeregular') format('svg');font-weight:normal;font-style:normal}.fa{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571429em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14285714em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14285714em;width:2.14285714em;top:.14285714em;text-align:center}.fa-li.fa-lg{left:-1.85714286em}.fa-border{padding:.2em .25em .15em;border:solid .08em #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa.fa-pull-left{margin-right:.3em}.fa.fa-pull-right{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left{margin-right:.3em}.fa.pull-right{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s infinite linear;animation:fa-spin 2s infinite linear}.fa-pulse{-webkit-animation:fa-spin 1s infinite steps(8);animation:fa-spin 1s infinite steps(8)}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scale(-1, 1);-ms-transform:scale(-1, 1);transform:scale(-1, 1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scale(1, -1);-ms-transform:scale(1, -1);transform:scale(1, -1)}:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270,:root .fa-flip-horizontal,:root .fa-flip-vertical{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:"\f000"}.fa-music:before{content:"\f001"}.fa-search:before{content:"\f002"}.fa-envelope-o:before{content:"\f003"}.fa-heart:before{content:"\f004"}.fa-star:before{content:"\f005"}.fa-star-o:before{content:"\f006"}.fa-user:before{content:"\f007"}.fa-film:before{content:"\f008"}.fa-th-large:before{content:"\f009"}.fa-th:before{content:"\f00a"}.fa-th-list:before{content:"\f00b"}.fa-check:before{content:"\f00c"}.fa-remove:before,.fa-close:before,.fa-times:before{content:"\f00d"}.fa-search-plus:before{content:"\f00e"}.fa-search-minus:before{content:"\f010"}.fa-power-off:before{content:"\f011"}.fa-signal:before{content:"\f012"}.fa-gear:before,.fa-cog:before{content:"\f013"}.fa-trash-o:before{content:"\f014"}.fa-home:before{content:"\f015"}.fa-file-o:before{content:"\f016"}.fa-clock-o:before{content:"\f017"}.fa-road:before{content:"\f018"}.fa-download:before{content:"\f019"}.fa-arrow-circle-o-down:before{content:"\f01a"}.fa-arrow-circle-o-up:before{content:"\f01b"}.fa-inbox:before{content:"\f01c"}.fa-play-circle-o:before{content:"\f01d"}.fa-rotate-right:before,.fa-repeat:before{content:"\f01e"}.fa-refresh:before{content:"\f021"}.fa-list-alt:before{content:"\f022"}.fa-lock:before{content:"\f023"}.fa-flag:before{content:"\f024"}.fa-headphones:before{content:"\f025"}.fa-volume-off:before{content:"\f026"}.fa-volume-down:before{content:"\f027"}.fa-volume-up:before{content:"\f028"}.fa-qrcode:before{content:"\f029"}.fa-barcode:before{content:"\f02a"}.fa-tag:before{content:"\f02b"}.fa-tags:before{content:"\f02c"}.fa-book:before{content:"\f02d"}.fa-bookmark:before{content:"\f02e"}.fa-print:before{content:"\f02f"}.fa-camera:before{content:"\f030"}.fa-font:before{content:"\f031"}.fa-bold:before{content:"\f032"}.fa-italic:before{content:"\f033"}.fa-text-height:before{content:"\f034"}.fa-text-width:before{content:"\f035"}.fa-align-left:before{content:"\f036"}.fa-align-center:before{content:"\f037"}.fa-align-right:before{content:"\f038"}.fa-align-justify:before{content:"\f039"}.fa-list:before{content:"\f03a"}.fa-dedent:before,.fa-outdent:before{content:"\f03b"}.fa-indent:before{content:"\f03c"}.fa-video-camera:before{content:"\f03d"}.fa-photo:before,.fa-image:before,.fa-picture-o:before{content:"\f03e"}.fa-pencil:before{content:"\f040"}.fa-map-marker:before{content:"\f041"}.fa-adjust:before{content:"\f042"}.fa-tint:before{content:"\f043"}.fa-edit:before,.fa-pencil-square-o:before{content:"\f044"}.fa-share-square-o:before{content:"\f045"}.fa-check-square-o:before{content:"\f046"}.fa-arrows:before{content:"\f047"}.fa-step-backward:before{content:"\f048"}.fa-fast-backward:before{content:"\f049"}.fa-backward:before{content:"\f04a"}.fa-play:before{content:"\f04b"}.fa-pause:before{content:"\f04c"}.fa-stop:before{content:"\f04d"}.fa-forward:before{content:"\f04e"}.fa-fast-forward:before{content:"\f050"}.fa-step-forward:before{content:"\f051"}.fa-eject:before{content:"\f052"}.fa-chevron-left:before{content:"\f053"}.fa-chevron-right:before{content:"\f054"}.fa-plus-circle:before{content:"\f055"}.fa-minus-circle:before{content:"\f056"}.fa-times-circle:before{content:"\f057"}.fa-check-circle:before{content:"\f058"}.fa-question-circle:before{content:"\f059"}.fa-info-circle:before{content:"\f05a"}.fa-crosshairs:before{content:"\f05b"}.fa-times-circle-o:before{content:"\f05c"}.fa-check-circle-o:before{content:"\f05d"}.fa-ban:before{content:"\f05e"}.fa-arrow-left:before{content:"\f060"}.fa-arrow-right:before{content:"\f061"}.fa-arrow-up:before{content:"\f062"}.fa-arrow-down:before{content:"\f063"}.fa-mail-forward:before,.fa-share:before{content:"\f064"}.fa-expand:before{content:"\f065"}.fa-compress:before{content:"\f066"}.fa-plus:before{content:"\f067"}.fa-minus:before{content:"\f068"}.fa-asterisk:before{content:"\f069"}.fa-exclamation-circle:before{content:"\f06a"}.fa-gift:before{content:"\f06b"}.fa-leaf:before{content:"\f06c"}.fa-fire:before{content:"\f06d"}.fa-eye:before{content:"\f06e"}.fa-eye-slash:before{content:"\f070"}.fa-warning:before,.fa-exclamation-triangle:before{content:"\f071"}.fa-plane:before{content:"\f072"}.fa-calendar:before{content:"\f073"}.fa-random:before{content:"\f074"}.fa-comment:before{content:"\f075"}.fa-magnet:before{content:"\f076"}.fa-chevron-up:before{content:"\f077"}.fa-chevron-down:before{content:"\f078"}.fa-retweet:before{content:"\f079"}.fa-shopping-cart:before{content:"\f07a"}.fa-folder:before{content:"\f07b"}.fa-folder-open:before{content:"\f07c"}.fa-arrows-v:before{content:"\f07d"}.fa-arrows-h:before{content:"\f07e"}.fa-bar-chart-o:before,.fa-bar-chart:before{content:"\f080"}.fa-twitter-square:before{content:"\f081"}.fa-facebook-square:before{content:"\f082"}.fa-camera-retro:before{content:"\f083"}.fa-key:before{content:"\f084"}.fa-gears:before,.fa-cogs:before{content:"\f085"}.fa-comments:before{content:"\f086"}.fa-thumbs-o-up:before{content:"\f087"}.fa-thumbs-o-down:before{content:"\f088"}.fa-star-half:before{content:"\f089"}.fa-heart-o:before{content:"\f08a"}.fa-sign-out:before{content:"\f08b"}.fa-linkedin-square:before{content:"\f08c"}.fa-thumb-tack:before{content:"\f08d"}.fa-external-link:before{content:"\f08e"}.fa-sign-in:before{content:"\f090"}.fa-trophy:before{content:"\f091"}.fa-github-square:before{content:"\f092"}.fa-upload:before{content:"\f093"}.fa-lemon-o:before{content:"\f094"}.fa-phone:before{content:"\f095"}.fa-square-o:before{content:"\f096"}.fa-bookmark-o:before{content:"\f097"}.fa-phone-square:before{content:"\f098"}.fa-twitter:before{content:"\f099"}.fa-facebook-f:before,.fa-facebook:before{content:"\f09a"}.fa-github:before{content:"\f09b"}.fa-unlock:before{content:"\f09c"}.fa-credit-card:before{content:"\f09d"}.fa-feed:before,.fa-rss:before{content:"\f09e"}.fa-hdd-o:before{content:"\f0a0"}.fa-bullhorn:before{content:"\f0a1"}.fa-bell:before{content:"\f0f3"}.fa-certificate:before{content:"\f0a3"}.fa-hand-o-right:before{content:"\f0a4"}.fa-hand-o-left:before{content:"\f0a5"}.fa-hand-o-up:before{content:"\f0a6"}.fa-hand-o-down:before{content:"\f0a7"}.fa-arrow-circle-left:before{content:"\f0a8"}.fa-arrow-circle-right:before{content:"\f0a9"}.fa-arrow-circle-up:before{content:"\f0aa"}.fa-arrow-circle-down:before{content:"\f0ab"}.fa-globe:before{content:"\f0ac"}.fa-wrench:before{content:"\f0ad"}.fa-tasks:before{content:"\f0ae"}.fa-filter:before{content:"\f0b0"}.fa-briefcase:before{content:"\f0b1"}.fa-arrows-alt:before{content:"\f0b2"}.fa-group:before,.fa-users:before{content:"\f0c0"}.fa-chain:before,.fa-link:before{content:"\f0c1"}.fa-cloud:before{content:"\f0c2"}.fa-flask:before{content:"\f0c3"}.fa-cut:before,.fa-scissors:before{content:"\f0c4"}.fa-copy:before,.fa-files-o:before{content:"\f0c5"}.fa-paperclip:before{content:"\f0c6"}.fa-save:before,.fa-floppy-o:before{content:"\f0c7"}.fa-square:before{content:"\f0c8"}.fa-navicon:before,.fa-reorder:before,.fa-bars:before{content:"\f0c9"}.fa-list-ul:before{content:"\f0ca"}.fa-list-ol:before{content:"\f0cb"}.fa-strikethrough:before{content:"\f0cc"}.fa-underline:before{content:"\f0cd"}.fa-table:before{content:"\f0ce"}.fa-magic:before{content:"\f0d0"}.fa-truck:before{content:"\f0d1"}.fa-pinterest:before{content:"\f0d2"}.fa-pinterest-square:before{content:"\f0d3"}.fa-google-plus-square:before{content:"\f0d4"}.fa-google-plus:before{content:"\f0d5"}.fa-money:before{content:"\f0d6"}.fa-caret-down:before{content:"\f0d7"}.fa-caret-up:before{content:"\f0d8"}.fa-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.fa-columns:before{content:"\f0db"}.fa-unsorted:before,.fa-sort:before{content:"\f0dc"}.fa-sort-down:before,.fa-sort-desc:before{content:"\f0dd"}.fa-sort-up:before,.fa-sort-asc:before{content:"\f0de"}.fa-envelope:before{content:"\f0e0"}.fa-linkedin:before{content:"\f0e1"}.fa-rotate-left:before,.fa-undo:before{content:"\f0e2"}.fa-legal:before,.fa-gavel:before{content:"\f0e3"}.fa-dashboard:before,.fa-tachometer:before{content:"\f0e4"}.fa-comment-o:before{content:"\f0e5"}.fa-comments-o:before{content:"\f0e6"}.fa-flash:before,.fa-bolt:before{content:"\f0e7"}.fa-sitemap:before{content:"\f0e8"}.fa-umbrella:before{content:"\f0e9"}.fa-paste:before,.fa-clipboard:before{content:"\f0ea"}.fa-lightbulb-o:before{content:"\f0eb"}.fa-exchange:before{content:"\f0ec"}.fa-cloud-download:before{content:"\f0ed"}.fa-cloud-upload:before{content:"\f0ee"}.fa-user-md:before{content:"\f0f0"}.fa-stethoscope:before{content:"\f0f1"}.fa-suitcase:before{content:"\f0f2"}.fa-bell-o:before{content:"\f0a2"}.fa-coffee:before{content:"\f0f4"}.fa-cutlery:before{content:"\f0f5"}.fa-file-text-o:before{content:"\f0f6"}.fa-building-o:before{content:"\f0f7"}.fa-hospital-o:before{content:"\f0f8"}.fa-ambulance:before{content:"\f0f9"}.fa-medkit:before{content:"\f0fa"}.fa-fighter-jet:before{content:"\f0fb"}.fa-beer:before{content:"\f0fc"}.fa-h-square:before{content:"\f0fd"}.fa-plus-square:before{content:"\f0fe"}.fa-angle-double-left:before{content:"\f100"}.fa-angle-double-right:before{content:"\f101"}.fa-angle-double-up:before{content:"\f102"}.fa-angle-double-down:before{content:"\f103"}.fa-angle-left:before{content:"\f104"}.fa-angle-right:before{content:"\f105"}.fa-angle-up:before{content:"\f106"}.fa-angle-down:before{content:"\f107"}.fa-desktop:before{content:"\f108"}.fa-laptop:before{content:"\f109"}.fa-tablet:before{content:"\f10a"}.fa-mobile-phone:before,.fa-mobile:before{content:"\f10b"}.fa-circle-o:before{content:"\f10c"}.fa-quote-left:before{content:"\f10d"}.fa-quote-right:before{content:"\f10e"}.fa-spinner:before{content:"\f110"}.fa-circle:before{content:"\f111"}.fa-mail-reply:before,.fa-reply:before{content:"\f112"}.fa-github-alt:before{content:"\f113"}.fa-folder-o:before{content:"\f114"}.fa-folder-open-o:before{content:"\f115"}.fa-smile-o:before{content:"\f118"}.fa-frown-o:before{content:"\f119"}.fa-meh-o:before{content:"\f11a"}.fa-gamepad:before{content:"\f11b"}.fa-keyboard-o:before{content:"\f11c"}.fa-flag-o:before{content:"\f11d"}.fa-flag-checkered:before{content:"\f11e"}.fa-terminal:before{content:"\f120"}.fa-code:before{content:"\f121"}.fa-mail-reply-all:before,.fa-reply-all:before{content:"\f122"}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:"\f123"}.fa-location-arrow:before{content:"\f124"}.fa-crop:before{content:"\f125"}.fa-code-fork:before{content:"\f126"}.fa-unlink:before,.fa-chain-broken:before{content:"\f127"}.fa-question:before{content:"\f128"}.fa-info:before{content:"\f129"}.fa-exclamation:before{content:"\f12a"}.fa-superscript:before{content:"\f12b"}.fa-subscript:before{content:"\f12c"}.fa-eraser:before{content:"\f12d"}.fa-puzzle-piece:before{content:"\f12e"}.fa-microphone:before{content:"\f130"}.fa-microphone-slash:before{content:"\f131"}.fa-shield:before{content:"\f132"}.fa-calendar-o:before{content:"\f133"}.fa-fire-extinguisher:before{content:"\f134"}.fa-rocket:before{content:"\f135"}.fa-maxcdn:before{content:"\f136"}.fa-chevron-circle-left:before{content:"\f137"}.fa-chevron-circle-right:before{content:"\f138"}.fa-chevron-circle-up:before{content:"\f139"}.fa-chevron-circle-down:before{content:"\f13a"}.fa-html5:before{content:"\f13b"}.fa-css3:before{content:"\f13c"}.fa-anchor:before{content:"\f13d"}.fa-unlock-alt:before{content:"\f13e"}.fa-bullseye:before{content:"\f140"}.fa-ellipsis-h:before{content:"\f141"}.fa-ellipsis-v:before{content:"\f142"}.fa-rss-square:before{content:"\f143"}.fa-play-circle:before{content:"\f144"}.fa-ticket:before{content:"\f145"}.fa-minus-square:before{content:"\f146"}.fa-minus-square-o:before{content:"\f147"}.fa-level-up:before{content:"\f148"}.fa-level-down:before{content:"\f149"}.fa-check-square:before{content:"\f14a"}.fa-pencil-square:before{content:"\f14b"}.fa-external-link-square:before{content:"\f14c"}.fa-share-square:before{content:"\f14d"}.fa-compass:before{content:"\f14e"}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:"\f150"}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:"\f151"}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:"\f152"}.fa-euro:before,.fa-eur:before{content:"\f153"}.fa-gbp:before{content:"\f154"}.fa-dollar:before,.fa-usd:before{content:"\f155"}.fa-rupee:before,.fa-inr:before{content:"\f156"}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:"\f157"}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:"\f158"}.fa-won:before,.fa-krw:before{content:"\f159"}.fa-bitcoin:before,.fa-btc:before{content:"\f15a"}.fa-file:before{content:"\f15b"}.fa-file-text:before{content:"\f15c"}.fa-sort-alpha-asc:before{content:"\f15d"}.fa-sort-alpha-desc:before{content:"\f15e"}.fa-sort-amount-asc:before{content:"\f160"}.fa-sort-amount-desc:before{content:"\f161"}.fa-sort-numeric-asc:before{content:"\f162"}.fa-sort-numeric-desc:before{content:"\f163"}.fa-thumbs-up:before{content:"\f164"}.fa-thumbs-down:before{content:"\f165"}.fa-youtube-square:before{content:"\f166"}.fa-youtube:before{content:"\f167"}.fa-xing:before{content:"\f168"}.fa-xing-square:before{content:"\f169"}.fa-youtube-play:before{content:"\f16a"}.fa-dropbox:before{content:"\f16b"}.fa-stack-overflow:before{content:"\f16c"}.fa-instagram:before{content:"\f16d"}.fa-flickr:before{content:"\f16e"}.fa-adn:before{content:"\f170"}.fa-bitbucket:before{content:"\f171"}.fa-bitbucket-square:before{content:"\f172"}.fa-tumblr:before{content:"\f173"}.fa-tumblr-square:before{content:"\f174"}.fa-long-arrow-down:before{content:"\f175"}.fa-long-arrow-up:before{content:"\f176"}.fa-long-arrow-left:before{content:"\f177"}.fa-long-arrow-right:before{content:"\f178"}.fa-apple:before{content:"\f179"}.fa-windows:before{content:"\f17a"}.fa-android:before{content:"\f17b"}.fa-linux:before{content:"\f17c"}.fa-dribbble:before{content:"\f17d"}.fa-skype:before{content:"\f17e"}.fa-foursquare:before{content:"\f180"}.fa-trello:before{content:"\f181"}.fa-female:before{content:"\f182"}.fa-male:before{content:"\f183"}.fa-gittip:before,.fa-gratipay:before{content:"\f184"}.fa-sun-o:before{content:"\f185"}.fa-moon-o:before{content:"\f186"}.fa-archive:before{content:"\f187"}.fa-bug:before{content:"\f188"}.fa-vk:before{content:"\f189"}.fa-weibo:before{content:"\f18a"}.fa-renren:before{content:"\f18b"}.fa-pagelines:before{content:"\f18c"}.fa-stack-exchange:before{content:"\f18d"}.fa-arrow-circle-o-right:before{content:"\f18e"}.fa-arrow-circle-o-left:before{content:"\f190"}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:"\f191"}.fa-dot-circle-o:before{content:"\f192"}.fa-wheelchair:before{content:"\f193"}.fa-vimeo-square:before{content:"\f194"}.fa-turkish-lira:before,.fa-try:before{content:"\f195"}.fa-plus-square-o:before{content:"\f196"}.fa-space-shuttle:before{content:"\f197"}.fa-slack:before{content:"\f198"}.fa-envelope-square:before{content:"\f199"}.fa-wordpress:before{content:"\f19a"}.fa-openid:before{content:"\f19b"}.fa-institution:before,.fa-bank:before,.fa-university:before{content:"\f19c"}.fa-mortar-board:before,.fa-graduation-cap:before{content:"\f19d"}.fa-yahoo:before{content:"\f19e"}.fa-google:before{content:"\f1a0"}.fa-reddit:before{content:"\f1a1"}.fa-reddit-square:before{content:"\f1a2"}.fa-stumbleupon-circle:before{content:"\f1a3"}.fa-stumbleupon:before{content:"\f1a4"}.fa-delicious:before{content:"\f1a5"}.fa-digg:before{content:"\f1a6"}.fa-pied-piper-pp:before{content:"\f1a7"}.fa-pied-piper-alt:before{content:"\f1a8"}.fa-drupal:before{content:"\f1a9"}.fa-joomla:before{content:"\f1aa"}.fa-language:before{content:"\f1ab"}.fa-fax:before{content:"\f1ac"}.fa-building:before{content:"\f1ad"}.fa-child:before{content:"\f1ae"}.fa-paw:before{content:"\f1b0"}.fa-spoon:before{content:"\f1b1"}.fa-cube:before{content:"\f1b2"}.fa-cubes:before{content:"\f1b3"}.fa-behance:before{content:"\f1b4"}.fa-behance-square:before{content:"\f1b5"}.fa-steam:before{content:"\f1b6"}.fa-steam-square:before{content:"\f1b7"}.fa-recycle:before{content:"\f1b8"}.fa-automobile:before,.fa-car:before{content:"\f1b9"}.fa-cab:before,.fa-taxi:before{content:"\f1ba"}.fa-tree:before{content:"\f1bb"}.fa-spotify:before{content:"\f1bc"}.fa-deviantart:before{content:"\f1bd"}.fa-soundcloud:before{content:"\f1be"}.fa-database:before{content:"\f1c0"}.fa-file-pdf-o:before{content:"\f1c1"}.fa-file-word-o:before{content:"\f1c2"}.fa-file-excel-o:before{content:"\f1c3"}.fa-file-powerpoint-o:before{content:"\f1c4"}.fa-file-photo-o:before,.fa-file-picture-o:before,.fa-file-image-o:before{content:"\f1c5"}.fa-file-zip-o:before,.fa-file-archive-o:before{content:"\f1c6"}.fa-file-sound-o:before,.fa-file-audio-o:before{content:"\f1c7"}.fa-file-movie-o:before,.fa-file-video-o:before{content:"\f1c8"}.fa-file-code-o:before{content:"\f1c9"}.fa-vine:before{content:"\f1ca"}.fa-codepen:before{content:"\f1cb"}.fa-jsfiddle:before{content:"\f1cc"}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-saver:before,.fa-support:before,.fa-life-ring:before{content:"\f1cd"}.fa-circle-o-notch:before{content:"\f1ce"}.fa-ra:before,.fa-resistance:before,.fa-rebel:before{content:"\f1d0"}.fa-ge:before,.fa-empire:before{content:"\f1d1"}.fa-git-square:before{content:"\f1d2"}.fa-git:before{content:"\f1d3"}.fa-y-combinator-square:before,.fa-yc-square:before,.fa-hacker-news:before{content:"\f1d4"}.fa-tencent-weibo:before{content:"\f1d5"}.fa-qq:before{content:"\f1d6"}.fa-wechat:before,.fa-weixin:before{content:"\f1d7"}.fa-send:before,.fa-paper-plane:before{content:"\f1d8"}.fa-send-o:before,.fa-paper-plane-o:before{content:"\f1d9"}.fa-history:before{content:"\f1da"}.fa-circle-thin:before{content:"\f1db"}.fa-header:before{content:"\f1dc"}.fa-paragraph:before{content:"\f1dd"}.fa-sliders:before{content:"\f1de"}.fa-share-alt:before{content:"\f1e0"}.fa-share-alt-square:before{content:"\f1e1"}.fa-bomb:before{content:"\f1e2"}.fa-soccer-ball-o:before,.fa-futbol-o:before{content:"\f1e3"}.fa-tty:before{content:"\f1e4"}.fa-binoculars:before{content:"\f1e5"}.fa-plug:before{content:"\f1e6"}.fa-slideshare:before{content:"\f1e7"}.fa-twitch:before{content:"\f1e8"}.fa-yelp:before{content:"\f1e9"}.fa-newspaper-o:before{content:"\f1ea"}.fa-wifi:before{content:"\f1eb"}.fa-calculator:before{content:"\f1ec"}.fa-paypal:before{content:"\f1ed"}.fa-google-wallet:before{content:"\f1ee"}.fa-cc-visa:before{content:"\f1f0"}.fa-cc-mastercard:before{content:"\f1f1"}.fa-cc-discover:before{content:"\f1f2"}.fa-cc-amex:before{content:"\f1f3"}.fa-cc-paypal:before{content:"\f1f4"}.fa-cc-stripe:before{content:"\f1f5"}.fa-bell-slash:before{content:"\f1f6"}.fa-bell-slash-o:before{content:"\f1f7"}.fa-trash:before{content:"\f1f8"}.fa-copyright:before{content:"\f1f9"}.fa-at:before{content:"\f1fa"}.fa-eyedropper:before{content:"\f1fb"}.fa-paint-brush:before{content:"\f1fc"}.fa-birthday-cake:before{content:"\f1fd"}.fa-area-chart:before{content:"\f1fe"}.fa-pie-chart:before{content:"\f200"}.fa-line-chart:before{content:"\f201"}.fa-lastfm:before{content:"\f202"}.fa-lastfm-square:before{content:"\f203"}.fa-toggle-off:before{content:"\f204"}.fa-toggle-on:before{content:"\f205"}.fa-bicycle:before{content:"\f206"}.fa-bus:before{content:"\f207"}.fa-ioxhost:before{content:"\f208"}.fa-angellist:before{content:"\f209"}.fa-cc:before{content:"\f20a"}.fa-shekel:before,.fa-sheqel:before,.fa-ils:before{content:"\f20b"}.fa-meanpath:before{content:"\f20c"}.fa-buysellads:before{content:"\f20d"}.fa-connectdevelop:before{content:"\f20e"}.fa-dashcube:before{content:"\f210"}.fa-forumbee:before{content:"\f211"}.fa-leanpub:before{content:"\f212"}.fa-sellsy:before{content:"\f213"}.fa-shirtsinbulk:before{content:"\f214"}.fa-simplybuilt:before{content:"\f215"}.fa-skyatlas:before{content:"\f216"}.fa-cart-plus:before{content:"\f217"}.fa-cart-arrow-down:before{content:"\f218"}.fa-diamond:before{content:"\f219"}.fa-ship:before{content:"\f21a"}.fa-user-secret:before{content:"\f21b"}.fa-motorcycle:before{content:"\f21c"}.fa-street-view:before{content:"\f21d"}.fa-heartbeat:before{content:"\f21e"}.fa-venus:before{content:"\f221"}.fa-mars:before{content:"\f222"}.fa-mercury:before{content:"\f223"}.fa-intersex:before,.fa-transgender:before{content:"\f224"}.fa-transgender-alt:before{content:"\f225"}.fa-venus-double:before{content:"\f226"}.fa-mars-double:before{content:"\f227"}.fa-venus-mars:before{content:"\f228"}.fa-mars-stroke:before{content:"\f229"}.fa-mars-stroke-v:before{content:"\f22a"}.fa-mars-stroke-h:before{content:"\f22b"}.fa-neuter:before{content:"\f22c"}.fa-genderless:before{content:"\f22d"}.fa-facebook-official:before{content:"\f230"}.fa-pinterest-p:before{content:"\f231"}.fa-whatsapp:before{content:"\f232"}.fa-server:before{content:"\f233"}.fa-user-plus:before{content:"\f234"}.fa-user-times:before{content:"\f235"}.fa-hotel:before,.fa-bed:before{content:"\f236"}.fa-viacoin:before{content:"\f237"}.fa-train:before{content:"\f238"}.fa-subway:before{content:"\f239"}.fa-medium:before{content:"\f23a"}.fa-yc:before,.fa-y-combinator:before{content:"\f23b"}.fa-optin-monster:before{content:"\f23c"}.fa-opencart:before{content:"\f23d"}.fa-expeditedssl:before{content:"\f23e"}.fa-battery-4:before,.fa-battery:before,.fa-battery-full:before{content:"\f240"}.fa-battery-3:before,.fa-battery-three-quarters:before{content:"\f241"}.fa-battery-2:before,.fa-battery-half:before{content:"\f242"}.fa-battery-1:before,.fa-battery-quarter:before{content:"\f243"}.fa-battery-0:before,.fa-battery-empty:before{content:"\f244"}.fa-mouse-pointer:before{content:"\f245"}.fa-i-cursor:before{content:"\f246"}.fa-object-group:before{content:"\f247"}.fa-object-ungroup:before{content:"\f248"}.fa-sticky-note:before{content:"\f249"}.fa-sticky-note-o:before{content:"\f24a"}.fa-cc-jcb:before{content:"\f24b"}.fa-cc-diners-club:before{content:"\f24c"}.fa-clone:before{content:"\f24d"}.fa-balance-scale:before{content:"\f24e"}.fa-hourglass-o:before{content:"\f250"}.fa-hourglass-1:before,.fa-hourglass-start:before{content:"\f251"}.fa-hourglass-2:before,.fa-hourglass-half:before{content:"\f252"}.fa-hourglass-3:before,.fa-hourglass-end:before{content:"\f253"}.fa-hourglass:before{content:"\f254"}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:"\f255"}.fa-hand-stop-o:before,.fa-hand-paper-o:before{content:"\f256"}.fa-hand-scissors-o:before{content:"\f257"}.fa-hand-lizard-o:before{content:"\f258"}.fa-hand-spock-o:before{content:"\f259"}.fa-hand-pointer-o:before{content:"\f25a"}.fa-hand-peace-o:before{content:"\f25b"}.fa-trademark:before{content:"\f25c"}.fa-registered:before{content:"\f25d"}.fa-creative-commons:before{content:"\f25e"}.fa-gg:before{content:"\f260"}.fa-gg-circle:before{content:"\f261"}.fa-tripadvisor:before{content:"\f262"}.fa-odnoklassniki:before{content:"\f263"}.fa-odnoklassniki-square:before{content:"\f264"}.fa-get-pocket:before{content:"\f265"}.fa-wikipedia-w:before{content:"\f266"}.fa-safari:before{content:"\f267"}.fa-chrome:before{content:"\f268"}.fa-firefox:before{content:"\f269"}.fa-opera:before{content:"\f26a"}.fa-internet-explorer:before{content:"\f26b"}.fa-tv:before,.fa-television:before{content:"\f26c"}.fa-contao:before{content:"\f26d"}.fa-500px:before{content:"\f26e"}.fa-amazon:before{content:"\f270"}.fa-calendar-plus-o:before{content:"\f271"}.fa-calendar-minus-o:before{content:"\f272"}.fa-calendar-times-o:before{content:"\f273"}.fa-calendar-check-o:before{content:"\f274"}.fa-industry:before{content:"\f275"}.fa-map-pin:before{content:"\f276"}.fa-map-signs:before{content:"\f277"}.fa-map-o:before{content:"\f278"}.fa-map:before{content:"\f279"}.fa-commenting:before{content:"\f27a"}.fa-commenting-o:before{content:"\f27b"}.fa-houzz:before{content:"\f27c"}.fa-vimeo:before{content:"\f27d"}.fa-black-tie:before{content:"\f27e"}.fa-fonticons:before{content:"\f280"}.fa-reddit-alien:before{content:"\f281"}.fa-edge:before{content:"\f282"}.fa-credit-card-alt:before{content:"\f283"}.fa-codiepie:before{content:"\f284"}.fa-modx:before{content:"\f285"}.fa-fort-awesome:before{content:"\f286"}.fa-usb:before{content:"\f287"}.fa-product-hunt:before{content:"\f288"}.fa-mixcloud:before{content:"\f289"}.fa-scribd:before{content:"\f28a"}.fa-pause-circle:before{content:"\f28b"}.fa-pause-circle-o:before{content:"\f28c"}.fa-stop-circle:before{content:"\f28d"}.fa-stop-circle-o:before{content:"\f28e"}.fa-shopping-bag:before{content:"\f290"}.fa-shopping-basket:before{content:"\f291"}.fa-hashtag:before{content:"\f292"}.fa-bluetooth:before{content:"\f293"}.fa-bluetooth-b:before{content:"\f294"}.fa-percent:before{content:"\f295"}.fa-gitlab:before{content:"\f296"}.fa-wpbeginner:before{content:"\f297"}.fa-wpforms:before{content:"\f298"}.fa-envira:before{content:"\f299"}.fa-universal-access:before{content:"\f29a"}.fa-wheelchair-alt:before{content:"\f29b"}.fa-question-circle-o:before{content:"\f29c"}.fa-blind:before{content:"\f29d"}.fa-audio-description:before{content:"\f29e"}.fa-volume-control-phone:before{content:"\f2a0"}.fa-braille:before{content:"\f2a1"}.fa-assistive-listening-systems:before{content:"\f2a2"}.fa-asl-interpreting:before,.fa-american-sign-language-interpreting:before{content:"\f2a3"}.fa-deafness:before,.fa-hard-of-hearing:before,.fa-deaf:before{content:"\f2a4"}.fa-glide:before{content:"\f2a5"}.fa-glide-g:before{content:"\f2a6"}.fa-signing:before,.fa-sign-language:before{content:"\f2a7"}.fa-low-vision:before{content:"\f2a8"}.fa-viadeo:before{content:"\f2a9"}.fa-viadeo-square:before{content:"\f2aa"}.fa-snapchat:before{content:"\f2ab"}.fa-snapchat-ghost:before{content:"\f2ac"}.fa-snapchat-square:before{content:"\f2ad"}.fa-pied-piper:before{content:"\f2ae"}.fa-first-order:before{content:"\f2b0"}.fa-yoast:before{content:"\f2b1"}.fa-themeisle:before{content:"\f2b2"}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:"\f2b3"}.fa-fa:before,.fa-font-awesome:before{content:"\f2b4"}.fa-handshake-o:before{content:"\f2b5"}.fa-envelope-open:before{content:"\f2b6"}.fa-envelope-open-o:before{content:"\f2b7"}.fa-linode:before{content:"\f2b8"}.fa-address-book:before{content:"\f2b9"}.fa-address-book-o:before{content:"\f2ba"}.fa-vcard:before,.fa-address-card:before{content:"\f2bb"}.fa-vcard-o:before,.fa-address-card-o:before{content:"\f2bc"}.fa-user-circle:before{content:"\f2bd"}.fa-user-circle-o:before{content:"\f2be"}.fa-user-o:before{content:"\f2c0"}.fa-id-badge:before{content:"\f2c1"}.fa-drivers-license:before,.fa-id-card:before{content:"\f2c2"}.fa-drivers-license-o:before,.fa-id-card-o:before{content:"\f2c3"}.fa-quora:before{content:"\f2c4"}.fa-free-code-camp:before{content:"\f2c5"}.fa-telegram:before{content:"\f2c6"}.fa-thermometer-4:before,.fa-thermometer:before,.fa-thermometer-full:before{content:"\f2c7"}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:"\f2c8"}.fa-thermometer-2:before,.fa-thermometer-half:before{content:"\f2c9"}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:"\f2ca"}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:"\f2cb"}.fa-shower:before{content:"\f2cc"}.fa-bathtub:before,.fa-s15:before,.fa-bath:before{content:"\f2cd"}.fa-podcast:before{content:"\f2ce"}.fa-window-maximize:before{content:"\f2d0"}.fa-window-minimize:before{content:"\f2d1"}.fa-window-restore:before{content:"\f2d2"}.fa-times-rectangle:before,.fa-window-close:before{content:"\f2d3"}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:"\f2d4"}.fa-bandcamp:before{content:"\f2d5"}.fa-grav:before{content:"\f2d6"}.fa-etsy:before{content:"\f2d7"}.fa-imdb:before{content:"\f2d8"}.fa-ravelry:before{content:"\f2d9"}.fa-eercast:before{content:"\f2da"}.fa-microchip:before{content:"\f2db"}.fa-snowflake-o:before{content:"\f2dc"}.fa-superpowers:before{content:"\f2dd"}.fa-wpexplorer:before{content:"\f2de"}.fa-meetup:before{content:"\f2e0"}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0, 0, 0, 0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/css/style.css b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/css/style.css deleted file mode 100644 index a3040718b8f1caa8fed98832b8c82778b0003a9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/css/style.css +++ /dev/null @@ -1,453 +0,0 @@ -/* -* @Author: baipengxia -* @Date: 2021-03-12 11:44:28 -* @Last Modified by: baipengxia -* @Last Modified time: 2021-03-12 15:14:24 -*/ - -/** COMMON RESET **/ -* { - -webkit-tap-highlight-color: rgba(0, 0, 0, 0); -} - -body, -h1, -h2, -h3, -h4, -h5, -h6, -hr, -p, -dl, -dt, -dd, -ul, -ol, -li, -fieldset, -lengend, -button, -input, -textarea, -th, -td { - margin: 0; - padding: 0; - color: #000; -} - -body { - font-size: 14px; -} -html, body { - min-width: 1200px; -} - -button, -input, -select, -textarea { - font-size: 14px; -} - -h1 { - font-size: 18px; -} - -h2 { - font-size: 14px; -} - -h3 { - font-size: 14px; -} - -ul, -ol, -li { - list-style: none; -} - -a { - text-decoration: none; -} - -a:hover { - text-decoration: none; -} - -fieldset, -img { - border: none; -} - -table { - border-collapse: collapse; - border-spacing: 0; -} - -i { - font-style: normal; -} - -label { - position: inherit; -} - -.clearfix:after { - content: "."; - display: block; - height: 0; - clear: both; - visibility: hidden; -} - -.clearfix { - zoom: 1; - display: block; -} - -html, -body { - font-family: Tahoma, Arial, 'microsoft yahei', 'Roboto', 'Droid Sans', 'Helvetica Neue', 'Droid Sans Fallback', 'Heiti SC', 'Hiragino Sans GB', 'Simsun', 'sans-self'; -} - - - -.audio-banner { - width: 100%; - overflow: auto; - padding: 0; - background: url('../image/voice-dictation.svg'); - background-size: cover; -} -.weaper { - width: 1200px; - height: 155px; - margin: 72px auto; -} -.text-content { - width: 670px; - height: 100%; - float: left; -} -.text-content .title { - font-size: 34px; - font-family: 'PingFangSC-Medium'; - font-weight: 500; - color: rgba(255, 255, 255, 1); - line-height: 48px; -} -.text-content .con { - font-size: 16px; - font-family: PingFangSC-Light; - font-weight: 300; - color: rgba(255, 255, 255, 1); - line-height: 30px; -} -.img-con { - width: 416px; - height: 100%; - float: right; -} -.img-con img { - width: 100%; - height: 100%; -} -.con-container { - margin-top: 34px; -} - -.audio-advantage { - background: #f8f9fa; -} -.asr-advantage { - width: 1200px; - margin: 0 auto; -} -.asr-advantage h2 { - text-align: center; - font-size: 22px; - padding: 30px 0 0 0; -} -.asr-advantage > ul > li { - box-sizing: border-box; - padding: 0 16px; - width: 33%; - text-align: center; - margin-bottom: 35px; -} -.asr-advantage > ul > li .icons{ - margin-top: 10px; - margin-bottom: 20px; - width: 42px; - height: 42px; -} -.service-item-content { - margin-top: 35px; - display: flex; - justify-content: center; - flex-wrap: wrap; -} -.service-item-content img { - width: 160px; - vertical-align: bottom; -} -.service-item-content > li { - box-sizing: border-box; - padding: 0 16px; - width: 33%; - text-align: center; - margin-bottom: 35px; -} -.service-item-content > li .service-item-content-title { - line-height: 1.5; - font-weight: 700; - margin-top: 10px; -} -.service-item-content > li .service-item-content-desc { - margin-top: 5px; - line-height: 1.8; - color: #657384; -} - - -.audio-scene-con { - width: 100%; - padding-bottom: 84px; - background: #fff; -} -.audio-scene { - overflow: auto; - width: 1200px; - background: #fff; - text-align: center; - padding: 0; - margin: 0 auto; -} -.audio-scene h2 { - padding: 30px 0 0 0; - font-size: 22px; - text-align: center; -} - -.audio-experience { - width: 100%; - height: 538px; - background: #fff; - padding: 0; - margin: 0; - overflow: auto; -} -.asr-box { - width: 1200px; - height: 394px; - margin: 64px auto; -} -.asr-box h2 { - font-size: 22px; - text-align: center; - margin-bottom: 64px; -} -.voice-container { - position: relative; - width: 1200px; - height: 308px; - background: rgba(255, 255, 255, 1); - border-radius: 8px; - border: 1px solid rgba(225, 225, 225, 1); -} -.voice-container .voice { - height: 236px; - width: 100%; - border-radius: 8px; -} -.voice-container .voice textarea { - height: 100%; - width: 100%; - border: none; - outline: none; - border-radius: 8px; - padding: 25px; - font-size: 14px; - box-sizing: border-box; - resize: none; -} -.voice-input { - width: 100%; - height: 72px; - box-sizing: border-box; - padding-left: 35px; - background: rgba(242, 244, 245, 1); - border-radius: 8px; - line-height: 72px; -} -.voice-input .el-select { - width: 492px; -} -.start-voice { - display: inline-block; - margin-left: 10px; -} -.start-voice .time { - margin-right: 25px; -} -.asr-advantage > ul > li { - margin-bottom: 77px; -} -#msg { - width: 100%; - line-height: 40px; - font-size: 14px; - margin-left: 330px; -} -#captcha { - margin-left: 350px !important; - display: inline-block; - position: relative; -} -.black { - position: fixed; - width: 100%; - height: 100%; - z-index: 5; - background: rgba(0, 0, 0, 0.5); - top: 0; - left: 0; -} -.container { - position: fixed; - z-index: 6; - top: 25%; - left: 10%; -} -.audio-scene-con { - width: 100%; - padding-bottom: 84px; - background: #fff; -} -#sound { - color: #fff; - cursor: pointer; - background: #147ede; - padding: 10px; - margin-top: 30px; - margin-left: 135px; - width: 176px; - height: 30px !important; - text-align: center; - line-height: 30px !important; - border-radius: 10px; -} -.con-ten { - position: absolute; - width: 100%; - height: 100%; - z-index: 5; - background: #fff; - opacity: 0.5; - top: 0; - left: 0; -} -.websocket-url { - width: 320px; - height: 20px; - border: 1px solid #dcdfe6; - line-height: 20px; - padding: 10px; - border-radius: 4px; -} -.voice-btn { - color: #fff; - background-color: #409eff; - font-weight: 500; - padding: 12px 20px; - font-size: 14px; - border-radius: 4px; - border: 0; - cursor: pointer; -} -.voice-btn.end { - display: none; -} -.result-text { - background: #fff; - padding: 20px; -} -.voice-footer { - border-top: 1px solid #dddede; - background: #f7f9fa; - text-align: center; - margin-bottom: 8px; - color: #333; - font-size: 12px; - padding: 20px 0; -} - -/** line animate **/ -.time-box { - display: none; - margin-left: 10px; - width: 300px; -} -.total-time { - font-size: 14px; - color: #545454; -} -.voice-btn.end.show, -.time-box.show { - display: inline; -} -.start-taste-line { - margin-right: 20px; - display: inline-block; -} -.start-taste-line hr { - background-color: #187cff; - width: 3px; - height: 8px; - margin: 0 3px; - display: inline-block; - border: none; -} -.hr { - animation: note 0.2s ease-in-out; - animation-iteration-count: infinite; - animation-direction: alternate; -} -.hr-one { - animation-delay: -0.9s; -} -.hr-two { - animation-delay: -0.8s; -} -.hr-three { - animation-delay: -0.7s; -} -.hr-four { - animation-delay: -0.6s; -} -.hr-five { - animation-delay: -0.5s; -} -.hr-six { - animation-delay: -0.4s; -} -.hr-seven { - animation-delay: -0.3s; -} -.hr-eight { - animation-delay: -0.2s; -} -.hr-nine { - animation-delay: -0.1s; -} -@keyframes note { - from { - transform: scaleY(1); - } - to { - transform: scaleY(4); - } -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/favicon.ico b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/favicon.ico deleted file mode 100644 index da219e6d6e693bda3193ad25d24e0f54fbe7b989..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/favicon.ico and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/FontAwesome.otf b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/FontAwesome.otf deleted file mode 100644 index 401ec0f36e4f73b8efa40bd6f604fe80d286db70..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/FontAwesome.otf and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.eot b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.eot deleted file mode 100644 index e9f60ca953f93e35eab4108bd414bc02ddcf3928..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.eot and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.svg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.svg deleted file mode 100644 index 6cd0326be380a32c3193c42e1879b7a6c6cf527e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.svg +++ /dev/null @@ -1,1951 +0,0 @@ - - - - -Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 - By ,,, -Copyright Dave Gandy 2016. All rights reserved. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.ttf b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.ttf deleted file mode 100644 index 35acda2fa1196aad98c2adf4378a7611dd713aa3..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.ttf and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.woff b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.woff deleted file mode 100644 index 400014a4b06eee3d0c0d54402a47ab2601b2862b..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.woff and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.woff2 b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.woff2 deleted file mode 100644 index 4d13fc60404b91e398a37200c4a77b645cfd9586..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/fonts/fontawesome-webfont.woff2 and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/qrcode-enterprise.png b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/qrcode-enterprise.png deleted file mode 100644 index 1f7157a61874b2bcd3517ee2cf6479989e2b0e6f..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/qrcode-enterprise.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/qrcode-official-account.png b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/qrcode-official-account.png deleted file mode 100644 index 9afe08d93e907606271014602420096a9c523af8..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/qrcode-official-account.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/voice-dictation.svg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/voice-dictation.svg deleted file mode 100644 index d35971499ddfed4ab0016419fb87e8d6a0d695cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/voice-dictation.svg +++ /dev/null @@ -1,94 +0,0 @@ - - - - 背景 - Created with Sketch. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/voice-pic.png b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/voice-pic.png deleted file mode 100644 index 7b8f043a11d2b114fcfd614413844fac41d7e240..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/image/voice-pic.png and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/js/SoundRecognizer.js b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/js/SoundRecognizer.js deleted file mode 100644 index 5ef3d2e89dc27945d9e356b3c9eb5519f9cea69a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/js/SoundRecognizer.js +++ /dev/null @@ -1,133 +0,0 @@ -SoundRecognizer = { - rec: null, - wave: null, - SampleRate: 16000, - testBitRate: 16, - isCloseRecorder: false, - SendInterval: 300, - realTimeSendTryType: 'pcm', - realTimeSendTryEncBusy: 0, - realTimeSendTryTime: 0, - realTimeSendTryNumber: 0, - transferUploadNumberMax: 0, - realTimeSendTryChunk: null, - soundType: "pcm", - init: function (config) { - this.soundType = config.soundType || 'pcm'; - this.SampleRate = config.sampleRate || 16000; - this.recwaveElm = config.recwaveElm || ''; - this.TransferUpload = config.translerCallBack || this.TransferProcess; - this.initRecorder(); - }, - RealTimeSendTryReset: function (type) { - this.realTimeSendTryType = type; - this.realTimeSendTryTime = 0; - }, - RealTimeSendTry: function (rec, isClose) { - var that = this; - var t1 = Date.now(), endT = 0, recImpl = Recorder.prototype; - if (this.realTimeSendTryTime == 0) { - this.realTimeSendTryTime = t1; - this.realTimeSendTryEncBusy = 0; - this.realTimeSendTryNumber = 0; - this.transferUploadNumberMax = 0; - this.realTimeSendTryChunk = null; - } - if (!isClose && t1 - this.realTimeSendTryTime < this.SendInterval) { - return;//控制缓冲达到指定间隔才进行传输 - } - this.realTimeSendTryTime = t1; - var number = ++this.realTimeSendTryNumber; - - //借用SampleData函数进行数据的连续处理,采样率转换是顺带的 - var chunk = Recorder.SampleData(rec.buffers, rec.srcSampleRate, this.SampleRate, this.realTimeSendTryChunk, { frameType: isClose ? "" : this.realTimeSendTryType }); - - //清理已处理完的缓冲数据,释放内存以支持长时间录音,最后完成录音时不能调用stop,因为数据已经被清掉了 - for (var i = this.realTimeSendTryChunk ? this.realTimeSendTryChunk.index : 0; i < chunk.index; i++) { - rec.buffers[i] = null; - } - this.realTimeSendTryChunk = chunk; - - //没有新数据,或结束时的数据量太小,不能进行mock转码 - if (chunk.data.length == 0 || isClose && chunk.data.length < 2000) { - this.TransferUpload(number, null, 0, null, isClose); - return; - } - //实时编码队列阻塞处理 - if (!isClose) { - if (this.realTimeSendTryEncBusy >= 2) { - console.log("编码队列阻塞,已丢弃一帧", 1); - return; - } - } - this.realTimeSendTryEncBusy++; - - //通过mock方法实时转码成mp3、wav - var encStartTime = Date.now(); - var recMock = Recorder({ - type: this.realTimeSendTryType - , sampleRate: this.SampleRate //采样率 - , bitRate: this.testBitRate //比特率 - }); - recMock.mock(chunk.data, chunk.sampleRate); - recMock.stop(function (blob, duration) { - that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--); - blob.encTime = Date.now() - encStartTime; - - //转码好就推入传输 - that.TransferUpload(number, blob, duration, recMock, isClose); - }, function (msg) { - that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--); - //转码错误?没想到什么时候会产生错误! - console.log("不应该出现的错误:" + msg, 1); - }); - }, - recordClose: function () { - try { - this.rec.close(function () { - this.isCloseRecorder = true; - }); - this.RealTimeSendTry(this.rec, true);//最后一次发送 - } catch (ex) { - // recordClose(); - } - }, - recordEnd: function () { - try { - this.rec.stop(function (blob, time) { - this.recordClose(); - }, function (s) { - this.recordClose(); - }); - } catch (ex) { - } - }, - initRecorder: function () { - var that = this; - var rec = Recorder({ - type: that.soundType - , bitRate: that.testBitRate - , sampleRate: that.SampleRate - , onProcess: function (buffers, level, time, sampleRate) { - that.wave.input(buffers[buffers.length - 1], level, sampleRate); - that.RealTimeSendTry(rec, false);//推入实时处理,因为是unknown格式,这里简化函数调用,没有用到buffers和bufferSampleRate,因为这些数据和rec.buffers是完全相同的。 - } - }); - - rec.open(function () { - that.wave = Recorder.FrequencyHistogramView({ - elem: that.recwaveElm, lineCount: 90 - , position: 0 - , minHeight: 1 - , stripeEnable: false - }); - rec.start(); - that.isCloseRecorder = false; - that.RealTimeSendTryReset(that.soundType);//重置 - }); - this.rec = rec; - }, - TransferProcess: function (number, blobOrNull, duration, blobRec, isClose) { - - } -} \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/js/jquery-3.2.1.min.js b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/js/jquery-3.2.1.min.js deleted file mode 100644 index 644d35e274fd64ddaf6d12af813e820c424176a9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/web/static/js/jquery-3.2.1.min.js +++ /dev/null @@ -1,4 +0,0 @@ -/*! jQuery v3.2.1 | (c) JS Foundation and other contributors | jquery.org/license */ -!function(a,b){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){"use strict";var c=[],d=a.document,e=Object.getPrototypeOf,f=c.slice,g=c.concat,h=c.push,i=c.indexOf,j={},k=j.toString,l=j.hasOwnProperty,m=l.toString,n=m.call(Object),o={};function p(a,b){b=b||d;var c=b.createElement("script");c.text=a,b.head.appendChild(c).parentNode.removeChild(c)}var q="3.2.1",r=function(a,b){return new r.fn.init(a,b)},s=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,t=/^-ms-/,u=/-([a-z])/g,v=function(a,b){return b.toUpperCase()};r.fn=r.prototype={jquery:q,constructor:r,length:0,toArray:function(){return f.call(this)},get:function(a){return null==a?f.call(this):a<0?this[a+this.length]:this[a]},pushStack:function(a){var b=r.merge(this.constructor(),a);return b.prevObject=this,b},each:function(a){return r.each(this,a)},map:function(a){return this.pushStack(r.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(f.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(a<0?b:0);return this.pushStack(c>=0&&c0&&b-1 in a)}var x=function(a){var b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u="sizzle"+1*new Date,v=a.document,w=0,x=0,y=ha(),z=ha(),A=ha(),B=function(a,b){return a===b&&(l=!0),0},C={}.hasOwnProperty,D=[],E=D.pop,F=D.push,G=D.push,H=D.slice,I=function(a,b){for(var c=0,d=a.length;c+~]|"+K+")"+K+"*"),S=new RegExp("="+K+"*([^\\]'\"]*?)"+K+"*\\]","g"),T=new RegExp(N),U=new RegExp("^"+L+"$"),V={ID:new RegExp("^#("+L+")"),CLASS:new RegExp("^\\.("+L+")"),TAG:new RegExp("^("+L+"|[*])"),ATTR:new RegExp("^"+M),PSEUDO:new RegExp("^"+N),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+K+"*(even|odd|(([+-]|)(\\d*)n|)"+K+"*(?:([+-]|)"+K+"*(\\d+)|))"+K+"*\\)|)","i"),bool:new RegExp("^(?:"+J+")$","i"),needsContext:new RegExp("^"+K+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+K+"*((?:-\\d)?\\d*)"+K+"*\\)|)(?=[^-]|$)","i")},W=/^(?:input|select|textarea|button)$/i,X=/^h\d$/i,Y=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,$=/[+~]/,_=new RegExp("\\\\([\\da-f]{1,6}"+K+"?|("+K+")|.)","ig"),aa=function(a,b,c){var d="0x"+b-65536;return d!==d||c?b:d<0?String.fromCharCode(d+65536):String.fromCharCode(d>>10|55296,1023&d|56320)},ba=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ca=function(a,b){return b?"\0"===a?"\ufffd":a.slice(0,-1)+"\\"+a.charCodeAt(a.length-1).toString(16)+" ":"\\"+a},da=function(){m()},ea=ta(function(a){return a.disabled===!0&&("form"in a||"label"in a)},{dir:"parentNode",next:"legend"});try{G.apply(D=H.call(v.childNodes),v.childNodes),D[v.childNodes.length].nodeType}catch(fa){G={apply:D.length?function(a,b){F.apply(a,H.call(b))}:function(a,b){var c=a.length,d=0;while(a[c++]=b[d++]);a.length=c-1}}}function ga(a,b,d,e){var f,h,j,k,l,o,r,s=b&&b.ownerDocument,w=b?b.nodeType:9;if(d=d||[],"string"!=typeof a||!a||1!==w&&9!==w&&11!==w)return d;if(!e&&((b?b.ownerDocument||b:v)!==n&&m(b),b=b||n,p)){if(11!==w&&(l=Z.exec(a)))if(f=l[1]){if(9===w){if(!(j=b.getElementById(f)))return d;if(j.id===f)return d.push(j),d}else if(s&&(j=s.getElementById(f))&&t(b,j)&&j.id===f)return d.push(j),d}else{if(l[2])return G.apply(d,b.getElementsByTagName(a)),d;if((f=l[3])&&c.getElementsByClassName&&b.getElementsByClassName)return G.apply(d,b.getElementsByClassName(f)),d}if(c.qsa&&!A[a+" "]&&(!q||!q.test(a))){if(1!==w)s=b,r=a;else if("object"!==b.nodeName.toLowerCase()){(k=b.getAttribute("id"))?k=k.replace(ba,ca):b.setAttribute("id",k=u),o=g(a),h=o.length;while(h--)o[h]="#"+k+" "+sa(o[h]);r=o.join(","),s=$.test(a)&&qa(b.parentNode)||b}if(r)try{return G.apply(d,s.querySelectorAll(r)),d}catch(x){}finally{k===u&&b.removeAttribute("id")}}}return i(a.replace(P,"$1"),b,d,e)}function ha(){var a=[];function b(c,e){return a.push(c+" ")>d.cacheLength&&delete b[a.shift()],b[c+" "]=e}return b}function ia(a){return a[u]=!0,a}function ja(a){var b=n.createElement("fieldset");try{return!!a(b)}catch(c){return!1}finally{b.parentNode&&b.parentNode.removeChild(b),b=null}}function ka(a,b){var c=a.split("|"),e=c.length;while(e--)d.attrHandle[c[e]]=b}function la(a,b){var c=b&&a,d=c&&1===a.nodeType&&1===b.nodeType&&a.sourceIndex-b.sourceIndex;if(d)return d;if(c)while(c=c.nextSibling)if(c===b)return-1;return a?1:-1}function ma(a){return function(b){var c=b.nodeName.toLowerCase();return"input"===c&&b.type===a}}function na(a){return function(b){var c=b.nodeName.toLowerCase();return("input"===c||"button"===c)&&b.type===a}}function oa(a){return function(b){return"form"in b?b.parentNode&&b.disabled===!1?"label"in b?"label"in b.parentNode?b.parentNode.disabled===a:b.disabled===a:b.isDisabled===a||b.isDisabled!==!a&&ea(b)===a:b.disabled===a:"label"in b&&b.disabled===a}}function pa(a){return ia(function(b){return b=+b,ia(function(c,d){var e,f=a([],c.length,b),g=f.length;while(g--)c[e=f[g]]&&(c[e]=!(d[e]=c[e]))})})}function qa(a){return a&&"undefined"!=typeof a.getElementsByTagName&&a}c=ga.support={},f=ga.isXML=function(a){var b=a&&(a.ownerDocument||a).documentElement;return!!b&&"HTML"!==b.nodeName},m=ga.setDocument=function(a){var b,e,g=a?a.ownerDocument||a:v;return g!==n&&9===g.nodeType&&g.documentElement?(n=g,o=n.documentElement,p=!f(n),v!==n&&(e=n.defaultView)&&e.top!==e&&(e.addEventListener?e.addEventListener("unload",da,!1):e.attachEvent&&e.attachEvent("onunload",da)),c.attributes=ja(function(a){return a.className="i",!a.getAttribute("className")}),c.getElementsByTagName=ja(function(a){return a.appendChild(n.createComment("")),!a.getElementsByTagName("*").length}),c.getElementsByClassName=Y.test(n.getElementsByClassName),c.getById=ja(function(a){return o.appendChild(a).id=u,!n.getElementsByName||!n.getElementsByName(u).length}),c.getById?(d.filter.ID=function(a){var b=a.replace(_,aa);return function(a){return a.getAttribute("id")===b}},d.find.ID=function(a,b){if("undefined"!=typeof b.getElementById&&p){var c=b.getElementById(a);return c?[c]:[]}}):(d.filter.ID=function(a){var b=a.replace(_,aa);return function(a){var c="undefined"!=typeof a.getAttributeNode&&a.getAttributeNode("id");return c&&c.value===b}},d.find.ID=function(a,b){if("undefined"!=typeof b.getElementById&&p){var c,d,e,f=b.getElementById(a);if(f){if(c=f.getAttributeNode("id"),c&&c.value===a)return[f];e=b.getElementsByName(a),d=0;while(f=e[d++])if(c=f.getAttributeNode("id"),c&&c.value===a)return[f]}return[]}}),d.find.TAG=c.getElementsByTagName?function(a,b){return"undefined"!=typeof b.getElementsByTagName?b.getElementsByTagName(a):c.qsa?b.querySelectorAll(a):void 0}:function(a,b){var c,d=[],e=0,f=b.getElementsByTagName(a);if("*"===a){while(c=f[e++])1===c.nodeType&&d.push(c);return d}return f},d.find.CLASS=c.getElementsByClassName&&function(a,b){if("undefined"!=typeof b.getElementsByClassName&&p)return b.getElementsByClassName(a)},r=[],q=[],(c.qsa=Y.test(n.querySelectorAll))&&(ja(function(a){o.appendChild(a).innerHTML="",a.querySelectorAll("[msallowcapture^='']").length&&q.push("[*^$]="+K+"*(?:''|\"\")"),a.querySelectorAll("[selected]").length||q.push("\\["+K+"*(?:value|"+J+")"),a.querySelectorAll("[id~="+u+"-]").length||q.push("~="),a.querySelectorAll(":checked").length||q.push(":checked"),a.querySelectorAll("a#"+u+"+*").length||q.push(".#.+[+~]")}),ja(function(a){a.innerHTML="";var b=n.createElement("input");b.setAttribute("type","hidden"),a.appendChild(b).setAttribute("name","D"),a.querySelectorAll("[name=d]").length&&q.push("name"+K+"*[*^$|!~]?="),2!==a.querySelectorAll(":enabled").length&&q.push(":enabled",":disabled"),o.appendChild(a).disabled=!0,2!==a.querySelectorAll(":disabled").length&&q.push(":enabled",":disabled"),a.querySelectorAll("*,:x"),q.push(",.*:")})),(c.matchesSelector=Y.test(s=o.matches||o.webkitMatchesSelector||o.mozMatchesSelector||o.oMatchesSelector||o.msMatchesSelector))&&ja(function(a){c.disconnectedMatch=s.call(a,"*"),s.call(a,"[s!='']:x"),r.push("!=",N)}),q=q.length&&new RegExp(q.join("|")),r=r.length&&new RegExp(r.join("|")),b=Y.test(o.compareDocumentPosition),t=b||Y.test(o.contains)?function(a,b){var c=9===a.nodeType?a.documentElement:a,d=b&&b.parentNode;return a===d||!(!d||1!==d.nodeType||!(c.contains?c.contains(d):a.compareDocumentPosition&&16&a.compareDocumentPosition(d)))}:function(a,b){if(b)while(b=b.parentNode)if(b===a)return!0;return!1},B=b?function(a,b){if(a===b)return l=!0,0;var d=!a.compareDocumentPosition-!b.compareDocumentPosition;return d?d:(d=(a.ownerDocument||a)===(b.ownerDocument||b)?a.compareDocumentPosition(b):1,1&d||!c.sortDetached&&b.compareDocumentPosition(a)===d?a===n||a.ownerDocument===v&&t(v,a)?-1:b===n||b.ownerDocument===v&&t(v,b)?1:k?I(k,a)-I(k,b):0:4&d?-1:1)}:function(a,b){if(a===b)return l=!0,0;var c,d=0,e=a.parentNode,f=b.parentNode,g=[a],h=[b];if(!e||!f)return a===n?-1:b===n?1:e?-1:f?1:k?I(k,a)-I(k,b):0;if(e===f)return la(a,b);c=a;while(c=c.parentNode)g.unshift(c);c=b;while(c=c.parentNode)h.unshift(c);while(g[d]===h[d])d++;return d?la(g[d],h[d]):g[d]===v?-1:h[d]===v?1:0},n):n},ga.matches=function(a,b){return ga(a,null,null,b)},ga.matchesSelector=function(a,b){if((a.ownerDocument||a)!==n&&m(a),b=b.replace(S,"='$1']"),c.matchesSelector&&p&&!A[b+" "]&&(!r||!r.test(b))&&(!q||!q.test(b)))try{var d=s.call(a,b);if(d||c.disconnectedMatch||a.document&&11!==a.document.nodeType)return d}catch(e){}return ga(b,n,null,[a]).length>0},ga.contains=function(a,b){return(a.ownerDocument||a)!==n&&m(a),t(a,b)},ga.attr=function(a,b){(a.ownerDocument||a)!==n&&m(a);var e=d.attrHandle[b.toLowerCase()],f=e&&C.call(d.attrHandle,b.toLowerCase())?e(a,b,!p):void 0;return void 0!==f?f:c.attributes||!p?a.getAttribute(b):(f=a.getAttributeNode(b))&&f.specified?f.value:null},ga.escape=function(a){return(a+"").replace(ba,ca)},ga.error=function(a){throw new Error("Syntax error, unrecognized expression: "+a)},ga.uniqueSort=function(a){var b,d=[],e=0,f=0;if(l=!c.detectDuplicates,k=!c.sortStable&&a.slice(0),a.sort(B),l){while(b=a[f++])b===a[f]&&(e=d.push(f));while(e--)a.splice(d[e],1)}return k=null,a},e=ga.getText=function(a){var b,c="",d=0,f=a.nodeType;if(f){if(1===f||9===f||11===f){if("string"==typeof a.textContent)return a.textContent;for(a=a.firstChild;a;a=a.nextSibling)c+=e(a)}else if(3===f||4===f)return a.nodeValue}else while(b=a[d++])c+=e(b);return c},d=ga.selectors={cacheLength:50,createPseudo:ia,match:V,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(a){return a[1]=a[1].replace(_,aa),a[3]=(a[3]||a[4]||a[5]||"").replace(_,aa),"~="===a[2]&&(a[3]=" "+a[3]+" "),a.slice(0,4)},CHILD:function(a){return a[1]=a[1].toLowerCase(),"nth"===a[1].slice(0,3)?(a[3]||ga.error(a[0]),a[4]=+(a[4]?a[5]+(a[6]||1):2*("even"===a[3]||"odd"===a[3])),a[5]=+(a[7]+a[8]||"odd"===a[3])):a[3]&&ga.error(a[0]),a},PSEUDO:function(a){var b,c=!a[6]&&a[2];return V.CHILD.test(a[0])?null:(a[3]?a[2]=a[4]||a[5]||"":c&&T.test(c)&&(b=g(c,!0))&&(b=c.indexOf(")",c.length-b)-c.length)&&(a[0]=a[0].slice(0,b),a[2]=c.slice(0,b)),a.slice(0,3))}},filter:{TAG:function(a){var b=a.replace(_,aa).toLowerCase();return"*"===a?function(){return!0}:function(a){return a.nodeName&&a.nodeName.toLowerCase()===b}},CLASS:function(a){var b=y[a+" "];return b||(b=new RegExp("(^|"+K+")"+a+"("+K+"|$)"))&&y(a,function(a){return b.test("string"==typeof a.className&&a.className||"undefined"!=typeof a.getAttribute&&a.getAttribute("class")||"")})},ATTR:function(a,b,c){return function(d){var e=ga.attr(d,a);return null==e?"!="===b:!b||(e+="","="===b?e===c:"!="===b?e!==c:"^="===b?c&&0===e.indexOf(c):"*="===b?c&&e.indexOf(c)>-1:"$="===b?c&&e.slice(-c.length)===c:"~="===b?(" "+e.replace(O," ")+" ").indexOf(c)>-1:"|="===b&&(e===c||e.slice(0,c.length+1)===c+"-"))}},CHILD:function(a,b,c,d,e){var f="nth"!==a.slice(0,3),g="last"!==a.slice(-4),h="of-type"===b;return 1===d&&0===e?function(a){return!!a.parentNode}:function(b,c,i){var j,k,l,m,n,o,p=f!==g?"nextSibling":"previousSibling",q=b.parentNode,r=h&&b.nodeName.toLowerCase(),s=!i&&!h,t=!1;if(q){if(f){while(p){m=b;while(m=m[p])if(h?m.nodeName.toLowerCase()===r:1===m.nodeType)return!1;o=p="only"===a&&!o&&"nextSibling"}return!0}if(o=[g?q.firstChild:q.lastChild],g&&s){m=q,l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),j=k[a]||[],n=j[0]===w&&j[1],t=n&&j[2],m=n&&q.childNodes[n];while(m=++n&&m&&m[p]||(t=n=0)||o.pop())if(1===m.nodeType&&++t&&m===b){k[a]=[w,n,t];break}}else if(s&&(m=b,l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),j=k[a]||[],n=j[0]===w&&j[1],t=n),t===!1)while(m=++n&&m&&m[p]||(t=n=0)||o.pop())if((h?m.nodeName.toLowerCase()===r:1===m.nodeType)&&++t&&(s&&(l=m[u]||(m[u]={}),k=l[m.uniqueID]||(l[m.uniqueID]={}),k[a]=[w,t]),m===b))break;return t-=e,t===d||t%d===0&&t/d>=0}}},PSEUDO:function(a,b){var c,e=d.pseudos[a]||d.setFilters[a.toLowerCase()]||ga.error("unsupported pseudo: "+a);return e[u]?e(b):e.length>1?(c=[a,a,"",b],d.setFilters.hasOwnProperty(a.toLowerCase())?ia(function(a,c){var d,f=e(a,b),g=f.length;while(g--)d=I(a,f[g]),a[d]=!(c[d]=f[g])}):function(a){return e(a,0,c)}):e}},pseudos:{not:ia(function(a){var b=[],c=[],d=h(a.replace(P,"$1"));return d[u]?ia(function(a,b,c,e){var f,g=d(a,null,e,[]),h=a.length;while(h--)(f=g[h])&&(a[h]=!(b[h]=f))}):function(a,e,f){return b[0]=a,d(b,null,f,c),b[0]=null,!c.pop()}}),has:ia(function(a){return function(b){return ga(a,b).length>0}}),contains:ia(function(a){return a=a.replace(_,aa),function(b){return(b.textContent||b.innerText||e(b)).indexOf(a)>-1}}),lang:ia(function(a){return U.test(a||"")||ga.error("unsupported lang: "+a),a=a.replace(_,aa).toLowerCase(),function(b){var c;do if(c=p?b.lang:b.getAttribute("xml:lang")||b.getAttribute("lang"))return c=c.toLowerCase(),c===a||0===c.indexOf(a+"-");while((b=b.parentNode)&&1===b.nodeType);return!1}}),target:function(b){var c=a.location&&a.location.hash;return c&&c.slice(1)===b.id},root:function(a){return a===o},focus:function(a){return a===n.activeElement&&(!n.hasFocus||n.hasFocus())&&!!(a.type||a.href||~a.tabIndex)},enabled:oa(!1),disabled:oa(!0),checked:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&!!a.checked||"option"===b&&!!a.selected},selected:function(a){return a.parentNode&&a.parentNode.selectedIndex,a.selected===!0},empty:function(a){for(a=a.firstChild;a;a=a.nextSibling)if(a.nodeType<6)return!1;return!0},parent:function(a){return!d.pseudos.empty(a)},header:function(a){return X.test(a.nodeName)},input:function(a){return W.test(a.nodeName)},button:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&"button"===a.type||"button"===b},text:function(a){var b;return"input"===a.nodeName.toLowerCase()&&"text"===a.type&&(null==(b=a.getAttribute("type"))||"text"===b.toLowerCase())},first:pa(function(){return[0]}),last:pa(function(a,b){return[b-1]}),eq:pa(function(a,b,c){return[c<0?c+b:c]}),even:pa(function(a,b){for(var c=0;c=0;)a.push(d);return a}),gt:pa(function(a,b,c){for(var d=c<0?c+b:c;++d1?function(b,c,d){var e=a.length;while(e--)if(!a[e](b,c,d))return!1;return!0}:a[0]}function va(a,b,c){for(var d=0,e=b.length;d-1&&(f[j]=!(g[j]=l))}}else r=wa(r===g?r.splice(o,r.length):r),e?e(null,g,r,i):G.apply(g,r)})}function ya(a){for(var b,c,e,f=a.length,g=d.relative[a[0].type],h=g||d.relative[" "],i=g?1:0,k=ta(function(a){return a===b},h,!0),l=ta(function(a){return I(b,a)>-1},h,!0),m=[function(a,c,d){var e=!g&&(d||c!==j)||((b=c).nodeType?k(a,c,d):l(a,c,d));return b=null,e}];i1&&ua(m),i>1&&sa(a.slice(0,i-1).concat({value:" "===a[i-2].type?"*":""})).replace(P,"$1"),c,i0,e=a.length>0,f=function(f,g,h,i,k){var l,o,q,r=0,s="0",t=f&&[],u=[],v=j,x=f||e&&d.find.TAG("*",k),y=w+=null==v?1:Math.random()||.1,z=x.length;for(k&&(j=g===n||g||k);s!==z&&null!=(l=x[s]);s++){if(e&&l){o=0,g||l.ownerDocument===n||(m(l),h=!p);while(q=a[o++])if(q(l,g||n,h)){i.push(l);break}k&&(w=y)}c&&((l=!q&&l)&&r--,f&&t.push(l))}if(r+=s,c&&s!==r){o=0;while(q=b[o++])q(t,u,g,h);if(f){if(r>0)while(s--)t[s]||u[s]||(u[s]=E.call(i));u=wa(u)}G.apply(i,u),k&&!f&&u.length>0&&r+b.length>1&&ga.uniqueSort(i)}return k&&(w=y,j=v),t};return c?ia(f):f}return h=ga.compile=function(a,b){var c,d=[],e=[],f=A[a+" "];if(!f){b||(b=g(a)),c=b.length;while(c--)f=ya(b[c]),f[u]?d.push(f):e.push(f);f=A(a,za(e,d)),f.selector=a}return f},i=ga.select=function(a,b,c,e){var f,i,j,k,l,m="function"==typeof a&&a,n=!e&&g(a=m.selector||a);if(c=c||[],1===n.length){if(i=n[0]=n[0].slice(0),i.length>2&&"ID"===(j=i[0]).type&&9===b.nodeType&&p&&d.relative[i[1].type]){if(b=(d.find.ID(j.matches[0].replace(_,aa),b)||[])[0],!b)return c;m&&(b=b.parentNode),a=a.slice(i.shift().value.length)}f=V.needsContext.test(a)?0:i.length;while(f--){if(j=i[f],d.relative[k=j.type])break;if((l=d.find[k])&&(e=l(j.matches[0].replace(_,aa),$.test(i[0].type)&&qa(b.parentNode)||b))){if(i.splice(f,1),a=e.length&&sa(i),!a)return G.apply(c,e),c;break}}}return(m||h(a,n))(e,b,!p,c,!b||$.test(a)&&qa(b.parentNode)||b),c},c.sortStable=u.split("").sort(B).join("")===u,c.detectDuplicates=!!l,m(),c.sortDetached=ja(function(a){return 1&a.compareDocumentPosition(n.createElement("fieldset"))}),ja(function(a){return a.innerHTML="","#"===a.firstChild.getAttribute("href")})||ka("type|href|height|width",function(a,b,c){if(!c)return a.getAttribute(b,"type"===b.toLowerCase()?1:2)}),c.attributes&&ja(function(a){return a.innerHTML="",a.firstChild.setAttribute("value",""),""===a.firstChild.getAttribute("value")})||ka("value",function(a,b,c){if(!c&&"input"===a.nodeName.toLowerCase())return a.defaultValue}),ja(function(a){return null==a.getAttribute("disabled")})||ka(J,function(a,b,c){var d;if(!c)return a[b]===!0?b.toLowerCase():(d=a.getAttributeNode(b))&&d.specified?d.value:null}),ga}(a);r.find=x,r.expr=x.selectors,r.expr[":"]=r.expr.pseudos,r.uniqueSort=r.unique=x.uniqueSort,r.text=x.getText,r.isXMLDoc=x.isXML,r.contains=x.contains,r.escapeSelector=x.escape;var y=function(a,b,c){var d=[],e=void 0!==c;while((a=a[b])&&9!==a.nodeType)if(1===a.nodeType){if(e&&r(a).is(c))break;d.push(a)}return d},z=function(a,b){for(var c=[];a;a=a.nextSibling)1===a.nodeType&&a!==b&&c.push(a);return c},A=r.expr.match.needsContext;function B(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()}var C=/^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i,D=/^.[^:#\[\.,]*$/;function E(a,b,c){return r.isFunction(b)?r.grep(a,function(a,d){return!!b.call(a,d,a)!==c}):b.nodeType?r.grep(a,function(a){return a===b!==c}):"string"!=typeof b?r.grep(a,function(a){return i.call(b,a)>-1!==c}):D.test(b)?r.filter(b,a,c):(b=r.filter(b,a),r.grep(a,function(a){return i.call(b,a)>-1!==c&&1===a.nodeType}))}r.filter=function(a,b,c){var d=b[0];return c&&(a=":not("+a+")"),1===b.length&&1===d.nodeType?r.find.matchesSelector(d,a)?[d]:[]:r.find.matches(a,r.grep(b,function(a){return 1===a.nodeType}))},r.fn.extend({find:function(a){var b,c,d=this.length,e=this;if("string"!=typeof a)return this.pushStack(r(a).filter(function(){for(b=0;b1?r.uniqueSort(c):c},filter:function(a){return this.pushStack(E(this,a||[],!1))},not:function(a){return this.pushStack(E(this,a||[],!0))},is:function(a){return!!E(this,"string"==typeof a&&A.test(a)?r(a):a||[],!1).length}});var F,G=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/,H=r.fn.init=function(a,b,c){var e,f;if(!a)return this;if(c=c||F,"string"==typeof a){if(e="<"===a[0]&&">"===a[a.length-1]&&a.length>=3?[null,a,null]:G.exec(a),!e||!e[1]&&b)return!b||b.jquery?(b||c).find(a):this.constructor(b).find(a);if(e[1]){if(b=b instanceof r?b[0]:b,r.merge(this,r.parseHTML(e[1],b&&b.nodeType?b.ownerDocument||b:d,!0)),C.test(e[1])&&r.isPlainObject(b))for(e in b)r.isFunction(this[e])?this[e](b[e]):this.attr(e,b[e]);return this}return f=d.getElementById(e[2]),f&&(this[0]=f,this.length=1),this}return a.nodeType?(this[0]=a,this.length=1,this):r.isFunction(a)?void 0!==c.ready?c.ready(a):a(r):r.makeArray(a,this)};H.prototype=r.fn,F=r(d);var I=/^(?:parents|prev(?:Until|All))/,J={children:!0,contents:!0,next:!0,prev:!0};r.fn.extend({has:function(a){var b=r(a,this),c=b.length;return this.filter(function(){for(var a=0;a-1:1===c.nodeType&&r.find.matchesSelector(c,a))){f.push(c);break}return this.pushStack(f.length>1?r.uniqueSort(f):f)},index:function(a){return a?"string"==typeof a?i.call(r(a),this[0]):i.call(this,a.jquery?a[0]:a):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(a,b){return this.pushStack(r.uniqueSort(r.merge(this.get(),r(a,b))))},addBack:function(a){return this.add(null==a?this.prevObject:this.prevObject.filter(a))}});function K(a,b){while((a=a[b])&&1!==a.nodeType);return a}r.each({parent:function(a){var b=a.parentNode;return b&&11!==b.nodeType?b:null},parents:function(a){return y(a,"parentNode")},parentsUntil:function(a,b,c){return y(a,"parentNode",c)},next:function(a){return K(a,"nextSibling")},prev:function(a){return K(a,"previousSibling")},nextAll:function(a){return y(a,"nextSibling")},prevAll:function(a){return y(a,"previousSibling")},nextUntil:function(a,b,c){return y(a,"nextSibling",c)},prevUntil:function(a,b,c){return y(a,"previousSibling",c)},siblings:function(a){return z((a.parentNode||{}).firstChild,a)},children:function(a){return z(a.firstChild)},contents:function(a){return B(a,"iframe")?a.contentDocument:(B(a,"template")&&(a=a.content||a),r.merge([],a.childNodes))}},function(a,b){r.fn[a]=function(c,d){var e=r.map(this,b,c);return"Until"!==a.slice(-5)&&(d=c),d&&"string"==typeof d&&(e=r.filter(d,e)),this.length>1&&(J[a]||r.uniqueSort(e),I.test(a)&&e.reverse()),this.pushStack(e)}});var L=/[^\x20\t\r\n\f]+/g;function M(a){var b={};return r.each(a.match(L)||[],function(a,c){b[c]=!0}),b}r.Callbacks=function(a){a="string"==typeof a?M(a):r.extend({},a);var b,c,d,e,f=[],g=[],h=-1,i=function(){for(e=e||a.once,d=b=!0;g.length;h=-1){c=g.shift();while(++h-1)f.splice(c,1),c<=h&&h--}),this},has:function(a){return a?r.inArray(a,f)>-1:f.length>0},empty:function(){return f&&(f=[]),this},disable:function(){return e=g=[],f=c="",this},disabled:function(){return!f},lock:function(){return e=g=[],c||b||(f=c=""),this},locked:function(){return!!e},fireWith:function(a,c){return e||(c=c||[],c=[a,c.slice?c.slice():c],g.push(c),b||i()),this},fire:function(){return j.fireWith(this,arguments),this},fired:function(){return!!d}};return j};function N(a){return a}function O(a){throw a}function P(a,b,c,d){var e;try{a&&r.isFunction(e=a.promise)?e.call(a).done(b).fail(c):a&&r.isFunction(e=a.then)?e.call(a,b,c):b.apply(void 0,[a].slice(d))}catch(a){c.apply(void 0,[a])}}r.extend({Deferred:function(b){var c=[["notify","progress",r.Callbacks("memory"),r.Callbacks("memory"),2],["resolve","done",r.Callbacks("once memory"),r.Callbacks("once memory"),0,"resolved"],["reject","fail",r.Callbacks("once memory"),r.Callbacks("once memory"),1,"rejected"]],d="pending",e={state:function(){return d},always:function(){return f.done(arguments).fail(arguments),this},"catch":function(a){return e.then(null,a)},pipe:function(){var a=arguments;return r.Deferred(function(b){r.each(c,function(c,d){var e=r.isFunction(a[d[4]])&&a[d[4]];f[d[1]](function(){var a=e&&e.apply(this,arguments);a&&r.isFunction(a.promise)?a.promise().progress(b.notify).done(b.resolve).fail(b.reject):b[d[0]+"With"](this,e?[a]:arguments)})}),a=null}).promise()},then:function(b,d,e){var f=0;function g(b,c,d,e){return function(){var h=this,i=arguments,j=function(){var a,j;if(!(b=f&&(d!==O&&(h=void 0,i=[a]),c.rejectWith(h,i))}};b?k():(r.Deferred.getStackHook&&(k.stackTrace=r.Deferred.getStackHook()),a.setTimeout(k))}}return r.Deferred(function(a){c[0][3].add(g(0,a,r.isFunction(e)?e:N,a.notifyWith)),c[1][3].add(g(0,a,r.isFunction(b)?b:N)),c[2][3].add(g(0,a,r.isFunction(d)?d:O))}).promise()},promise:function(a){return null!=a?r.extend(a,e):e}},f={};return r.each(c,function(a,b){var g=b[2],h=b[5];e[b[1]]=g.add,h&&g.add(function(){d=h},c[3-a][2].disable,c[0][2].lock),g.add(b[3].fire),f[b[0]]=function(){return f[b[0]+"With"](this===f?void 0:this,arguments),this},f[b[0]+"With"]=g.fireWith}),e.promise(f),b&&b.call(f,f),f},when:function(a){var b=arguments.length,c=b,d=Array(c),e=f.call(arguments),g=r.Deferred(),h=function(a){return function(c){d[a]=this,e[a]=arguments.length>1?f.call(arguments):c,--b||g.resolveWith(d,e)}};if(b<=1&&(P(a,g.done(h(c)).resolve,g.reject,!b),"pending"===g.state()||r.isFunction(e[c]&&e[c].then)))return g.then();while(c--)P(e[c],h(c),g.reject);return g.promise()}});var Q=/^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/;r.Deferred.exceptionHook=function(b,c){a.console&&a.console.warn&&b&&Q.test(b.name)&&a.console.warn("jQuery.Deferred exception: "+b.message,b.stack,c)},r.readyException=function(b){a.setTimeout(function(){throw b})};var R=r.Deferred();r.fn.ready=function(a){return R.then(a)["catch"](function(a){r.readyException(a)}),this},r.extend({isReady:!1,readyWait:1,ready:function(a){(a===!0?--r.readyWait:r.isReady)||(r.isReady=!0,a!==!0&&--r.readyWait>0||R.resolveWith(d,[r]))}}),r.ready.then=R.then;function S(){d.removeEventListener("DOMContentLoaded",S), -a.removeEventListener("load",S),r.ready()}"complete"===d.readyState||"loading"!==d.readyState&&!d.documentElement.doScroll?a.setTimeout(r.ready):(d.addEventListener("DOMContentLoaded",S),a.addEventListener("load",S));var T=function(a,b,c,d,e,f,g){var h=0,i=a.length,j=null==c;if("object"===r.type(c)){e=!0;for(h in c)T(a,b,h,c[h],!0,f,g)}else if(void 0!==d&&(e=!0,r.isFunction(d)||(g=!0),j&&(g?(b.call(a,d),b=null):(j=b,b=function(a,b,c){return j.call(r(a),c)})),b))for(;h1,null,!0)},removeData:function(a){return this.each(function(){X.remove(this,a)})}}),r.extend({queue:function(a,b,c){var d;if(a)return b=(b||"fx")+"queue",d=W.get(a,b),c&&(!d||Array.isArray(c)?d=W.access(a,b,r.makeArray(c)):d.push(c)),d||[]},dequeue:function(a,b){b=b||"fx";var c=r.queue(a,b),d=c.length,e=c.shift(),f=r._queueHooks(a,b),g=function(){r.dequeue(a,b)};"inprogress"===e&&(e=c.shift(),d--),e&&("fx"===b&&c.unshift("inprogress"),delete f.stop,e.call(a,g,f)),!d&&f&&f.empty.fire()},_queueHooks:function(a,b){var c=b+"queueHooks";return W.get(a,c)||W.access(a,c,{empty:r.Callbacks("once memory").add(function(){W.remove(a,[b+"queue",c])})})}}),r.fn.extend({queue:function(a,b){var c=2;return"string"!=typeof a&&(b=a,a="fx",c--),arguments.length\x20\t\r\n\f]+)/i,la=/^$|\/(?:java|ecma)script/i,ma={option:[1,""],thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};ma.optgroup=ma.option,ma.tbody=ma.tfoot=ma.colgroup=ma.caption=ma.thead,ma.th=ma.td;function na(a,b){var c;return c="undefined"!=typeof a.getElementsByTagName?a.getElementsByTagName(b||"*"):"undefined"!=typeof a.querySelectorAll?a.querySelectorAll(b||"*"):[],void 0===b||b&&B(a,b)?r.merge([a],c):c}function oa(a,b){for(var c=0,d=a.length;c-1)e&&e.push(f);else if(j=r.contains(f.ownerDocument,f),g=na(l.appendChild(f),"script"),j&&oa(g),c){k=0;while(f=g[k++])la.test(f.type||"")&&c.push(f)}return l}!function(){var a=d.createDocumentFragment(),b=a.appendChild(d.createElement("div")),c=d.createElement("input");c.setAttribute("type","radio"),c.setAttribute("checked","checked"),c.setAttribute("name","t"),b.appendChild(c),o.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,b.innerHTML="",o.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue}();var ra=d.documentElement,sa=/^key/,ta=/^(?:mouse|pointer|contextmenu|drag|drop)|click/,ua=/^([^.]*)(?:\.(.+)|)/;function va(){return!0}function wa(){return!1}function xa(){try{return d.activeElement}catch(a){}}function ya(a,b,c,d,e,f){var g,h;if("object"==typeof b){"string"!=typeof c&&(d=d||c,c=void 0);for(h in b)ya(a,h,c,d,b[h],f);return a}if(null==d&&null==e?(e=c,d=c=void 0):null==e&&("string"==typeof c?(e=d,d=void 0):(e=d,d=c,c=void 0)),e===!1)e=wa;else if(!e)return a;return 1===f&&(g=e,e=function(a){return r().off(a),g.apply(this,arguments)},e.guid=g.guid||(g.guid=r.guid++)),a.each(function(){r.event.add(this,b,e,d,c)})}r.event={global:{},add:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,n,o,p,q=W.get(a);if(q){c.handler&&(f=c,c=f.handler,e=f.selector),e&&r.find.matchesSelector(ra,e),c.guid||(c.guid=r.guid++),(i=q.events)||(i=q.events={}),(g=q.handle)||(g=q.handle=function(b){return"undefined"!=typeof r&&r.event.triggered!==b.type?r.event.dispatch.apply(a,arguments):void 0}),b=(b||"").match(L)||[""],j=b.length;while(j--)h=ua.exec(b[j])||[],n=p=h[1],o=(h[2]||"").split(".").sort(),n&&(l=r.event.special[n]||{},n=(e?l.delegateType:l.bindType)||n,l=r.event.special[n]||{},k=r.extend({type:n,origType:p,data:d,handler:c,guid:c.guid,selector:e,needsContext:e&&r.expr.match.needsContext.test(e),namespace:o.join(".")},f),(m=i[n])||(m=i[n]=[],m.delegateCount=0,l.setup&&l.setup.call(a,d,o,g)!==!1||a.addEventListener&&a.addEventListener(n,g)),l.add&&(l.add.call(a,k),k.handler.guid||(k.handler.guid=c.guid)),e?m.splice(m.delegateCount++,0,k):m.push(k),r.event.global[n]=!0)}},remove:function(a,b,c,d,e){var f,g,h,i,j,k,l,m,n,o,p,q=W.hasData(a)&&W.get(a);if(q&&(i=q.events)){b=(b||"").match(L)||[""],j=b.length;while(j--)if(h=ua.exec(b[j])||[],n=p=h[1],o=(h[2]||"").split(".").sort(),n){l=r.event.special[n]||{},n=(d?l.delegateType:l.bindType)||n,m=i[n]||[],h=h[2]&&new RegExp("(^|\\.)"+o.join("\\.(?:.*\\.|)")+"(\\.|$)"),g=f=m.length;while(f--)k=m[f],!e&&p!==k.origType||c&&c.guid!==k.guid||h&&!h.test(k.namespace)||d&&d!==k.selector&&("**"!==d||!k.selector)||(m.splice(f,1),k.selector&&m.delegateCount--,l.remove&&l.remove.call(a,k));g&&!m.length&&(l.teardown&&l.teardown.call(a,o,q.handle)!==!1||r.removeEvent(a,n,q.handle),delete i[n])}else for(n in i)r.event.remove(a,n+b[j],c,d,!0);r.isEmptyObject(i)&&W.remove(a,"handle events")}},dispatch:function(a){var b=r.event.fix(a),c,d,e,f,g,h,i=new Array(arguments.length),j=(W.get(this,"events")||{})[b.type]||[],k=r.event.special[b.type]||{};for(i[0]=b,c=1;c=1))for(;j!==this;j=j.parentNode||this)if(1===j.nodeType&&("click"!==a.type||j.disabled!==!0)){for(f=[],g={},c=0;c-1:r.find(e,this,null,[j]).length),g[e]&&f.push(d);f.length&&h.push({elem:j,handlers:f})}return j=this,i\x20\t\r\n\f]*)[^>]*)\/>/gi,Aa=/\s*$/g;function Ea(a,b){return B(a,"table")&&B(11!==b.nodeType?b:b.firstChild,"tr")?r(">tbody",a)[0]||a:a}function Fa(a){return a.type=(null!==a.getAttribute("type"))+"/"+a.type,a}function Ga(a){var b=Ca.exec(a.type);return b?a.type=b[1]:a.removeAttribute("type"),a}function Ha(a,b){var c,d,e,f,g,h,i,j;if(1===b.nodeType){if(W.hasData(a)&&(f=W.access(a),g=W.set(b,f),j=f.events)){delete g.handle,g.events={};for(e in j)for(c=0,d=j[e].length;c1&&"string"==typeof q&&!o.checkClone&&Ba.test(q))return a.each(function(e){var f=a.eq(e);s&&(b[0]=q.call(this,e,f.html())),Ja(f,b,c,d)});if(m&&(e=qa(b,a[0].ownerDocument,!1,a,d),f=e.firstChild,1===e.childNodes.length&&(e=f),f||d)){for(h=r.map(na(e,"script"),Fa),i=h.length;l")},clone:function(a,b,c){var d,e,f,g,h=a.cloneNode(!0),i=r.contains(a.ownerDocument,a);if(!(o.noCloneChecked||1!==a.nodeType&&11!==a.nodeType||r.isXMLDoc(a)))for(g=na(h),f=na(a),d=0,e=f.length;d0&&oa(g,!i&&na(a,"script")),h},cleanData:function(a){for(var b,c,d,e=r.event.special,f=0;void 0!==(c=a[f]);f++)if(U(c)){if(b=c[W.expando]){if(b.events)for(d in b.events)e[d]?r.event.remove(c,d):r.removeEvent(c,d,b.handle);c[W.expando]=void 0}c[X.expando]&&(c[X.expando]=void 0)}}}),r.fn.extend({detach:function(a){return Ka(this,a,!0)},remove:function(a){return Ka(this,a)},text:function(a){return T(this,function(a){return void 0===a?r.text(this):this.empty().each(function(){1!==this.nodeType&&11!==this.nodeType&&9!==this.nodeType||(this.textContent=a)})},null,a,arguments.length)},append:function(){return Ja(this,arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=Ea(this,a);b.appendChild(a)}})},prepend:function(){return Ja(this,arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=Ea(this,a);b.insertBefore(a,b.firstChild)}})},before:function(){return Ja(this,arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this)})},after:function(){return Ja(this,arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this.nextSibling)})},empty:function(){for(var a,b=0;null!=(a=this[b]);b++)1===a.nodeType&&(r.cleanData(na(a,!1)),a.textContent="");return this},clone:function(a,b){return a=null!=a&&a,b=null==b?a:b,this.map(function(){return r.clone(this,a,b)})},html:function(a){return T(this,function(a){var b=this[0]||{},c=0,d=this.length;if(void 0===a&&1===b.nodeType)return b.innerHTML;if("string"==typeof a&&!Aa.test(a)&&!ma[(ka.exec(a)||["",""])[1].toLowerCase()]){a=r.htmlPrefilter(a);try{for(;c1)}});function _a(a,b,c,d,e){return new _a.prototype.init(a,b,c,d,e)}r.Tween=_a,_a.prototype={constructor:_a,init:function(a,b,c,d,e,f){this.elem=a,this.prop=c,this.easing=e||r.easing._default,this.options=b,this.start=this.now=this.cur(),this.end=d,this.unit=f||(r.cssNumber[c]?"":"px")},cur:function(){var a=_a.propHooks[this.prop];return a&&a.get?a.get(this):_a.propHooks._default.get(this)},run:function(a){var b,c=_a.propHooks[this.prop];return this.options.duration?this.pos=b=r.easing[this.easing](a,this.options.duration*a,0,1,this.options.duration):this.pos=b=a,this.now=(this.end-this.start)*b+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),c&&c.set?c.set(this):_a.propHooks._default.set(this),this}},_a.prototype.init.prototype=_a.prototype,_a.propHooks={_default:{get:function(a){var b;return 1!==a.elem.nodeType||null!=a.elem[a.prop]&&null==a.elem.style[a.prop]?a.elem[a.prop]:(b=r.css(a.elem,a.prop,""),b&&"auto"!==b?b:0)},set:function(a){r.fx.step[a.prop]?r.fx.step[a.prop](a):1!==a.elem.nodeType||null==a.elem.style[r.cssProps[a.prop]]&&!r.cssHooks[a.prop]?a.elem[a.prop]=a.now:r.style(a.elem,a.prop,a.now+a.unit)}}},_a.propHooks.scrollTop=_a.propHooks.scrollLeft={set:function(a){a.elem.nodeType&&a.elem.parentNode&&(a.elem[a.prop]=a.now)}},r.easing={linear:function(a){return a},swing:function(a){return.5-Math.cos(a*Math.PI)/2},_default:"swing"},r.fx=_a.prototype.init,r.fx.step={};var ab,bb,cb=/^(?:toggle|show|hide)$/,db=/queueHooks$/;function eb(){bb&&(d.hidden===!1&&a.requestAnimationFrame?a.requestAnimationFrame(eb):a.setTimeout(eb,r.fx.interval),r.fx.tick())}function fb(){return a.setTimeout(function(){ab=void 0}),ab=r.now()}function gb(a,b){var c,d=0,e={height:a};for(b=b?1:0;d<4;d+=2-b)c=ca[d],e["margin"+c]=e["padding"+c]=a;return b&&(e.opacity=e.width=a),e}function hb(a,b,c){for(var d,e=(kb.tweeners[b]||[]).concat(kb.tweeners["*"]),f=0,g=e.length;f1)},removeAttr:function(a){return this.each(function(){r.removeAttr(this,a)})}}),r.extend({attr:function(a,b,c){var d,e,f=a.nodeType;if(3!==f&&8!==f&&2!==f)return"undefined"==typeof a.getAttribute?r.prop(a,b,c):(1===f&&r.isXMLDoc(a)||(e=r.attrHooks[b.toLowerCase()]||(r.expr.match.bool.test(b)?lb:void 0)),void 0!==c?null===c?void r.removeAttr(a,b):e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:(a.setAttribute(b,c+""),c):e&&"get"in e&&null!==(d=e.get(a,b))?d:(d=r.find.attr(a,b), -null==d?void 0:d))},attrHooks:{type:{set:function(a,b){if(!o.radioValue&&"radio"===b&&B(a,"input")){var c=a.value;return a.setAttribute("type",b),c&&(a.value=c),b}}}},removeAttr:function(a,b){var c,d=0,e=b&&b.match(L);if(e&&1===a.nodeType)while(c=e[d++])a.removeAttribute(c)}}),lb={set:function(a,b,c){return b===!1?r.removeAttr(a,c):a.setAttribute(c,c),c}},r.each(r.expr.match.bool.source.match(/\w+/g),function(a,b){var c=mb[b]||r.find.attr;mb[b]=function(a,b,d){var e,f,g=b.toLowerCase();return d||(f=mb[g],mb[g]=e,e=null!=c(a,b,d)?g:null,mb[g]=f),e}});var nb=/^(?:input|select|textarea|button)$/i,ob=/^(?:a|area)$/i;r.fn.extend({prop:function(a,b){return T(this,r.prop,a,b,arguments.length>1)},removeProp:function(a){return this.each(function(){delete this[r.propFix[a]||a]})}}),r.extend({prop:function(a,b,c){var d,e,f=a.nodeType;if(3!==f&&8!==f&&2!==f)return 1===f&&r.isXMLDoc(a)||(b=r.propFix[b]||b,e=r.propHooks[b]),void 0!==c?e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:a[b]=c:e&&"get"in e&&null!==(d=e.get(a,b))?d:a[b]},propHooks:{tabIndex:{get:function(a){var b=r.find.attr(a,"tabindex");return b?parseInt(b,10):nb.test(a.nodeName)||ob.test(a.nodeName)&&a.href?0:-1}}},propFix:{"for":"htmlFor","class":"className"}}),o.optSelected||(r.propHooks.selected={get:function(a){var b=a.parentNode;return b&&b.parentNode&&b.parentNode.selectedIndex,null},set:function(a){var b=a.parentNode;b&&(b.selectedIndex,b.parentNode&&b.parentNode.selectedIndex)}}),r.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){r.propFix[this.toLowerCase()]=this});function pb(a){var b=a.match(L)||[];return b.join(" ")}function qb(a){return a.getAttribute&&a.getAttribute("class")||""}r.fn.extend({addClass:function(a){var b,c,d,e,f,g,h,i=0;if(r.isFunction(a))return this.each(function(b){r(this).addClass(a.call(this,b,qb(this)))});if("string"==typeof a&&a){b=a.match(L)||[];while(c=this[i++])if(e=qb(c),d=1===c.nodeType&&" "+pb(e)+" "){g=0;while(f=b[g++])d.indexOf(" "+f+" ")<0&&(d+=f+" ");h=pb(d),e!==h&&c.setAttribute("class",h)}}return this},removeClass:function(a){var b,c,d,e,f,g,h,i=0;if(r.isFunction(a))return this.each(function(b){r(this).removeClass(a.call(this,b,qb(this)))});if(!arguments.length)return this.attr("class","");if("string"==typeof a&&a){b=a.match(L)||[];while(c=this[i++])if(e=qb(c),d=1===c.nodeType&&" "+pb(e)+" "){g=0;while(f=b[g++])while(d.indexOf(" "+f+" ")>-1)d=d.replace(" "+f+" "," ");h=pb(d),e!==h&&c.setAttribute("class",h)}}return this},toggleClass:function(a,b){var c=typeof a;return"boolean"==typeof b&&"string"===c?b?this.addClass(a):this.removeClass(a):r.isFunction(a)?this.each(function(c){r(this).toggleClass(a.call(this,c,qb(this),b),b)}):this.each(function(){var b,d,e,f;if("string"===c){d=0,e=r(this),f=a.match(L)||[];while(b=f[d++])e.hasClass(b)?e.removeClass(b):e.addClass(b)}else void 0!==a&&"boolean"!==c||(b=qb(this),b&&W.set(this,"__className__",b),this.setAttribute&&this.setAttribute("class",b||a===!1?"":W.get(this,"__className__")||""))})},hasClass:function(a){var b,c,d=0;b=" "+a+" ";while(c=this[d++])if(1===c.nodeType&&(" "+pb(qb(c))+" ").indexOf(b)>-1)return!0;return!1}});var rb=/\r/g;r.fn.extend({val:function(a){var b,c,d,e=this[0];{if(arguments.length)return d=r.isFunction(a),this.each(function(c){var e;1===this.nodeType&&(e=d?a.call(this,c,r(this).val()):a,null==e?e="":"number"==typeof e?e+="":Array.isArray(e)&&(e=r.map(e,function(a){return null==a?"":a+""})),b=r.valHooks[this.type]||r.valHooks[this.nodeName.toLowerCase()],b&&"set"in b&&void 0!==b.set(this,e,"value")||(this.value=e))});if(e)return b=r.valHooks[e.type]||r.valHooks[e.nodeName.toLowerCase()],b&&"get"in b&&void 0!==(c=b.get(e,"value"))?c:(c=e.value,"string"==typeof c?c.replace(rb,""):null==c?"":c)}}}),r.extend({valHooks:{option:{get:function(a){var b=r.find.attr(a,"value");return null!=b?b:pb(r.text(a))}},select:{get:function(a){var b,c,d,e=a.options,f=a.selectedIndex,g="select-one"===a.type,h=g?null:[],i=g?f+1:e.length;for(d=f<0?i:g?f:0;d-1)&&(c=!0);return c||(a.selectedIndex=-1),f}}}}),r.each(["radio","checkbox"],function(){r.valHooks[this]={set:function(a,b){if(Array.isArray(b))return a.checked=r.inArray(r(a).val(),b)>-1}},o.checkOn||(r.valHooks[this].get=function(a){return null===a.getAttribute("value")?"on":a.value})});var sb=/^(?:focusinfocus|focusoutblur)$/;r.extend(r.event,{trigger:function(b,c,e,f){var g,h,i,j,k,m,n,o=[e||d],p=l.call(b,"type")?b.type:b,q=l.call(b,"namespace")?b.namespace.split("."):[];if(h=i=e=e||d,3!==e.nodeType&&8!==e.nodeType&&!sb.test(p+r.event.triggered)&&(p.indexOf(".")>-1&&(q=p.split("."),p=q.shift(),q.sort()),k=p.indexOf(":")<0&&"on"+p,b=b[r.expando]?b:new r.Event(p,"object"==typeof b&&b),b.isTrigger=f?2:3,b.namespace=q.join("."),b.rnamespace=b.namespace?new RegExp("(^|\\.)"+q.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,b.result=void 0,b.target||(b.target=e),c=null==c?[b]:r.makeArray(c,[b]),n=r.event.special[p]||{},f||!n.trigger||n.trigger.apply(e,c)!==!1)){if(!f&&!n.noBubble&&!r.isWindow(e)){for(j=n.delegateType||p,sb.test(j+p)||(h=h.parentNode);h;h=h.parentNode)o.push(h),i=h;i===(e.ownerDocument||d)&&o.push(i.defaultView||i.parentWindow||a)}g=0;while((h=o[g++])&&!b.isPropagationStopped())b.type=g>1?j:n.bindType||p,m=(W.get(h,"events")||{})[b.type]&&W.get(h,"handle"),m&&m.apply(h,c),m=k&&h[k],m&&m.apply&&U(h)&&(b.result=m.apply(h,c),b.result===!1&&b.preventDefault());return b.type=p,f||b.isDefaultPrevented()||n._default&&n._default.apply(o.pop(),c)!==!1||!U(e)||k&&r.isFunction(e[p])&&!r.isWindow(e)&&(i=e[k],i&&(e[k]=null),r.event.triggered=p,e[p](),r.event.triggered=void 0,i&&(e[k]=i)),b.result}},simulate:function(a,b,c){var d=r.extend(new r.Event,c,{type:a,isSimulated:!0});r.event.trigger(d,null,b)}}),r.fn.extend({trigger:function(a,b){return this.each(function(){r.event.trigger(a,b,this)})},triggerHandler:function(a,b){var c=this[0];if(c)return r.event.trigger(a,b,c,!0)}}),r.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(a,b){r.fn[b]=function(a,c){return arguments.length>0?this.on(b,null,a,c):this.trigger(b)}}),r.fn.extend({hover:function(a,b){return this.mouseenter(a).mouseleave(b||a)}}),o.focusin="onfocusin"in a,o.focusin||r.each({focus:"focusin",blur:"focusout"},function(a,b){var c=function(a){r.event.simulate(b,a.target,r.event.fix(a))};r.event.special[b]={setup:function(){var d=this.ownerDocument||this,e=W.access(d,b);e||d.addEventListener(a,c,!0),W.access(d,b,(e||0)+1)},teardown:function(){var d=this.ownerDocument||this,e=W.access(d,b)-1;e?W.access(d,b,e):(d.removeEventListener(a,c,!0),W.remove(d,b))}}});var tb=a.location,ub=r.now(),vb=/\?/;r.parseXML=function(b){var c;if(!b||"string"!=typeof b)return null;try{c=(new a.DOMParser).parseFromString(b,"text/xml")}catch(d){c=void 0}return c&&!c.getElementsByTagName("parsererror").length||r.error("Invalid XML: "+b),c};var wb=/\[\]$/,xb=/\r?\n/g,yb=/^(?:submit|button|image|reset|file)$/i,zb=/^(?:input|select|textarea|keygen)/i;function Ab(a,b,c,d){var e;if(Array.isArray(b))r.each(b,function(b,e){c||wb.test(a)?d(a,e):Ab(a+"["+("object"==typeof e&&null!=e?b:"")+"]",e,c,d)});else if(c||"object"!==r.type(b))d(a,b);else for(e in b)Ab(a+"["+e+"]",b[e],c,d)}r.param=function(a,b){var c,d=[],e=function(a,b){var c=r.isFunction(b)?b():b;d[d.length]=encodeURIComponent(a)+"="+encodeURIComponent(null==c?"":c)};if(Array.isArray(a)||a.jquery&&!r.isPlainObject(a))r.each(a,function(){e(this.name,this.value)});else for(c in a)Ab(c,a[c],b,e);return d.join("&")},r.fn.extend({serialize:function(){return r.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var a=r.prop(this,"elements");return a?r.makeArray(a):this}).filter(function(){var a=this.type;return this.name&&!r(this).is(":disabled")&&zb.test(this.nodeName)&&!yb.test(a)&&(this.checked||!ja.test(a))}).map(function(a,b){var c=r(this).val();return null==c?null:Array.isArray(c)?r.map(c,function(a){return{name:b.name,value:a.replace(xb,"\r\n")}}):{name:b.name,value:c.replace(xb,"\r\n")}}).get()}});var Bb=/%20/g,Cb=/#.*$/,Db=/([?&])_=[^&]*/,Eb=/^(.*?):[ \t]*([^\r\n]*)$/gm,Fb=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,Gb=/^(?:GET|HEAD)$/,Hb=/^\/\//,Ib={},Jb={},Kb="*/".concat("*"),Lb=d.createElement("a");Lb.href=tb.href;function Mb(a){return function(b,c){"string"!=typeof b&&(c=b,b="*");var d,e=0,f=b.toLowerCase().match(L)||[];if(r.isFunction(c))while(d=f[e++])"+"===d[0]?(d=d.slice(1)||"*",(a[d]=a[d]||[]).unshift(c)):(a[d]=a[d]||[]).push(c)}}function Nb(a,b,c,d){var e={},f=a===Jb;function g(h){var i;return e[h]=!0,r.each(a[h]||[],function(a,h){var j=h(b,c,d);return"string"!=typeof j||f||e[j]?f?!(i=j):void 0:(b.dataTypes.unshift(j),g(j),!1)}),i}return g(b.dataTypes[0])||!e["*"]&&g("*")}function Ob(a,b){var c,d,e=r.ajaxSettings.flatOptions||{};for(c in b)void 0!==b[c]&&((e[c]?a:d||(d={}))[c]=b[c]);return d&&r.extend(!0,a,d),a}function Pb(a,b,c){var d,e,f,g,h=a.contents,i=a.dataTypes;while("*"===i[0])i.shift(),void 0===d&&(d=a.mimeType||b.getResponseHeader("Content-Type"));if(d)for(e in h)if(h[e]&&h[e].test(d)){i.unshift(e);break}if(i[0]in c)f=i[0];else{for(e in c){if(!i[0]||a.converters[e+" "+i[0]]){f=e;break}g||(g=e)}f=f||g}if(f)return f!==i[0]&&i.unshift(f),c[f]}function Qb(a,b,c,d){var e,f,g,h,i,j={},k=a.dataTypes.slice();if(k[1])for(g in a.converters)j[g.toLowerCase()]=a.converters[g];f=k.shift();while(f)if(a.responseFields[f]&&(c[a.responseFields[f]]=b),!i&&d&&a.dataFilter&&(b=a.dataFilter(b,a.dataType)),i=f,f=k.shift())if("*"===f)f=i;else if("*"!==i&&i!==f){if(g=j[i+" "+f]||j["* "+f],!g)for(e in j)if(h=e.split(" "),h[1]===f&&(g=j[i+" "+h[0]]||j["* "+h[0]])){g===!0?g=j[e]:j[e]!==!0&&(f=h[0],k.unshift(h[1]));break}if(g!==!0)if(g&&a["throws"])b=g(b);else try{b=g(b)}catch(l){return{state:"parsererror",error:g?l:"No conversion from "+i+" to "+f}}}return{state:"success",data:b}}r.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:tb.href,type:"GET",isLocal:Fb.test(tb.protocol),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Kb,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/\bxml\b/,html:/\bhtml/,json:/\bjson\b/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":JSON.parse,"text xml":r.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(a,b){return b?Ob(Ob(a,r.ajaxSettings),b):Ob(r.ajaxSettings,a)},ajaxPrefilter:Mb(Ib),ajaxTransport:Mb(Jb),ajax:function(b,c){"object"==typeof b&&(c=b,b=void 0),c=c||{};var e,f,g,h,i,j,k,l,m,n,o=r.ajaxSetup({},c),p=o.context||o,q=o.context&&(p.nodeType||p.jquery)?r(p):r.event,s=r.Deferred(),t=r.Callbacks("once memory"),u=o.statusCode||{},v={},w={},x="canceled",y={readyState:0,getResponseHeader:function(a){var b;if(k){if(!h){h={};while(b=Eb.exec(g))h[b[1].toLowerCase()]=b[2]}b=h[a.toLowerCase()]}return null==b?null:b},getAllResponseHeaders:function(){return k?g:null},setRequestHeader:function(a,b){return null==k&&(a=w[a.toLowerCase()]=w[a.toLowerCase()]||a,v[a]=b),this},overrideMimeType:function(a){return null==k&&(o.mimeType=a),this},statusCode:function(a){var b;if(a)if(k)y.always(a[y.status]);else for(b in a)u[b]=[u[b],a[b]];return this},abort:function(a){var b=a||x;return e&&e.abort(b),A(0,b),this}};if(s.promise(y),o.url=((b||o.url||tb.href)+"").replace(Hb,tb.protocol+"//"),o.type=c.method||c.type||o.method||o.type,o.dataTypes=(o.dataType||"*").toLowerCase().match(L)||[""],null==o.crossDomain){j=d.createElement("a");try{j.href=o.url,j.href=j.href,o.crossDomain=Lb.protocol+"//"+Lb.host!=j.protocol+"//"+j.host}catch(z){o.crossDomain=!0}}if(o.data&&o.processData&&"string"!=typeof o.data&&(o.data=r.param(o.data,o.traditional)),Nb(Ib,o,c,y),k)return y;l=r.event&&o.global,l&&0===r.active++&&r.event.trigger("ajaxStart"),o.type=o.type.toUpperCase(),o.hasContent=!Gb.test(o.type),f=o.url.replace(Cb,""),o.hasContent?o.data&&o.processData&&0===(o.contentType||"").indexOf("application/x-www-form-urlencoded")&&(o.data=o.data.replace(Bb,"+")):(n=o.url.slice(f.length),o.data&&(f+=(vb.test(f)?"&":"?")+o.data,delete o.data),o.cache===!1&&(f=f.replace(Db,"$1"),n=(vb.test(f)?"&":"?")+"_="+ub++ +n),o.url=f+n),o.ifModified&&(r.lastModified[f]&&y.setRequestHeader("If-Modified-Since",r.lastModified[f]),r.etag[f]&&y.setRequestHeader("If-None-Match",r.etag[f])),(o.data&&o.hasContent&&o.contentType!==!1||c.contentType)&&y.setRequestHeader("Content-Type",o.contentType),y.setRequestHeader("Accept",o.dataTypes[0]&&o.accepts[o.dataTypes[0]]?o.accepts[o.dataTypes[0]]+("*"!==o.dataTypes[0]?", "+Kb+"; q=0.01":""):o.accepts["*"]);for(m in o.headers)y.setRequestHeader(m,o.headers[m]);if(o.beforeSend&&(o.beforeSend.call(p,y,o)===!1||k))return y.abort();if(x="abort",t.add(o.complete),y.done(o.success),y.fail(o.error),e=Nb(Jb,o,c,y)){if(y.readyState=1,l&&q.trigger("ajaxSend",[y,o]),k)return y;o.async&&o.timeout>0&&(i=a.setTimeout(function(){y.abort("timeout")},o.timeout));try{k=!1,e.send(v,A)}catch(z){if(k)throw z;A(-1,z)}}else A(-1,"No Transport");function A(b,c,d,h){var j,m,n,v,w,x=c;k||(k=!0,i&&a.clearTimeout(i),e=void 0,g=h||"",y.readyState=b>0?4:0,j=b>=200&&b<300||304===b,d&&(v=Pb(o,y,d)),v=Qb(o,v,y,j),j?(o.ifModified&&(w=y.getResponseHeader("Last-Modified"),w&&(r.lastModified[f]=w),w=y.getResponseHeader("etag"),w&&(r.etag[f]=w)),204===b||"HEAD"===o.type?x="nocontent":304===b?x="notmodified":(x=v.state,m=v.data,n=v.error,j=!n)):(n=x,!b&&x||(x="error",b<0&&(b=0))),y.status=b,y.statusText=(c||x)+"",j?s.resolveWith(p,[m,x,y]):s.rejectWith(p,[y,x,n]),y.statusCode(u),u=void 0,l&&q.trigger(j?"ajaxSuccess":"ajaxError",[y,o,j?m:n]),t.fireWith(p,[y,x]),l&&(q.trigger("ajaxComplete",[y,o]),--r.active||r.event.trigger("ajaxStop")))}return y},getJSON:function(a,b,c){return r.get(a,b,c,"json")},getScript:function(a,b){return r.get(a,void 0,b,"script")}}),r.each(["get","post"],function(a,b){r[b]=function(a,c,d,e){return r.isFunction(c)&&(e=e||d,d=c,c=void 0),r.ajax(r.extend({url:a,type:b,dataType:e,data:c,success:d},r.isPlainObject(a)&&a))}}),r._evalUrl=function(a){return r.ajax({url:a,type:"GET",dataType:"script",cache:!0,async:!1,global:!1,"throws":!0})},r.fn.extend({wrapAll:function(a){var b;return this[0]&&(r.isFunction(a)&&(a=a.call(this[0])),b=r(a,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){var a=this;while(a.firstElementChild)a=a.firstElementChild;return a}).append(this)),this},wrapInner:function(a){return r.isFunction(a)?this.each(function(b){r(this).wrapInner(a.call(this,b))}):this.each(function(){var b=r(this),c=b.contents();c.length?c.wrapAll(a):b.append(a)})},wrap:function(a){var b=r.isFunction(a);return this.each(function(c){r(this).wrapAll(b?a.call(this,c):a)})},unwrap:function(a){return this.parent(a).not("body").each(function(){r(this).replaceWith(this.childNodes)}),this}}),r.expr.pseudos.hidden=function(a){return!r.expr.pseudos.visible(a)},r.expr.pseudos.visible=function(a){return!!(a.offsetWidth||a.offsetHeight||a.getClientRects().length)},r.ajaxSettings.xhr=function(){try{return new a.XMLHttpRequest}catch(b){}};var Rb={0:200,1223:204},Sb=r.ajaxSettings.xhr();o.cors=!!Sb&&"withCredentials"in Sb,o.ajax=Sb=!!Sb,r.ajaxTransport(function(b){var c,d;if(o.cors||Sb&&!b.crossDomain)return{send:function(e,f){var g,h=b.xhr();if(h.open(b.type,b.url,b.async,b.username,b.password),b.xhrFields)for(g in b.xhrFields)h[g]=b.xhrFields[g];b.mimeType&&h.overrideMimeType&&h.overrideMimeType(b.mimeType),b.crossDomain||e["X-Requested-With"]||(e["X-Requested-With"]="XMLHttpRequest");for(g in e)h.setRequestHeader(g,e[g]);c=function(a){return function(){c&&(c=d=h.onload=h.onerror=h.onabort=h.onreadystatechange=null,"abort"===a?h.abort():"error"===a?"number"!=typeof h.status?f(0,"error"):f(h.status,h.statusText):f(Rb[h.status]||h.status,h.statusText,"text"!==(h.responseType||"text")||"string"!=typeof h.responseText?{binary:h.response}:{text:h.responseText},h.getAllResponseHeaders()))}},h.onload=c(),d=h.onerror=c("error"),void 0!==h.onabort?h.onabort=d:h.onreadystatechange=function(){4===h.readyState&&a.setTimeout(function(){c&&d()})},c=c("abort");try{h.send(b.hasContent&&b.data||null)}catch(i){if(c)throw i}},abort:function(){c&&c()}}}),r.ajaxPrefilter(function(a){a.crossDomain&&(a.contents.script=!1)}),r.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/\b(?:java|ecma)script\b/},converters:{"text script":function(a){return r.globalEval(a),a}}}),r.ajaxPrefilter("script",function(a){void 0===a.cache&&(a.cache=!1),a.crossDomain&&(a.type="GET")}),r.ajaxTransport("script",function(a){if(a.crossDomain){var b,c;return{send:function(e,f){b=r(" - - - - - - - - - - -
-
-
-
-

WeNet简介

-

- WeNet是全球首个面向工业级产品的、全栈式的、开源的端到端语音识别解决方案,其提供模型训练、模型推理、云侧和端侧模型部署的一站式服务。 -

-
-
- -
-
-
-
-
-

产品体验

-
-
-
-
-
-
-
-
- WebSocket URL: - -
- - -
- 识别中, 秒后自动停止识别 -
-
-
-
-
此处显示识别结果
-
-
-
-
-
-
-

核心特点

-
    -
  • - -
    产品优先和产品就绪的设计
    -
    WeNet训练的模型可以直接无缝应用到产品环境中,
    并提供长语音、端点检测、时间戳、语言模型等
    产品级别特性支持。
    -
  • -
  • - -
    统一的流式和非流式模型
    -
    WeNet训练的模型既能应用到低延迟的流式语音识别场景,也能应用到高识别率要求的非流式语音识别场景。
    -
  • -
  • - -
    云侧和端侧解决方案
    -
    WeNet中同时集成了云侧和端侧的解决方案。
    -
  • -
-
-
-
-
-

联系我们

-
    -
  • -
    微信扫码关注公众号
    - -
  • -
-
-
-
- - - - \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/CMakeLists.txt deleted file mode 100644 index 67447c42d977f120fc39cdab0d052b011edd3efe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(websocket STATIC - websocket_client.cc - websocket_server.cc -) -target_link_libraries(websocket PUBLIC decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_client.cc deleted file mode 100644 index c0394e6250153e2d59636c9eab62badc4a737d16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_client.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "websocket/websocket_client.h" - -#include "boost/json/src.hpp" - -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -WebSocketClient::WebSocketClient(const std::string& hostname, int port) - : hostname_(hostname), port_(port) { - Connect(); - t_.reset(new std::thread(&WebSocketClient::ReadLoopFunc, this)); -} - -void WebSocketClient::Connect() { - tcp::resolver resolver{ioc_}; - // Look up the domain name - auto const results = resolver.resolve(hostname_, std::to_string(port_)); - // Make the connection on the IP address we get from a lookup - auto ep = asio::connect(ws_.next_layer(), results); - // Provide the value of the Host HTTP header during the WebSocket handshake. - // See https://tools.ietf.org/html/rfc7230#section-5.4 - std::string host = hostname_ + ":" + std::to_string(ep.port()); - // Perform the websocket handshake - ws_.handshake(host, "/"); -} - -void WebSocketClient::SendTextData(const std::string& data) { - ws_.text(true); - ws_.write(asio::buffer(data)); -} - -void WebSocketClient::SendBinaryData(const void* data, size_t size) { - ws_.binary(true); - ws_.write(asio::buffer(data, size)); -} - -void WebSocketClient::Close() { ws_.close(websocket::close_code::normal); } - -void WebSocketClient::ReadLoopFunc() { - try { - while (true) { - beast::flat_buffer buffer; - ws_.read(buffer); - std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; - CHECK(ws_.got_text()); - json::object obj = json::parse(message).as_object(); - if (obj["status"] != "ok") { - break; - } - if (obj["type"] == "speech_end") { - done_ = true; - break; - } - } - } catch (beast::system_error const& se) { - // This indicates that the session was closed - if (se.code() != websocket::error::closed) { - LOG(ERROR) << se.code().message(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void WebSocketClient::Join() { t_->join(); } - -void WebSocketClient::SendStartSignal() { - // TODO(Binbin Zhang): Add sample rate and other setting support - json::value start_tag = {{"signal", "start"}, - {"nbest", nbest_}, - {"continuous_decoding", continuous_decoding_}}; - std::string start_message = json::serialize(start_tag); - this->SendTextData(start_message); -} - -void WebSocketClient::SendEndSignal() { - json::value end_tag = {{"signal", "end"}}; - std::string end_message = json::serialize(end_tag); - this->SendTextData(end_message); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_client.h deleted file mode 100644 index 76ec3aa451d31c7ee6b158ce21c8acdc10575eb3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_client.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef WEBSOCKET_WEBSOCKET_CLIENT_H_ -#define WEBSOCKET_WEBSOCKET_CLIENT_H_ - -#include -#include -#include -#include - -#include "boost/asio/connect.hpp" -#include "boost/asio/ip/tcp.hpp" -#include "boost/beast/core.hpp" -#include "boost/beast/websocket.hpp" - -#include "utils/utils.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class WebSocketClient { - public: - WebSocketClient(const std::string& host, int port); - - void SendTextData(const std::string& data); - void SendBinaryData(const void* data, size_t size); - void ReadLoopFunc(); - void Close(); - void Join(); - void SendStartSignal(); - void SendEndSignal(); - void set_nbest(int nbest) { nbest_ = nbest; } - void set_continuous_decoding(bool continuous_decoding) { - continuous_decoding_ = continuous_decoding; - } - bool done() const { return done_; } - - private: - void Connect(); - std::string hostname_; - int port_; - int nbest_ = 1; - bool continuous_decoding_ = false; - bool done_ = false; - asio::io_context ioc_; - websocket::stream ws_{ioc_}; - std::unique_ptr t_{nullptr}; - - WENET_DISALLOW_COPY_AND_ASSIGN(WebSocketClient); -}; - -} // namespace wenet - -#endif // WEBSOCKET_WEBSOCKET_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_server.cc deleted file mode 100644 index 52ab088f46d59b9f3f1add1e34d3aceae290f5da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_server.cc +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "websocket/websocket_server.h" - -#include -#include -#include - -#include "boost/json/src.hpp" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -ConnectionHandler::ConnectionHandler( - tcp::socket&& socket, std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : ws_(std::move(socket)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - -void ConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Received speech start signal, start reading speech"; - got_start_tag_ = true; - json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = - std::make_shared(&ConnectionHandler::DecodeThreadFunc, this); -} - -void ConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Received speech end signal"; - if (feature_pipeline_ != nullptr) { - feature_pipeline_->set_input_finished(); - } - got_end_tag_ = true; -} - -void ConnectionHandler::OnPartialResult(const std::string& result) { - LOG(INFO) << "Partial result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "partial_result"}, {"nbest", result}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnFinalResult(const std::string& result) { - LOG(INFO) << "Final result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "final_result"}, {"nbest", result}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnFinish() { - // Send finish tag - json::value rv = {{"status", "ok"}, {"type", "speech_end"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) { - // Read binary PCM data - int num_samples = buffer.size() / sizeof(int16_t); - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - const auto* pcm_data = static_cast(buffer.data().data()); - feature_pipeline_->AcceptWaveform(pcm_data, num_samples); -} - -std::string ConnectionHandler::SerializeResult(bool finish) { - json::array nbest; - for (const DecodeResult& path : decoder_->result()) { - json::object jpath({{"sentence", path.sentence}}); - if (finish) { - json::array word_pieces; - for (const WordPiece& word_piece : path.word_pieces) { - json::object jword_piece({{"word", word_piece.word}, - {"start", word_piece.start}, - {"end", word_piece.end}}); - word_pieces.emplace_back(jword_piece); - } - jpath.emplace("word_pieces", word_pieces); - } - nbest.emplace_back(jpath); - - if (nbest.size() == nbest_) { - break; - } - } - return json::serialize(nbest); -} - -void ConnectionHandler::DecodeThreadFunc() { - try { - while (true) { - DecodeState state = decoder_->Decode(); - if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - OnFinish(); - stop_recognition_ = true; - break; - } else if (state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - // If it's not continuous decoding, continue to do next recognition - // otherwise stop the recognition - if (continuous_decoding_) { - decoder_->ResetContinuousDecoding(); - } else { - OnFinish(); - stop_recognition_ = true; - break; - } - } else { - if (decoder_->DecodedSomething()) { - std::string result = SerializeResult(false); - OnPartialResult(result); - } - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void ConnectionHandler::OnError(const std::string& message) { - json::value rv = {{"status", "failed"}, {"message", message}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); - // Close websocket - ws_.close(websocket::close_code::normal); -} - -void ConnectionHandler::OnText(const std::string& message) { - json::value v = json::parse(message); - if (v.is_object()) { - json::object obj = v.get_object(); - if (obj.find("signal") != obj.end()) { - json::string signal = obj["signal"].as_string(); - if (signal == "start") { - if (obj.find("nbest") != obj.end()) { - if (obj["nbest"].is_int64()) { - nbest_ = obj["nbest"].as_int64(); - } else { - OnError("integer is expected for nbest option"); - } - } - if (obj.find("continuous_decoding") != obj.end()) { - if (obj["continuous_decoding"].is_bool()) { - continuous_decoding_ = obj["continuous_decoding"].as_bool(); - } else { - OnError( - "boolean true or false is expected for " - "continuous_decoding option"); - } - } - OnSpeechStart(); - } else if (signal == "end") { - OnSpeechEnd(); - } else { - OnError("Unexpected signal type"); - } - } else { - OnError("Wrong message header"); - } - } else { - OnError("Wrong protocol"); - } -} - -void ConnectionHandler::operator()() { - try { - // Accept the websocket handshake - ws_.accept(); - for (;;) { - // This buffer will hold the incoming message - beast::flat_buffer buffer; - // Read a message - ws_.read(buffer); - if (ws_.got_text()) { - std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; - OnText(message); - if (got_end_tag_) { - break; - } - } else { - if (!got_start_tag_) { - OnError("Start signal is expected before binary data"); - } else { - if (stop_recognition_) { - break; - } - OnSpeechData(buffer); - } - } - } - - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (beast::system_error const& se) { - LOG(INFO) << se.code().message(); - // This indicates that the session was closed - if (se.code() == websocket::error::closed) { - OnSpeechEnd(); - } - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void WebSocketServer::Start() { - try { - auto const address = asio::ip::make_address("0.0.0.0"); - tcp::acceptor acceptor{ioc_, {address, static_cast(port_)}}; - for (;;) { - // This will receive the new connection - tcp::socket socket{ioc_}; - // Block until we get a connection - acceptor.accept(socket); - // Launch the session, transferring ownership of the socket - ConnectionHandler handler(std::move(socket), feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.detach(); - } - } catch (const std::exception& e) { - LOG(FATAL) << e.what(); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_server.h deleted file mode 100644 index a1241834221dcf93c34d6414bd9b5ae40ef1cf38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/libtorch/websocket/websocket_server.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef WEBSOCKET_WEBSOCKET_SERVER_H_ -#define WEBSOCKET_WEBSOCKET_SERVER_H_ - -#include -#include -#include -#include -#include - -#include "boost/asio/connect.hpp" -#include "boost/asio/ip/tcp.hpp" -#include "boost/beast/core.hpp" -#include "boost/beast/websocket.hpp" - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class ConnectionHandler { - public: - ConnectionHandler(tcp::socket&& socket, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource_); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnText(const std::string& message); - void OnFinish(); - void OnSpeechData(const beast::flat_buffer& buffer); - void OnError(const std::string& message); - void OnPartialResult(const std::string& result); - void OnFinalResult(const std::string& result); - void DecodeThreadFunc(); - std::string SerializeResult(bool finish); - - bool continuous_decoding_ = false; - int nbest_ = 1; - websocket::stream ws_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - bool got_start_tag_ = false; - bool got_end_tag_ = false; - // When endpoint is detected, stop recognition, and stop receiving data. - bool stop_recognition_ = false; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class WebSocketServer { - public: - WebSocketServer(int port, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : port_(port), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - - void Start(); - - private: - int port_; - // The io_context is required for all I/O - asio::io_context ioc_{1}; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - WENET_DISALLOW_COPY_AND_ASSIGN(WebSocketServer); -}; - -} // namespace wenet - -#endif // WEBSOCKET_WEBSOCKET_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/CMakeLists.txt deleted file mode 100644 index 6223e1481e7e98846d9de3535ec510b41c237d48..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/CMakeLists.txt +++ /dev/null @@ -1,81 +0,0 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - -project(wenet VERSION 0.1) - -option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF) -option(GRAPH_TOOLS "whether to build TLG graph tools" OFF) -option(BUILD_TESTING "whether to build unit test" ON) - -option(GRPC "whether to build with gRPC" OFF) -# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost -# which is a very big library -option(WEBSOCKET "whether to build with websocket" ON) -option(HTTP "whether to build with http" OFF) -option(TORCH "whether to build with Torch" ON) -option(ONNX "whether to build with ONNX" OFF) -option(GPU "whether to build with GPU" OFF) - -set(CMAKE_VERBOSE_MAKEFILE OFF) - -include(FetchContent) -set(FETCHCONTENT_QUIET OFF) -get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -set(FETCHCONTENT_BASE_DIR ${fc_base}) - -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - -if(NOT MSVC) - # Keep the same with openfst, -fPIC or -fpic - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC") -else() - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - add_compile_options("$<$:/utf-8>") -endif() - -# Include all dependency -if(TORCH) - include(libtorch) -endif() -if(ONNX) - include(onnx) -endif() -include(openfst) -include_directories( - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/kaldi -) - -# Build all libraries -add_subdirectory(utils) -add_subdirectory(frontend) -add_subdirectory(post_processor) -add_subdirectory(kaldi) # kaldi: wfst based decoder -add_subdirectory(decoder) -add_subdirectory(api) - -# Optionally, you can build with websocket -if(WEBSOCKET) - include(boost) - add_subdirectory(websocket) -endif() - -# Optionally, you can build with gRPC -if(GRPC) - include(grpc) - add_subdirectory(grpc) -endif() - -# Optionally, you can build with http -if(HTTP) - include(boost) - add_subdirectory(http) -endif() - -# Build all bins -add_subdirectory(bin) - -# Unit Test -if(BUILD_TESTING) - include(gtest) - add_subdirectory(test) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/README.md deleted file mode 100644 index f9aa1a3d3267a5de21ba255be7c7658070c4e67b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# ONNX backend on WeNet - -* Step 1. Export your experiment model to ONNX by https://github.com/wenet-e2e/wenet/blob/main/wenet/bin/export_onnx_cpu.py - -``` sh -exp=exp # Change it to your experiment dir -onnx_dir=onnx -python -m wenet.bin.export_onnx_cpu \ - --config $exp/train.yaml \ - --checkpoint $exp/final.pt \ - --chunk_size 16 \ - --output_dir $onnx_dir \ - --num_decoding_left_chunks -1 - -# When it finishes, you can find `encoder.onnx`, `ctc.onnx`, and `decoder.onnx` in the $onnx_dir respectively. -``` - -* Step 2. Build. The build requires cmake 3.14 or above. - -``` sh -mkdir build && cd build -cmake -DONNX=ON -DTORCH=OFF -DWEBSOCKET=OFF -DGRPC=OFF .. -cmake --build . -``` - -* Step 3. Testing, the RTF(real time factor) is shown in the console. - -``` sh -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=your_test_wav_path -onnx_dir=your_model_dir -units=units.txt # Change it to your model units path -./build/bin/decoder_main \ - --chunk_size 16 \ - --wav_path $wav_path \ - --onnx_dir $onnx_dir \ - --unit_path $units 2>&1 | tee log.txt -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/CMakeLists.txt deleted file mode 100644 index 8d61ca8477f0f0b6128f1effe0a2738494b2620f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(TORCH) - add_library(wenet_api SHARED wenet_api.cc) - target_link_libraries(wenet_api PUBLIC decoder) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/README.md deleted file mode 100644 index 5eaa13b977eb4836eb930452f4434dc9f2ea4139..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# WeNet API - -We refer [vosk](https://github.com/alphacep/vosk-api/blob/master/src/vosk_api.h) -for the interface design. - - -We are going to implement the following interfaces: - -- [x] non-streaming recognition -- [] streaming recognition -- [] nbest -- [] contextual biasing word -- [] alignment -- [] language support(post processor) -- [] label check diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/wenet_api.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/wenet_api.cc deleted file mode 100644 index cb1e0c8552e0126e2db274a29075578fe351a25f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/wenet_api.cc +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" - -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "decoder/torch_asr_model.h" -#include "post_processor/post_processor.h" -#include "utils/file.h" -#include "utils/json.h" -#include "utils/string.h" - -class Recognizer { - public: - explicit Recognizer(const std::string& model_dir) { - // FeaturePipeline init - feature_config_ = std::make_shared(80, 16000); - feature_pipeline_ = - std::make_shared(*feature_config_); - // Resource init - resource_ = std::make_shared(); - wenet::TorchAsrModel::InitEngineThreads(); - std::string model_path = wenet::JoinPath(model_dir, "final.zip"); - CHECK(wenet::FileExists(model_path)); - - auto model = std::make_shared(); - model->Read(model_path); - resource_->model = model; - - // units.txt: E2E model unit - std::string unit_path = wenet::JoinPath(model_dir, "units.txt"); - CHECK(wenet::FileExists(unit_path)); - resource_->unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(unit_path)); - - std::string fst_path = wenet::JoinPath(model_dir, "TLG.fst"); - if (wenet::FileExists(fst_path)) { // With LM - resource_->fst = std::shared_ptr>( - fst::Fst::Read(fst_path)); - - std::string symbol_path = wenet::JoinPath(model_dir, "words.txt"); - CHECK(wenet::FileExists(symbol_path)); - resource_->symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(symbol_path)); - } else { // Without LM, symbol_table is the same as unit_table - resource_->symbol_table = resource_->unit_table; - } - - // Context config init - context_config_ = std::make_shared(); - decode_options_ = std::make_shared(); - post_process_opts_ = std::make_shared(); - } - - void Reset() { - if (feature_pipeline_ != nullptr) { - feature_pipeline_->Reset(); - } - if (decoder_ != nullptr) { - decoder_->Reset(); - } - result_.clear(); - } - - void InitDecoder() { - CHECK(decoder_ == nullptr); - // Optional init context graph - if (context_.size() > 0) { - context_config_->context_score = context_score_; - auto context_graph = - std::make_shared(*context_config_); - context_graph->BuildContextGraph(context_, resource_->symbol_table); - resource_->context_graph = context_graph; - } - // PostProcessor - if (language_ == "chs") { // TODO(Binbin Zhang): CJK(chs, jp, kr) - post_process_opts_->language_type = wenet::kMandarinEnglish; - } else { - post_process_opts_->language_type = wenet::kIndoEuropean; - } - resource_->post_processor = - std::make_shared(*post_process_opts_); - // Init decoder - decoder_ = std::make_shared(feature_pipeline_, resource_, - *decode_options_); - } - - void Decode(const char* data, int len, int last) { - using wenet::DecodeState; - // Init decoder when it is called first time - if (decoder_ == nullptr) { - InitDecoder(); - } - // Convert to 16 bits PCM data to float - CHECK_EQ(len % 2, 0); - feature_pipeline_->AcceptWaveform(reinterpret_cast(data), - len / 2); - if (last > 0) { - feature_pipeline_->set_input_finished(); - } - - while (true) { - DecodeState state = decoder_->Decode(false); - if (state == DecodeState::kWaitFeats) { - break; - } else if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - UpdateResult(true); - break; - } else if (state == DecodeState::kEndpoint && continuous_decoding_) { - decoder_->Rescoring(); - UpdateResult(true); - decoder_->ResetContinuousDecoding(); - } else { // kEndBatch - UpdateResult(false); - } - } - } - - void UpdateResult(bool final_result) { - json::JSON obj; - obj["type"] = final_result ? "final_result" : "partial_result"; - int nbest = final_result ? nbest_ : 1; - obj["nbest"] = json::Array(); - for (int i = 0; i < nbest && i < decoder_->result().size(); i++) { - json::JSON one; - one["sentence"] = decoder_->result()[i].sentence; - if (final_result && enable_timestamp_) { - one["word_pieces"] = json::Array(); - for (const auto& word_piece : decoder_->result()[i].word_pieces) { - json::JSON piece; - piece["word"] = word_piece.word; - piece["start"] = word_piece.start; - piece["end"] = word_piece.end; - one["word_pieces"].append(piece); - } - } - one["sentence"] = decoder_->result()[i].sentence; - obj["nbest"].append(one); - } - result_ = obj.dump(); - } - - const char* GetResult() { return result_.c_str(); } - - void set_nbest(int n) { nbest_ = n; } - void set_enable_timestamp(bool flag) { enable_timestamp_ = flag; } - void AddContext(const char* word) { context_.emplace_back(word); } - void set_context_score(float score) { context_score_ = score; } - void set_language(const char* lang) { language_ = lang; } - void set_continuous_decoding(bool flag) { continuous_decoding_ = flag; } - - private: - // NOTE(Binbin Zhang): All use shared_ptr for clone in the future - std::shared_ptr feature_config_ = nullptr; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr resource_ = nullptr; - std::shared_ptr decode_options_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr context_config_ = nullptr; - std::shared_ptr post_process_opts_ = nullptr; - - int nbest_ = 1; - std::string result_; - bool enable_timestamp_ = false; - std::vector context_; - float context_score_; - std::string language_ = "chs"; - bool continuous_decoding_ = false; -}; - -void* wenet_init(const char* model_dir) { - Recognizer* decoder = new Recognizer(model_dir); - return reinterpret_cast(decoder); -} - -void wenet_free(void* decoder) { - delete reinterpret_cast(decoder); -} - -void wenet_reset(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Reset(); -} - -void wenet_decode(void* decoder, const char* data, int len, int last) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Decode(data, len, last); -} - -const char* wenet_get_result(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - return recognizer->GetResult(); -} - -void wenet_set_log_level(int level) { - FLAGS_logtostderr = true; - FLAGS_v = level; -} - -void wenet_set_nbest(void* decoder, int n) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_nbest(n); -} - -void wenet_set_timestamp(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - bool enable = flag > 0 ? true : false; - recognizer->set_enable_timestamp(enable); -} - -void wenet_add_context(void* decoder, const char* word) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->AddContext(word); -} - -void wenet_set_context_score(void* decoder, float score) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_context_score(score); -} - -void wenet_set_language(void* decoder, const char* lang) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_language(lang); -} - -void wenet_set_continuous_decoding(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_continuous_decoding(flag > 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/wenet_api.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/wenet_api.h deleted file mode 100644 index e839aaa40166a6e50d9aa2ac0e697356bd25b941..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/api/wenet_api.h +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_WENET_API_H_ -#define API_WENET_API_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/** Init decoder from the file and returns the object - * - * @param model_dir: the model dir - * @returns model object or NULL if problem occured - */ -void* wenet_init(const char* model_dir); - -/** Free wenet decoder and corresponding resource - */ -void wenet_free(void* decoder); - -/** Reset decoder for next decoding - */ -void wenet_reset(void* decoder); - -/** Decode the input wav data - * @param data: pcm data, encoded as int16_t(16 bits) - * @param len: data length - * @param last: if it is the last package - */ -void wenet_decode(void* decoder, const char* data, int len, int last); - -/** Get decode result in json format - * It returns partial result when last is 0 - * It returns final result when last is 1 - - { - "nbest" : [{ - "sentence" : "are you okay" - "word_pieces" : [{ - "end" : 960, - "start" : 0, - "word" : "are" - }, { - "end" : 1200, - "start" : 960, - "word" : "you" - }, { - ...}] - }, { - "sentence" : "are you ok" - }], - "type" : "final_result" - } - - "type": final_result/partial_result - "nbest": nbest is enabled when n > 1 in final_result - "sentence": the ASR result - "word_pieces": optional, output timestamp when enabled - */ -const char* wenet_get_result(void* decoder); - -/** Set n-best, range 1~10 - * wenet_get_result will return top-n best results - */ -void wenet_set_nbest(void* decoder, int n); - -/** Whether to enable word level timestamp in results - disable it when flag = 0, otherwise enable - */ -void wenet_set_timestamp(void* decoder, int flag); - -/** Add one contextual biasing - */ -void wenet_add_context(void* decoder, const char* word); - -/** Set contextual biasing bonus score - */ -void wenet_set_context_score(void* decoder, float score); - -/** Set language, has effect on the postpocessing - * @param: lang, could be chs/en now - */ -void wenet_set_language(void* decoder, const char* lang); - -/** Set log level - * We use glog in wenet, so the level is the glog level - */ -void wenet_set_log_level(int level); - -/** Enable continous decoding or not - * flag > 0: enable, otherwise disable - */ -void wenet_set_continuous_decoding(void* decoder, int flag); - -#ifdef __cplusplus -} -#endif - -#endif // API_WENET_API_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/CMakeLists.txt deleted file mode 100644 index a117b8bcb580c8738a7ce72f88bc10ff0a450e98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -add_executable(decoder_main decoder_main.cc) -target_link_libraries(decoder_main PUBLIC decoder) - -add_executable(label_checker_main label_checker_main.cc) -target_link_libraries(label_checker_main PUBLIC decoder) - -# if(TORCH) -# add_executable(api_main api_main.cc) -# target_link_libraries(api_main PUBLIC wenet_api) -# endif() - -if(WEBSOCKET) - add_executable(websocket_client_main websocket_client_main.cc) - target_link_libraries(websocket_client_main PUBLIC websocket) - add_executable(websocket_server_main websocket_server_main.cc) - target_link_libraries(websocket_server_main PUBLIC websocket) -endif() - -if(GRPC) - add_executable(grpc_server_main grpc_server_main.cc) - target_link_libraries(grpc_server_main PUBLIC wenet_grpc) - add_executable(grpc_client_main grpc_client_main.cc) - target_link_libraries(grpc_client_main PUBLIC wenet_grpc) -endif() - -if(HTTP) - add_executable(http_client_main http_client_main.cc) - target_link_libraries(http_client_main PUBLIC http) - add_executable(http_server_main http_server_main.cc) - target_link_libraries(http_server_main PUBLIC http) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/api_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/api_main.cc deleted file mode 100644 index 94b20d52a7b8eee5c39a12af4e1e25324d7d880f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/api_main.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" -#include "frontend/wav.h" -#include "utils/flags.h" - -DEFINE_string(model_dir, "", "model dir path"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_bool(enable_timestamp, false, "enable timestamps"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet_set_log_level(2); - - void* decoder = wenet_init(FLAGS_model_dir.c_str()); - wenet_set_timestamp(decoder, FLAGS_enable_timestamp == true ? 1 : 0); - wenet::WavReader wav_reader(FLAGS_wav_path); - std::vector data(wav_reader.num_samples()); - for (int i = 0; i < wav_reader.num_samples(); i++) { - data[i] = static_cast(*(wav_reader.data() + i)); - } - - for (int i = 0; i < 10; i++) { - // Return the final result when last is 1 - wenet_decode(decoder, reinterpret_cast(data.data()), - data.size() * 2, 1); - const char* result = wenet_get_result(decoder); - LOG(INFO) << i << " " << result; - wenet_reset(decoder); - } - wenet_free(decoder); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/decoder_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/decoder_main.cc deleted file mode 100644 index b8f1dbae6b88390504cc9ce63f33dc9bd54a2d6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/decoder_main.cc +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" -#include "utils/thread_pool.h" -#include "utils/timer.h" -#include "utils/utils.h" - -DEFINE_bool(simulate_streaming, false, "simulate streaming input"); -DEFINE_bool(output_nbest, false, "output n-best of decode result"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_string(wav_scp, "", "input wav scp"); -DEFINE_string(result, "", "result output file"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); -DEFINE_int32(thread_num, 1, "num of decode thread"); -DEFINE_int32(warmup, 0, "num of warmup decode, 0 means no warmup"); - -std::shared_ptr g_decode_config; -std::shared_ptr g_feature_config; -std::shared_ptr g_decode_resource; - -std::ofstream g_result; -std::mutex g_mutex; -int g_total_waves_dur = 0; -int g_total_decode_time = 0; - -void decode(std::pair wav, bool warmup = false) { - wenet::WavReader wav_reader(wav.second); - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - - auto feature_pipeline = - std::make_shared(*g_feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - - wenet::AsrDecoder decoder(feature_pipeline, g_decode_resource, - *g_decode_config); - - int wave_dur = static_cast(static_cast(num_samples) / - wav_reader.sample_rate() * 1000); - int decode_time = 0; - std::string final_result; - while (true) { - wenet::Timer timer; - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - } - int chunk_decode_time = timer.Elapsed(); - decode_time += chunk_decode_time; - if (decoder.DecodedSomething()) { - LOG(INFO) << "Partial result: " << decoder.result()[0].sentence; - } - - if (FLAGS_continuous_decoding && state == wenet::DecodeState::kEndpoint) { - if (decoder.DecodedSomething()) { - decoder.Rescoring(); - LOG(INFO) << "Final result (continuous decoding): " - << decoder.result()[0].sentence; - final_result.append(decoder.result()[0].sentence); - } - decoder.ResetContinuousDecoding(); - } - - if (state == wenet::DecodeState::kEndFeats) { - break; - } else if (FLAGS_chunk_size > 0 && FLAGS_simulate_streaming) { - float frame_shift_in_ms = - static_cast(g_feature_config->frame_shift) / - wav_reader.sample_rate() * 1000; - auto wait_time = - decoder.num_frames_in_current_chunk() * frame_shift_in_ms - - chunk_decode_time; - if (wait_time > 0) { - LOG(INFO) << "Simulate streaming, waiting for " << wait_time << "ms"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(wait_time))); - } - } - } - if (decoder.DecodedSomething()) { - final_result.append(decoder.result()[0].sentence); - } - LOG(INFO) << wav.first << " Final result: " << final_result << std::endl; - LOG(INFO) << "Decoded " << wave_dur << "ms audio taken " << decode_time - << "ms."; - - if (!warmup) { - g_mutex.lock(); - std::ostream& buffer = FLAGS_result.empty() ? std::cout : g_result; - if (!FLAGS_output_nbest) { - buffer << wav.first << " " << final_result << std::endl; - } else { - buffer << "wav " << wav.first << std::endl; - auto& results = decoder.result(); - for (auto& r : results) { - if (r.sentence.empty()) continue; - buffer << "candidate " << r.score << " " << r.sentence << std::endl; - } - } - g_total_waves_dur += wave_dur; - g_total_decode_time += decode_time; - g_mutex.unlock(); - } -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - g_decode_config = wenet::InitDecodeOptionsFromFlags(); - g_feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - g_decode_resource = wenet::InitDecodeResourceFromFlags(); - - if (FLAGS_wav_path.empty() && FLAGS_wav_scp.empty()) { - LOG(FATAL) << "Please provide the wave path or the wav scp."; - } - std::vector> waves; - if (!FLAGS_wav_path.empty()) { - waves.emplace_back(make_pair("test", FLAGS_wav_path)); - } else { - std::ifstream wav_scp(FLAGS_wav_scp); - std::string line; - while (getline(wav_scp, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_GE(strs.size(), 2); - waves.emplace_back(make_pair(strs[0], strs[1])); - } - - if (waves.empty()) { - LOG(FATAL) << "Please provide non-empty wav scp."; - } - } - - if (!FLAGS_result.empty()) { - g_result.open(FLAGS_result, std::ios::out); - } - - // Warmup - if (FLAGS_warmup > 0) { - LOG(INFO) << "Warming up..."; - { - ThreadPool pool(FLAGS_thread_num); - auto wav = waves[0]; - for (int i = 0; i < FLAGS_warmup; i++) { - pool.enqueue(decode, wav, true); - } - } - LOG(INFO) << "Warmup done."; - } - - { - ThreadPool pool(FLAGS_thread_num); - for (auto& wav : waves) { - pool.enqueue(decode, wav, false); - } - } - - LOG(INFO) << "Total: decoded " << g_total_waves_dur << "ms audio taken " - << g_total_decode_time << "ms."; - LOG(INFO) << "RTF: " << std::setprecision(4) - << static_cast(g_total_decode_time) / g_total_waves_dur; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/grpc_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/grpc_client_main.cc deleted file mode 100644 index f2d226d48d3757c5f095335eff3288f5d227282b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/grpc_client_main.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "grpc/grpc_client.h" -#include "utils/flags.h" -#include "utils/timer.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::GrpcClient client(FLAGS_hostname, FLAGS_port, FLAGS_nbest, - FLAGS_continuous_decoding); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - std::vector pcm_data(wav_reader.data(), - wav_reader.data() + num_samples); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(pcm_data[j])); - } - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/grpc_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/grpc_server_main.cc deleted file mode 100644 index b00f3cbade1ee70dadfb49829e9ca73fd50c2be2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/grpc_server_main.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "grpc/grpc_server.h" -#include "utils/log.h" - -DEFINE_int32(port, 10086, "grpc listening port"); -DEFINE_int32(workers, 4, "grpc num workers"); - -using grpc::Server; -using grpc::ServerBuilder; - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::GrpcServer service(feature_config, decode_config, decode_resource); - grpc::EnableDefaultHealthCheckService(true); - grpc::reflection::InitProtoReflectionServerBuilderPlugin(); - ServerBuilder builder; - std::string address("0.0.0.0:" + std::to_string(FLAGS_port)); - builder.AddListeningPort(address, grpc::InsecureServerCredentials()); - builder.RegisterService(&service); - builder.SetSyncServerOption(ServerBuilder::SyncServerOption::NUM_CQS, - FLAGS_workers); - std::unique_ptr server(builder.BuildAndStart()); - LOG(INFO) << "Listening at port " << FLAGS_port; - server->Wait(); - google::ShutdownGoogleLogging(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/http_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/http_client_main.cc deleted file mode 100644 index b59ee3f5f32bf08552416b183802029ac5d5afa5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/http_client_main.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "http/http_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of http server"); -DEFINE_int32(port, 10086, "port of http server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Convert to short - std::vector data; - data.reserve(num_samples); - for (int j = 0; j < num_samples; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // Send data - wenet::HttpClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - wenet::Timer timer; - VLOG(2) << "Send " << data.size() << " samples"; - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/http_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/http_server_main.cc deleted file mode 100644 index e30cf2bcdf746c2072f023e90f470ccba5467c2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/http_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "http/http_server.h" - -DEFINE_int32(port, 10086, "http listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::HttpServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/label_checker_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/label_checker_main.cc deleted file mode 100644 index e36e3d5c29a38a7ebee80606ebd8e69ae8b1eb96..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/label_checker_main.cc +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_string(text, "", "kaldi style text input file"); -DEFINE_string(wav_scp, "", "kaldi style wav scp"); -DEFINE_double(is_penalty, 1.0, - "insertion/substitution penalty for align insertion"); -DEFINE_double(del_penalty, 1.0, "deletion penalty for align insertion"); -DEFINE_string(result, "", "result output file"); -DEFINE_string(timestamp, "", "timestamp output file"); - -namespace wenet { - -const char* kDeletion = ""; -// Is: Insertion and substitution -const char* kIsStart = ""; -const char* kIsEnd = ""; - -bool MapToLabel(const std::string& text, - std::shared_ptr symbol_table, - std::vector* labels) { - labels->clear(); - // Split label to char sequence - std::vector chars; - SplitUTF8StringToChars(text, &chars); - for (size_t i = 0; i < chars.size(); i++) { - // ▁ is special symbol for white space - std::string label = chars[i] != " " ? chars[i] : "▁"; - int id = symbol_table->Find(label); - if (id != -1) { // fst::kNoSymbol - // LOG(INFO) << label << " " << id; - labels->push_back(id); - } - } - return true; -} - -std::shared_ptr MakeSymbolTableForFst( - std::shared_ptr isymbol_table) { - LOG(INFO) << isymbol_table; - CHECK(isymbol_table != nullptr); - auto osymbol_table = std::make_shared(); - osymbol_table->AddSymbol("", 0); - CHECK_EQ(isymbol_table->Find(""), 0); - osymbol_table->AddSymbol("", 1); - for (int i = 1; i < isymbol_table->NumSymbols(); i++) { - std::string symbol = isymbol_table->Find(i); - osymbol_table->AddSymbol(symbol, i + 1); - } - osymbol_table->AddSymbol(kDeletion, isymbol_table->NumSymbols() + 1); - osymbol_table->AddSymbol(kIsStart, isymbol_table->NumSymbols() + 2); - osymbol_table->AddSymbol(kIsEnd, isymbol_table->NumSymbols() + 3); - return osymbol_table; -} - -void CompileCtcFst(std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int start = ofst->AddState(); - ofst->SetStart(start); - CHECK_EQ(symbol_table->Find(""), 0); - CHECK_EQ(symbol_table->Find(""), 1); - ofst->AddArc(start, fst::StdArc(1, 0, 0.0, start)); - // Exclude kDeletion and kInsertion - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - int s = ofst->AddState(); - ofst->AddArc(start, fst::StdArc(i, i, 0.0, s)); - ofst->AddArc(s, fst::StdArc(i, 0, 0.0, s)); - ofst->AddArc(s, fst::StdArc(0, 0, 0.0, start)); - } - ofst->SetFinal(start, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdOLabelCompare()); -} - -void CompileAlignFst(std::vector labels, - std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int deletion = symbol_table->Find(kDeletion); - int insertion_start = symbol_table->Find(kIsStart); - int insertion_end = symbol_table->Find(kIsEnd); - - int start = ofst->AddState(); - ofst->SetStart(start); - // Filler State - int filler_start = ofst->AddState(); - int filler_end = ofst->AddState(); - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - ofst->AddArc(filler_start, fst::StdArc(i, i, FLAGS_is_penalty, filler_end)); - } - ofst->AddArc(filler_end, fst::StdArc(0, 0, 0.0, filler_start)); - - int prev = start; - // Alignment path and optional filler - for (size_t i = 0; i < labels.size(); i++) { - int cur = ofst->AddState(); - // 1. Insertion or Substitution - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - // 2. Correct - ofst->AddArc(prev, fst::StdArc(labels[i], labels[i], 0.0, cur)); - // 3. Deletion - ofst->AddArc(prev, fst::StdArc(0, deletion, FLAGS_del_penalty, cur)); - - prev = cur; - } - // Optional add endding filler - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - ofst->SetFinal(prev, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdILabelCompare()); -} - -} // namespace wenet - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - CHECK(decode_resource->unit_table != nullptr); - - auto wfst_symbol_table = - wenet::MakeSymbolTableForFst(decode_resource->unit_table); - // wfst_symbol_table->WriteText("fst.txt"); - // Reset symbol_table to on-the-fly generated wfst_symbol_table - decode_resource->symbol_table = wfst_symbol_table; - - // Compile ctc FST - fst::StdVectorFst ctc_fst; - wenet::CompileCtcFst(wfst_symbol_table, &ctc_fst); - // ctc_fst.Write("ctc.fst"); - - std::unordered_map wav_table; - std::ifstream wav_is(FLAGS_wav_scp); - std::string line; - while (std::getline(wav_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_EQ(strs.size(), 2); - wav_table[strs[0]] = strs[1]; - } - - std::ifstream text_is(FLAGS_text); - std::ofstream result_os(FLAGS_result, std::ios::out); - std::ofstream timestamp_out; - if (!FLAGS_timestamp.empty()) { - timestamp_out.open(FLAGS_timestamp, std::ios::out); - } - std::ostream& timestamp_os = - FLAGS_timestamp.empty() ? std::cout : timestamp_out; - - while (std::getline(text_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - if (strs.size() < 2) continue; - std::string key = strs[0]; - LOG(INFO) << "Processing " << key; - if (wav_table.find(key) != wav_table.end()) { - strs.erase(strs.begin()); - std::string text = wenet::JoinString(" ", strs); - std::vector labels; - wenet::MapToLabel(text, wfst_symbol_table, &labels); - // Prepare FST for alignment decoding - fst::StdVectorFst align_fst; - wenet::CompileAlignFst(labels, wfst_symbol_table, &align_fst); - // align_fst.Write("align.fst"); - auto decoding_fst = std::make_shared(); - fst::Compose(ctc_fst, align_fst, decoding_fst.get()); - // decoding_fst->Write("decoding.fst"); - // Preapre feature pipeline - wenet::WavReader wav_reader; - if (!wav_reader.Open(wav_table[key])) { - LOG(WARNING) << "Error in reading " << wav_table[key]; - continue; - } - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - auto feature_pipeline = - std::make_shared(*feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - decode_resource->fst = decoding_fst; - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - wenet::AsrDecoder decoder(feature_pipeline, decode_resource, - *decode_config); - while (true) { - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - break; - } - } - std::string final_result; - std::string timestamp_str; - if (decoder.DecodedSomething()) { - const wenet::DecodeResult& result = decoder.result()[0]; - final_result = result.sentence; - std::stringstream ss; - for (const auto& w : result.word_pieces) { - ss << " " << w.word << " " << w.start << " " << w.end; - } - timestamp_str = ss.str(); - } - result_os << key << " " << final_result << std::endl; - timestamp_os << key << " " << timestamp_str << std::endl; - LOG(INFO) << key << " " << final_result; - } else { - LOG(WARNING) << "No wav file for " << key; - } - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/websocket_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/websocket_client_main.cc deleted file mode 100644 index 3eaa96069dc5f57673fbb2819bf7d4883e0d5ffa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/websocket_client_main.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "websocket/websocket_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::WebSocketClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - client.set_continuous_decoding(FLAGS_continuous_decoding); - client.SendStartSignal(); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // TODO(Binbin Zhang): Network order? - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - client.SendEndSignal(); - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/websocket_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/websocket_server_main.cc deleted file mode 100644 index 796d9d2e6d151f7c08b43d66b7245c58ee086cc2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/bin/websocket_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "websocket/websocket_server.h" - -DEFINE_int32(port, 10086, "websocket listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::WebSocketServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/boost.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/boost.cmake deleted file mode 100644 index 8684c0ec43960da213da923dc57416f04301ea2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/boost.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FetchContent_Declare(boost - URL https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz - URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a -) -FetchContent_MakeAvailable(boost) -include_directories(${boost_SOURCE_DIR}) - -if(MSVC) - add_definitions(-DBOOST_ALL_DYN_LINK -DBOOST_ALL_NO_LIB) -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/bpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/bpu.cmake deleted file mode 100644 index 350d76c19d6f656fb130de09877d649cf49972a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/bpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -if(BPU) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(EASY_DNN_URL "https://github.com/xingchensong/toolchain_pkg/releases/download/easy_dnn/easy_dnn.0.4.11.tar.gz") - set(URL_HASH "SHA256=a1a6f77d1baae7181d75ec5d37a2ee529ac4e1c4400babd6ceb1c007392a4904") - else() - message(FATAL_ERROR "Unsupported CMake System Processor '${CMAKE_SYSTEM_PROCESSOR}' (expected 'aarch64')") - endif() - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Linux')") - endif() - - FetchContent_Declare(easy_dnn - URL ${EASY_DNN_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(easy_dnn) - include_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/include) - link_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/lib) - - add_definitions(-DUSE_BPU) - # NOTE(xcsong): Reasons for adding flag `-fuse-ld=gold`: - # https://stackoverflow.com/questions/59915966/unknown-gcc-linker-error-but-builds-sucessfully/59916438#59916438 - # https://github.com/tensorflow/tensorflow/issues/47849 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/gflags.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/gflags.cmake deleted file mode 100644 index 53ae5763b5a8c860b7e64d35b380eee5429f539d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/gflags.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(gflags - URL https://github.com/gflags/gflags/archive/v2.2.2.zip - URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 -) -FetchContent_MakeAvailable(gflags) -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/glog.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/glog.cmake deleted file mode 100644 index 447ab4132f669ee2c3a52c37959dd684a39ff21b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/glog.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(glog - URL https://github.com/google/glog/archive/v0.4.0.zip - URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc -) -FetchContent_MakeAvailable(glog) -include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/grpc.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/grpc.cmake deleted file mode 100644 index 644093a4bf8191f3a45b0df0a72c000981c48f58..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/grpc.cmake +++ /dev/null @@ -1,9 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) -# third_party: grpc -# On how to build grpc, you may refer to https://github.com/grpc/grpc -# We recommend manually recursive clone the repo to avoid internet connection problem -FetchContent_Declare(gRPC - GIT_REPOSITORY https://github.com/grpc/grpc - GIT_TAG v1.37.1 -) -FetchContent_MakeAvailable(gRPC) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/gtest.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/gtest.cmake deleted file mode 100644 index 30dc7c1a31d8b83991841a4dc33f61ed078b532a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/gtest.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FetchContent_Declare(googletest - URL https://github.com/google/googletest/archive/release-1.11.0.zip - URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a -) -if(MSVC) - set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) -endif() -FetchContent_MakeAvailable(googletest) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/libtorch.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/libtorch.cmake deleted file mode 100644 index 3cd9245b2da52f8be206d27164de5f411bff171b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/libtorch.cmake +++ /dev/null @@ -1,79 +0,0 @@ -if(TORCH) - add_definitions(-DUSE_TORCH) - if(NOT ANDROID) - if(GPU) - if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(FATAL_ERROR "GPU is supported only Linux, you can use CPU version") - else() - add_definitions(-DUSE_GPU) - endif() - endif() - - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - if(${CMAKE_BUILD_TYPE} MATCHES "Release") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bece54d36377990257e9d028c687c5b6759c5cfec0a0153da83cf6f0f71f648f") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=3cc7ba3c3865d86f03d78c2f0878fdbed8b764359476397a5c95cf3bba0d665a") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CXX11_ABI) - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=d52f63577a07adb0bfd6d77c90f7da21896e94f71eb7dcd55ed7835ccb3b2b59") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip") - set(URL_HASH "SHA256=80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865") - endif() - else() - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bee1b7be308792aa60fc95a4f5274d9658cb7248002d0e333d49eb81ec88430c") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip") - set(URL_HASH "SHA256=90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad") - endif() - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.13.0.zip") - set(URL_HASH "SHA256=a8f80050b95489b4e002547910410c2c230e9f590ffab2482e19e809afe4f7aa") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_definitions(-DIOS) - else() - message(FATAL_ERROR "Unsupported System '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux', 'Darwin' or 'iOS')") - endif() - - # iOS use LibTorch from pod install - if(NOT IOS) - FetchContent_Declare(libtorch - URL ${LIBTORCH_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(libtorch) - find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG") - endif() - - if(MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - file(COPY ${TORCH_DLLS} DESTINATION ${CMAKE_BINARY_DIR}) - endif() - else() - # Change version in runtime/android/app/build.gradle. - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - include_directories( - ${PYTORCH_INCLUDE_DIRS} - ${PYTORCH_INCLUDE_DIRS}/torch/csrc/api/include - ) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/onnx.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/onnx.cmake deleted file mode 100644 index bd55402cb2a6024620fa6ff8b5c413207041adfa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/onnx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -if(ONNX) - set(ONNX_VERSION "1.12.0") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-win-x64-${ONNX_VERSION}.zip") - set(URL_HASH "SHA256=8b5d61204989350b7904ac277f5fbccd3e6736ddbb6ec001e412723d71c9c176") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-aarch64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5820d9f343df73c63b6b2b174a1ff62575032e171c9564bcf92060f46827d0ac") - else() - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5d503ce8540358b59be26c675e42081be14a3e833a5301926f555451046929c5") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-osx-x86_64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=09b17f712f8c6f19bb63da35d508815b443cbb473e16c6192abfaa297c02f600") - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux' or 'Darwin')") - endif() - - FetchContent_Declare(onnxruntime - URL ${ONNX_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(onnxruntime) - include_directories(${onnxruntime_SOURCE_DIR}/include) - link_directories(${onnxruntime_SOURCE_DIR}/lib) - - if(MSVC) - file(GLOB ONNX_DLLS "${onnxruntime_SOURCE_DIR}/lib/*.dll") - file(COPY ${ONNX_DLLS} DESTINATION ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}) - endif() - - add_definitions(-DUSE_ONNX) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/openfst.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/openfst.cmake deleted file mode 100644 index 490a3da6b571ec228114167fb9c0d9e9b4043bd2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/openfst.cmake +++ /dev/null @@ -1,45 +0,0 @@ -if(NOT ANDROID) - include(gflags) - # We can't build glog with gflags, unless gflags is pre-installed. - # If build glog with pre-installed gflags, there will be conflict. - set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) - include(glog) - - if(NOT GRAPH_TOOLS) - set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) - set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) - endif() - set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) - set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) - set(HAVE_GRM OFF CACHE BOOL "Build grm" FORCE) - set(HAVE_FAR OFF CACHE BOOL "Build far" FORCE) - set(HAVE_PDT OFF CACHE BOOL "Build pdt" FORCE) - set(HAVE_MPDT OFF CACHE BOOL "Build mpdt" FORCE) - set(HAVE_LINEAR OFF CACHE BOOL "Build linear" FORCE) - set(HAVE_LOOKAHEAD OFF CACHE BOOL "Build lookahead" FORCE) - set(HAVE_NGRAM OFF CACHE BOOL "Build ngram" FORCE) - set(HAVE_SPECIAL OFF CACHE BOOL "Build special" FORCE) - - if(MSVC) - add_compile_options(/W0 /wd4244 /wd4267) - endif() - - # "OpenFST port for Windows" builds openfst with cmake for multiple platforms. - # Openfst is compiled with glog/gflags to avoid log and flag conflicts with log and flags in wenet/libtorch. - # To build openfst with gflags and glog, we comment out some vars of {flags, log}.h and flags.cc. - set(openfst_SOURCE_DIR ${fc_base}/openfst-src CACHE PATH "OpenFST source directory") - FetchContent_Declare(openfst - URL https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz - URL_HASH SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e - PATCH_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} - ) - FetchContent_MakeAvailable(openfst) - add_dependencies(fst gflags glog) - target_link_libraries(fst PUBLIC gflags_nothreads_static glog) - include_directories(${openfst_SOURCE_DIR}/src/include) -else() - set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) - include_directories(${openfst_BINARY_DIR}/include) - link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) - link_libraries(log gflags_nothreads glog fst) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/pybind11.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/pybind11.cmake deleted file mode 100644 index 6bdae202c1c4d94228e5f92dab051c118dba7d3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/pybind11.cmake +++ /dev/null @@ -1,7 +0,0 @@ -FetchContent_Declare(pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.zip - URL_HASH SHA256=d1646e6f70d8a3acb2ddd85ce1ed543b5dd579c68b8fb8e9638282af20edead8 -) -FetchContent_MakeAvailable(pybind11) - -add_subdirectory(${pybind11_SOURCE_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/xpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/xpu.cmake deleted file mode 100644 index 38418671b0237550cd01d4d95e8743067e113e56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/cmake/xpu.cmake +++ /dev/null @@ -1,37 +0,0 @@ -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -if(XPU) - set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") - set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) - if(NOT DEFINED ENV{XPU_API_PATH}) - message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") - else() - set(XPU_API_PATH $ENV{XPU_API_PATH}) - message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") - endif() - - include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ - ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) - link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) - - add_definitions(-DUSE_XPU) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/CMakeLists.txt deleted file mode 100644 index fe03efb288eb1c7ae3d05e896e95855e5865472f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -set(decoder_srcs - asr_decoder.cc - asr_model.cc - context_graph.cc - ctc_prefix_beam_search.cc - ctc_wfst_beam_search.cc - ctc_endpoint.cc -) - -if(NOT TORCH AND NOT ONNX AND NOT XPU AND NOT IOS AND NOT BPU) - message(FATAL_ERROR "Please build with TORCH or ONNX or XPU or IOS or BPU!!!") -endif() -if(TORCH OR IOS) - list(APPEND decoder_srcs torch_asr_model.cc) -endif() -if(ONNX) - list(APPEND decoder_srcs onnx_asr_model.cc) -endif() - -add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend - post_processor utils) - -if(ANDROID) - target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) -else() - if(TORCH) - target_link_libraries(decoder PUBLIC ${TORCH_LIBRARIES}) - endif() - if(ONNX) - target_link_libraries(decoder PUBLIC onnxruntime) - endif() - if(BPU) - target_link_libraries(decoder PUBLIC bpu_asr_model) - endif() - if(XPU) - target_link_libraries(decoder PUBLIC xpu_conformer) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_decoder.cc deleted file mode 100644 index 34de7550ea287b37d2cb707e148f5d6853b3d804..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_decoder.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/asr_decoder.h" - -#include - -#include -#include -#include - -#include "utils/timer.h" - -namespace wenet { - -AsrDecoder::AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts) - : feature_pipeline_(std::move(feature_pipeline)), - // Make a copy of the model ASR model since we will change the inner - // status of the model - model_(resource->model->Copy()), - post_processor_(resource->post_processor), - symbol_table_(resource->symbol_table), - fst_(resource->fst), - unit_table_(resource->unit_table), - opts_(opts), - ctc_endpointer_(new CtcEndpoint(opts.ctc_endpoint_config)) { - if (opts_.reverse_weight > 0) { - // Check if model has a right to left decoder - CHECK(model_->is_bidirectional_decoder()); - } - if (nullptr == fst_) { - searcher_.reset(new CtcPrefixBeamSearch(opts.ctc_prefix_search_opts, - resource->context_graph)); - } else { - searcher_.reset(new CtcWfstBeamSearch(*fst_, opts.ctc_wfst_search_opts, - resource->context_graph)); - } - ctc_endpointer_->frame_shift_in_ms(frame_shift_in_ms()); -} - -void AsrDecoder::Reset() { - start_ = false; - result_.clear(); - num_frames_ = 0; - global_frame_offset_ = 0; - model_->Reset(); - searcher_->Reset(); - feature_pipeline_->Reset(); - ctc_endpointer_->Reset(); -} - -void AsrDecoder::ResetContinuousDecoding() { - global_frame_offset_ = num_frames_; - start_ = false; - result_.clear(); - model_->Reset(); - searcher_->Reset(); - ctc_endpointer_->Reset(); -} - -DecodeState AsrDecoder::Decode(bool block) { - return this->AdvanceDecoding(block); -} - -void AsrDecoder::Rescoring() { - // Do attention rescoring - Timer timer; - AttentionRescoring(); - VLOG(2) << "Rescoring cost latency: " << timer.Elapsed() << "ms."; -} - -DecodeState AsrDecoder::AdvanceDecoding(bool block) { - DecodeState state = DecodeState::kEndBatch; - model_->set_chunk_size(opts_.chunk_size); - model_->set_num_left_chunks(opts_.num_left_chunks); - int num_required_frames = model_->num_frames_for_chunk(start_); - std::vector> chunk_feats; - // Return immediately if we do not want to block - if (!block && !feature_pipeline_->input_finished() && - feature_pipeline_->NumQueuedFrames() < num_required_frames) { - return DecodeState::kWaitFeats; - } - // If not okay, that means we reach the end of the input - if (!feature_pipeline_->Read(num_required_frames, &chunk_feats)) { - state = DecodeState::kEndFeats; - } - - num_frames_ += chunk_feats.size(); - VLOG(2) << "Required " << num_required_frames << " get " - << chunk_feats.size(); - Timer timer; - std::vector> ctc_log_probs; - model_->ForwardEncoder(chunk_feats, &ctc_log_probs); - int forward_time = timer.Elapsed(); - if (opts_.ctc_wfst_search_opts.blank_scale != 1.0) { - for (int i = 0; i < ctc_log_probs.size(); i++) { - ctc_log_probs[i][0] = ctc_log_probs[i][0] - + std::log(opts_.ctc_wfst_search_opts.blank_scale); - } - } - timer.Reset(); - searcher_->Search(ctc_log_probs); - int search_time = timer.Elapsed(); - VLOG(3) << "forward takes " << forward_time << " ms, search takes " - << search_time << " ms"; - UpdateResult(); - - if (state != DecodeState::kEndFeats) { - if (ctc_endpointer_->IsEndpoint(ctc_log_probs, DecodedSomething())) { - VLOG(1) << "Endpoint is detected at " << num_frames_; - state = DecodeState::kEndpoint; - } - } - - start_ = true; - return state; -} - -void AsrDecoder::UpdateResult(bool finish) { - const auto& hypotheses = searcher_->Outputs(); - const auto& inputs = searcher_->Inputs(); - const auto& likelihood = searcher_->Likelihood(); - const auto& times = searcher_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - int offset = global_frame_offset_ * feature_frame_shift_in_ms(); - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (searcher_->Type() == kWfstBeamSearch) { - path.sentence += (' ' + word); - } else { - path.sentence += (word); - } - } - - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - const std::vector& input = inputs[i]; - const std::vector& time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ > 0 - ? time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ - : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : start; - } - int end = time_stamp[j] * frame_shift_in_ms(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : end; - } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } - } - - if (post_processor_ != nullptr) { - path.sentence = post_processor_->Process(path.sentence, finish); - } - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } -} - -void AsrDecoder::AttentionRescoring() { - searcher_->FinalizeSearch(); - UpdateResult(true); - // No need to do rescoring - if (0.0 == opts_.rescoring_weight) { - return; - } - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = searcher_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - std::vector rescoring_score; - model_->AttentionRescoring(hypotheses, opts_.reverse_weight, - &rescoring_score); - - // Combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - result_[i].score = opts_.rescoring_weight * rescoring_score[i] + - opts_.ctc_weight * result_[i].score; - } - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_decoder.h deleted file mode 100644 index df71f5b7bad7b2ffdc69bbd7ab11f576bed464d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_decoder.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_ASR_DECODER_H_ -#define DECODER_ASR_DECODER_H_ - -#include -#include -#include -#include - -#include "fst/fstlib.h" -#include "fst/symbol-table.h" - -#include "decoder/asr_model.h" -#include "decoder/context_graph.h" -#include "decoder/ctc_endpoint.h" -#include "decoder/ctc_prefix_beam_search.h" -#include "decoder/ctc_wfst_beam_search.h" -#include "decoder/search_interface.h" -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/utils.h" - -namespace wenet { - -struct DecodeOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 64 = 16*4 - int chunk_size = 16; - int num_left_chunks = -1; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores in the following two search - // methods are different. - // For CtcPrefixBeamSearch, it's a sum(prefix) score + context score - // For CtcWfstBeamSearch, it's a max(viterbi) path score + context score - // So we should carefully set ctc_weight according to the search methods. - float ctc_weight = 0.5; - float rescoring_weight = 1.0; - float reverse_weight = 0.0; - CtcEndpointConfig ctc_endpoint_config; - CtcPrefixBeamSearchOptions ctc_prefix_search_opts; - CtcWfstBeamSearchOptions ctc_wfst_search_opts; -}; - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -enum DecodeState { - kEndBatch = 0x00, // End of current decoding batch, normal case - kEndpoint = 0x01, // Endpoint is detected - kEndFeats = 0x02, // All feature is decoded - kWaitFeats = 0x03 // Feat is not enough for one chunk inference, wait -}; - -// DecodeResource is thread safe, which can be shared for multiple -// decoding threads -struct DecodeResource { - std::shared_ptr model = nullptr; - std::shared_ptr symbol_table = nullptr; - std::shared_ptr> fst = nullptr; - std::shared_ptr unit_table = nullptr; - std::shared_ptr context_graph = nullptr; - std::shared_ptr post_processor = nullptr; -}; - -// Torch ASR decoder -class AsrDecoder { - public: - AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts); - // @param block: if true, block when feature is not enough for one chunk - // inference. Otherwise, return kWaitFeats. - DecodeState Decode(bool block = true); - void Rescoring(); - void Reset(); - void ResetContinuousDecoding(); - bool DecodedSomething() const { - return !result_.empty() && !result_[0].sentence.empty(); - } - - // This method is used for time benchmark - int num_frames_in_current_chunk() const { - return num_frames_in_current_chunk_; - } - int frame_shift_in_ms() const { - return model_->subsampling_rate() * - feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - int feature_frame_shift_in_ms() const { - return feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - const std::vector& result() const { return result_; } - - private: - DecodeState AdvanceDecoding(bool block = true); - void AttentionRescoring(); - - void UpdateResult(bool finish = false); - - std::shared_ptr feature_pipeline_; - std::shared_ptr model_; - std::shared_ptr post_processor_; - - std::shared_ptr> fst_ = nullptr; - // output symbol table - std::shared_ptr symbol_table_; - // e2e unit symbol table - std::shared_ptr unit_table_ = nullptr; - const DecodeOptions& opts_; - // cache feature - bool start_ = false; - // For continuous decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = 100; // timestamp gap between words in a sentence - - std::unique_ptr searcher_; - std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(AsrDecoder); -}; - -} // namespace wenet - -#endif // DECODER_ASR_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_model.cc deleted file mode 100644 index 8c7b0fb1195cf07bac6c3ff1bb8cb0e187e977da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_model.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#include "decoder/asr_model.h" - -#include -#include - -namespace wenet { - -int AsrModel::num_frames_for_chunk(bool start) const { - int num_required_frames = 0; - if (chunk_size_ > 0) { - if (!start) { // First batch - int context = right_context_ + 1; // Add current frame - num_required_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - num_required_frames = chunk_size_ * subsampling_rate_; - } - } else { - num_required_frames = std::numeric_limits::max(); - } - return num_required_frames; -} - -void AsrModel::CacheFeature( - const std::vector>& chunk_feats) { - // Cache feature for next chunk - const int cached_feature_size = 1 + right_context_ - subsampling_rate_; - if (chunk_feats.size() >= cached_feature_size) { - // TODO(Binbin Zhang): Only deal the case when - // chunk_feats.size() > cached_feature_size here, and it's consistent - // with our current model, refine it later if we have new model or - // new requirements - cached_feature_.resize(cached_feature_size); - for (int i = 0; i < cached_feature_size; ++i) { - cached_feature_[i] = - chunk_feats[chunk_feats.size() - cached_feature_size + i]; - } - } -} - -void AsrModel::ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) { - ctc_prob->clear(); - int num_frames = cached_feature_.size() + chunk_feats.size(); - if (num_frames >= right_context_ + 1) { - this->ForwardEncoderFunc(chunk_feats, ctc_prob); - this->CacheFeature(chunk_feats); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_model.h deleted file mode 100644 index d100dd818551014fa4769c1766bc3b1b626e8453..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/asr_model.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#ifndef DECODER_ASR_MODEL_H_ -#define DECODER_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "utils/timer.h" -#include "utils/utils.h" - -namespace wenet { - -class AsrModel { - public: - virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } - virtual int sos() const { return sos_; } - virtual int eos() const { return eos_; } - virtual bool is_bidirectional_decoder() const { - return is_bidirectional_decoder_; - } - virtual int offset() const { return offset_; } - - // If chunk_size > 0, streaming case. Otherwise, none streaming case - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { - num_left_chunks_ = num_left_chunks; - } - // start: if it is the start chunk of one sentence - virtual int num_frames_for_chunk(bool start) const; - - virtual void Reset() = 0; - - virtual void ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob); - - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - - virtual std::shared_ptr Copy() const = 0; - - protected: - virtual void ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) = 0; - virtual void CacheFeature(const std::vector>& chunk_feats); - - int right_context_ = 1; - int subsampling_rate_ = 1; - int sos_ = 0; - int eos_ = 0; - bool is_bidirectional_decoder_ = false; - int chunk_size_ = 16; - int num_left_chunks_ = -1; // -1 means all left chunks - int offset_ = 0; - - std::vector> cached_feature_; -}; - -} // namespace wenet - -#endif // DECODER_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/context_graph.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/context_graph.cc deleted file mode 100644 index adc59c506de2afa7087815887295e4d8735d2a35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/context_graph.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/context_graph.h" - -#include - -#include "fst/determinize.h" - -#include "utils/string.h" -#include "utils/utils.h" - -namespace wenet { - -ContextGraph::ContextGraph(ContextConfig config) : config_(config) {} - -void ContextGraph::BuildContextGraph( - const std::vector& query_contexts, - const std::shared_ptr& symbol_table) { - CHECK(symbol_table != nullptr) << "Symbols table should not be nullptr!"; - start_tag_id_ = symbol_table->AddSymbol(""); - end_tag_id_ = symbol_table->AddSymbol(""); - symbol_table_ = symbol_table; - if (query_contexts.empty()) { - if (graph_ != nullptr) graph_.reset(); - return; - } - - std::unique_ptr ofst(new fst::StdVectorFst()); - // State 0 is the start state and the final state. - int start_state = ofst->AddState(); - ofst->SetStart(start_state); - ofst->SetFinal(start_state, fst::StdArc::Weight::One()); - - LOG(INFO) << "Contexts count size: " << query_contexts.size(); - int count = 0; - for (const auto& context : query_contexts) { - if (context.size() > config_.max_context_length) { - LOG(INFO) << "Skip long context: " << context; - continue; - } - if (++count > config_.max_contexts) break; - - std::vector words; - // Split context to words by symbol table, and build the context graph. - bool no_oov = SplitUTF8StringToWords(Trim(context), symbol_table, &words); - if (!no_oov) { - LOG(WARNING) << "Ignore unknown word found during compilation."; - continue; - } - - int prev_state = start_state; - int next_state = start_state; - float escape_score = 0; - for (size_t i = 0; i < words.size(); ++i) { - int word_id = symbol_table_->Find(words[i]); - float score = (i * config_.incremental_context_score - + config_.context_score) * UTF8StringLength(words[i]); - next_state = (i < words.size() - 1) ? ofst->AddState() : start_state; - ofst->AddArc(prev_state, - fst::StdArc(word_id, word_id, score, next_state)); - // Add escape arc to clean the previous context score. - if (i > 0) { - // ilabel and olabel of the escape arc is 0 (). - ofst->AddArc(prev_state, fst::StdArc(0, 0, -escape_score, start_state)); - } - prev_state = next_state; - escape_score += score; - } - } - std::unique_ptr det_fst(new fst::StdVectorFst()); - fst::Determinize(*ofst, det_fst.get()); - graph_ = std::move(det_fst); -} - -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // escape score, will be overwritten when ilabel equals to word id. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - next_state = arc.nextstate; - *score = arc.weight.Value(); - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} - -bool ContextGraph::SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - bool no_oov = true; - for (size_t start = 0; start < chars.size();) { - for (size_t end = chars.size(); end > start; --end) { - std::string word; - for (size_t i = start; i < end; i++) { - word += chars[i]; - } - // Skip space. - if (word == " ") { - start = end; - continue; - } - // Add '▁' at the beginning of English word. - if (IsAlpha(word)) { - word = kSpaceSymbol + word; - } - - if (symbol_table->Find(word) != -1) { - words->emplace_back(word); - start = end; - continue; - } - if (end == start + 1) { - ++start; - no_oov = false; - LOG(WARNING) << word << " is oov."; - } - } - } - return no_oov; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/context_graph.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/context_graph.h deleted file mode 100644 index 41b59206987cfe22d421f40506057830b6311f8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/context_graph.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CONTEXT_GRAPH_H_ -#define DECODER_CONTEXT_GRAPH_H_ - -#include -#include -#include - -#include "fst/compose.h" -#include "fst/fst.h" -#include "fst/vector-fst.h" - -namespace wenet { - -using StateId = fst::StdArc::StateId; - -struct ContextConfig { - int max_contexts = 5000; - int max_context_length = 100; - float context_score = 3.0; - float incremental_context_score = 0.0; -}; - -class ContextGraph { - public: - explicit ContextGraph(ContextConfig config); - void BuildContextGraph(const std::vector& query_context, - const std::shared_ptr& symbol_table); - int GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary); - - int start_tag_id() { return start_tag_id_; } - int end_tag_id() { return end_tag_id_; } - - private: - bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - - int start_tag_id_ = -1; - int end_tag_id_ = -1; - ContextConfig config_; - std::shared_ptr symbol_table_ = nullptr; - std::unique_ptr graph_ = nullptr; - DISALLOW_COPY_AND_ASSIGN(ContextGraph); -}; - -} // namespace wenet - -#endif // DECODER_CONTEXT_GRAPH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_endpoint.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_endpoint.cc deleted file mode 100644 index 4a64dd048f32401ab0dca468836cfac8be943d26..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_endpoint.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_endpoint.h" - -#include - -#include -#include - -#include "utils/log.h" - -namespace wenet { - -CtcEndpoint::CtcEndpoint(const CtcEndpointConfig& config) : config_(config) { - Reset(); -} - -void CtcEndpoint::Reset() { - num_frames_decoded_ = 0; - num_frames_trailing_blank_ = 0; -} - -static bool RuleActivated(const CtcEndpointRule& rule, - const std::string& rule_name, bool decoded_sth, - int trailing_silence, int utterance_length) { - bool ans = (decoded_sth || !rule.must_decoded_sth) && - trailing_silence >= rule.min_trailing_silence && - utterance_length >= rule.min_utterance_length; - if (ans) { - VLOG(2) << "Endpointing rule " << rule_name - << " activated: " << (decoded_sth ? "true" : "false") << ',' - << trailing_silence << ',' << utterance_length; - } - return ans; -} - -bool CtcEndpoint::IsEndpoint( - const std::vector>& ctc_log_probs, - bool decoded_something) { - for (int t = 0; t < ctc_log_probs.size(); ++t) { - const auto& logp_t = ctc_log_probs[t]; - float blank_prob = expf(logp_t[config_.blank]); - - num_frames_decoded_++; - if (blank_prob > config_.blank_threshold) { - num_frames_trailing_blank_++; - } else { - num_frames_trailing_blank_ = 0; - } - } - CHECK_GE(num_frames_decoded_, num_frames_trailing_blank_); - CHECK_GT(frame_shift_in_ms_, 0); - int utterance_length = num_frames_decoded_ * frame_shift_in_ms_; - int trailing_silence = num_frames_trailing_blank_ * frame_shift_in_ms_; - if (RuleActivated(config_.rule1, "rule1", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule2, "rule2", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule3, "rule3", decoded_something, trailing_silence, - utterance_length)) - return true; - return false; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_endpoint.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_endpoint.h deleted file mode 100644 index 56d9e08e7d3fab5562028e956f7b1d6ebac7b9e4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_endpoint.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_ENDPOINT_H_ -#define DECODER_CTC_ENDPOINT_H_ - -#include - -namespace wenet { - -struct CtcEndpointRule { - bool must_decoded_sth; - int min_trailing_silence; - int min_utterance_length; - - CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000, - int min_utterance_length = 0) - : must_decoded_sth(must_decoded_sth), - min_trailing_silence(min_trailing_silence), - min_utterance_length(min_utterance_length) {} -}; - -struct CtcEndpointConfig { - /// We consider blank as silence for purposes of endpointing. - int blank = 0; // blank id - float blank_threshold = 0.8; // blank threshold to be silence - /// We support three rules. We terminate decoding if ANY of these rules - /// evaluates to "true". If you want to add more rules, do it by changing this - /// code. If you want to disable a rule, you can set the silence-timeout for - /// that rule to a very large number. - - /// rule1 times out after 5000 ms of silence, even if we decoded nothing. - CtcEndpointRule rule1; - /// rule2 times out after 1000 ms of silence after decoding something. - CtcEndpointRule rule2; - /// rule3 times out after the utterance is 20000 ms long, regardless of - /// anything else. - CtcEndpointRule rule3; - - CtcEndpointConfig() - : rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {} -}; - -class CtcEndpoint { - public: - explicit CtcEndpoint(const CtcEndpointConfig& config); - - void Reset(); - /// This function returns true if this set of endpointing rules thinks we - /// should terminate decoding. - bool IsEndpoint(const std::vector>& ctc_log_probs, - bool decoded_something); - - void frame_shift_in_ms(int frame_shift_in_ms) { - frame_shift_in_ms_ = frame_shift_in_ms; - } - - private: - CtcEndpointConfig config_; - int frame_shift_in_ms_ = -1; - int num_frames_decoded_ = 0; - int num_frames_trailing_blank_ = 0; -}; - -} // namespace wenet - -#endif // DECODER_CTC_ENDPOINT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_prefix_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index 154c8864ba98255528a33a80a35b18eee8fa5dc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_prefix_beam_search.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -CtcPrefixBeamSearch::CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : opts_(opts), context_graph_(context_graph) { - Reset(); -} - -void CtcPrefixBeamSearch::Reset() { - hypotheses_.clear(); - likelihood_.clear(); - cur_hyps_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); - abs_time_step_ = 0; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - prefix_score.v_s = 0.0; - prefix_score.v_ns = 0.0; - std::vector empty; - cur_hyps_[empty] = prefix_score; - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.total_score()); - times_.emplace_back(empty); -} - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.total_score() > b.second.total_score(); -} - -void CtcPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - const std::vector& start_boundaries = prefix.second.start_boundaries; - const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - if (s < start_boundaries.size() && i == start_boundaries[s]) { - output.emplace_back(context_graph_->start_tag_id()); - ++s; - } - output.emplace_back(input[i]); - if (e < end_boundaries.size() && i == end_boundaries[e]) { - output.emplace_back(context_graph_->end_tag_id()); - ++e; - } - } - outputs_.emplace_back(output); -} - -void CtcPrefixBeamSearch::UpdateHypotheses( - const std::vector, PrefixScore>>& hpys) { - cur_hyps_.clear(); - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - for (auto& item : hpys) { - cur_hyps_[item.first] = item.second; - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.total_score()); - viterbi_likelihood_.emplace_back(item.second.viterbi_score()); - times_.emplace_back(item.second.times()); - } -} - -// Please refer https://robin1001.github.io/2020/12/11/ctc-search -// for how CTC prefix beam search works, and there is a simple graph demo in -// it. -void CtcPrefixBeamSearch::Search(const std::vector>& logp) { - if (logp.size() == 0) return; - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. First beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. Token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields s(blank ending score) and - // ns(none blank ending score) to -inf, respectively. - if (id == opts_.blank) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = LogAdd(next_score.s, prefix_score.score() + prob); - next_score.v_s = prefix_score.viterbi_score() + prob; - next_score.times_s = prefix_score.times(); - // Prefix not changed, copy the context from prefix. - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.ns = LogAdd(next_score1.ns, prefix_score.ns + prob); - if (next_score1.v_ns < prefix_score.v_ns + prob) { - next_score1.v_ns = prefix_score.v_ns + prob; - if (next_score1.cur_token_prob < prob) { - next_score1.cur_token_prob = prob; - next_score1.times_ns = prefix_score.times_ns; - CHECK_GT(next_score1.times_ns.size(), 0); - next_score1.times_ns.back() = abs_time_step_; - } - } - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.ns = LogAdd(next_score2.ns, prefix_score.s + prob); - if (next_score2.v_ns < prefix_score.v_s + prob) { - next_score2.v_ns = prefix_score.v_s + prob; - next_score2.cur_token_prob = prob; - next_score2.times_ns = prefix_score.times_s; - next_score2.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score2.has_context) { - // Prefix changed, calculate the context score. - next_score2.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score2.has_context = true; - } - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = LogAdd(next_score.ns, prefix_score.score() + prob); - if (next_score.v_ns < prefix_score.viterbi_score() + prob) { - next_score.v_ns = prefix_score.viterbi_score() + prob; - next_score.cur_token_prob = prob; - next_score.times_ns = prefix_score.times(); - next_score.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score.has_context) { - // Calculate the context score. - next_score.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score.has_context = true; - } - } - } - } - - // 3. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), arr.begin() + second_beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. Update cur_hyps_ and get new result - UpdateHypotheses(arr); - } -} - -void CtcPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } - -void CtcPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - CHECK_EQ(hypotheses_.size(), cur_hyps_.size()); - CHECK_EQ(hypotheses_.size(), likelihood_.size()); - // We should backoff the context score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_state != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); - } - } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_prefix_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_prefix_beam_search.h deleted file mode 100644 index f44ec23c37af517c9e45140f89ef7346768f5d35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_prefix_beam_search.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_PREFIX_BEAM_SEARCH_H_ -#define DECODER_CTC_PREFIX_BEAM_SEARCH_H_ - -#include -#include -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "utils/utils.h" - -namespace wenet { - -struct CtcPrefixBeamSearchOptions { - int blank = 0; // blank id - int first_beam_size = 10; - int second_beam_size = 10; -}; - -struct PrefixScore { - float s = -kFloatMax; // blank ending score - float ns = -kFloatMax; // none blank ending score - float v_s = -kFloatMax; // viterbi blank ending score - float v_ns = -kFloatMax; // viterbi none blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_s; // times of viterbi blank path - std::vector times_ns; // times of viterbi none blank path - - float score() const { return LogAdd(s, ns); } - float viterbi_score() const { return v_s > v_ns ? v_s : v_ns; } - const std::vector& times() const { - return v_s > v_ns ? times_s : times_ns; - } - - bool has_context = false; - int context_state = 0; - float context_score = 0; - std::vector start_boundaries; - std::vector end_boundaries; - - void CopyContext(const PrefixScore& prefix_score) { - context_state = prefix_score.context_state; - context_score = prefix_score.context_score; - start_boundaries = prefix_score.start_boundaries; - end_boundaries = prefix_score.end_boundaries; - } - - void UpdateContext(const std::shared_ptr& context_graph, - const PrefixScore& prefix_score, int word_id, - int prefix_len) { - this->CopyContext(prefix_score); - - float score = 0; - bool is_start_boundary = false; - bool is_end_boundary = false; - - context_state = - context_graph->GetNextState(prefix_score.context_state, word_id, &score, - &is_start_boundary, &is_end_boundary); - context_score += score; - if (is_start_boundary) start_boundaries.emplace_back(prefix_len); - if (is_end_boundary) end_boundaries.emplace_back(prefix_len); - } - - float total_score() const { return score() + context_score; } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -class CtcPrefixBeamSearch : public SearchInterface { - public: - explicit CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph = nullptr); - - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kPrefixBeamSearch; } - void UpdateOutputs(const std::pair, PrefixScore>& prefix); - void UpdateHypotheses( - const std::vector, PrefixScore>>& hpys); - void UpdateFinalContext(); - - const std::vector& viterbi_likelihood() const { - return viterbi_likelihood_; - } - const std::vector>& Inputs() const override { - return hypotheses_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - int abs_time_step_ = 0; - - // N-best list and corresponding likelihood_, in sorted order - std::vector> hypotheses_; - std::vector likelihood_; - std::vector viterbi_likelihood_; - std::vector> times_; - - std::unordered_map, PrefixScore, PrefixHash> cur_hyps_; - std::shared_ptr context_graph_ = nullptr; - // Outputs contain the hypotheses_ and tags like: and - std::vector> outputs_; - const CtcPrefixBeamSearchOptions& opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(CtcPrefixBeamSearch); -}; - -} // namespace wenet - -#endif // DECODER_CTC_PREFIX_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_wfst_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_wfst_beam_search.cc deleted file mode 100644 index 10e93f387e87b5f16fb7784d7060c50f227bf58e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_wfst_beam_search.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_wfst_beam_search.h" - -#include - -namespace wenet { - -void DecodableTensorScaled::Reset() { - num_frames_ready_ = 0; - done_ = false; - // Give an empty initialization, will throw error when - // AcceptLoglikes is not called - logp_.clear(); -} - -void DecodableTensorScaled::AcceptLoglikes(const std::vector& logp) { - ++num_frames_ready_; - // TODO(Binbin Zhang): Avoid copy here - logp_ = logp; -} - -float DecodableTensorScaled::LogLikelihood(int32 frame, int32 index) { - CHECK_GT(index, 0); - CHECK_LT(frame, num_frames_ready_); - return scale_ * logp_[index - 1]; -} - -bool DecodableTensorScaled::IsLastFrame(int32 frame) const { - CHECK_LT(frame, num_frames_ready_); - return done_ && (frame == num_frames_ready_ - 1); -} - -int32 DecodableTensorScaled::NumIndices() const { - LOG(FATAL) << "Not implement"; - return 0; -} - -CtcWfstBeamSearch::CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : decodable_(opts.acoustic_scale), - decoder_(fst, opts, context_graph), - context_graph_(context_graph), - opts_(opts) { - Reset(); -} - -void CtcWfstBeamSearch::Reset() { - num_frames_ = 0; - decoded_frames_mapping_.clear(); - is_last_frame_blank_ = false; - last_best_ = 0; - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - decodable_.Reset(); - decoder_.InitDecoding(); -} - -void CtcWfstBeamSearch::Search(const std::vector>& logp) { - if (0 == logp.size()) { - return; - } - // Every time we get the log posterior, we decode it all before return - for (int i = 0; i < logp.size(); i++) { - float blank_score = std::exp(logp[i][0]); - if (blank_score > opts_.blank_skip_thresh * opts_.blank_scale) { - VLOG(3) << "skipping frame " << num_frames_ << " score " << blank_score; - is_last_frame_blank_ = true; - last_frame_prob_ = logp[i]; - } else { - // Get the best symbol - int cur_best = - std::max_element(logp[i].begin(), logp[i].end()) - logp[i].begin(); - // Optional, adding one blank frame if we has skipped it in two same - // symbols - if (cur_best != 0 && is_last_frame_blank_ && cur_best == last_best_) { - decodable_.AcceptLoglikes(last_frame_prob_); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_ - 1); - VLOG(2) << "Adding blank frame at symbol " << cur_best; - } - last_best_ = cur_best; - - decodable_.AcceptLoglikes(logp[i]); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_); - is_last_frame_blank_ = false; - } - num_frames_++; - } - // Get the best path - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - if (decoded_frames_mapping_.size() > 0) { - inputs_.resize(1); - outputs_.resize(1); - likelihood_.resize(1); - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, false); - std::vector alignment; - kaldi::LatticeWeight weight; - fst::GetLinearSymbolSequence(lat, &alignment, &outputs_[0], &weight); - ConvertToInputs(alignment, &inputs_[0]); - RemoveContinuousTags(&outputs_[0]); - VLOG(3) << weight.Value1() << " " << weight.Value2(); - likelihood_[0] = -(weight.Value1() + weight.Value2()); - } -} - -void CtcWfstBeamSearch::FinalizeSearch() { - decodable_.SetFinish(); - decoder_.FinalizeDecoding(); - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - if (decoded_frames_mapping_.size() > 0) { - std::vector nbest_lats; - if (opts_.nbest == 1) { - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, true); - nbest_lats.push_back(std::move(lat)); - } else { - // Get N-best path by lattice(CompactLattice) - kaldi::CompactLattice clat; - decoder_.GetLattice(&clat, true); - kaldi::Lattice lat, nbest_lat; - fst::ConvertLattice(clat, &lat); - // TODO(Binbin Zhang): it's n-best word lists here, not character n-best - fst::ShortestPath(lat, &nbest_lat, opts_.nbest); - fst::ConvertNbestToVector(nbest_lat, &nbest_lats); - } - int nbest = nbest_lats.size(); - inputs_.resize(nbest); - outputs_.resize(nbest); - likelihood_.resize(nbest); - times_.resize(nbest); - for (int i = 0; i < nbest; i++) { - kaldi::LatticeWeight weight; - std::vector alignment; - fst::GetLinearSymbolSequence(nbest_lats[i], &alignment, &outputs_[i], - &weight); - ConvertToInputs(alignment, &inputs_[i], ×_[i]); - RemoveContinuousTags(&outputs_[i]); - likelihood_[i] = -(weight.Value1() + weight.Value2()); - } - } -} - -void CtcWfstBeamSearch::ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time) { - input->clear(); - if (time != nullptr) time->clear(); - for (int cur = 0; cur < alignment.size(); ++cur) { - // ignore blank - if (alignment[cur] - 1 == 0) continue; - // merge continuous same label - if (cur > 0 && alignment[cur] == alignment[cur - 1]) continue; - - input->push_back(alignment[cur] - 1); - if (time != nullptr) { - time->push_back(decoded_frames_mapping_[cur]); - } - } -} - -void CtcWfstBeamSearch::RemoveContinuousTags(std::vector* output) { - if (context_graph_) { - for (auto it = output->begin(); it != output->end();) { - if (*it == context_graph_->start_tag_id() || - *it == context_graph_->end_tag_id()) { - if (it + 1 != output->end() && *it == *(it + 1)) { - it = output->erase(it); - continue; - } - } - ++it; - } - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_wfst_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_wfst_beam_search.h deleted file mode 100644 index 204a0c8db1254035b7e3bd4a6e02b65d66b756f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/ctc_wfst_beam_search.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_WFST_BEAM_SEARCH_H_ -#define DECODER_CTC_WFST_BEAM_SEARCH_H_ - -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "kaldi/decoder/lattice-faster-online-decoder.h" -#include "utils/utils.h" - -namespace wenet { - -class DecodableTensorScaled : public kaldi::DecodableInterface { - public: - explicit DecodableTensorScaled(float scale = 1.0) : scale_(scale) { Reset(); } - - void Reset(); - int32 NumFramesReady() const override { return num_frames_ready_; } - bool IsLastFrame(int32 frame) const override; - float LogLikelihood(int32 frame, int32 index) override; - int32 NumIndices() const override; - void AcceptLoglikes(const std::vector& logp); - void SetFinish() { done_ = true; } - - private: - int num_frames_ready_ = 0; - float scale_ = 1.0; - bool done_ = false; - std::vector logp_; -}; - -// LatticeFasterDecoderConfig has the following key members -// beam: decoding beam -// max_active: Decoder max active states -// lattice_beam: Lattice generation beam -struct CtcWfstBeamSearchOptions : public kaldi::LatticeFasterDecoderConfig { - float acoustic_scale = 1.0; - float nbest = 10; - // When blank score is greater than this thresh, skip the frame in viterbi - // search - float blank_skip_thresh = 0.98; - float blank_scale = 1.0; -}; - -class CtcWfstBeamSearch : public SearchInterface { - public: - explicit CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph); - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kWfstBeamSearch; } - // For CTC prefix beam search, both inputs and outputs are hypotheses_ - const std::vector>& Inputs() const override { - return inputs_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - // Sub one and remove - void ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time = nullptr); - void RemoveContinuousTags(std::vector* output); - - int num_frames_ = 0; - std::vector decoded_frames_mapping_; - - int last_best_ = 0; // last none blank best id - std::vector last_frame_prob_; - bool is_last_frame_blank_ = false; - std::vector> inputs_, outputs_; - std::vector likelihood_; - std::vector> times_; - DecodableTensorScaled decodable_; - kaldi::LatticeFasterOnlineDecoder decoder_; - std::shared_ptr context_graph_; - const CtcWfstBeamSearchOptions& opts_; -}; - -} // namespace wenet - -#endif // DECODER_CTC_WFST_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/onnx_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/onnx_asr_model.cc deleted file mode 100644 index fc7afc704febbde3b7e350e392dc46763c453e74..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/onnx_asr_model.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/onnx_asr_model.h" - -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -Ort::Env OnnxAsrModel::env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, ""); -Ort::SessionOptions OnnxAsrModel::session_options_ = Ort::SessionOptions(); - -void OnnxAsrModel::InitEngineThreads(int num_threads) { - session_options_.SetIntraOpNumThreads(num_threads); -} - -void OnnxAsrModel::GetInputOutputInfo( - const std::shared_ptr& session, - std::vector* in_names, std::vector* out_names) { - Ort::AllocatorWithDefaultOptions allocator; - // Input info - int num_nodes = session->GetInputCount(); - in_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetInputName(i, allocator); - Ort::TypeInfo type_info = session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tInput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*in_names)[i] = name; - } - // Output info - num_nodes = session->GetOutputCount(); - out_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetOutputName(i, allocator); - Ort::TypeInfo type_info = session->GetOutputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tOutput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*out_names)[i] = name; - } -} - -void OnnxAsrModel::Read(const std::string& model_dir) { - std::string encoder_onnx_path = model_dir + "/encoder.onnx"; - std::string rescore_onnx_path = model_dir + "/decoder.onnx"; - std::string ctc_onnx_path = model_dir + "/ctc.onnx"; - - // 1. Load sessions - try { -#ifdef _MSC_VER - encoder_session_ = std::make_shared( - env_, ToWString(encoder_onnx_path).c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, ToWString(rescore_onnx_path).c_str(), session_options_); - ctc_session_ = std::make_shared( - env_, ToWString(ctc_onnx_path).c_str(), session_options_); -#else - encoder_session_ = std::make_shared( - env_, encoder_onnx_path.c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, rescore_onnx_path.c_str(), session_options_); - ctc_session_ = std::make_shared(env_, ctc_onnx_path.c_str(), - session_options_); -#endif - } catch (std::exception const& e) { - LOG(ERROR) << "error when load onnx model: " << e.what(); - exit(0); - } - - // 2. Read metadata - auto model_metadata = encoder_session_->GetModelMetadata(); - - Ort::AllocatorWithDefaultOptions allocator; - encoder_output_size_ = - atoi(model_metadata.LookupCustomMetadataMap("output_size", allocator)); - num_blocks_ = - atoi(model_metadata.LookupCustomMetadataMap("num_blocks", allocator)); - head_ = atoi(model_metadata.LookupCustomMetadataMap("head", allocator)); - cnn_module_kernel_ = atoi( - model_metadata.LookupCustomMetadataMap("cnn_module_kernel", allocator)); - subsampling_rate_ = atoi( - model_metadata.LookupCustomMetadataMap("subsampling_rate", allocator)); - right_context_ = - atoi(model_metadata.LookupCustomMetadataMap("right_context", allocator)); - sos_ = atoi(model_metadata.LookupCustomMetadataMap("sos_symbol", allocator)); - eos_ = atoi(model_metadata.LookupCustomMetadataMap("eos_symbol", allocator)); - is_bidirectional_decoder_ = atoi(model_metadata.LookupCustomMetadataMap( - "is_bidirectional_decoder", allocator)); - chunk_size_ = - atoi(model_metadata.LookupCustomMetadataMap("chunk_size", allocator)); - num_left_chunks_ = - atoi(model_metadata.LookupCustomMetadataMap("left_chunks", allocator)); - - LOG(INFO) << "Onnx Model Info:"; - LOG(INFO) << "\tencoder_output_size " << encoder_output_size_; - LOG(INFO) << "\tnum_blocks " << num_blocks_; - LOG(INFO) << "\thead " << head_; - LOG(INFO) << "\tcnn_module_kernel " << cnn_module_kernel_; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; - LOG(INFO) << "\tchunk_size " << chunk_size_; - LOG(INFO) << "\tnum_left_chunks " << num_left_chunks_; - - // 3. Read model nodes - LOG(INFO) << "Onnx Encoder:"; - GetInputOutputInfo(encoder_session_, &encoder_in_names_, &encoder_out_names_); - LOG(INFO) << "Onnx CTC:"; - GetInputOutputInfo(ctc_session_, &ctc_in_names_, &ctc_out_names_); - LOG(INFO) << "Onnx Rescore:"; - GetInputOutputInfo(rescore_session_, &rescore_in_names_, &rescore_out_names_); -} - -OnnxAsrModel::OnnxAsrModel(const OnnxAsrModel& other) { - // metadatas - encoder_output_size_ = other.encoder_output_size_; - num_blocks_ = other.num_blocks_; - head_ = other.head_; - cnn_module_kernel_ = other.cnn_module_kernel_; - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - // sessions - encoder_session_ = other.encoder_session_; - ctc_session_ = other.ctc_session_; - rescore_session_ = other.rescore_session_; - - // node names - encoder_in_names_ = other.encoder_in_names_; - encoder_out_names_ = other.encoder_out_names_; - ctc_in_names_ = other.ctc_in_names_; - ctc_out_names_ = other.ctc_out_names_; - rescore_in_names_ = other.rescore_in_names_; - rescore_out_names_ = other.rescore_out_names_; -} - -std::shared_ptr OnnxAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void OnnxAsrModel::Reset() { - offset_ = 0; - encoder_outs_.clear(); - cached_feature_.clear(); - // Reset att_cache - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - if (num_left_chunks_ > 0) { - int required_cache_size = chunk_size_ * num_left_chunks_; - offset_ = required_cache_size; - att_cache_.resize(num_blocks_ * head_ * required_cache_size * - encoder_output_size_ / head_ * 2, - 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, required_cache_size, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } else { - att_cache_.resize(0, 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, 0, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } - - // Reset cnn_cache - cnn_cache_.resize( - num_blocks_ * encoder_output_size_ * (cnn_module_kernel_ - 1), 0.0); - const int64_t cnn_cache_shape[] = {num_blocks_, 1, encoder_output_size_, - cnn_module_kernel_ - 1}; - cnn_cache_ort_ = Ort::Value::CreateTensor( - memory_info, cnn_cache_.data(), cnn_cache_.size(), cnn_cache_shape, 4); -} - -void OnnxAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - // 1. Prepare onnx required data, splice cached_feature_ and chunk_feats - // chunk - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - std::vector feats; - for (size_t i = 0; i < cached_feature_.size(); ++i) { - feats.insert(feats.end(), cached_feature_[i].begin(), - cached_feature_[i].end()); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - feats.insert(feats.end(), chunk_feats[i].begin(), chunk_feats[i].end()); - } - const int64_t feats_shape[3] = {1, num_frames, feature_dim}; - Ort::Value feats_ort = Ort::Value::CreateTensor( - memory_info, feats.data(), feats.size(), feats_shape, 3); - // offset - int64_t offset_int64 = static_cast(offset_); - Ort::Value offset_ort = Ort::Value::CreateTensor( - memory_info, &offset_int64, 1, std::vector{}.data(), 0); - // required_cache_size - int64_t required_cache_size = chunk_size_ * num_left_chunks_; - Ort::Value required_cache_size_ort = Ort::Value::CreateTensor( - memory_info, &required_cache_size, 1, std::vector{}.data(), 0); - // att_mask - Ort::Value att_mask_ort{nullptr}; - std::vector att_mask(required_cache_size + chunk_size_, 1); - if (num_left_chunks_ > 0) { - int chunk_idx = offset_ / chunk_size_ - num_left_chunks_; - if (chunk_idx < num_left_chunks_) { - for (int i = 0; i < (num_left_chunks_ - chunk_idx) * chunk_size_; ++i) { - att_mask[i] = 0; - } - } - const int64_t att_mask_shape[] = {1, 1, required_cache_size + chunk_size_}; - att_mask_ort = Ort::Value::CreateTensor( - memory_info, reinterpret_cast(att_mask.data()), att_mask.size(), - att_mask_shape, 3); - } - - // 2. Encoder chunk forward - std::vector inputs; - for (auto name : encoder_in_names_) { - if (!strcmp(name, "chunk")) { - inputs.emplace_back(std::move(feats_ort)); - } else if (!strcmp(name, "offset")) { - inputs.emplace_back(std::move(offset_ort)); - } else if (!strcmp(name, "required_cache_size")) { - inputs.emplace_back(std::move(required_cache_size_ort)); - } else if (!strcmp(name, "att_cache")) { - inputs.emplace_back(std::move(att_cache_ort_)); - } else if (!strcmp(name, "cnn_cache")) { - inputs.emplace_back(std::move(cnn_cache_ort_)); - } else if (!strcmp(name, "att_mask")) { - inputs.emplace_back(std::move(att_mask_ort)); - } - } - - std::vector ort_outputs = encoder_session_->Run( - Ort::RunOptions{nullptr}, encoder_in_names_.data(), inputs.data(), - inputs.size(), encoder_out_names_.data(), encoder_out_names_.size()); - - offset_ += static_cast( - ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape()[1]); - att_cache_ort_ = std::move(ort_outputs[1]); - cnn_cache_ort_ = std::move(ort_outputs[2]); - - std::vector ctc_inputs; - ctc_inputs.emplace_back(std::move(ort_outputs[0])); - - std::vector ctc_ort_outputs = ctc_session_->Run( - Ort::RunOptions{nullptr}, ctc_in_names_.data(), ctc_inputs.data(), - ctc_inputs.size(), ctc_out_names_.data(), ctc_out_names_.size()); - encoder_outs_.push_back(std::move(ctc_inputs[0])); - - float* logp_data = ctc_ort_outputs[0].GetTensorMutableData(); - auto type_info = ctc_ort_outputs[0].GetTensorTypeAndShapeInfo(); - - int num_outputs = type_info.GetShape()[1]; - int output_dim = type_info.GetShape()[2]; - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), logp_data + i * output_dim, - sizeof(float) * output_dim); - } -} - -float OnnxAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void OnnxAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - - std::vector rescore_input; - int encoder_len = 0; - for (int i = 0; i < encoder_outs_.size(); i++) { - float* encoder_outs_data = encoder_outs_[i].GetTensorMutableData(); - auto type_info = encoder_outs_[i].GetTensorTypeAndShapeInfo(); - for (int j = 0; j < type_info.GetElementCount(); j++) { - rescore_input.emplace_back(encoder_outs_data[j]); - } - encoder_len += type_info.GetShape()[1]; - } - - const int64_t decode_input_shape[] = {1, encoder_len, encoder_output_size_}; - - std::vector hyps_pad; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad.emplace_back(0); - } - } - - const int64_t hyps_pad_shape[] = {num_hyps, max_hyps_len}; - - const int64_t hyps_lens_shape[] = {num_hyps}; - - Ort::Value decode_input_tensor_ = Ort::Value::CreateTensor( - memory_info, rescore_input.data(), rescore_input.size(), - decode_input_shape, 3); - Ort::Value hyps_pad_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_pad.data(), hyps_pad.size(), hyps_pad_shape, 2); - Ort::Value hyps_lens_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_lens.data(), hyps_lens.size(), hyps_lens_shape, 1); - - std::vector rescore_inputs; - - rescore_inputs.emplace_back(std::move(hyps_pad_tensor_)); - rescore_inputs.emplace_back(std::move(hyps_lens_tensor_)); - rescore_inputs.emplace_back(std::move(decode_input_tensor_)); - - std::vector rescore_outputs = rescore_session_->Run( - Ort::RunOptions{nullptr}, rescore_in_names_.data(), rescore_inputs.data(), - rescore_inputs.size(), rescore_out_names_.data(), - rescore_out_names_.size()); - - float* decoder_outs_data = rescore_outputs[0].GetTensorMutableData(); - float* r_decoder_outs_data = rescore_outputs[1].GetTensorMutableData(); - - auto type_info = rescore_outputs[0].GetTensorTypeAndShapeInfo(); - int decode_out_len = type_info.GetShape()[2]; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - score = ComputeAttentionScore( - decoder_outs_data + max_hyps_len * decode_out_len * i, hyp, eos_, - decode_out_len); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore( - r_decoder_outs_data + max_hyps_len * decode_out_len * i, r_hyp, eos_, - decode_out_len); - } - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/onnx_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/onnx_asr_model.h deleted file mode 100644 index f5d9e9a0c61d728f2fb6d45d1428234abae98c90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/onnx_asr_model.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_ONNX_ASR_MODEL_H_ -#define DECODER_ONNX_ASR_MODEL_H_ - -#include -#include -#include - -#include "onnxruntime_cxx_api.h" // NOLINT - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -class OnnxAsrModel : public AsrModel { - public: - static void InitEngineThreads(int num_threads = 1); - - public: - OnnxAsrModel() = default; - OnnxAsrModel(const OnnxAsrModel& other); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - void GetInputOutputInfo(const std::shared_ptr& session, - std::vector* in_names, - std::vector* out_names); - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // sessions - // NOTE(Mddct): The Env holds the logging state used by all other objects. - // One Env must be created before using any other Onnxruntime functionality. - static Ort::Env env_; // shared environment across threads. - static Ort::SessionOptions session_options_; - std::shared_ptr encoder_session_ = nullptr; - std::shared_ptr rescore_session_ = nullptr; - std::shared_ptr ctc_session_ = nullptr; - - // node names - std::vector encoder_in_names_, encoder_out_names_; - std::vector ctc_in_names_, ctc_out_names_; - std::vector rescore_in_names_, rescore_out_names_; - - // caches - Ort::Value att_cache_ort_{nullptr}; - Ort::Value cnn_cache_ort_{nullptr}; - std::vector encoder_outs_; - // NOTE: Instead of making a copy of the xx_cache, ONNX only maintains - // its data pointer when initializing xx_cache_ort (see https://github.com/ - // microsoft/onnxruntime/blob/master/onnxruntime/core/framework - // /tensor.cc#L102-L129), so we need the following variables to keep - // our data "alive" during the lifetime of decoder. - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // DECODER_ONNX_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/params.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/params.h deleted file mode 100644 index 3edc877f1bb6d876ca087cab8e4ed00d42e97e63..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/params.h +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_PARAMS_H_ -#define DECODER_PARAMS_H_ - -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#ifdef USE_ONNX -#include "decoder/onnx_asr_model.h" -#endif -#ifdef USE_TORCH -#include "decoder/torch_asr_model.h" -#endif -#ifdef USE_XPU -#include "xpu/xpu_asr_model.h" -#endif -#ifdef USE_BPU -#include "bpu/bpu_asr_model.h" -#endif -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); - -// TorchAsrModel flags -DEFINE_string(model_path, "", "pytorch exported model path"); -// OnnxAsrModel flags -DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); -// XPUAsrModel flags -DEFINE_string(xpu_model_dir, "", - "directory where the XPU model and weights is saved"); -// BPUAsrModel flags -DEFINE_string(bpu_model_dir, "", - "directory where the HORIZON BPU model is saved"); - -// FeaturePipelineConfig flags -DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); -DEFINE_int32(sample_rate, 16000, "sample rate for audio"); - -// TLG fst -DEFINE_string(fst_path, "", "TLG fst path"); - -// DecodeOptions flags -DEFINE_int32(chunk_size, 16, "decoding chunk size"); -DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); -DEFINE_double(ctc_weight, 0.5, - "ctc weight when combining ctc score and rescoring score"); -DEFINE_double(rescoring_weight, 1.0, - "rescoring weight when combining ctc score and rescoring score"); -DEFINE_double(reverse_weight, 0.0, - "used for bitransformer rescoring. it must be 0.0 if decoder is" - "conventional transformer decoder, and only reverse_weight > 0.0" - "dose the right to left decoder will be calculated and used"); -DEFINE_int32(max_active, 7000, "max active states in ctc wfst search"); -DEFINE_int32(min_active, 200, "min active states in ctc wfst search"); -DEFINE_double(beam, 16.0, "beam in ctc wfst search"); -DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); -DEFINE_double(blank_skip_thresh, 1.0, - "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(blank_scale, 1.0, "blank scale for ctc wfst search"); -DEFINE_double(length_penalty, 0.0, - "length penalty ctc wfst search, will not" - "apply on self-loop arc, for balancing the del/ins ratio, " - "suggest set to -3.0"); -DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); - -// SymbolTable flags -DEFINE_string(dict_path, "", - "dict symbol table path, required when LM is enabled"); -DEFINE_string(unit_path, "", - "e2e model unit symbol table, it is used in both " - "with/without LM scenarios for context/timestamp"); - -// Context flags -DEFINE_string(context_path, "", "context path, is used to build context graph"); -DEFINE_double(context_score, 3.0, "is used to rescore the decoded result"); - -// PostProcessOptions flags -DEFINE_int32(language_type, 0, - "remove spaces according to language type" - "0x00 = kMandarinEnglish, " - "0x01 = kIndoEuropean"); -DEFINE_bool(lowercase, true, "lowercase final result if needed"); - -namespace wenet { -std::shared_ptr InitFeaturePipelineConfigFromFlags() { - auto feature_config = std::make_shared( - FLAGS_num_bins, FLAGS_sample_rate); - return feature_config; -} - -std::shared_ptr InitDecodeOptionsFromFlags() { - auto decode_config = std::make_shared(); - decode_config->chunk_size = FLAGS_chunk_size; - decode_config->num_left_chunks = FLAGS_num_left_chunks; - decode_config->ctc_weight = FLAGS_ctc_weight; - decode_config->reverse_weight = FLAGS_reverse_weight; - decode_config->rescoring_weight = FLAGS_rescoring_weight; - decode_config->ctc_wfst_search_opts.max_active = FLAGS_max_active; - decode_config->ctc_wfst_search_opts.min_active = FLAGS_min_active; - decode_config->ctc_wfst_search_opts.beam = FLAGS_beam; - decode_config->ctc_wfst_search_opts.lattice_beam = FLAGS_lattice_beam; - decode_config->ctc_wfst_search_opts.acoustic_scale = FLAGS_acoustic_scale; - decode_config->ctc_wfst_search_opts.blank_skip_thresh = - FLAGS_blank_skip_thresh; - decode_config->ctc_wfst_search_opts.blank_scale = FLAGS_blank_scale; - decode_config->ctc_wfst_search_opts.length_penalty = FLAGS_length_penalty; - decode_config->ctc_wfst_search_opts.nbest = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; - return decode_config; -} - -std::shared_ptr InitDecodeResourceFromFlags() { - auto resource = std::make_shared(); - const int kNumGemmThreads = 1; - if (!FLAGS_onnx_dir.empty()) { -#ifdef USE_ONNX - LOG(INFO) << "Reading onnx model "; - OnnxAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_onnx_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; -#endif - } else if (!FLAGS_model_path.empty()) { -#ifdef USE_TORCH - LOG(INFO) << "Reading torch model " << FLAGS_model_path; - TorchAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_model_path); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; -#endif - } else if (!FLAGS_xpu_model_dir.empty()) { -#ifdef USE_XPU - LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; - auto model = std::make_shared(); - model->SetEngineThreads(kNumGemmThreads); - model->SetDeviceId(FLAGS_device_id); - model->Read(FLAGS_xpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; -#endif - } else if (!FLAGS_bpu_model_dir.empty()) { -#ifdef USE_BPU - LOG(INFO) << "Reading Horizon BPU model from " << FLAGS_bpu_model_dir; - auto model = std::make_shared(); - model->Read(FLAGS_bpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DBPU=ON'."; -#endif - } else { - LOG(FATAL) << "Please set ONNX, TORCH, XPU or BPU model path!!!"; - } - - LOG(INFO) << "Reading unit table " << FLAGS_unit_path; - auto unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_unit_path)); - CHECK(unit_table != nullptr); - resource->unit_table = unit_table; - - if (!FLAGS_fst_path.empty()) { // With LM - CHECK(!FLAGS_dict_path.empty()); - LOG(INFO) << "Reading fst " << FLAGS_fst_path; - auto fst = std::shared_ptr>( - fst::Fst::Read(FLAGS_fst_path)); - CHECK(fst != nullptr); - resource->fst = fst; - - LOG(INFO) << "Reading symbol table " << FLAGS_dict_path; - auto symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_dict_path)); - CHECK(symbol_table != nullptr); - resource->symbol_table = symbol_table; - } else { // Without LM, symbol_table is the same as unit_table - resource->symbol_table = unit_table; - } - - if (!FLAGS_context_path.empty()) { - LOG(INFO) << "Reading context " << FLAGS_context_path; - std::vector contexts; - std::ifstream infile(FLAGS_context_path); - std::string context; - while (getline(infile, context)) { - contexts.emplace_back(Trim(context)); - } - ContextConfig config; - config.context_score = FLAGS_context_score; - resource->context_graph = std::make_shared(config); - resource->context_graph->BuildContextGraph(contexts, - resource->symbol_table); - } - - PostProcessOptions post_process_opts; - post_process_opts.language_type = - FLAGS_language_type == 0 ? kMandarinEnglish : kIndoEuropean; - post_process_opts.lowercase = FLAGS_lowercase; - resource->post_processor = - std::make_shared(std::move(post_process_opts)); - return resource; -} - -} // namespace wenet - -#endif // DECODER_PARAMS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/search_interface.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/search_interface.h deleted file mode 100644 index 25bad26705f8be44561d2c686f50a63035b14bbf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/search_interface.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_SEARCH_INTERFACE_H_ -#define DECODER_SEARCH_INTERFACE_H_ - -namespace wenet { - -#include - -enum SearchType { - kPrefixBeamSearch = 0x00, - kWfstBeamSearch = 0x01, -}; - -class SearchInterface { - public: - virtual ~SearchInterface() {} - virtual void Search(const std::vector>& logp) = 0; - virtual void Reset() = 0; - virtual void FinalizeSearch() = 0; - - virtual SearchType Type() const = 0; - // N-best inputs id - virtual const std::vector>& Inputs() const = 0; - // N-best outputs id - virtual const std::vector>& Outputs() const = 0; - // N-best likelihood - virtual const std::vector& Likelihood() const = 0; - // N-best timestamp - virtual const std::vector>& Times() const = 0; -}; - -} // namespace wenet - -#endif // DECODER_SEARCH_INTERFACE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/torch_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/torch_asr_model.cc deleted file mode 100644 index 3abca283e12f5c173c9511707229ea82b31f26d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/torch_asr_model.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/torch_asr_model.h" - -#include -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -namespace wenet { - -#ifndef IOS -void TorchAsrModel::InitEngineThreads(int num_threads) { - // For multi-thread performance - at::set_num_threads(num_threads); - VLOG(1) << "Num intra-op threads: " << at::get_num_threads(); -} -#endif - -void TorchAsrModel::Read(const std::string& model_path) { - torch::DeviceType device = at::kCPU; -#ifdef USE_GPU - if (!torch::cuda::is_available()) { - VLOG(1) << "CUDA is not available! Please check your GPU settings"; - throw std::runtime_error("CUDA is not available!"); - } else { - VLOG(1) << "CUDA available! Running on GPU"; - device = at::kCUDA; - } -#endif - torch::jit::script::Module model = torch::jit::load(model_path, device); - model_ = std::make_shared(std::move(model)); - torch::NoGradGuard no_grad; - model_->eval(); - torch::jit::IValue o1 = model_->run_method("subsampling_rate"); - CHECK_EQ(o1.isInt(), true); - subsampling_rate_ = o1.toInt(); - torch::jit::IValue o2 = model_->run_method("right_context"); - CHECK_EQ(o2.isInt(), true); - right_context_ = o2.toInt(); - torch::jit::IValue o3 = model_->run_method("sos_symbol"); - CHECK_EQ(o3.isInt(), true); - sos_ = o3.toInt(); - torch::jit::IValue o4 = model_->run_method("eos_symbol"); - CHECK_EQ(o4.isInt(), true); - eos_ = o4.toInt(); - torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder"); - CHECK_EQ(o5.isBool(), true); - is_bidirectional_decoder_ = o5.toBool(); - - VLOG(1) << "Torch Model Info:"; - VLOG(1) << "\tsubsampling_rate " << subsampling_rate_; - VLOG(1) << "\tright context " << right_context_; - VLOG(1) << "\tsos " << sos_; - VLOG(1) << "\teos " << eos_; - VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - // 2. Model copy, just copy the model ptr since: - // PyTorch allows using multiple CPU threads during TorchScript model - // inference, please see https://pytorch.org/docs/stable/notes/cpu_ - // threading_torchscript_inference.html - model_ = other.model_; - - // NOTE(Binbin Zhang): - // inner states for forward are not copied here. -} - -std::shared_ptr TorchAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void TorchAsrModel::Reset() { - offset_ = 0; - att_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - encoder_outs_.clear(); - cached_feature_.clear(); -} - -void TorchAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - torch::Tensor feats = - torch::zeros({1, num_frames, feature_dim}, torch::kFloat); - for (size_t i = 0; i < cached_feature_.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(cached_feature_[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][i] = std::move(row); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(chunk_feats[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][cached_feature_.size() + i] = std::move(row); - } - - // 2. Encoder chunk forward -#ifdef USE_GPU - feats = feats.to(at::kCUDA); - att_cache_ = att_cache_.to(at::kCUDA); - cnn_cache_ = cnn_cache_.to(at::kCUDA); -#endif - int required_cache_size = chunk_size_ * num_left_chunks_; - torch::NoGradGuard no_grad; - std::vector inputs = {feats, offset_, required_cache_size, - att_cache_, cnn_cache_}; - - // Refer interfaces in wenet/transformer/asr_model.py - auto outputs = - model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements(); - CHECK_EQ(outputs.size(), 3); -#ifdef USE_GPU - torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU); - att_cache_ = outputs[1].toTensor().to(at::kCPU); - cnn_cache_ = outputs[2].toTensor().to(at::kCPU); -#else - torch::Tensor chunk_out = outputs[0].toTensor(); - att_cache_ = outputs[1].toTensor(); - cnn_cache_ = outputs[2].toTensor(); -#endif - offset_ += chunk_out.size(1); - - // The first dimension of returned value is for batchsize, which is 1 -#ifdef USE_GPU - chunk_out = chunk_out.to(at::kCUDA); - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor(); - ctc_log_probs = ctc_log_probs.to(at::kCPU)[0]; - encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU))); -#else - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor()[0]; - encoder_outs_.push_back(std::move(chunk_out)); -#endif - - // Copy to output - int num_outputs = ctc_log_probs.size(0); - int output_dim = ctc_log_probs.size(1); - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(), - sizeof(float) * output_dim); - } -} - -float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, - int eos) { - float score = 0.0f; - auto accessor = prob.accessor(); - for (size_t j = 0; j < hyp.size(); ++j) { - score += accessor[j][hyp[j]]; - } - score += accessor[hyp.size()][eos]; - return score; -} - -void TorchAsrModel::AttentionRescoring( - const std::vector>& hyps, float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - torch::NoGradGuard no_grad; - // Step 1: Prepare input for libtorch - torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong); - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_length[i] = static_cast(length); - } - torch::Tensor hyps_tensor = - torch::zeros({num_hyps, max_hyps_len}, torch::kLong); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_tensor[i][0] = sos_; - for (size_t j = 0; j < hyp.size(); ++j) { - hyps_tensor[i][j + 1] = hyp[j]; - } - } - - // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_ - torch::Tensor encoder_out = torch::cat(encoder_outs_, 1); -#ifdef USE_GPU - hyps_tensor = hyps_tensor.to(at::kCUDA); - hyps_length = hyps_length.to(at::kCUDA); - encoder_out = encoder_out.to(at::kCUDA); -#endif - auto outputs = model_ - ->run_method("forward_attention_decoder", hyps_tensor, - hyps_length, encoder_out, reverse_weight) - .toTuple() - ->elements(); -#ifdef USE_GPU - auto probs = outputs[0].toTensor().to(at::kCPU); - auto r_probs = outputs[1].toTensor().to(at::kCPU); -#else - auto probs = outputs[0].toTensor(); - auto r_probs = outputs[1].toTensor(); -#endif - CHECK_EQ(probs.size(0), num_hyps); - CHECK_EQ(probs.size(1), max_hyps_len); - - // Step 3: Compute rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left-to-right decoder score - score = ComputeAttentionScore(probs[i], hyp, eos_); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - // right-to-left score - CHECK_EQ(r_probs.size(0), num_hyps); - CHECK_EQ(r_probs.size(1), max_hyps_len); - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_); - } - - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/torch_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/torch_asr_model.h deleted file mode 100644 index a3cebe08798f1cad60ca4cd73c7b2488173b6114..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/decoder/torch_asr_model.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_TORCH_ASR_MODEL_H_ -#define DECODER_TORCH_ASR_MODEL_H_ - -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -#include "decoder/asr_model.h" -#include "utils/utils.h" - -namespace wenet { - -class TorchAsrModel : public AsrModel { - public: -#ifndef IOS - static void InitEngineThreads(int num_threads = 1); -#endif - - public: - using TorchModule = torch::jit::script::Module; - TorchAsrModel() = default; - TorchAsrModel(const TorchAsrModel& other); - void Read(const std::string& model_path); - std::shared_ptr torch_model() const { return model_; } - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, int eos); - - private: - std::shared_ptr model_ = nullptr; - std::vector encoder_outs_; - // transformer/conformer attention cache - torch::Tensor att_cache_ = torch::zeros({0, 0, 0, 0}); - // conformer-only conv_module cache - torch::Tensor cnn_cache_ = torch::zeros({0, 0, 0, 0}); -}; - -} // namespace wenet - -#endif // DECODER_TORCH_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/CMakeLists.txt deleted file mode 100644 index 78872257e43bb9a6ffcedaae977bf0173817ae50..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(frontend STATIC - feature_pipeline.cc - fft.cc -) -target_link_libraries(frontend PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fbank.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fbank.h deleted file mode 100644 index 5a650dc035b8e244388cc1f2e0b9512654de7fda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fbank.h +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FBANK_H_ -#define FRONTEND_FBANK_H_ - -#include -#include -#include -#include -#include - -#include "frontend/fft.h" -#include "utils/log.h" - -namespace wenet { - -// This code is based on kaldi Fbank implementation, please see -// https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.cc -class Fbank { - public: - Fbank(int num_bins, int sample_rate, int frame_length, int frame_shift) - : num_bins_(num_bins), - sample_rate_(sample_rate), - frame_length_(frame_length), - frame_shift_(frame_shift), - use_log_(true), - remove_dc_offset_(true), - generator_(0), - distribution_(0, 1.0), - dither_(0.0) { - fft_points_ = UpperPowerOfTwo(frame_length_); - // generate bit reversal table and trigonometric function table - const int fft_points_4 = fft_points_ / 4; - bitrev_.resize(fft_points_); - sintbl_.resize(fft_points_ + fft_points_4); - make_sintbl(fft_points_, sintbl_.data()); - make_bitrev(fft_points_, bitrev_.data()); - - int num_fft_bins = fft_points_ / 2; - float fft_bin_width = static_cast(sample_rate_) / fft_points_; - int low_freq = 20, high_freq = sample_rate_ / 2; - float mel_low_freq = MelScale(low_freq); - float mel_high_freq = MelScale(high_freq); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); - bins_.resize(num_bins_); - center_freqs_.resize(num_bins_); - for (int bin = 0; bin < num_bins; ++bin) { - float left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - center_freqs_[bin] = InverseMelScale(center_mel); - std::vector this_bin(num_fft_bins); - int first_index = -1, last_index = -1; - for (int i = 0; i < num_fft_bins; ++i) { - float freq = (fft_bin_width * i); // Center frequency of this fft - // bin. - float mel = MelScale(freq); - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) - weight = (mel - left_mel) / (center_mel - left_mel); - else - weight = (right_mel - mel) / (right_mel - center_mel); - this_bin[i] = weight; - if (first_index == -1) first_index = i; - last_index = i; - } - } - CHECK(first_index != -1 && last_index >= first_index); - bins_[bin].first = first_index; - int size = last_index + 1 - first_index; - bins_[bin].second.resize(size); - for (int i = 0; i < size; ++i) { - bins_[bin].second[i] = this_bin[first_index + i]; - } - } - - // povey window - povey_window_.resize(frame_length_); - double a = M_2PI / (frame_length - 1); - for (int i = 0; i < frame_length; ++i) { - povey_window_[i] = pow(0.5 - 0.5 * cos(a * i), 0.85); - } - } - - void set_use_log(bool use_log) { use_log_ = use_log; } - - void set_remove_dc_offset(bool remove_dc_offset) { - remove_dc_offset_ = remove_dc_offset; - } - - void set_dither(float dither) { dither_ = dither; } - - int num_bins() const { return num_bins_; } - - static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); - } - - static inline float MelScale(float freq) { - return 1127.0f * logf(1.0f + freq / 700.0f); - } - - static int UpperPowerOfTwo(int n) { - return static_cast(pow(2, ceil(log(n) / log(2)))); - } - - // pre emphasis - void PreEmphasis(float coeff, std::vector* data) const { - if (coeff == 0.0) return; - for (int i = data->size() - 1; i > 0; i--) - (*data)[i] -= coeff * (*data)[i - 1]; - (*data)[0] -= coeff * (*data)[0]; - } - - // Apply povey window on data in place - void Povey(std::vector* data) const { - CHECK_GE(data->size(), povey_window_.size()); - for (size_t i = 0; i < povey_window_.size(); ++i) { - (*data)[i] *= povey_window_[i]; - } - } - - // Compute fbank feat, return num frames - int Compute(const std::vector& wave, - std::vector>* feat) { - int num_samples = wave.size(); - if (num_samples < frame_length_) return 0; - int num_frames = 1 + ((num_samples - frame_length_) / frame_shift_); - feat->resize(num_frames); - std::vector fft_real(fft_points_, 0), fft_img(fft_points_, 0); - std::vector power(fft_points_ / 2); - for (int i = 0; i < num_frames; ++i) { - std::vector data(wave.data() + i * frame_shift_, - wave.data() + i * frame_shift_ + frame_length_); - // optional add noise - if (dither_ != 0.0) { - for (size_t j = 0; j < data.size(); ++j) - data[j] += dither_ * distribution_(generator_); - } - // optinal remove dc offset - if (remove_dc_offset_) { - float mean = 0.0; - for (size_t j = 0; j < data.size(); ++j) mean += data[j]; - mean /= data.size(); - for (size_t j = 0; j < data.size(); ++j) data[j] -= mean; - } - - PreEmphasis(0.97, &data); - Povey(&data); - // copy data to fft_real - memset(fft_img.data(), 0, sizeof(float) * fft_points_); - memset(fft_real.data() + frame_length_, 0, - sizeof(float) * (fft_points_ - frame_length_)); - memcpy(fft_real.data(), data.data(), sizeof(float) * frame_length_); - fft(bitrev_.data(), sintbl_.data(), fft_real.data(), fft_img.data(), - fft_points_); - // power - for (int j = 0; j < fft_points_ / 2; ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - } - - (*feat)[i].resize(num_bins_); - // cepstral coefficients, triangle filter array - for (int j = 0; j < num_bins_; ++j) { - float mel_energy = 0.0; - int s = bins_[j].first; - for (size_t k = 0; k < bins_[j].second.size(); ++k) { - mel_energy += bins_[j].second[k] * power[s + k]; - } - // optional use log - if (use_log_) { - if (mel_energy < std::numeric_limits::epsilon()) - mel_energy = std::numeric_limits::epsilon(); - mel_energy = logf(mel_energy); - } - - (*feat)[i][j] = mel_energy; - } - } - return num_frames; - } - - private: - int num_bins_; - int sample_rate_; - int frame_length_, frame_shift_; - int fft_points_; - bool use_log_; - bool remove_dc_offset_; - std::vector center_freqs_; - std::vector>> bins_; - std::vector povey_window_; - std::default_random_engine generator_; - std::normal_distribution distribution_; - float dither_; - - // bit reversal table - std::vector bitrev_; - // trigonometric function table - std::vector sintbl_; -}; - -} // namespace wenet - -#endif // FRONTEND_FBANK_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/feature_pipeline.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/feature_pipeline.cc deleted file mode 100644 index ab450b15cd35ebd8101a3bcdec4f963a73bed10c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/feature_pipeline.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/feature_pipeline.h" - -#include -#include - -namespace wenet { - -FeaturePipeline::FeaturePipeline(const FeaturePipelineConfig& config) - : config_(config), - feature_dim_(config.num_bins), - fbank_(config.num_bins, config.sample_rate, config.frame_length, - config.frame_shift), - num_frames_(0), - input_finished_(false) {} - -void FeaturePipeline::AcceptWaveform(const float* pcm, const int size) { - std::vector> feats; - std::vector waves; - waves.insert(waves.end(), remained_wav_.begin(), remained_wav_.end()); - waves.insert(waves.end(), pcm, pcm + size); - int num_frames = fbank_.Compute(waves, &feats); - feature_queue_.Push(std::move(feats)); - num_frames_ += num_frames; - - int left_samples = waves.size() - config_.frame_shift * num_frames; - remained_wav_.resize(left_samples); - std::copy(waves.begin() + config_.frame_shift * num_frames, waves.end(), - remained_wav_.begin()); - // We are still adding wave, notify input is not finished - finish_condition_.notify_one(); -} - -void FeaturePipeline::AcceptWaveform(const int16_t* pcm, const int size) { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = static_cast(pcm[i]); - } - this->AcceptWaveform(float_pcm, size); - delete[] float_pcm; -} - -void FeaturePipeline::set_input_finished() { - CHECK(!input_finished_); - { - std::lock_guard lock(mutex_); - input_finished_ = true; - } - finish_condition_.notify_one(); -} - -bool FeaturePipeline::ReadOne(std::vector* feat) { - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - return false; - } - } -} - -bool FeaturePipeline::Read(int num_frames, - std::vector>* feats) { - feats->clear(); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - *feats = std::move(feature_queue_.Pop(feature_queue_.Size())); - return false; - } - } -} - -void FeaturePipeline::Reset() { - input_finished_ = false; - num_frames_ = 0; - remained_wav_.clear(); - feature_queue_.Clear(); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/feature_pipeline.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/feature_pipeline.h deleted file mode 100644 index 9918d6b573255795e0e665f0a9598c44be625c19..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/feature_pipeline.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FEATURE_PIPELINE_H_ -#define FRONTEND_FEATURE_PIPELINE_H_ - -#include -#include -#include -#include - -#include "frontend/fbank.h" -#include "utils/blocking_queue.h" -#include "utils/log.h" - -namespace wenet { - -struct FeaturePipelineConfig { - int num_bins; - int sample_rate; - int frame_length; - int frame_shift; - FeaturePipelineConfig(int num_bins, int sample_rate) - : num_bins(num_bins), // 80 dim fbank - sample_rate(sample_rate) { // 16k sample rate - frame_length = sample_rate / 1000 * 25; // frame length 25ms - frame_shift = sample_rate / 1000 * 10; // frame shift 10ms - } - - void Info() const { - LOG(INFO) << "feature pipeline config" - << " num_bins " << num_bins << " frame_length " << frame_length - << " frame_shift " << frame_shift; - } -}; - -// Typically, FeaturePipeline is used in two threads: one thread A calls -// AcceptWaveform() to add raw wav data and set_input_finished() to notice -// the end of input wav, another thread B (decoder thread) calls Read() to -// consume features.So a BlockingQueue is used to make this class thread safe. - -// The Read() is designed as a blocking method when there is no feature -// in feature_queue_ and the input is not finished. - -// See bin/decoder_main.cc, websocket/websocket_server.cc and -// decoder/torch_asr_decoder.cc for usage - -class FeaturePipeline { - public: - explicit FeaturePipeline(const FeaturePipelineConfig& config); - - // The feature extraction is done in AcceptWaveform(). - void AcceptWaveform(const float* pcm, const int size); - void AcceptWaveform(const int16_t* pcm, const int size); - - // Current extracted frames number. - int num_frames() const { return num_frames_; } - int feature_dim() const { return feature_dim_; } - const FeaturePipelineConfig& config() const { return config_; } - - // The caller should call this method when speech input is end. - // Never call AcceptWaveform() after calling set_input_finished() ! - void set_input_finished(); - bool input_finished() const { return input_finished_; } - - // Return False if input is finished and no feature could be read. - // Return True if a feature is read. - // This function is a blocking method. It will block the thread when - // there is no feature in feature_queue_ and the input is not finished. - bool ReadOne(std::vector* feat); - - // Read #num_frames frame features. - // Return False if less than #num_frames features are read and the - // input is finished. - // Return True if #num_frames features are read. - // This function is a blocking method when there is no feature - // in feature_queue_ and the input is not finished. - bool Read(int num_frames, std::vector>* feats); - - void Reset(); - bool IsLastFrame(int frame) const { - return input_finished_ && (frame == num_frames_ - 1); - } - - int NumQueuedFrames() const { return feature_queue_.Size(); } - - private: - const FeaturePipelineConfig& config_; - int feature_dim_; - Fbank fbank_; - - BlockingQueue> feature_queue_; - int num_frames_; - bool input_finished_; - - // The feature extraction is done in AcceptWaveform(). - // This waveform sample points are consumed by frame size. - // The residual waveform sample points after framing are - // kept to be used in next AcceptWaveform() calling. - std::vector remained_wav_; - - // Used to block the Read when there is no feature in feature_queue_ - // and the input is not finished. - mutable std::mutex mutex_; - std::condition_variable finish_condition_; -}; - -} // namespace wenet - -#endif // FRONTEND_FEATURE_PIPELINE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fft.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fft.cc deleted file mode 100644 index 9e05f854e79ea733d0411045385e924c2670b7f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fft.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include - -#include "frontend/fft.h" - -namespace wenet { - -void make_sintbl(int n, float* sintbl) { - int i, n2, n4, n8; - float c, s, dc, ds, t; - - n2 = n / 2; - n4 = n / 4; - n8 = n / 8; - t = sin(M_PI / n); - dc = 2 * t * t; - ds = sqrt(dc * (2 - dc)); - t = 2 * dc; - c = sintbl[n4] = 1; - s = sintbl[0] = 0; - for (i = 1; i < n8; ++i) { - c -= dc; - dc += t * c; - s += ds; - ds -= t * s; - sintbl[i] = s; - sintbl[n4 - i] = c; - } - if (n8 != 0) sintbl[n8] = sqrt(0.5); - for (i = 0; i < n4; ++i) sintbl[n2 - i] = sintbl[i]; - for (i = 0; i < n2 + n4; ++i) sintbl[i + n2] = -sintbl[i]; -} - -void make_bitrev(int n, int* bitrev) { - int i, j, k, n2; - - n2 = n / 2; - i = j = 0; - for (;;) { - bitrev[i] = j; - if (++i >= n) break; - k = n2; - while (k <= j) { - j -= k; - k /= 2; - } - j += k; - } -} - -// bitrev: bit reversal table -// sintbl: trigonometric function table -// x:real part -// y:image part -// n: fft length -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n) { - int i, j, k, ik, h, d, k2, n4, inverse; - float t, s, c, dx, dy; - - /* preparation */ - if (n < 0) { - n = -n; - inverse = 1; /* inverse transform */ - } else { - inverse = 0; - } - n4 = n / 4; - if (n == 0) { - return 0; - } - - /* bit reversal */ - for (i = 0; i < n; ++i) { - j = bitrev[i]; - if (i < j) { - t = x[i]; - x[i] = x[j]; - x[j] = t; - t = y[i]; - y[i] = y[j]; - y[j] = t; - } - } - - /* transformation */ - for (k = 1; k < n; k = k2) { - h = 0; - k2 = k + k; - d = n / k2; - for (j = 0; j < k; ++j) { - c = sintbl[h + n4]; - if (inverse) - s = -sintbl[h]; - else - s = sintbl[h]; - for (i = j; i < n; i += k2) { - ik = i + k; - dx = s * y[ik] + c * x[ik]; - dy = c * y[ik] - s * x[ik]; - x[ik] = x[i] - dx; - x[i] += dx; - y[ik] = y[i] - dy; - y[i] += dy; - } - h += d; - } - } - if (inverse) { - /* divide by n in case of the inverse transformation */ - for (i = 0; i < n; ++i) { - x[i] /= n; - y[i] /= n; - } - } - return 0; /* finished successfully */ -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fft.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fft.h deleted file mode 100644 index 6b92e406c44b4768eaee6e734f55bb39cd9af28b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/fft.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_FFT_H_ -#define FRONTEND_FFT_H_ - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -namespace wenet { - -// Fast Fourier Transform - -void make_sintbl(int n, float* sintbl); - -void make_bitrev(int n, int* bitrev); - -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n); - -} // namespace wenet - -#endif // FRONTEND_FFT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/wav.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/wav.h deleted file mode 100644 index 688a049a940ebbdc83f24e59134fff22b7b09bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/frontend/wav.h +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2016 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_WAV_H_ -#define FRONTEND_WAV_H_ - -#include -#include -#include -#include -#include - -#include - -#include "utils/log.h" - -namespace wenet { - -struct WavHeader { - char riff[4] = {'R', 'I', 'F', 'F'}; - unsigned int size = 0; - char wav[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - unsigned int fmt_size = 16; - uint16_t format = 1; - uint16_t channels = 0; - unsigned int sample_rate = 0; - unsigned int bytes_per_second = 0; - uint16_t block_size = 0; - uint16_t bit = 0; - char data[4] = {'d', 'a', 't', 'a'}; - unsigned int data_size = 0; - - WavHeader() {} - - WavHeader(int num_samples, int num_channel, int sample_rate, - int bits_per_sample) { - data_size = num_samples * num_channel * (bits_per_sample / 8); - size = sizeof(WavHeader) - 8 + data_size; - channels = num_channel; - this->sample_rate = sample_rate; - bytes_per_second = sample_rate * num_channel * (bits_per_sample / 8); - block_size = num_channel * (bits_per_sample / 8); - bit = bits_per_sample; - } -}; - -class WavReader { - public: - WavReader() : data_(nullptr) {} - explicit WavReader(const std::string& filename) { Open(filename); } - - bool Open(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "rb"); - if (NULL == fp) { - LOG(WARNING) << "Error in read " << filename; - return false; - } - - WavHeader header; - fread(&header, 1, sizeof(header), fp); - if (header.fmt_size < 16) { - fprintf(stderr, - "WaveData: expect PCM format data " - "to have fmt chunk of at least size 16.\n"); - return false; - } else if (header.fmt_size > 16) { - int offset = 44 - 8 + header.fmt_size - 16; - fseek(fp, offset, SEEK_SET); - fread(header.data, 8, sizeof(char), fp); - } - // check "RIFF" "WAVE" "fmt " "data" - - // Skip any sub-chunks between "fmt" and "data". Usually there will - // be a single "fact" sub chunk, but on Windows there can also be a - // "list" sub chunk. - while (0 != strncmp(header.data, "data", 4)) { - // We will just ignore the data in these chunks. - fseek(fp, header.data_size, SEEK_CUR); - // read next sub chunk - fread(header.data, 8, sizeof(char), fp); - } - - num_channel_ = header.channels; - sample_rate_ = header.sample_rate; - bits_per_sample_ = header.bit; - int num_data = header.data_size / (bits_per_sample_ / 8); - data_ = new float[num_data]; - num_samples_ = num_data / num_channel_; - - for (int i = 0; i < num_data; ++i) { - switch (bits_per_sample_) { - case 8: { - char sample; - fread(&sample, 1, sizeof(char), fp); - data_[i] = static_cast(sample); - break; - } - case 16: { - int16_t sample; - fread(&sample, 1, sizeof(int16_t), fp); - data_[i] = static_cast(sample); - break; - } - case 32: { - int sample; - fread(&sample, 1, sizeof(int), fp); - data_[i] = static_cast(sample); - break; - } - default: - fprintf(stderr, "unsupported quantization bits"); - exit(1); - } - } - fclose(fp); - return true; - } - - int num_channel() const { return num_channel_; } - int sample_rate() const { return sample_rate_; } - int bits_per_sample() const { return bits_per_sample_; } - int num_samples() const { return num_samples_; } - - ~WavReader() { - delete[] data_; - } - - const float* data() const { return data_; } - - private: - int num_channel_; - int sample_rate_; - int bits_per_sample_; - int num_samples_; // sample points per channel - float* data_; -}; - -class WavWriter { - public: - WavWriter(const float* data, int num_samples, int num_channel, - int sample_rate, int bits_per_sample) - : data_(data), - num_samples_(num_samples), - num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample) {} - - void Write(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "wb"); - WavHeader header(num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fwrite(&header, 1, sizeof(header), fp); - - for (int i = 0; i < num_samples_; ++i) { - for (int j = 0; j < num_channel_; ++j) { - switch (bits_per_sample_) { - case 8: { - char sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 16: { - int16_t sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 32: { - int sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - } - } - } - fclose(fp); - } - - private: - const float* data_; - int num_samples_; // total float points in data_ - int num_channel_; - int sample_rate_; - int bits_per_sample_; -}; - -class StreamWavWriter { - public: - StreamWavWriter(int num_channel, int sample_rate, int bits_per_sample) - : num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample), - total_num_samples_(0) {} - - StreamWavWriter(const std::string& filename, int num_channel, - int sample_rate, int bits_per_sample) - : StreamWavWriter(num_channel, sample_rate, bits_per_sample) { - Open(filename); - } - - void Open(const std::string& filename) { - fp_ = fopen(filename.c_str(), "wb"); - fseek(fp_, sizeof(WavHeader), SEEK_SET); - } - - void Write(const int16_t* sample_data, size_t num_samples) { - fwrite(sample_data, sizeof(int16_t), num_samples, fp_); - total_num_samples_ += num_samples; - } - - void Close() { - WavHeader header(total_num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fseek(fp_, 0L, SEEK_SET); - fwrite(&header, 1, sizeof(header), fp_); - fclose(fp_); - } - - private: - FILE* fp_; - int num_channel_; - int sample_rate_; - int bits_per_sample_; - size_t total_num_samples_; -}; - -} // namespace wenet - -#endif // FRONTEND_WAV_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/CMakeLists.txt deleted file mode 100644 index 2a152dd0d38cdc17d2758d7dbd542cd974d5f0c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# compile wenet.proto -set(PROTO_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -add_custom_command( - OUTPUT ${PROTO_DIR}/wenet.pb.cc - ${PROTO_DIR}/wenet.pb.h - ${PROTO_DIR}/wenet.grpc.pb.cc - ${PROTO_DIR}/wenet.grpc.pb.h - COMMAND ${protobuf_BINARY_DIR}/protoc - ARGS --grpc_out "${PROTO_DIR}" - --cpp_out "${PROTO_DIR}" - -I "${PROTO_DIR}" - --plugin=protoc-gen-grpc=${grpc_BINARY_DIR}/grpc_cpp_plugin - wenet.proto) - -# grpc_server/client -link_directories(${protobuf_BINARY_DIR}/lib) -add_library(wenet_grpc STATIC - grpc_client.cc - grpc_server.cc - wenet.pb.cc - wenet.grpc.pb.cc -) -target_link_libraries(wenet_grpc PUBLIC grpc++ grpc++_reflection decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_client.cc deleted file mode 100644 index 7a2e3f6f384980b6566468213d3eead43a404070..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_client.cc +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "grpc/grpc_client.h" - -#include "utils/log.h" - -namespace wenet { -using grpc::Channel; -using grpc::ClientContext; -using grpc::ClientReaderWriter; -using grpc::Status; -using wenet::Request; -using wenet::Response; - -GrpcClient::GrpcClient(const std::string& host, int port, int nbest, - bool continuous_decoding) - : host_(host), - port_(port), - nbest_(nbest), - continuous_decoding_(continuous_decoding) { - Connect(); - t_.reset(new std::thread(&GrpcClient::ReadLoopFunc, this)); -} - -void GrpcClient::Connect() { - channel_ = grpc::CreateChannel(host_ + ":" + std::to_string(port_), - grpc::InsecureChannelCredentials()); - stub_ = ASR::NewStub(channel_); - context_ = std::make_shared(); - stream_ = stub_->Recognize(context_.get()); - request_ = std::make_shared(); - response_ = std::make_shared(); - request_->mutable_decode_config()->set_nbest_config(nbest_); - request_->mutable_decode_config()->set_continuous_decoding_config( - continuous_decoding_); - stream_->Write(*request_); -} - -void GrpcClient::SendBinaryData(const void* data, size_t size) { - const int16_t* pdata = reinterpret_cast(data); - request_->set_audio_data(pdata, size); - stream_->Write(*request_); -} - -void GrpcClient::ReadLoopFunc() { - try { - while (stream_->Read(response_.get())) { - for (int i = 0; i < response_->nbest_size(); i++) { - // you can also traverse wordpieces like demonstrated above - LOG(INFO) << i + 1 << "best " << response_->nbest(i).sentence(); - } - if (response_->status() != Response_Status_ok) { - break; - } - if (response_->type() == Response_Type_speech_end) { - done_ = true; - break; - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void GrpcClient::Join() { - stream_->WritesDone(); - t_->join(); - Status status = stream_->Finish(); - if (!status.ok()) { - LOG(INFO) << "Recognize rpc failed."; - } -} -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_client.h deleted file mode 100644 index 36e36a0f5f5ec5bbb818009fe931e863eaa7fd60..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_client.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GRPC_GRPC_CLIENT_H_ -#define GRPC_GRPC_CLIENT_H_ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "grpc/wenet.grpc.pb.h" -#include "utils/utils.h" - -namespace wenet { - -using grpc::Channel; -using grpc::ClientContext; -using grpc::ClientReaderWriter; -using wenet::ASR; -using wenet::Request; -using wenet::Response; - -class GrpcClient { - public: - GrpcClient(const std::string& host, int port, int nbest, - bool continuous_decoding); - - void SendBinaryData(const void* data, size_t size); - void ReadLoopFunc(); - void Join(); - bool done() const { return done_; } - - private: - void Connect(); - std::string host_; - int port_; - std::shared_ptr channel_{nullptr}; - std::unique_ptr stub_{nullptr}; - std::shared_ptr context_{nullptr}; - std::unique_ptr> stream_{nullptr}; - std::shared_ptr request_{nullptr}; - std::shared_ptr response_{nullptr}; - int nbest_ = 1; - bool continuous_decoding_ = false; - bool done_ = false; - std::unique_ptr t_{nullptr}; - - WENET_DISALLOW_COPY_AND_ASSIGN(GrpcClient); -}; - -} // namespace wenet - -#endif // GRPC_GRPC_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_server.cc deleted file mode 100644 index 26268bc02a2f2ea56bb24a1eb379a565f693429a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_server.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "grpc/grpc_server.h" - -namespace wenet { - -using grpc::ServerReaderWriter; -using wenet::Request; -using wenet::Response; - -GrpcConnectionHandler::GrpcConnectionHandler( - ServerReaderWriter* stream, - std::shared_ptr request, std::shared_ptr response, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : stream_(std::move(stream)), - request_(std::move(request)), - response_(std::move(response)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - -void GrpcConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Received speech start signal, start reading speech"; - got_start_tag_ = true; - response_->set_status(Response::ok); - response_->set_type(Response::server_ready); - stream_->Write(*response_); - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = std::make_shared( - &GrpcConnectionHandler::DecodeThreadFunc, this); -} - -void GrpcConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Received speech end signal"; - CHECK(feature_pipeline_ != nullptr); - feature_pipeline_->set_input_finished(); - got_end_tag_ = true; -} - -void GrpcConnectionHandler::OnPartialResult() { - LOG(INFO) << "Partial result"; - response_->set_status(Response::ok); - response_->set_type(Response::partial_result); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnFinalResult() { - LOG(INFO) << "Final result"; - response_->set_status(Response::ok); - response_->set_type(Response::final_result); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnFinish() { - // Send finish tag - response_->set_status(Response::ok); - response_->set_type(Response::speech_end); - stream_->Write(*response_); -} - -void GrpcConnectionHandler::OnSpeechData() { - // Read binary PCM data - const int16_t* pcm_data = - reinterpret_cast(request_->audio_data().c_str()); - int num_samples = request_->audio_data().length() / sizeof(int16_t); - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - feature_pipeline_->AcceptWaveform(pcm_data, num_samples); -} - -void GrpcConnectionHandler::SerializeResult(bool finish) { - for (const DecodeResult& path : decoder_->result()) { - Response_OneBest* one_best_ = response_->add_nbest(); - one_best_->set_sentence(path.sentence); - if (finish) { - for (const WordPiece& word_piece : path.word_pieces) { - Response_OnePiece* one_piece_ = one_best_->add_wordpieces(); - one_piece_->set_word(word_piece.word); - one_piece_->set_start(word_piece.start); - one_piece_->set_end(word_piece.end); - } - } - if (response_->nbest_size() == nbest_) { - break; - } - } - return; -} - -void GrpcConnectionHandler::DecodeThreadFunc() { - while (true) { - DecodeState state = decoder_->Decode(); - response_->clear_status(); - response_->clear_type(); - response_->clear_nbest(); - if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - SerializeResult(true); - OnFinalResult(); - OnFinish(); - stop_recognition_ = true; - break; - } else if (state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - SerializeResult(true); - OnFinalResult(); - // If it's not continuous decoding, continue to do next recognition - // otherwise stop the recognition - if (continuous_decoding_) { - decoder_->ResetContinuousDecoding(); - } else { - OnFinish(); - stop_recognition_ = true; - break; - } - } else { - if (decoder_->DecodedSomething()) { - SerializeResult(false); - OnPartialResult(); - } - } - } -} - -void GrpcConnectionHandler::operator()() { - try { - while (stream_->Read(request_.get())) { - if (!got_start_tag_) { - nbest_ = request_->decode_config().nbest_config(); - continuous_decoding_ = - request_->decode_config().continuous_decoding_config(); - OnSpeechStart(); - } else { - OnSpeechData(); - } - } - OnSpeechEnd(); - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -Status GrpcServer::Recognize(ServerContext* context, - ServerReaderWriter* stream) { - LOG(INFO) << "Get Recognize request" << std::endl; - auto request = std::make_shared(); - auto response = std::make_shared(); - GrpcConnectionHandler handler(stream, request, response, feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.join(); - return Status::OK; -} -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_server.h deleted file mode 100644 index 3ab47ce5b15897c2a596d8ef27f2e7c4f8d26a3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/grpc_server.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GRPC_GRPC_SERVER_H_ -#define GRPC_GRPC_SERVER_H_ - -#include -#include -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -#include "grpc/wenet.grpc.pb.h" - -namespace wenet { - -using grpc::ServerContext; -using grpc::ServerReaderWriter; -using grpc::Status; -using wenet::ASR; -using wenet::Request; -using wenet::Response; - -class GrpcConnectionHandler { - public: - GrpcConnectionHandler(ServerReaderWriter* stream, - std::shared_ptr request, - std::shared_ptr response, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnFinish(); - void OnSpeechData(); - void OnPartialResult(); - void OnFinalResult(); - void DecodeThreadFunc(); - void SerializeResult(bool finish); - - bool continuous_decoding_ = false; - int nbest_ = 1; - ServerReaderWriter* stream_; - std::shared_ptr request_; - std::shared_ptr response_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - bool got_start_tag_ = false; - bool got_end_tag_ = false; - // When endpoint is detected, stop recognition, and stop receiving data. - bool stop_recognition_ = false; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class GrpcServer final : public ASR::Service { - public: - GrpcServer(std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - Status Recognize(ServerContext* context, - ServerReaderWriter* reader) override; - - private: - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - DISALLOW_COPY_AND_ASSIGN(GrpcServer); -}; - -} // namespace wenet - -#endif // GRPC_GRPC_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/wenet.proto b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/wenet.proto deleted file mode 100644 index 4c3033c034c513611c9159ff9db42b225be2cc98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/grpc/wenet.proto +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -syntax = "proto3"; - -option java_package = "ex.grpc"; -option objc_class_prefix = "wenet"; - -package wenet; - -service ASR { - rpc Recognize (stream Request) returns (stream Response) {} -} - -message Request { - - message DecodeConfig { - int32 nbest_config = 1; - bool continuous_decoding_config = 2; - } - - oneof RequestPayload { - DecodeConfig decode_config = 1; - bytes audio_data = 2; - } -} - -message Response { - - message OneBest { - string sentence = 1; - repeated OnePiece wordpieces = 2; - } - - message OnePiece { - string word = 1; - int32 start = 2; - int32 end = 3; - } - - enum Status { - ok = 0; - failed = 1; - } - - enum Type { - server_ready = 0; - partial_result = 1; - final_result = 2; - speech_end = 3; - } - - Status status = 1; - Type type = 2; - repeated OneBest nbest = 3; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/CMakeLists.txt deleted file mode 100644 index b072309e44b90dcee44ea31e9bcbc1741e73f151..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) - -project(kaldi) - -# include_directories() is called in the root CMakeLists.txt - -add_library(kaldi-util - base/kaldi-error.cc - base/kaldi-math.cc - util/kaldi-io.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/text-utils.cc -) -target_link_libraries(kaldi-util PUBLIC utils) - -add_library(kaldi-decoder - lat/determinize-lattice-pruned.cc - lat/lattice-functions.cc - decoder/lattice-faster-decoder.cc - decoder/lattice-faster-online-decoder.cc -) -target_link_libraries(kaldi-decoder PUBLIC kaldi-util) - -if(GRAPH_TOOLS) - # Arpa binary - add_executable(arpa2fst - lm/arpa-file-parser.cc - lm/arpa-lm-compiler.cc - lmbin/arpa2fst.cc - ) - target_link_libraries(arpa2fst PUBLIC kaldi-util) - - # FST tools binary - set(FST_BINS - fstaddselfloops - fstdeterminizestar - fstisstochastic - fstminimizeencoded - fsttablecompose - ) - - if(NOT MSVC) - # dl is for dynamic linking, otherwise there is a linking error on linux - link_libraries(dl) - endif() - foreach(name IN LISTS FST_BINS) - add_executable(${name} - fstbin/${name}.cc - fstext/kaldi-fst-io.cc - ) - target_link_libraries(${name} PUBLIC kaldi-util) - endforeach() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/README.md deleted file mode 100644 index 4eb9c9173b747686f00b658afc5e1e0dfdc17e68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/README.md +++ /dev/null @@ -1,21 +0,0 @@ -We use Kaldi decoder to implement TLG based language model integration, -so we copied related files to this directory. -The main changes are: - -1. To minimize the change, we use the same directories tree as Kaldi. - -2. We replace Kaldi log system with glog in the following way. - -``` c++ -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_INFO \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) -``` - -3. We lint all the files to satisfy the lint in WeNet. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs-inl.h deleted file mode 100644 index 9397400833676b323492321183c989cec2f41c3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs-inl.h +++ /dev/null @@ -1,329 +0,0 @@ -// base/io-funcs-inl.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; -// Johns Hopkins University (Author: Daniel Povey) -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_INL_H_ -#define KALDI_BASE_IO_FUNCS_INL_H_ 1 - -// Do not include this file directly. It is included by base/io-funcs.h - -#include -#include -#include - -namespace kaldi { - -// Template that covers integers. -template -void WriteBasicType(std::ostream &os, bool binary, T t) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char len_c = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(t)); - os.put(len_c); - os.write(reinterpret_cast(&t), sizeof(t)); - } else { - if (sizeof(t) == 1) - os << static_cast(t) << " "; - else - os << t << " "; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteBasicType."; - } -} - -// Template that covers integers. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t) { - KALDI_PARANOID_ASSERT(t != NULL); - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - int len_c_in = is.get(); - if (len_c_in == -1) - KALDI_ERR << "ReadBasicType: encountered end of stream."; - char len_c = static_cast(len_c_in), - len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(*t)); - if (len_c != len_c_expected) { - KALDI_ERR << "ReadBasicType: did not get expected integer type, " - << static_cast(len_c) << " vs. " - << static_cast(len_c_expected) - << ". You can change this code to successfully" - << " read it later, if needed."; - // insert code here to read "wrong" type. Might have a switch statement. - } - is.read(reinterpret_cast(t), sizeof(*t)); - } else { - if (sizeof(*t) == 1) { - int16 i; - is >> i; - *t = i; - } else { - is >> *t; - } - } - if (is.fail()) { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << is.peek(); - } -} - -// Template that covers integers. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector >::const_iterator iter = v.begin(), - end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(iter->first) << ',' - << static_cast(iter->second) << ' '; - else - os << iter->first << ',' << iter->second << ' '; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerPairVector."; - } -} - -// Template that covers integers. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz * 2); - } - } else { - std::vector > tmp_v; // use temporary so v doesn't use - // extra memory due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); - } else { - T next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::pair(next_t1, next_t2)); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerPairVector: read failure at file position " - << is.tellg(); -} - -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(*iter) << " "; - else - os << *iter << " "; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerVector."; - } -} - -template -inline void ReadIntegerVector(std::istream &is, bool binary, - std::vector *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz); - } - } else { - std::vector tmp_v; // use temporary so v doesn't use extra memory - // due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back((T)next_t); - } else { - T next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(next_t); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerVector: read failure at file position " - << is.tellg(); -} - -// Initialize an opened stream for writing by writing an optional binary -// header and modifying the floating-point precision. -inline void InitKaldiOutputStream(std::ostream &os, bool binary) { - // This does not throw exceptions (does not check for errors). - if (binary) { - os.put('\0'); - os.put('B'); - } - // Note, in non-binary mode we may at some point want to mess with - // the precision a bit. - // 7 is a bit more than the precision of float.. - if (os.precision() < 7) os.precision(7); -} - -/// Initialize an opened stream for reading by detecting the binary header and -// setting the "binary" value appropriately. -inline bool InitKaldiInputStream(std::istream &is, bool *binary) { - // Sets the 'binary' variable. - // Throws exception in the very unusual situation that stream - // starts with '\0' but not then 'B'. - - if (is.peek() == '\0') { // seems to be binary - is.get(); - if (is.peek() != 'B') { - return false; - } - is.get(); - *binary = true; - return true; - } else { - *binary = false; - return true; - } -} - -} // end namespace kaldi. - -#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs.cc deleted file mode 100644 index bd6c350780d1096ff8c452fd00864aa07a30ac65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs.cc +++ /dev/null @@ -1,215 +0,0 @@ -// base/io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" - -namespace kaldi { - -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b) { - os << (b ? "T" : "F"); - if (!binary) os << " "; - if (os.fail()) KALDI_ERR << "Write failure in WriteBasicType"; -} - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b) { - KALDI_PARANOID_ASSERT(b != NULL); - if (!binary) is >> std::ws; // eat up whitespace. - char c = is.peek(); - if (c == 'T') { - *b = true; - is.get(); - } else if (c == 'F') { - *b = false; - is.get(); - } else { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << CharToString(c); - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, float f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f) { - KALDI_PARANOID_ASSERT(f != NULL); - if (binary) { - double d; - int c = is.peek(); - if (c == sizeof(*f)) { - is.get(); - is.read(reinterpret_cast(f), sizeof(*f)); - } else if (c == sizeof(d)) { - ReadBasicType(is, binary, &d); - *f = d; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *f; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, double *d) { - KALDI_PARANOID_ASSERT(d != NULL); - if (binary) { - float f; - int c = is.peek(); - if (c == sizeof(*d)) { - is.get(); - is.read(reinterpret_cast(d), sizeof(*d)); - } else if (c == sizeof(f)) { - ReadBasicType(is, binary, &f); - *d = f; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *d; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -void CheckToken(const char *token) { - if (*token == '\0') KALDI_ERR << "Token is empty (not a valid token)"; - const char *orig_token = token; - while (*token != '\0') { - if (::isspace(*token)) - KALDI_ERR << "Token is not a valid token (contains space): '" - << orig_token << "'"; - token++; - } -} - -void WriteToken(std::ostream &os, bool binary, const char *token) { - // binary mode is ignored; - // we use space as termination character in either case. - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - os << token << " "; - if (os.fail()) { - KALDI_ERR << "Write failure in WriteToken."; - } -} - -int Peek(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // eat up whitespace. - return is.peek(); -} - -void WriteToken(std::ostream &os, bool binary, const std::string &token) { - WriteToken(os, binary, token.c_str()); -} - -void ReadToken(std::istream &is, bool binary, std::string *str) { - KALDI_ASSERT(str != NULL); - if (!binary) is >> std::ws; // consume whitespace. - is >> *str; - if (is.fail()) { - KALDI_ERR << "ReadToken, failed to read token at file position " - << is.tellg(); - } - if (!isspace(is.peek())) { - KALDI_ERR << "ReadToken, expected space after token, saw instead " - << CharToString(static_cast(is.peek())) - << ", at file position " << is.tellg(); - } - is.get(); // consume the space. -} - -int PeekToken(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // consume whitespace. - bool read_bracket; - if (static_cast(is.peek()) == '<') { - read_bracket = true; - is.get(); - } else { - read_bracket = false; - } - int ans = is.peek(); - if (read_bracket) { - if (!is.unget()) { - // Clear the bad bit. This code can be (and is in fact) reached, since the - // C++ standard does not guarantee that a call to unget() must succeed. - is.clear(); - } - } - return ans; -} - -void ExpectToken(std::istream &is, bool binary, const char *token) { - int pos_at_start = is.tellg(); - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - if (!binary) is >> std::ws; // consume whitespace. - std::string str; - is >> str; - is.get(); // consume the space. - if (is.fail()) { - KALDI_ERR << "Failed to read token [started at file position " - << pos_at_start << "], expected " << token; - } - // The second half of the '&&' expression below is so that if we're expecting - // "", we will accept "Foo>" instead. This is so that the model-reading - // code will tolerate errors in PeekToken where is.unget() failed; search for - // is.clear() in PeekToken() for an explanation. - if (strcmp(str.c_str(), token) != 0 && - !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { - KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str - << "\"."; - } -} - -void ExpectToken(std::istream &is, bool binary, const std::string &token) { - ExpectToken(is, binary, token.c_str()); -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs.h deleted file mode 100644 index 06ad1e3d2d8dc8385886a7c6653f620642c7c05a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/io-funcs.h +++ /dev/null @@ -1,246 +0,0 @@ -// base/io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_H_ -#define KALDI_BASE_IO_FUNCS_H_ - -// This header only contains some relatively low-level I/O functions. -// The full Kaldi I/O declarations are in ../util/kaldi-io.h -// and ../util/kaldi-table.h -// They were put in util/ in order to avoid making the Matrix library -// dependent on them. - -#include -#include -#include -#include - -#include "base/io-funcs-inl.h" -#include "base/kaldi-common.h" - -namespace kaldi { - -/* - This comment describes the Kaldi approach to I/O. All objects can be written - and read in two modes: binary and text. In addition we want to make the I/O - work if we redefine the typedef "BaseFloat" between floats and doubles. - We also want to have control over whitespace in text mode without affecting - the meaning of the file, for pretty-printing purposes. - - Errors are handled by throwing a KaldiFatalError exception. - - For integer and floating-point types (and boolean values): - - WriteBasicType(std::ostream &, bool binary, const T&); - ReadBasicType(std::istream &, bool binary, T*); - - and we expect these functions to be defined in such a way that they work when - the type T changes between float and double, so you can read float into double - and vice versa]. Note that for efficiency and space-saving reasons, the - Vector and Matrix classes do not use these functions [but they preserve the - type interchangeability in their own way] - - For a class (or struct) C: - class C { - .. - Write(std::ostream &, bool binary, [possibly extra optional args for - specific classes]) const; Read(std::istream &, bool binary, [possibly extra - optional args for specific classes]); - .. - } - NOTE: The only actual optional args we used are the "add" arguments in - Vector/Matrix classes, which specify whether we should sum the data already - in the class with the data being read. - - For types which are typedef's involving stl classes, I/O is as follows: - typedef std::vector > MyTypedefName; - - The user should define something like: - - WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); - ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); - - The user would have to write these functions. - - For a type std::vector: - - void WriteIntegerVector(std::ostream &os, bool binary, const std::vector - &v); void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - - For other types, e.g. vectors of pairs, the user should create a routine of - the type WriteMyTypedefName. This is to avoid introducing confusing templated - functions; we could easily create templated functions to handle most of these - cases but they would have to share the same name. - - It also often happens that the user needs to write/read special tokens as part - of a file. These might be class headers, or separators/identifiers in the - class. We provide special functions for manipulating these. These special - tokens must be nonempty and must not contain any whitespace. - - void WriteToken(std::ostream &os, bool binary, const char*); - void WriteToken(std::ostream &os, bool binary, const std::string & token); - int Peek(std::istream &is, bool binary); - void ReadToken(std::istream &is, bool binary, std::string *str); - void PeekToken(std::istream &is, bool binary, std::string *str); - - WriteToken writes the token and one space (whether in binary or text mode). - - Peek returns the first character of the next token, by consuming whitespace - (in text mode) and then returning the peek() character. It returns -1 at EOF; - it doesn't throw. It's useful if a class can have various forms based on - typedefs and virtual classes, and wants to know which version to read. - - ReadToken allows the caller to obtain the next token. PeekToken works just - like ReadToken, but seeks back to the beginning of the token. A subsequent - call to ReadToken will read the same token again. This is useful when - different object types are written to the same file; using PeekToken one can - decide which of the objects to read. - - There is currently no special functionality for writing/reading strings (where - the strings contain data rather than "special tokens" that are whitespace-free - and nonempty). This is because Kaldi is structured in such a way that strings - don't appear, except as OpenFst symbol table entries (and these have their own - format). - - - NOTE: you should not call ReadIntegerType and WriteIntegerType with types, - such as int and size_t, that are machine-independent -- at least not - if you want your file formats to port between machines. Use int32 and - int64 where necessary. There is no way to detect this using compile-time - assertions because C++ only keeps track of the internal representation of - the type. -*/ - -/// \addtogroup io_funcs_basic -/// @{ - -/// WriteBasicType is the name of the write function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void WriteBasicType(std::ostream &os, bool binary, T t); - -/// ReadBasicType is the name of the read function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void ReadBasicType(std::istream &is, bool binary, T *t); - -// Declare specialization for bool. -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b); - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b); - -// Declare specializations for float and double. -template <> -void WriteBasicType(std::ostream &os, bool binary, float f); - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f); - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f); - -template <> -void ReadBasicType(std::istream &is, bool binary, double *f); - -// Define ReadBasicType that accepts an "add" parameter to add to -// the destination. Caution: if used in Read functions, be careful -// to initialize the parameters concerned to zero in the default -// constructor. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { - if (!add) { - ReadBasicType(is, binary, t); - } else { - T tmp = T(0); - ReadBasicType(is, binary, &tmp); - *t += tmp; - } -} - -/// Function for writing STL vectors of integer types. -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v); - -/// Function for reading STL vector of integer types. -template -inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - -/// Function for writing STL vectors of pairs of integer types. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v); - -/// Function for reading STL vector of pairs of integer types. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v); - -/// The WriteToken functions are for writing nonempty sequences of non-space -/// characters. They are not for general strings. -void WriteToken(std::ostream &os, bool binary, const char *token); -void WriteToken(std::ostream &os, bool binary, const std::string &token); - -/// Peek consumes whitespace (if binary == false) and then returns the peek() -/// value of the stream. -int Peek(std::istream &is, bool binary); - -/// ReadToken gets the next token and puts it in str (exception on failure). If -/// PeekToken() had been previously called, it is possible that the stream had -/// failed to unget the starting '<' character. In this case ReadToken() returns -/// the token string without the leading '<'. You must be prepared to handle -/// this case. ExpectToken() handles this internally, and is not affected. -void ReadToken(std::istream &is, bool binary, std::string *token); - -/// PeekToken will return the first character of the next token, or -1 if end of -/// file. It's the same as Peek(), except if the first character is '<' it will -/// skip over it and will return the next character. It will attempt to unget -/// the '<' so the stream is where it was before you did PeekToken(), however, -/// this is not guaranteed (see ReadToken()). -int PeekToken(std::istream &is, bool binary); - -/// ExpectToken tries to read in the given token, and throws an exception -/// on failure. -void ExpectToken(std::istream &is, bool binary, const char *token); -void ExpectToken(std::istream &is, bool binary, const std::string &token); - -/// ExpectPretty attempts to read the text in "token", but only in non-binary -/// mode. Throws exception on failure. It expects an exact match except that -/// arbitrary whitespace matches arbitrary whitespace. -void ExpectPretty(std::istream &is, bool binary, const char *token); -void ExpectPretty(std::istream &is, bool binary, const std::string &token); - -/// @} end "addtogroup io_funcs_basic" - -/// InitKaldiOutputStream initializes an opened stream for writing by writing an -/// optional binary header and modifying the floating-point precision; it will -/// typically not be called by users directly. -inline void InitKaldiOutputStream(std::ostream &os, bool binary); - -/// InitKaldiInputStream initializes an opened stream for reading by detecting -/// the binary header and setting the "binary" value appropriately; -/// It will typically not be called by users directly. -inline bool InitKaldiInputStream(std::istream &is, bool *binary); - -} // end namespace kaldi. -#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-common.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-common.h deleted file mode 100644 index eee5f34d7234e7c029e6bb59584d3ee65ff5a875..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-common.h +++ /dev/null @@ -1,41 +0,0 @@ -// base/kaldi-common.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_COMMON_H_ -#define KALDI_BASE_KALDI_COMMON_H_ 1 - -#include -#include -#include // C string stuff like strcpy -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-utils.h" -#include "base/kaldi-error.h" -#include "base/kaldi-types.h" -// #include "base/io-funcs.h" -#include "base/kaldi-math.h" -// #include "base/timer.h" - -#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-error.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-error.cc deleted file mode 100644 index 77edc6af6e56bb8fa3431d519e58fda9ee0bac6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-error.cc +++ /dev/null @@ -1,42 +0,0 @@ -// base/kaldi-error.cc - -// Copyright 2019 LAIX (Yi Sun) -// Copyright 2019 SmartAction LLC (kkm) -// Copyright 2016 Brno University of Technology (author: Karel Vesely) -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-error.h" - -#include - -namespace kaldi { - -/***** GLOBAL VARIABLES FOR LOGGING *****/ - -int32 g_kaldi_verbose_level = 0; -static std::string program_name; // NOLINT - -void SetProgramName(const char *basename) { - // Using the 'static std::string' for the program name is mostly harmless, - // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ - // string implementation has been found in the wild that would not be just - // an empty string when zero-initialized but not yet constructed. - program_name = basename; -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-error.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-error.h deleted file mode 100644 index 0f65db372b5f05a8017433eed7c95badc819a0a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// base/kaldi-error.h - -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_ERROR_H_ -#define KALDI_BASE_KALDI_ERROR_H_ 1 - -#include "utils/log.h" - -namespace kaldi { - -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_LOG \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) - - -/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ - -/// Called by ParseOptions to set base name (no directory) of the executing -/// program. The name is printed in logging code along with every message, -/// because in our scripts, we often mix together the stderr of many programs. -/// This function is very thread-unsafe. -void SetProgramName(const char *basename); - -/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. -/// Do not use directly, prefer {Get,Set}VerboseLevel(). -extern int32 g_kaldi_verbose_level; - -/// Get verbosity level, usually set via command line '--verbose=' switch. -inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } - -/// This should be rarely used, except by programs using Kaldi as library; -/// command-line programs set the verbose level automatically from ParseOptions. -inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } - -} // namespace kaldi - -#endif // KALDI_BASE_KALDI_ERROR_H_ - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-math.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-math.cc deleted file mode 100644 index 175d9f49b6c5216645e90e146f4e2eab5572c342..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-math.cc +++ /dev/null @@ -1,164 +0,0 @@ -// base/kaldi-math.cc - -// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; -// Saarland University; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-math.h" -#ifndef _MSC_VER -#include -#include -#endif -#include -#include - -namespace kaldi { -// These routines are tested in matrix/matrix-test.cc - -int32 RoundUpToNearestPowerOfTwo(int32 n) { - KALDI_ASSERT(n > 0); - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n+1; -} - -static std::mutex _RandMutex; - -int Rand(struct RandomState* state) { -#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) - // On Windows and Cygwin, just call Rand() - return rand(); -#else - if (state) { - return rand_r(&(state->seed)); - } else { - std::lock_guard lock(_RandMutex); - return rand(); - } -#endif -} - -RandomState::RandomState() { - // we initialize it as Rand() + 27437 instead of just Rand(), because on some - // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be - // the case that rand_r when initialized with rand() will give you the exact - // same sequence of numbers that rand() will give if you keep calling rand() - // after that initial call. This can cause problems with repeated sequences. - // For example if you initialize two RandomState structs one after the other - // without calling rand() in between, they would give you the same sequence - // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just - // a randomly chosen prime number. - seed = unsigned(Rand()) + 27437; -} - -bool WithProb(BaseFloat prob, struct RandomState* state) { - KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, - // but we allow slightly larger values that could arise from roundoff in - // previous calculations. - KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); - if (prob == 0) { - return false; - } else if (prob == 1.0) { - return true; - } else if (prob * RAND_MAX < 128.0) { - // prob is very small but nonzero, and the "main algorithm" - // wouldn't work that well. So: with probability 1/128, we - // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... - // Note: we know that prob * 128.0 < 1.0, because - // we asserted RAND_MAX > 128 * 128. - return WithProb(prob * 128.0); - } else { - return false; - } - } else { - return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); - } -} - -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { - // This is not exact. - KALDI_ASSERT(max_val >= min_val); - if (max_val == min_val) return min_val; - -#ifdef _MSC_VER - // RAND_MAX is quite small on Windows -> may need to handle larger numbers. - if (RAND_MAX > (max_val-min_val)*8) { - // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + - ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); - } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > - (unsigned int)((max_val+1-min_val)*8)) { - // *8 to avoid inaccuracies in probability, from the modulus... - return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) - % (unsigned int)(max_val+1-min_val)); - } else { - KALDI_ERR << "rand_int failed because we do not support such large " - "random numbers. (Extend this function)."; - } - } -#else - return min_val + - (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); -#endif -} - -// Returns poisson-distributed random number. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state) { - // Knuth's algorithm. - KALDI_ASSERT(lambda >= 0); - float L = expf(-lambda), p = 1.0; - int32 k = 0; - do { - k++; - float u = RandUniform(state); - p *= u; - } while (p > L); - return k-1; -} - -void RandGauss2(float *a, float *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float u1 = RandUniform(state); - float u2 = RandUniform(state); - u1 = sqrtf(-2.0f * logf(u1)); - u2 = 2.0f * M_PI * u2; - *a = u1 * cosf(u2); - *b = u1 * sinf(u2); -} - -void RandGauss2(double *a, double *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float a_float, b_float; - // Just because we're using doubles doesn't mean we need super-high-quality - // random numbers, so we just use the floating-point version internally. - RandGauss2(&a_float, &b_float, state); - *a = a_float; - *b = b_float; -} - - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-math.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-math.h deleted file mode 100644 index 93c265ee96e704893da26b9083a44a9e60c6c192..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-math.h +++ /dev/null @@ -1,363 +0,0 @@ -// base/kaldi-math.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; -// Jan Silovsky; Saarland University -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_MATH_H_ -#define KALDI_BASE_KALDI_MATH_H_ 1 - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include - -#include "base/kaldi-types.h" -#include "base/kaldi-common.h" - - -#ifndef DBL_EPSILON -#define DBL_EPSILON 2.2204460492503131e-16 -#endif -#ifndef FLT_EPSILON -#define FLT_EPSILON 1.19209290e-7f -#endif - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.4142135623730950488016887 -#endif - -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244008443621048490 -#endif - -#ifndef M_LOG_2PI -#define M_LOG_2PI 1.8378770664093454835606594728112 -#endif - -#ifndef M_LN2 -#define M_LN2 0.693147180559945309417232121458 -#endif - -#ifndef M_LN10 -#define M_LN10 2.302585092994045684017991454684 -#endif - - -#define KALDI_ISNAN std::isnan -#define KALDI_ISINF std::isinf -#define KALDI_ISFINITE(x) std::isfinite(x) - -#if !defined(KALDI_SQR) -# define KALDI_SQR(x) ((x) * (x)) -#endif - -namespace kaldi { - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline double Exp(double x) { return exp(x); } -#ifndef KALDI_NO_EXPF -inline float Exp(float x) { return expf(x); } -#else -inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF -#else -inline double Exp(double x) { return exp(x); } -#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -// Microsoft CL v18.0 buggy 64-bit implementation of -// expf() incorrectly returns -inf for exp(-inf). -inline float Exp(float x) { return exp(static_cast(x)); } -#else -inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -inline double Log(double x) { return log(x); } -inline float Log(float x) { return logf(x); } - -#if !defined(_MSC_VER) || (_MSC_VER >= 1700) -inline double Log1p(double x) { return log1p(x); } -inline float Log1p(float x) { return log1pf(x); } -#else -inline double Log1p(double x) { - const double cutoff = 1.0e-08; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} - -inline float Log1p(float x) { - const float cutoff = 1.0e-07; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} -#endif - -static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! -static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! - -// -infinity -const float kLogZeroFloat = -std::numeric_limits::infinity(); -const double kLogZeroDouble = -std::numeric_limits::infinity(); -const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); - -// Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state = NULL); - -// State for thread-safe random number generator -struct RandomState { - RandomState(); - unsigned seed; -}; - -// Returns a random integer between first and last inclusive. -int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); - -// Returns true with probability "prob", -bool WithProb(BaseFloat prob, struct RandomState* state = NULL); -// with 0 <= prob <= 1 [we check this]. -// Internally calls Rand(). This function is carefully implemented so -// that it should work even if prob is very small. - -/// Returns a random number strictly between 0 and 1. -inline float RandUniform(struct RandomState* state = NULL) { - return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); -} - -inline float RandGauss(struct RandomState* state = NULL) { - return static_cast(sqrtf (-2 * Log(RandUniform(state))) - * cosf(2*M_PI*RandUniform(state))); -} - -// Returns poisson-distributed random number. Uses Knuth's algorithm. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state = NULL); - -// Returns a pair of gaussian random numbers. Uses Box-Muller transform -void RandGauss2(float *a, float *b, RandomState *state = NULL); -void RandGauss2(double *a, double *b, RandomState *state = NULL); - -// Also see Vector::RandCategorical(). - -// This is a randomized pruning mechanism that preserves expectations, -// that we typically use to prune posteriors. -template -inline Float RandPrune(Float post, BaseFloat prune_thresh, - struct RandomState* state = NULL) { - KALDI_ASSERT(prune_thresh >= 0.0); - if (post == 0.0 || std::abs(post) >= prune_thresh) - return post; - return (post >= 0 ? 1.0 : -1.0) * - (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); -} - -// returns log(exp(x) + exp(y)). -inline double LogAdd(double x, double y) { - double diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffDouble) { - double res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) + exp(y)). -inline float LogAdd(float x, float y) { - float diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffFloat) { - float res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) - exp(y)). -inline double LogSub(double x, double y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - double diff = y - x; // Will be negative. - double res = x + Log(1.0 - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroDouble; - return res; -} - - -// returns log(exp(x) - exp(y)). -inline float LogSub(float x, float y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - float diff = y - x; // Will be negative. - float res = x + Log(1.0f - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroFloat; - return res; -} - -/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). -static inline bool ApproxEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - if (a == b) return true; - float diff = std::abs(a-b); - if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); -} - -/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) -static inline void AssertEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); -} - - -// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. -int32 RoundUpToNearestPowerOfTwo(int32 n); - -/// Returns a / b, rounding towards negative infinity in all cases. -static inline int32 DivideRoundingDown(int32 a, int32 b) { - KALDI_ASSERT(b != 0); - if (a * b >= 0) - return a / b; - else if (a < 0) - return (a - b + 1) / b; - else - return (a - b - 1) / b; -} - -template I Gcd(I m, I n) { - if (m == 0 || n == 0) { - if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. - KALDI_ERR << "Undefined GCD since m = 0, n = 0."; - } - return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); - // return absolute value of whichever is nonzero - } - // could use compile-time assertion - // but involves messing with complex template stuff. - KALDI_ASSERT(std::numeric_limits::is_integer); - while (1) { - m %= n; - if (m == 0) return (n > 0 ? n : -n); - n %= m; - if (n == 0) return (m > 0 ? m : -m); - } -} - -/// Returns the least common multiple of two integers. Will -/// crash unless the inputs are positive. -template I Lcm(I m, I n) { - KALDI_ASSERT(m > 0 && n > 0); - I gcd = Gcd(m, n); - return gcd * (m/gcd) * (n/gcd); -} - - -template void Factorize(I m, std::vector *factors) { - // Splits a number into its prime factors, in sorted order from - // least to greatest, with duplication. A very inefficient - // algorithm, which is mainly intended for use in the - // mixed-radix FFT computation (where we assume most factors - // are small). - KALDI_ASSERT(factors != NULL); - KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. - factors->clear(); - I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; - - // First try small factors. - for (I i = 0; i < 10; i++) { - if (m == 1) return; // We're done. - while (m % small_factors[i] == 0) { - m /= small_factors[i]; - factors->push_back(small_factors[i]); - } - } - // Next try all odd numbers starting from 31. - for (I j = 31;; j += 2) { - if (m == 1) return; - while (m % j == 0) { - m /= j; - factors->push_back(j); - } - } -} - -inline double Hypot(double x, double y) { return hypot(x, y); } -inline float Hypot(float x, float y) { return hypotf(x, y); } - - - - -} // namespace kaldi - - -#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-types.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-types.h deleted file mode 100644 index 7ebf4f85386192a65e176d8f0ecde9bb348af4a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-types.h +++ /dev/null @@ -1,75 +0,0 @@ -// base/kaldi-types.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_TYPES_H_ -#define KALDI_BASE_KALDI_TYPES_H_ 1 - -namespace kaldi { -// TYPEDEFS .................................................................. -#if (KALDI_DOUBLEPRECISION != 0) -typedef double BaseFloat; -#else -typedef float BaseFloat; -#endif -} - -#ifdef _MSC_VER -#include -#define ssize_t SSIZE_T -#endif - -// we can do this a different way if some platform -// we find in the future lacks stdint.h -#include - -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - -#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-utils.h deleted file mode 100644 index bd434d09ed92ec94bc4208f53a4416f941edfdb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/base/kaldi-utils.h +++ /dev/null @@ -1,155 +0,0 @@ -// base/kaldi-utils.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; -// Saarland University; Karel Vesely; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_UTILS_H_ -#define KALDI_BASE_KALDI_UTILS_H_ 1 - -#if defined(_MSC_VER) -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX -# include -#endif - -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) -#if _MSC_VER < 1400 -#define __restrict__ -#else -#define __restrict__ __restrict -#endif -#endif - -#if defined(_MSC_VER) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = _aligned_malloc(size, align)) -# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) -#elif defined(__CYGWIN__) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = aligned_alloc(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#else -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#endif - -#ifdef __ICC -#pragma warning(disable: 383) // ICPC remark we don't want. -#pragma warning(disable: 810) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#pragma warning(disable: 1418) // ICPC remark we don't want. -#pragma warning(disable: 444) // ICPC remark we don't want. -#pragma warning(disable: 869) // ICPC remark we don't want. -#pragma warning(disable: 1287) // ICPC remark we don't want. -#pragma warning(disable: 279) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#endif - - -namespace kaldi { - - -// CharToString prints the character in a human-readable form, for debugging. -std::string CharToString(const char &c); - - -inline int MachineIsLittleEndian() { - int check = 1; - return (*reinterpret_cast(&check) != 0); -} - -// This function kaldi::Sleep() provides a portable way -// to sleep for a possibly fractional -// number of seconds. On Windows it's only accurate to microseconds. -void Sleep(float seconds); -} // namespace kaldi - -#define KALDI_SWAP8(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ - (reinterpret_cast(&a))[7] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ - (reinterpret_cast(&a))[6] = t;\ - t = (reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ - (reinterpret_cast(&a))[5] = t;\ - t = (reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ - (reinterpret_cast(&a))[4] = t;} while (0) -#define KALDI_SWAP4(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=t;} while (0) -#define KALDI_SWAP2(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1] = t;} while (0) - - -// Makes copy constructor and operator= private. -#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ - type(const type&); \ - void operator = (const type&) - -template class KaldiCompileTimeAssert { }; -template<> class KaldiCompileTimeAssert { - public: - static inline void Check() { } -}; - -#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() - -#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ - KaldiCompileTimeAssert::is_specialized \ - && std::numeric_limits::is_integer>::Check() - -#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ - KaldiCompileTimeAssert::is_specialized \ - && !std::numeric_limits::is_integer>::Check() - -#if defined(_MSC_VER) -#define KALDI_STRCASECMP _stricmp -#elif defined(__CYGWIN__) -#include -#define KALDI_STRCASECMP strcasecmp -#else -#define KALDI_STRCASECMP strcasecmp -#endif -#ifdef _MSC_VER -# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif - -#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-decoder.cc deleted file mode 100644 index 06f77557fa49a23f6a44d07c327a1b3b081c6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-decoder.cc +++ /dev/null @@ -1,1101 +0,0 @@ -// decoder/lattice-faster-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2018 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "decoder/lattice-faster-decoder.h" -// #include "lat/lattice-functions.h" - -namespace kaldi { - -// instantiate this class once for each thing you have to decode. -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : fst_(&fst), - delete_fst_(false), - config_(config), - num_toks_(0), - context_graph_(context_graph) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const LatticeFasterDecoderConfig &config, FST *fst) - : fst_(fst), delete_fst_(true), config_(config), num_toks_(0) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::~LatticeFasterDecoderTpl() { - DeleteElems(toks_.Clear()); - ClearActiveTokens(); - if (delete_fst_) delete fst_; -} - -template -void LatticeFasterDecoderTpl::InitDecoding() { - // clean up from last time: - DeleteElems(toks_.Clear()); - cost_offsets_.clear(); - ClearActiveTokens(); - warned_ = false; - num_toks_ = 0; - decoding_finalized_ = false; - final_costs_.clear(); - StateId start_state = fst_->Start(); - KALDI_ASSERT(start_state != fst::kNoStateId); - active_toks_.resize(1); - Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL); - active_toks_[0].toks = start_tok; - toks_.Insert(start_state, start_tok); - num_toks_++; - ProcessNonemitting(config_.beam); -} - -// Returns true if any kind of traceback is available (not necessarily from -// a final state). It should only very rarely return false; this indicates -// an unusual search error. -template -bool LatticeFasterDecoderTpl::Decode( - DecodableInterface *decodable) { - InitDecoding(); - // We use 1-based indexing for frames in this decoder (if you view it in - // terms of features), but note that the decodable object uses zero-based - // numbering, which we have to correct for when we call it. - AdvanceDecoding(decodable); - FinalizeDecoding(); - - // Returns true if we have any kind of traceback available (not necessarily - // to the end state; query ReachedFinal() for that). - return !active_toks_.empty() && active_toks_.back().toks != NULL; -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - Lattice raw_lat; - GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, olat); - return (olat->NumStates() != 0); -} - -// Outputs an FST corresponding to the raw, state-level lattice -template -bool LatticeFasterDecoderTpl::GetRawLattice( - Lattice *ofst, bool use_final_probs) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (decoding_finalized_ ? final_costs_ : final_costs_local); - if (!decoding_finalized_ && use_final_probs) - ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_ / 2 + 3; - unordered_map tok_map(bucket_count); - // First create all states. - std::vector token_list; - for (int32 f = 0; f <= num_frames; f++) { - if (active_toks_[f].toks == NULL) { - KALDI_WARN << "GetRawLattice: no tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - TopSortTokens(active_toks_[f].toks, &token_list); - for (size_t i = 0; i < token_list.size(); i++) - if (token_list[i] != NULL) tok_map[token_list[i]] = ofst->AddState(); - } - // The next statement sets the start state of the output FST. Because we - // topologically sorted the tokens, state zero must be the start-state. - ofst->SetStart(0); - - KALDI_VLOG(4) << "init:" << num_toks_ / 2 + 3 - << " buckets:" << tok_map.bucket_count() - << " load:" << tok_map.load_factor() - << " max:" << tok_map.max_load_factor(); - // Now create all arcs. - for (int32 f = 0; f <= num_frames; f++) { - for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) { - StateId cur_state = tok_map[tok]; - for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) { - typename unordered_map::const_iterator iter = - tok_map.find(l->next_tok); - StateId nextstate = iter->second; - KALDI_ASSERT(iter != tok_map.end()); - BaseFloat cost_offset = 0.0; - if (l->ilabel != 0) { // emitting.. - KALDI_ASSERT(f >= 0 && f < cost_offsets_.size()); - cost_offset = cost_offsets_[f]; - } - - StateId state = cur_state; - if (l->is_start_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->start_tag_id(), Weight(0, 0), tmp); - ofst->AddArc(state, arc); - state = tmp; - } - if (l->is_end_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->end_tag_id(), Weight(0, 0), nextstate); - ofst->AddArc(tmp, arc); - nextstate = tmp; - } - - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(state, arc); - } - if (f == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - } - - fst::TopSort(ofst); - return (ofst->NumStates() > 0); -} - -// This function is now deprecated, since now we do determinization from outside -// the LatticeFasterDecoder class. Outputs an FST corresponding to the -// lattice-determinized lattice (one path per word sequence). -template -bool LatticeFasterDecoderTpl::GetLattice( - CompactLattice *ofst, bool use_final_probs) const { - Lattice raw_fst; - GetRawLattice(&raw_fst, use_final_probs); - Invert(&raw_fst); // make it so word labels are on the input. - // (in phase where we get backward-costs). - fst::ILabelCompare ilabel_comp; - ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes - // lattice-determinization more efficient. - - fst::DeterminizeLatticePrunedOptions lat_opts; - lat_opts.max_mem = config_.det_opts.max_mem; - - DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); - raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. - Connect(ofst); // Remove unreachable states... there might be - // a small number of these, in some cases. - // Note: if something went wrong and the raw lattice was empty, - // we should still get to this point in the code without warnings or failures. - return (ofst->NumStates() != 0); -} - -template -void LatticeFasterDecoderTpl::PossiblyResizeHash(size_t num_toks) { - size_t new_sz = static_cast(static_cast(num_toks) * - config_.hash_ratio); - if (new_sz > toks_.Size()) { - toks_.SetSize(new_sz); - } -} - -/* - A note on the definition of extra_cost. - - extra_cost is used in pruning tokens, to save memory. - - extra_cost can be thought of as a beta (backward) cost assuming - we had set the betas on currently-active tokens to all be the negative - of the alphas for those tokens. (So all currently active tokens would - be on (tied) best paths). - - We can use the extra_cost to accurately prune away tokens that we know will - never appear in the lattice. If the extra_cost is greater than the desired - lattice beam, the token would provably never appear in the lattice, so we can - prune away the token. - - (Note: we don't update all the extra_costs every time we update a frame; we - only do it every 'config_.prune_interval' frames). - */ - -// FindOrAddToken either locates a token in hash of toks_, -// or if necessary inserts a new, empty token (i.e. with no forward links) -// for the current frame. [note: it's inserted if necessary into hash toks_ -// and also into the singly linked list of tokens active on this frame -// (whose head is at active_toks_[frame]). -template -inline typename LatticeFasterDecoderTpl::Elem * -LatticeFasterDecoderTpl::FindOrAddToken(StateId state, - int32 frame_plus_one, - BaseFloat tot_cost, - Token *backpointer, - bool *changed) { - // Returns the Token pointer. Sets "changed" (if non-NULL) to true - // if the token was newly created or the cost changed. - KALDI_ASSERT(frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - Elem *e_found = toks_.Insert(state, NULL); - if (e_found->val == NULL) { // no such token presently. - const BaseFloat extra_cost = 0.0; - // tokens on the currently final frame have zero extra_cost - // as any of them could end up - // on the winning path. - Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer); - // NULL: no forward links yet - toks = new_tok; - num_toks_++; - e_found->val = new_tok; - if (changed) *changed = true; - return e_found; - } else { - Token *tok = e_found->val; // There is an existing Token for this state. - if (tok->tot_cost > tot_cost) { // replace old token - tok->tot_cost = tot_cost; - // SetBackpointer() just does tok->backpointer = backpointer in - // the case where Token == BackpointerToken, else nothing. - tok->SetBackpointer(backpointer); - // we don't allocate a new token, the old stays linked in active_toks_ - // we only replace the tot_cost - // in the current frame, there are no forward links (and no extra_cost) - // only in ProcessNonemitting we have to delete forward links - // in case we visit a state for the second time - // those forward links, that lead to this replaced token before: - // they remain and will hopefully be pruned later (PruneForwardLinks...) - if (changed) *changed = true; - } else { - if (changed) *changed = false; - } - return e_found; - } -} - -// prunes outgoing links for all tokens in active_toks_[frame] -// it's called by PruneActiveTokens -// all links, that have link_extra_cost > lattice_beam are pruned -template -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - // delta is the amount by which the extra_costs must change - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - - *extra_costs_changed = false; - *links_pruned = false; - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - if (active_toks_[frame_plus_one].toks == - NULL) { // empty list; should not happen. - if (!warned_) { - KALDI_WARN << "No tokens alive [doing pruning].. warning first " - "time only for each utterance\n"; - warned_ = true; - } - } - - // We have to iterate until there is no more change, because the links - // are not guaranteed to be in topological order. - bool changed = true; // difference new minus old extra cost >= delta ? - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost for tok. - BaseFloat tok_extra_cost = std::numeric_limits::infinity(); - // tok_extra_cost is the best (min) of link_extra_cost of outgoing links - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state - KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN - // the graph_cost contatins the context score - // if it's the score of the backoff arc, it should be removed. - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - *links_pruned = true; - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; // move to next link - link = link->next; - } - } // for all outgoing links - if (fabs(tok_extra_cost - tok->extra_cost) > delta) - changed = true; // difference new minus old is bigger than delta - tok->extra_cost = tok_extra_cost; - // will be +infinity or <= lattice_beam_. - // infinity indicates, that no forward link survived pruning - } // for all Token on active_toks_[frame] - if (changed) *extra_costs_changed = true; - - // Note: it's theoretically possible that aggressive compiler - // optimizations could cause an infinite loop here for small delta and - // high-dynamic-range scores. - } // while changed -} - -// PruneForwardLinksFinal is a version of PruneForwardLinks that we call -// on the final frame. If there are final tokens active, it uses -// the final-probs for pruning, otherwise it treats all tokens as final. -template -void LatticeFasterDecoderTpl::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame_plus_one = active_toks_.size() - 1; - - if (active_toks_[frame_plus_one].toks == - NULL) // empty list; should not happen. - KALDI_WARN << "No tokens alive at end of file"; - - typedef typename unordered_map::const_iterator IterType; - ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); - decoding_finalized_ = true; - // We call DeleteElems() as a nicety, not because it's really necessary; - // otherwise there would be a time, after calling PruneTokensForFrame() on the - // final frame, when toks_.GetList() or toks_.Clear() would contain pointers - // to nonexistent tokens. - DeleteElems(toks_.Clear()); - - // Now go through tokens on this frame, pruning forward links... may have to - // iterate a few times until there is no more change, because the list is not - // in topological order. This is a modified version of the code in - // PruneForwardLinks, but here we also take account of the final-probs. - bool changed = true; - BaseFloat delta = 1.0e-05; - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost. It has a term in it that corresponds - // to the "final-prob", so instead of initializing tok_extra_cost to - // infinity below we set it to the difference between the - // (score+final_prob) of this token, and the best such (score+final_prob). - BaseFloat final_cost; - if (final_costs_.empty()) { - final_cost = 0.0; - } else { - IterType iter = final_costs_.find(tok); - if (iter != final_costs_.end()) - final_cost = iter->second; - else - final_cost = std::numeric_limits::infinity(); - } - BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_; - // tok_extra_cost will be a "min" over either directly being final, or - // being indirectly final through other links, and the loop below may - // decrease its value: - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; - link = link->next; - } - } - // prune away tokens worse than lattice_beam above best path. This step - // was not necessary in the non-final case because then, this case - // showed up as having no forward links. Here, the tok_extra_cost has - // an extra component relating to the final-prob. - if (tok_extra_cost > config_.lattice_beam) - tok_extra_cost = std::numeric_limits::infinity(); - // to be pruned in PruneTokensForFrame - - if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true; - tok->extra_cost = - tok_extra_cost; // will be +infinity or <= lattice_beam_. - } - } // while changed -} - -template -BaseFloat LatticeFasterDecoderTpl::FinalRelativeCost() const { - if (!decoding_finalized_) { - BaseFloat relative_cost; - ComputeFinalCosts(NULL, &relative_cost, NULL); - return relative_cost; - } else { - // we're not allowed to call that function if FinalizeDecoding() has - // been called; return a cached value. - return final_relative_cost_; - } -} - -// Prune away any tokens on this frame that have no forward links. -// [we don't do this in PruneForwardLinks because it would give us -// a problem with dangling pointers]. -// It's called by PruneActiveTokens if any forward links have been pruned -template -void LatticeFasterDecoderTpl::PruneTokensForFrame( - int32 frame_plus_one) { - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]"; - Token *tok, *next_tok, *prev_tok = NULL; - for (tok = toks; tok != NULL; tok = next_tok) { - next_tok = tok->next; - if (tok->extra_cost == std::numeric_limits::infinity()) { - // token is unreachable from end of graph; (no forward links survived) - // excise tok from list and delete tok. - if (prev_tok != NULL) - prev_tok->next = tok->next; - else - toks = tok->next; - delete tok; - num_toks_--; - } else { // fetch next Token - prev_tok = tok; - } - } -} - -// Go backwards through still-alive tokens, pruning them, starting not from -// the current frame (where we want to keep all tokens) but from the frame -// before that. We go backwards through the frames and stop when we reach a -// point where the delta-costs are not changing (and the delta controls when we -// consider a cost to have "not changed"). -template -void LatticeFasterDecoderTpl::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // The index "f" below represents a "frame plus one", i.e. you'd have to - // subtract one to get the corresponding index for the decodable object. - for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) { - // Reason why we need to prune forward links in this situation: - // (1) we have never pruned them (new TokenList) - // (2) we have not yet pruned the forward links to the next f, - // after any of those tokens have changed their extra_cost. - if (active_toks_[f].must_prune_forward_links) { - bool extra_costs_changed = false, links_pruned = false; - PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); - if (extra_costs_changed && f > 0) // any token has changed extra_cost - active_toks_[f - 1].must_prune_forward_links = true; - if (links_pruned) // any link was pruned - active_toks_[f].must_prune_tokens = true; - active_toks_[f].must_prune_forward_links = false; // job done - } - if (f + 1 < cur_frame_plus_one && // except for last f (no forward links) - active_toks_[f + 1].must_prune_tokens) { - PruneTokensForFrame(f + 1); - active_toks_[f + 1].must_prune_tokens = false; - } - } - KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin - << " to " << num_toks_; -} - -template -void LatticeFasterDecoderTpl::ComputeFinalCosts( - unordered_map *final_costs, - BaseFloat *final_relative_cost, BaseFloat *final_best_cost) const { - KALDI_ASSERT(!decoding_finalized_); - if (final_costs != NULL) final_costs->clear(); - const Elem *final_toks = toks_.GetList(); - BaseFloat infinity = std::numeric_limits::infinity(); - BaseFloat best_cost = infinity, best_cost_with_final = infinity; - - while (final_toks != NULL) { - StateId state = final_toks->key; - Token *tok = final_toks->val; - const Elem *next = final_toks->tail; - BaseFloat final_cost = fst_->Final(state).Value(); - BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost; - best_cost = std::min(cost, best_cost); - best_cost_with_final = std::min(cost_with_final, best_cost_with_final); - if (final_costs != NULL && final_cost != infinity) - (*final_costs)[tok] = final_cost; - final_toks = next; - } - if (final_relative_cost != NULL) { - if (best_cost == infinity && best_cost_with_final == infinity) { - // Likely this will only happen if there are no tokens surviving. - // This seems the least bad way to handle it. - *final_relative_cost = infinity; - } else { - *final_relative_cost = best_cost_with_final - best_cost; - } - } - if (final_best_cost != NULL) { - if (best_cost_with_final != infinity) { // final-state exists. - *final_best_cost = best_cost_with_final; - } else { // no final-state exists. - *final_best_cost = best_cost; - } - } -} - -template -void LatticeFasterDecoderTpl::AdvanceDecoding( - DecodableInterface *decodable, int32 max_num_frames) { - if (std::is_same >::value) { - // if the type 'FST' is the FST base-class, then see if the FST type of fst_ - // is actually VectorFst or ConstFst. If so, call the AdvanceDecoding() - // function after casting *this to the more specific type. - if (fst_->Type() == "const") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } else if (fst_->Type() == "vector") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } - } - - KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ && - "You must call InitDecoding() before AdvanceDecoding"); - int32 num_frames_ready = decodable->NumFramesReady(); - // num_frames_ready must be >= num_frames_decoded, or else - // the number of frames ready must have decreased (which doesn't - // make sense) or the decodable object changed between calls - // (which isn't allowed). - KALDI_ASSERT(num_frames_ready >= NumFramesDecoded()); - int32 target_frames_decoded = num_frames_ready; - if (max_num_frames >= 0) - target_frames_decoded = - std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames); - while (NumFramesDecoded() < target_frames_decoded) { - if (NumFramesDecoded() % config_.prune_interval == 0) { - PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); - } -} - -// FinalizeDecoding() is a version of PruneActiveTokens that we call -// (optionally) on the final frame. Takes into account the final-prob of -// tokens. This function used to be called PruneActiveTokensFinal(). -template -void LatticeFasterDecoderTpl::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // PruneForwardLinksFinal() prunes final frame (with final-probs), and - // sets decoding_finalized_. - PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { - bool b1, b2; // values not used. - BaseFloat dontcare = 0.0; // delta of zero means we must always update - PruneForwardLinks(f, &b1, &b2, dontcare); - PruneTokensForFrame(f + 1); - } - PruneTokensForFrame(0); - KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " - << num_toks_; -} - -/// Gets the weight cutoff. Also counts the active tokens. -template -BaseFloat LatticeFasterDecoderTpl::GetCutoff( - Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, - Elem **best_elem) { - BaseFloat best_weight = std::numeric_limits::infinity(); - // positive == high cost == bad. - size_t count = 0; - if (config_.max_active == std::numeric_limits::max() && - config_.min_active == 0) { - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = static_cast(e->val->tot_cost); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - if (adaptive_beam != NULL) *adaptive_beam = config_.beam; - return best_weight + config_.beam; - } else { - tmp_array_.clear(); - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = e->val->tot_cost; - tmp_array_.push_back(w); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - - BaseFloat beam_cutoff = best_weight + config_.beam, - min_active_cutoff = std::numeric_limits::infinity(), - max_active_cutoff = std::numeric_limits::infinity(); - - KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() - << " is " << tmp_array_.size(); - - if (tmp_array_.size() > static_cast(config_.max_active)) { - std::nth_element(tmp_array_.begin(), - tmp_array_.begin() + config_.max_active, - tmp_array_.end()); - max_active_cutoff = tmp_array_[config_.max_active]; - } - if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam. - if (adaptive_beam) - *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; - return max_active_cutoff; - } - if (tmp_array_.size() > static_cast(config_.min_active)) { - if (config_.min_active == 0) { - min_active_cutoff = best_weight; - } else { - std::nth_element( - tmp_array_.begin(), tmp_array_.begin() + config_.min_active, - tmp_array_.size() > static_cast(config_.max_active) - ? tmp_array_.begin() + config_.max_active - : tmp_array_.end()); - min_active_cutoff = tmp_array_[config_.min_active]; - } - } - if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. - if (adaptive_beam) - *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; - return min_active_cutoff; - } else { - *adaptive_beam = config_.beam; - return beam_cutoff; - } - } -} - -template -BaseFloat LatticeFasterDecoderTpl::ProcessEmitting( - DecodableInterface *decodable) { - KALDI_ASSERT(active_toks_.size() > 0); - int32 frame = - active_toks_.size() - 1; // frame is the frame-index - // (zero-based) used to get likelihoods - // from the decodable object. - active_toks_.resize(active_toks_.size() + 1); - - Elem *final_toks = - toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_ - // in simple-decoder.h. Removes the Elems from - // being indexed in the hash in toks_. - Elem *best_elem = NULL; - BaseFloat adaptive_beam; - size_t tok_cnt; - BaseFloat cur_cutoff = - GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); - KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " - << adaptive_beam; - - PossiblyResizeHash( - tok_cnt); // This makes sure the hash is always big enough. - - BaseFloat next_cutoff = std::numeric_limits::infinity(); - // pruning "online" before having seen all tokens - - BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good - // dynamic range. - - // First process the best token to get a hopefully - // reasonably tight bound on the next cutoff. The only - // products of the next block are "next_cutoff" and "cost_offset". - if (best_elem) { - StateId state = best_elem->key; - Token *tok = best_elem->val; - cost_offset = -tok->tot_cost; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat new_weight = arc.weight.Value() + cost_offset - - decodable->LogLikelihood(frame, arc.ilabel) + - tok->tot_cost; - if (state != arc.nextstate) { - new_weight += config_.length_penalty; - } - if (new_weight + adaptive_beam < next_cutoff) - next_cutoff = new_weight + adaptive_beam; - } - } - } - - // Store the offset on the acoustic likelihoods that we're applying. - // Could just do cost_offsets_.push_back(cost_offset), but we - // do it this way as it's more robust to future code changes. - cost_offsets_.resize(frame + 1, 0.0); - cost_offsets_[frame] = cost_offset; - - // the tokens are now owned here, in final_toks, and the hash is empty. - // 'owned' is a complex thing here; the point is we need to call DeleteElem - // on each elem 'e' to let toks_ know we're done with them. - for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) { - // loop this way because we delete "e" as we go. - StateId state = e->key; - Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat ac_cost = cost_offset - - decodable->LogLikelihood(frame, arc.ilabel), - graph_cost = arc.weight.Value(); - if (state != arc.nextstate) { - graph_cost += config_.length_penalty; - } - BaseFloat cur_cost = tok->tot_cost, - tot_cost = cur_cost + ac_cost + graph_cost; - if (tot_cost >= next_cutoff) - continue; - else if (tot_cost + adaptive_beam < next_cutoff) - next_cutoff = - tot_cost + adaptive_beam; // prune by best current token - // Note: the frame indexes into active_toks_ are one-based, - // hence the + 1. - Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); - // NULL: no change indicator needed - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - // Add ForwardLink from tok to next_tok (put on head of list - // tok->links) - tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); - tok->links->context_score = context_score; - } - } // for all arcs - } - e_tail = e->tail; - toks_.Delete(e); // delete Elem - } - return next_cutoff; -} - -// static inline -template -void LatticeFasterDecoderTpl::DeleteForwardLinks(Token *tok) { - ForwardLinkT *l = tok->links, *m; - while (l != NULL) { - m = l->next; - delete l; - l = m; - } - tok->links = NULL; -} - -template -void LatticeFasterDecoderTpl::ProcessNonemitting(BaseFloat cutoff) { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame = static_cast(active_toks_.size()) - 2; - // Note: "frame" is the time-index we just processed, or -1 if - // we are processing the nonemitting transitions before the - // first frame (called from InitDecoding()). - - // Processes nonemitting arcs for one frame. Propagates within toks_. - // Note-- this queue structure is not very optimal as - // it may cause us to process states unnecessarily (e.g. more than once), - // but in the baseline code, turning this vector into a set to fix this - // problem did not improve overall speed. - - KALDI_ASSERT(queue_.empty()); - - if (toks_.GetList() == NULL) { - if (!warned_) { - KALDI_WARN << "Error, no surviving tokens: frame is " << frame; - warned_ = true; - } - } - - int before = 0, after = 0; - for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) { - StateId state = e->key; - if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(e); - ++before; - } - - while (!queue_.empty()) { - ++after; - const Elem *e = queue_.back(); - queue_.pop_back(); - - StateId state = e->key; - Token *tok = - e->val; // would segfault if e is a NULL pointer but this can't happen. - BaseFloat cur_cost = tok->tot_cost; - if (cur_cost >= cutoff) // Don't bother processing successors. - continue; - // If "tok" has any existing forward links, delete them, - // because we're about to regenerate them. This is a kind - // of non-optimality (remember, this is the simple decoder), - // but since most states are emitting it's not a huge issue. - DeleteForwardLinks(tok); // necessary when re-visiting - tok->links = NULL; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel == 0) { // propagate nonemitting only... - BaseFloat graph_cost = arc.weight.Value(), - tot_cost = cur_cost + graph_cost; - if (tot_cost < cutoff) { - bool changed; - - Elem *e_new = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed); - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_new->val->context_state = tok->context_state; - } else { - e_new->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - - tok->links = - new ForwardLinkT(e_new->val, 0, arc.olabel, graph_cost, 0, - is_start_boundary, is_end_boundary, tok->links); - tok->links->context_score = context_score; - - // "changed" tells us whether the new token has a different - // cost from before, or is new [if so, add into queue]. - if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0) - queue_.push_back(e_new); - } - } - } // for all arcs - } // while queue not empty - KALDI_VLOG(3) << "ProcessNonemitting " << before << " " << after; -} - -template -void LatticeFasterDecoderTpl::DeleteElems(Elem *list) { - for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { - e_tail = e->tail; - toks_.Delete(e); - } -} - -template -void LatticeFasterDecoderTpl< - FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin - for (size_t i = 0; i < active_toks_.size(); i++) { - // Delete all tokens alive on this frame, and any forward - // links they may have. - for (Token *tok = active_toks_[i].toks; tok != NULL;) { - DeleteForwardLinks(tok); - Token *next_tok = tok->next; - delete tok; - num_toks_--; - tok = next_tok; - } - } - active_toks_.clear(); - KALDI_ASSERT(num_toks_ == 0); -} - -// static -template -void LatticeFasterDecoderTpl::TopSortTokens( - Token *tok_list, std::vector *topsorted_list) { - unordered_map token2pos; - using std::unordered_set; - typedef typename unordered_map::iterator IterType; - int32 num_toks = 0; - for (Token *tok = tok_list; tok != NULL; tok = tok->next) num_toks++; - int32 cur_pos = 0; - // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0. - // This is likely to be in closer to topological order than - // if we had given them ascending order, because of the way - // new tokens are put at the front of the list. - for (Token *tok = tok_list; tok != NULL; tok = tok->next) - token2pos[tok] = num_toks - ++cur_pos; - - unordered_set reprocess; - - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) { - Token *tok = iter->first; - int32 pos = iter->second; - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - // We only need to consider epsilon links, since non-epsilon links - // transition between frames and this function only needs to sort a list - // of tokens from a single frame. - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { // another token on this - // frame, so must consider it. - int32 next_pos = following_iter->second; - if (next_pos < pos) { // reassign the position of the next Token. - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - // In case we had previously assigned this token to be reprocessed, we can - // erase it from that set because it's "happy now" (we just processed it). - reprocess.erase(tok); - } - - size_t max_loop = 1000000, - loop_count; // max_loop is to detect epsilon cycles. - for (loop_count = 0; !reprocess.empty() && loop_count < max_loop; - ++loop_count) { - std::vector reprocess_vec; - for (typename unordered_set::iterator iter = reprocess.begin(); - iter != reprocess.end(); ++iter) - reprocess_vec.push_back(*iter); - reprocess.clear(); - for (typename std::vector::iterator iter = reprocess_vec.begin(); - iter != reprocess_vec.end(); ++iter) { - Token *tok = *iter; - int32 pos = token2pos[tok]; - // Repeat the processing we did above (for comments, see above). - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { - int32 next_pos = following_iter->second; - if (next_pos < pos) { - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - } - } - KALDI_ASSERT(loop_count < max_loop && - "Epsilon loops exist in your decoding " - "graph (this is not allowed!)"); - - topsorted_list->clear(); - topsorted_list->resize(cur_pos, - NULL); // create a list with NULLs in between. - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) - (*topsorted_list)[iter->second] = iter->first; -} - -// Instantiate the template for the combination of token types and FST types -// that we'll need. -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; - -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-decoder.h deleted file mode 100644 index 0152b85447e354b770745b748d266b1ca2d57024..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-decoder.h +++ /dev/null @@ -1,558 +0,0 @@ -// decoder/lattice-faster-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_ - -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "decoder/context_graph.h" -#include "fst/fstlib.h" -#include "fstext/fstext-lib.h" -#include "itf/decodable-itf.h" -#include "lat/determinize-lattice-pruned.h" -#include "lat/kaldi-lattice.h" -#include "util/hash-list.h" - -namespace kaldi { - -struct LatticeFasterDecoderConfig { - BaseFloat beam; - int32 max_active; - int32 min_active; - BaseFloat lattice_beam; - int32 prune_interval; - bool determinize_lattice; // not inspected by this class... used in - // command-line program. - BaseFloat beam_delta; - BaseFloat hash_ratio; - // Note: we don't make prune_scale configurable on the command line, it's not - // a very important parameter. It affects the algorithm that prunes the - // tokens as we go. - BaseFloat prune_scale; - BaseFloat length_penalty; // for balancing the del/ins ratio, suggested -3.0 - - // Most of the options inside det_opts are not actually queried by the - // LatticeFasterDecoder class itself, but by the code that calls it, for - // example in the function DecodeUtteranceLatticeFaster. - fst::DeterminizeLatticePhonePrunedOptions det_opts; - - LatticeFasterDecoderConfig() - : beam(16.0), - max_active(std::numeric_limits::max()), - min_active(200), - lattice_beam(10.0), - prune_interval(25), - determinize_lattice(true), - beam_delta(0.5), - hash_ratio(2.0), - prune_scale(0.1), - length_penalty(0.0) {} - void Register(OptionsItf *opts) { - det_opts.Register(opts); - opts->Register("beam", &beam, - "Decoding beam. Larger->slower, more accurate."); - opts->Register("max-active", &max_active, - "Decoder max active states. Larger->slower; " - "more accurate"); - opts->Register("min-active", &min_active, - "Decoder minimum #active states."); - opts->Register("lattice-beam", &lattice_beam, - "Lattice generation beam. Larger->slower, " - "and deeper lattices"); - opts->Register("prune-interval", &prune_interval, - "Interval (in frames) at " - "which to prune tokens"); - opts->Register( - "determinize-lattice", &determinize_lattice, - "If true, " - "determinize the lattice (lattice-determinization, keeping only " - "best pdf-sequence for each word-sequence)."); - opts->Register( - "beam-delta", &beam_delta, - "Increment used in decoding-- this " - "parameter is obscure and relates to a speedup in the way the " - "max-active constraint is applied. Larger is more accurate."); - opts->Register("hash-ratio", &hash_ratio, - "Setting used in decoder to " - "control hash behavior"); - } - void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && - min_active <= max_active && prune_interval > 0 && - beam_delta > 0.0 && hash_ratio >= 1.0 && prune_scale > 0.0 && - prune_scale < 1.0); - } -}; - -namespace decoder { -// We will template the decoder on the token type as well as the FST type; this -// is a mechanism so that we can use the same underlying decoder code for -// versions of the decoder that support quickly getting the best path -// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also -// those that do not (LatticeFasterDecoder). - -// ForwardLinks are the links from a token to a token on the next frame. -// or sometimes on the current frame (for input-epsilon links). -template -struct ForwardLink { - using Label = fst::StdArc::Label; - - Token *next_tok; // the next token [or NULL if represents final-state] - Label ilabel; // ilabel on arc - Label olabel; // olabel on arc - BaseFloat graph_cost; // graph cost of traversing arc (contains LM, etc.) - BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing arc - bool is_start_boundary; - bool is_end_boundary; - float context_score; - ForwardLink *next; // next in singly-linked list of forward arcs (arcs - // in the state-level lattice) from a token. - inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, - bool is_start_boundary, bool is_end_boundary, - ForwardLink *next) - : next_tok(next_tok), - ilabel(ilabel), - olabel(olabel), - graph_cost(graph_cost), - acoustic_cost(acoustic_cost), - is_start_boundary(is_start_boundary), - is_end_boundary(is_end_boundary), - context_score(0), - next(next) {} -}; - -struct StdToken { - using ForwardLinkT = ForwardLink; - using Token = StdToken; - - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals the - // minimum difference between the cost of the best path that this link is a - // part of, and the cost of the absolute best path, under the assumption that - // any of the currently active states at the decoding front may eventually - // succeed (e.g. if you were to take the currently active states one by one - // and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - Token *next; - - // This function does nothing and should be optimized out; it's needed - // so we can share the regular LatticeFasterDecoderTpl code and the code - // for LatticeFasterOnlineDecoder that supports fast traceback. - inline void SetBackpointer(Token *backpointer) {} - - // This constructor just ignores the 'backpointer' argument. That argument is - // needed so that we can use the same decoder code for LatticeFasterDecoderTpl - // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a - // fast way to obtain the best path). - inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links, - Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - context_state(0), - next(next) {} -}; - -struct BackpointerToken { - using ForwardLinkT = ForwardLink; - using Token = BackpointerToken; - - // BackpointerToken is like Token but also - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - BackpointerToken *next; - - // Best preceding BackpointerToken (could be a on this frame, connected to - // this via an epsilon transition, or on a previous frame). This is only - // required for an efficient GetBestPath function in - // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation - // (the "links" list is what stores the forward links, for that). - Token *backpointer; - - inline void SetBackpointer(Token *backpointer) { - this->backpointer = backpointer; - } - - inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, - ForwardLinkT *links, Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - next(next), - backpointer(backpointer), - context_state(0) {} -}; - -} // namespace decoder - -/** This is the "normal" lattice-generating decoder. - See \ref lattices_generation \ref decoders_faster and \ref decoders_simple - for more information. - - The decoder is templated on the FST type and the token type. The token type - will normally be StdToken, but also may be BackpointerToken which is to - support quick lookup of the current best path (see - lattice-faster-online-decoder.h) - - The FST you invoke this decoder which is expected to equal - Fst::Fst, a.k.a. StdFst, or GrammarFst. If you invoke it with - FST == StdFst and it notices that the actual FST type is - fst::VectorFst or fst::ConstFst, the decoder object - will internally cast itself to one that is templated on those more specific - types; this is an optimization for speed. - */ -template -class LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph); - - // This version of the constructor takes ownership of the fst, and will delete - // it when this object is destroyed. - LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config, FST *fst); - - void SetOptions(const LatticeFasterDecoderConfig &config) { - config_ = config; - } - - const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - - ~LatticeFasterDecoderTpl(); - - /// Decodes until there are no more frames left in the "decodable" object.. - /// note, this may block waiting for input if the "decodable" object blocks. - /// Returns true if any kind of traceback is available (not necessarily from a - /// final state). - bool Decode(DecodableInterface *decodable); - - /// says whether a final-state was active on the last frame. If it was not, - /// the lattice (or traceback) will end with states that are not final-states. - bool ReachedFinal() const { - return FinalRelativeCost() != std::numeric_limits::infinity(); - } - - /// Outputs an FST corresponding to the single best path through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. Note: this just calls - /// GetRawLattice() and figures out the shortest path. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// Outputs an FST corresponding to the raw, state-level - /// tracebacks. Returns true if result is nonempty. - /// If "use_final_probs" is true AND we reached the final-state - /// of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - /// The raw lattice will be topologically sorted. - /// - /// See also GetRawLatticePruned in lattice-faster-online-decoder.h, - /// which also supports a pruning beam, in case for some reason - /// you want it pruned tighter than the regular lattice beam. - /// We could put that here in future needed. - bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const; - - /// [Deprecated, users should now use GetRawLattice and determinize it - /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper]. - /// Outputs an FST corresponding to the lattice-determinized - /// lattice (one path per word sequence). Returns true if result is - /// nonempty. If "use_final_probs" is true AND we reached the final-state of - /// the graph then it will include those as final-probs, else it will treat - /// all final-probs as one. - bool GetLattice(CompactLattice *ofst, bool use_final_probs = true) const; - - /// InitDecoding initializes the decoding, and should only be used if you - /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to - /// call this. You can also call InitDecoding if you have already decoded an - /// utterance and want to start with a new utterance. - void InitDecoding(); - - /// This will decode until there are no more frames ready in the decodable - /// object. You can keep calling it each time more frames become available. - /// If max_num_frames is specified, it specifies the maximum number of frames - /// the function will decode before returning. - void AdvanceDecoding(DecodableInterface *decodable, - int32 max_num_frames = -1); - - /// This function may be optionally called after AdvanceDecoding(), when you - /// do not plan to decode any further. It does an extra pruning step that - /// will help to prune the lattices output by GetLattice and (particularly) - /// GetRawLattice more completely, particularly toward the end of the - /// utterance. If you call this, you cannot call AdvanceDecoding again (it - /// will fail), and you cannot call GetLattice() and related functions with - /// use_final_probs = false. Used to be called PruneActiveTokensFinal(). - void FinalizeDecoding(); - - /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives - /// more information. It returns the difference between the best (final-cost - /// plus cost) of any token on the final frame, and the best cost of any token - /// on the final frame. If it is infinity it means no final-states were - /// present on the final frame. It will usually be nonnegative. If it not - /// too positive (e.g. < 5 is my first guess, but this is not tested) you can - /// take it as a good indication that we reached the final-state with - /// reasonable likelihood. - BaseFloat FinalRelativeCost() const; - - // Returns the number of frames decoded so far. The value returned changes - // whenever we call ProcessEmitting(). - inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; } - - protected: - // we make things protected instead of private, as code in - // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the - // internals. - - // Deletes the elements of the singly linked list tok->links. - inline static void DeleteForwardLinks(Token *tok); - - // head of per-frame list of Tokens (list is in topological order), - // and something saying whether we ever pruned it using PruneForwardLinks. - struct TokenList { - Token *toks; - bool must_prune_forward_links; - bool must_prune_tokens; - TokenList() - : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) {} - }; - - using Elem = typename HashList::Elem; - // Equivalent to: - // struct Elem { - // StateId key; - // Token *val; - // Elem *tail; - // }; - - void PossiblyResizeHash(size_t num_toks); - - // FindOrAddToken either locates a token in hash of toks_, or if necessary - // inserts a new, empty token (i.e. with no forward links) for the current - // frame. [note: it's inserted if necessary into hash toks_ and also into the - // singly linked list of tokens active on this frame (whose head is at - // active_toks_[frame]). The frame_plus_one argument is the acoustic frame - // index plus one, which is used to index into the active_toks_ array. - // Returns the Token pointer. Sets "changed" (if non-NULL) to true if the - // token was newly created or the cost changed. - // If Token == StdToken, the 'backpointer' argument has no purpose (and will - // hopefully be optimized out). - inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one, - BaseFloat tot_cost, Token *backpointer, - bool *changed); - - // prunes outgoing links for all tokens in active_toks_[frame] - // it's called by PruneActiveTokens - // all links, that have link_extra_cost > lattice_beam are pruned - // delta is the amount by which the extra_costs must change - // before we set *extra_costs_changed = true. - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed, - bool *links_pruned, BaseFloat delta); - - // This function computes the final-costs for tokens active on the final - // frame. It outputs to final-costs, if non-NULL, a map from the Token* - // pointer to the final-prob of the corresponding state, for all Tokens - // that correspond to states that have final-probs. This map will be - // empty if there were no final-probs. It outputs to - // final_relative_cost, if non-NULL, the difference between the best - // forward-cost including the final-prob cost, and the best forward-cost - // without including the final-prob cost (this will usually be positive), or - // infinity if there were no final-probs. [c.f. FinalRelativeCost(), which - // outputs this quanitity]. It outputs to final_best_cost, if - // non-NULL, the lowest for any token t active on the final frame, of - // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in - // the graph of the state corresponding to token t, or the best of - // forward-cost[t] if there were no final-probs active on the final frame. - // You cannot call this after FinalizeDecoding() has been called; in that - // case you should get the answer from class-member variables. - void ComputeFinalCosts(unordered_map *final_costs, - BaseFloat *final_relative_cost, - BaseFloat *final_best_cost) const; - - // PruneForwardLinksFinal is a version of PruneForwardLinks that we call - // on the final frame. If there are final tokens active, it uses - // the final-probs for pruning, otherwise it treats all tokens as final. - void PruneForwardLinksFinal(); - - // Prune away any tokens on this frame that have no forward links. - // [we don't do this in PruneForwardLinks because it would give us - // a problem with dangling pointers]. - // It's called by PruneActiveTokens if any forward links have been pruned - void PruneTokensForFrame(int32 frame_plus_one); - - // Go backwards through still-alive tokens, pruning them if the - // forward+backward cost is more than lat_beam away from the best path. It's - // possible to prove that this is "correct" in the sense that we won't lose - // anything outside of lat_beam, regardless of what happens in the future. - // delta controls when it considers a cost to have changed enough to continue - // going backward and propagating the change. larger delta -> will recurse - // less far. - void PruneActiveTokens(BaseFloat delta); - - /// Gets the weight cutoff. Also counts the active tokens. - BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, - BaseFloat *adaptive_beam, Elem **best_elem); - - /// Processes emitting arcs for one frame. Propagates from prev_toks_ to - /// cur_toks_. Returns the cost cutoff for subsequent ProcessNonemitting() to - /// use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); - - /// Processes nonemitting (epsilon) arcs for one frame. Called after - /// ProcessEmitting() on each frame. The cost cutoff is computed by the - /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); - - // HashList defined in ../util/hash-list.h. It actually allows us to maintain - // more than one list (e.g. for current and previous frames), but only one of - // them at a time can be indexed by StateId. It is indexed by frame-index - // plus one, where the frame-index is zero-based, as used in decodable object. - // That is, the emitting probs of frame t are accounted for in tokens at - // toks_[t+1]. The zeroth frame is for nonemitting transition at the start of - // the graph. - HashList toks_; - - std::vector active_toks_; // Lists of tokens, indexed by - // frame (members of TokenList are toks, must_prune_forward_links, - // must_prune_tokens). - std::vector - queue_; // temp variable used in ProcessNonemitting, - std::vector tmp_array_; // used in GetCutoff. - - // fst_ is a pointer to the FST we are decoding from. - const FST *fst_; - // delete_fst_ is true if the pointer fst_ needs to be deleted when this - // object is destroyed. - bool delete_fst_; - - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic log-likelihoods on that - // frame in order to keep everything in a nice dynamic range i.e. close to - // zero, to reduce roundoff errors. - LatticeFasterDecoderConfig config_; - int32 num_toks_; // current total #toks allocated... - bool warned_; - - /// decoding_finalized_ is true if someone called FinalizeDecoding(). [note, - /// calling this is optional]. If true, it's forbidden to decode more. Also, - /// if this is set, then the output of ComputeFinalCosts() is in the next - /// three variables. The reason we need to do this is that after - /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some - /// of the tokens on the last frame are freed, so we free the list from toks_ - /// to avoid having dangling pointers hanging around. - bool decoding_finalized_; - /// For the meaning of the next 3 variables, see the comment for - /// decoding_finalized_ above., and ComputeFinalCosts(). - unordered_map final_costs_; - BaseFloat final_relative_cost_; - BaseFloat final_best_cost_; - - std::shared_ptr context_graph_ = nullptr; - - // There are various cleanup tasks... the toks_ structure contains - // singly linked lists of Token pointers, where Elem is the list type. - // It also indexes them in a hash, indexed by state (this hash is only - // maintained for the most recent frame). toks_.Clear() - // deletes them from the hash and returns the list of Elems. The - // function DeleteElems calls toks_.Delete(elem) for each elem in - // the list, which returns ownership of the Elem to the toks_ structure - // for reuse, but does not delete the Token pointer. The Token pointers - // are reference-counted and are ultimately deleted in PruneTokensForFrame, - // but are also linked together on each frame by their own linked-list, - // using the "next" pointer. We delete them manually. - void DeleteElems(Elem *list); - - // This function takes a singly linked list of tokens for a single frame, and - // outputs a list of them in topological order (it will crash if no such order - // can be found, which will typically be due to decoding graphs with epsilon - // cycles, which are not allowed). Note: the output list may contain NULLs, - // which the caller should pass over; it just happens to be more efficient for - // the algorithm to output a list that contains NULLs. - static void TopSortTokens(Token *tok_list, - std::vector *topsorted_list); - - void ClearActiveTokens(); - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl); -}; - -typedef LatticeFasterDecoderTpl - LatticeFasterDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-online-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-online-decoder.cc deleted file mode 100644 index 2345b4d129ff905784762e973bad279f2fb55d31..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-online-decoder.cc +++ /dev/null @@ -1,278 +0,0 @@ -// decoder/lattice-faster-online-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2014 IMSL, PKU-HKUST (author: Wei Shi) -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.cc, about how to maintain this -// file in sync with lattice-faster-decoder.cc - -#include -#include -#include -#include - -#include "decoder/lattice-faster-online-decoder.h" - -namespace kaldi { - -template -bool LatticeFasterOnlineDecoderTpl::TestGetBestPath( - bool use_final_probs) const { - Lattice lat1; - { - Lattice raw_lat; - this->GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, &lat1); - } - Lattice lat2; - GetBestPath(&lat2, use_final_probs); - BaseFloat delta = 0.1; - int32 num_paths = 1; - if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) { - KALDI_WARN << "Best-path test failed"; - return false; - } else { - return true; - } -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterOnlineDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - olat->DeleteStates(); - BaseFloat final_graph_cost; - BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost); - if (iter.Done()) return false; // would have printed warning. - StateId state = olat->AddState(); - olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0)); - while (!iter.Done()) { - LatticeArc arc; - iter = TraceBackBestPath(iter, &arc); - arc.nextstate = state; - StateId new_state = olat->AddState(); - olat->AddArc(new_state, arc); - state = new_state; - } - olat->SetStart(state); - return true; -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::BestPathEnd( - bool use_final_probs, BaseFloat *final_cost_out) const { - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "BestPathEnd() with use_final_probs == false"; - KALDI_ASSERT(this->NumFramesDecoded() > 0 && - "You cannot call BestPathEnd if no frames were decoded."); - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - // Singly linked list of tokens on last frame (access list through "next" - // pointer). - BaseFloat best_cost = std::numeric_limits::infinity(); - BaseFloat best_final_cost = 0; - Token *best_tok = NULL; - for (Token *tok = this->active_toks_.back().toks; tok != NULL; - tok = tok->next) { - BaseFloat cost = tok->tot_cost, final_cost = 0.0; - if (use_final_probs && !final_costs.empty()) { - // if we are instructed to use final-probs, and any final tokens were - // active on final frame, include the final-prob in the cost of the token. - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) { - final_cost = iter->second; - cost += final_cost; - } else { - cost = std::numeric_limits::infinity(); - } - } - if (cost < best_cost) { - best_cost = cost; - best_tok = tok; - best_final_cost = final_cost; - } - } - if (best_tok == - NULL) { // this should not happen, and is likely a code error or - // caused by infinities in likelihoods, but I'm not making - // it a fatal error for now. - KALDI_WARN << "No final token found."; - } - if (final_cost_out) *final_cost_out = best_final_cost; - return BestPathIterator(best_tok, this->NumFramesDecoded() - 1); -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::TraceBackBestPath(BestPathIterator iter, - LatticeArc *oarc) const { - KALDI_ASSERT(!iter.Done() && oarc != NULL); - Token *tok = static_cast(iter.tok); - int32 cur_t = iter.frame, step_t = 0; - if (tok->backpointer != NULL) { - // retrieve the correct forward link(with the best link cost) - BaseFloat best_cost = std::numeric_limits::infinity(); - ForwardLinkT *link; - for (link = tok->backpointer->links; link != NULL; link = link->next) { - if (link->next_tok == tok) { // this is a link to "tok" - BaseFloat graph_cost = link->graph_cost, - acoustic_cost = link->acoustic_cost; - BaseFloat cost = graph_cost + acoustic_cost; - if (cost < best_cost) { - oarc->ilabel = link->ilabel; - oarc->olabel = link->olabel; - if (link->ilabel != 0) { - KALDI_ASSERT(static_cast(cur_t) < - this->cost_offsets_.size()); - acoustic_cost -= this->cost_offsets_[cur_t]; - step_t = -1; - } else { - step_t = 0; - } - oarc->weight = LatticeWeight(graph_cost, acoustic_cost); - best_cost = cost; - } - } - } - if (link == NULL && - best_cost == - std::numeric_limits::infinity()) { // Did not find - // correct link. - KALDI_ERR << "Error tracing best-path back (likely " - << "bug in token-pruning algorithm)"; - } - } else { - oarc->ilabel = 0; - oarc->olabel = 0; - oarc->weight = LatticeWeight::One(); // zero costs. - } - return BestPathIterator(tok->backpointer, cur_t + step_t); -} - -template -bool LatticeFasterOnlineDecoderTpl::GetRawLatticePruned( - Lattice *ofst, bool use_final_probs, BaseFloat beam) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = this->active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - for (int32 f = 0; f <= num_frames; f++) { - if (this->active_toks_[f].toks == NULL) { - KALDI_WARN << "No tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - } - unordered_map tok_map; - std::queue > tok_queue; - // First initialize the queue and states. Put the initial state on the queue; - // this is the last token in the list active_toks_[0].toks. - for (Token *tok = this->active_toks_[0].toks; tok != NULL; tok = tok->next) { - if (tok->next == NULL) { - tok_map[tok] = ofst->AddState(); - ofst->SetStart(tok_map[tok]); - std::pair tok_pair(tok, 0); // #frame = 0 - tok_queue.push(tok_pair); - } - } - - // Next create states for "good" tokens - while (!tok_queue.empty()) { - std::pair cur_tok_pair = tok_queue.front(); - tok_queue.pop(); - Token *cur_tok = cur_tok_pair.first; - int32 cur_frame = cur_tok_pair.second; - KALDI_ASSERT(cur_frame >= 0 && cur_frame <= this->cost_offsets_.size()); - - typename unordered_map::const_iterator iter = - tok_map.find(cur_tok); - KALDI_ASSERT(iter != tok_map.end()); - StateId cur_state = iter->second; - - for (ForwardLinkT *l = cur_tok->links; l != NULL; l = l->next) { - Token *next_tok = l->next_tok; - if (next_tok->extra_cost < beam) { - // so both the current and the next token are good; create the arc - int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1; - StateId nextstate; - if (tok_map.find(next_tok) == tok_map.end()) { - nextstate = tok_map[next_tok] = ofst->AddState(); - tok_queue.push(std::pair(next_tok, next_frame)); - } else { - nextstate = tok_map[next_tok]; - } - BaseFloat cost_offset = - (l->ilabel != 0 ? this->cost_offsets_[cur_frame] : 0); - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(cur_state, arc); - } - } - if (cur_frame == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(cur_tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - return (ofst->NumStates() != 0); -} - -// Instantiate the template for the FST types that we'll need. -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-online-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-online-decoder.h deleted file mode 100644 index dc50cfa73e6574e9625eda9045c47f674fcbc1e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/decoder/lattice-faster-online-decoder.h +++ /dev/null @@ -1,131 +0,0 @@ -// decoder/lattice-faster-online-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.h, about how to maintain this -// file in sync with lattice-faster-decoder.h - -#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ - -#include "decoder/lattice-faster-decoder.h" - -#include - -namespace kaldi { - -/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also - supports an efficient way to get the best path (see the function - BestPathEnd()), which is useful in endpointing and in situations where you - might want to frequently access the best path. - - This is only templated on the FST type, since the Token type is required to - be BackpointerToken. Actually it only makes sense to instantiate - LatticeFasterDecoderTpl with Token == BackpointerToken if you do so - indirectly via this child class. - */ -template -class LatticeFasterOnlineDecoderTpl - : public LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using Token = decoder::BackpointerToken; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterOnlineDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : LatticeFasterDecoderTpl(fst, config, context_graph) {} - - // This version of the initializer takes ownership of 'fst', and will delete - // it when this object is destroyed. - LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config, - FST *fst) - : LatticeFasterDecoderTpl(config, fst) {} - - struct BestPathIterator { - void *tok; - int32 frame; - // note, "frame" is the frame-index of the frame you'll get the - // transition-id for next time, if you call TraceBackBestPath on this - // iterator (assuming it's not an epsilon transition). Note that this - // is one less than you might reasonably expect, e.g. it's -1 for - // the nonemitting transitions before the first frame. - BestPathIterator(void *t, int32 f) : tok(t), frame(f) {} - bool Done() const { return tok == NULL; } - }; - - /// Outputs an FST corresponding to the single best path through the lattice. - /// This is quite efficient because it doesn't get the entire raw lattice and - /// find the best path through it; instead, it uses the BestPathEnd and - /// BestPathIterator so it basically traces it back through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// This function does a self-test of GetBestPath(). Returns true on - /// success; returns false and prints a warning on failure. - bool TestGetBestPath(bool use_final_probs = true) const; - - /// This function returns an iterator that can be used to trace back - /// the best path. If use_final_probs == true and at least one final state - /// survived till the end, it will use the final-probs in working out the best - /// final Token, and will output the final cost to *final_cost (if non-NULL), - /// else it will use only the forward likelihood, and will put zero in - /// *final_cost (if non-NULL). - /// Requires that NumFramesDecoded() > 0. - BestPathIterator BestPathEnd(bool use_final_probs, - BaseFloat *final_cost = NULL) const; - - /// This function can be used in conjunction with BestPathEnd() to trace back - /// the best path one link at a time (e.g. this can be useful in endpoint - /// detection). By "link" we mean a link in the graph; not all links cross - /// frame boundaries, but each time you see a nonzero ilabel you can interpret - /// that as a frame. The return value is the updated iterator. It outputs - /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" - /// pointer, while leaving its "nextstate" variable unchanged. - BestPathIterator TraceBackBestPath(BestPathIterator iter, - LatticeArc *arc) const; - - /// Behaves the same as GetRawLattice but only processes tokens whose - /// extra_cost is smaller than the best-cost plus the specified beam. - /// It is only worthwhile to call this function if beam is less than - /// the lattice_beam specified in the config; otherwise, it would - /// return essentially the same thing as GetRawLattice, but more slowly. - bool GetRawLatticePruned(Lattice *ofst, bool use_final_probs, - BaseFloat beam) const; - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl); -}; - -typedef LatticeFasterOnlineDecoderTpl LatticeFasterOnlineDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstaddselfloops.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstaddselfloops.cc deleted file mode 100644 index 145bf006f2324136c5fea4a8d0012a7a4126c646..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstaddselfloops.cc +++ /dev/null @@ -1,100 +0,0 @@ -// fstbin/fstaddselfloops.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#include "util/simple-io-funcs.h" - -/* some test examples: - pushd ~/tmpdir - ( echo 3; echo 4) > in.list - ( echo 5; echo 6) > out.list - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list - | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | - fstcompile | fstaddselfloops in.list out.list | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Adds self-loops to states of an FST to propagate disambiguation " - "symbols through it\n" - "They are added on each final state and each state with non-epsilon " - "output symbols\n" - "on at least one arc out of the state. Useful in conjunction with " - "predeterminize\n" - "\n" - "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " - "[out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" - "in.list and out.list are lists of integers, one per line, of the\n" - "same length.\n"; - - ParseOptions po(usage); - po.Read(argc, argv); - - if (po.NumArgs() < 2 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string disambig_in_rxfilename = po.GetArg(1), - disambig_out_rxfilename = po.GetArg(2), - fst_in_filename = po.GetOptArg(3), - fst_out_filename = po.GetOptArg(4); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - std::vector disambig_in; - if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_in_rxfilename); - - std::vector disambig_out; - if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_out_rxfilename); - - if (disambig_in.size() != disambig_out.size()) - KALDI_ERR - << "fstaddselfloops: mismatch in size of disambiguation symbols"; - - AddSelfLoops(fst, disambig_in, disambig_out); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstdeterminizestar.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstdeterminizestar.cc deleted file mode 100644 index e818143025c0fd5d389c28c77715d65711fe63f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstdeterminizestar.cc +++ /dev/null @@ -1,114 +0,0 @@ -// fstbin/fstdeterminizestar.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#if !defined(_MSC_VER) && !defined(__APPLE__) -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. We are disabling this code if -// compiling on Windows because signal.h is not available there, and on -// MacOS due to a problem with in the initial release of Sierra. -#endif - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | - fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 - 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst - > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n - "." done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo - "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id - of fstdeterminizestar] # prints out a bunch of debugging output showing the - mess it got itself into. -*/ - -bool debug_location = false; -void signal_handler(int) { debug_location = true; } - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Removes epsilons and determinizes in one step\n" - "\n" - "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" - "\n" - "See also: fstdeterminizelog, lattice-determinize\n"; - - float delta = kDelta; - int max_states = -1; - bool use_log = false; - ParseOptions po(usage); - po.Register("use-log", &use_log, "Determinize in log semiring."); - po.Register("delta", &delta, - "Delta value used to determine equivalence of weights."); - po.Register( - "max-states", &max_states, - "Maximum number of states in determinized FST before it will abort."); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); - - // This enables us to get traceback info from determinization that is - // not seeming to terminate. -#if !defined(_MSC_VER) && !defined(__APPLE__) - signal(SIGUSR1, signal_handler); -#endif - // Normal case: just files. - VectorFst *fst = ReadFstKaldi(fst_in_str); - - ArcSort(fst, ILabelCompare()); // improves speed. - if (use_log) { - DeterminizeStarInLog(fst, delta, &debug_location, max_states); - } else { - VectorFst det_fst; - DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); - *fst = det_fst; // will do shallow copy and then det_fst goes - // out of scope anyway. - } - WriteFstKaldi(*fst, fst_out_str); - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstisstochastic.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstisstochastic.cc deleted file mode 100644 index 468ed0daa7d37cb9a25cf25264f86e48e137b975..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstisstochastic.cc +++ /dev/null @@ -1,91 +0,0 @@ -// fstbin/fstisstochastic.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -// e.g. of test: -// echo " 0 0" | fstcompile | fstisstochastic -// should return 0 and print "0 0" [meaning, min and -// max weight are one = exp(0)] -// echo " 0 1" | fstcompile | fstisstochastic -// should return 1, not stochastic, and print 1 1 -// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 -// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo -// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, -// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; -// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic -// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" -// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even -// though not stochastic because we gave it an absurdly large delta. - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Checks whether an FST is stochastic and exits with success if so.\n" - "Prints out maximum error (in log units).\n" - "\n" - "Usage: fstisstochastic [ in.fst ]\n"; - - float delta = 0.01; - bool test_in_log = true; - - ParseOptions po(usage); - po.Register("delta", &delta, "Maximum error to accept."); - po.Register("test-in-log", &test_in_log, - "Test stochasticity in log semiring."); - po.Read(argc, argv); - - if (po.NumArgs() > 1) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1); - - Fst *fst = ReadFstKaldiGeneric(fst_in_filename); - - bool ans; - StdArc::Weight min, max; - if (test_in_log) - ans = IsStochasticFstInLog(*fst, delta, &min, &max); - else - ans = IsStochasticFst(*fst, delta, &min, &max); - - std::cout << min.Value() << " " << max.Value() << '\n'; - delete fst; - if (ans) - return 0; // success; - else - return 1; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstminimizeencoded.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstminimizeencoded.cc deleted file mode 100644 index ae9ca6d75abe67d9a195572dd6d91ec3c7b44851..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fstminimizeencoded.cc +++ /dev/null @@ -1,74 +0,0 @@ -// fstbin/fstminimizeencoded.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint - ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | - fstminimizeencoded | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Minimizes FST after encoding [similar to fstminimize, but no " - "weight-pushing]\n" - "\n" - "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; - - float delta = kDelta; - ParseOptions po(usage); - po.Register("delta", &delta, - "Delta likelihood used for quantization of weights"); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1), - fst_out_filename = po.GetOptArg(2); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - MinimizeEncoded(fst, delta); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fsttablecompose.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fsttablecompose.cc deleted file mode 100644 index bdd476da78b8cb8823c60abf33b5278e05bfd92c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstbin/fsttablecompose.cc +++ /dev/null @@ -1,133 +0,0 @@ -// fstbin/fsttablecompose.cc - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "fstext/table-matcher.h" -#include "util/parse-options.h" - -/* - cd ~/tmpdir - while true; do - fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort - > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst - fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" - echo -n "." - done - -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - /* - fsttablecompose should always give equivalent results to compose, - but it is more efficient for certain kinds of inputs. - In particular, it is useful when, say, the left FST has states - that typically either have epsilon olabels, or - one transition out for each of the possible symbols (as the - olabel). The same with the input symbols of the right-hand FST - is possible. - */ - - const char *usage = - "Composition algorithm [between two FSTs of standard type, in " - "tropical\n" - "semiring] that is more efficient for certain cases-- in particular,\n" - "where one of the FSTs (the left one, if --match-side=left) has large\n" - "out-degree\n" - "\n" - "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " - "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; - - ParseOptions po(usage); - - TableComposeOptions opts; - std::string match_side = "left"; - std::string compose_filter = "sequence"; - - po.Register("connect", &opts.connect, "If true, trim FST before output."); - po.Register("match-side", &match_side, - "Side of composition to do table " - "match, one of: \"left\" or \"right\"."); - po.Register("compose-filter", &compose_filter, - "Composition filter to use, " - "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); - - po.Read(argc, argv); - - if (match_side == "left") { - opts.table_match_type = MATCH_OUTPUT; - } else if (match_side == "right") { - opts.table_match_type = MATCH_INPUT; - } else { - KALDI_ERR << "Invalid match-side option: " << match_side; - } - - if (compose_filter == "alt_sequence") { - opts.filter_type = ALT_SEQUENCE_FILTER; - } else if (compose_filter == "auto") { - opts.filter_type = AUTO_FILTER; - } else if (compose_filter == "match") { - opts.filter_type = MATCH_FILTER; - } else if (compose_filter == "sequence") { - opts.filter_type = SEQUENCE_FILTER; - } else { - KALDI_ERR << "Invalid compose-filter option: " << compose_filter; - } - - if (po.NumArgs() < 2 || po.NumArgs() > 3) { - po.PrintUsage(); - exit(1); - } - - std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), - fst_out_str = po.GetOptArg(3); - - VectorFst *fst1 = ReadFstKaldi(fst1_in_str); - - VectorFst *fst2 = ReadFstKaldi(fst2_in_str); - - // Checks if is olabel sorted and is ilabel sorted. - if (fst1->Properties(fst::kOLabelSorted, true) == 0) { - KALDI_WARN << "The first FST is not olabel sorted."; - } - if (fst2->Properties(fst::kILabelSorted, true) == 0) { - KALDI_WARN << "The second FST is not ilabel sorted."; - } - - VectorFst composed_fst; - - TableCompose(*fst1, *fst2, &composed_fst, opts); - - delete fst1; - delete fst2; - - WriteFstKaldi(composed_fst, fst_out_str); - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstext/determinize-lattice-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstext/determinize-lattice-inl.h deleted file mode 100644 index 0bfbc8f41c7e439b1fac037f60490e04fdcbdd8b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/fstext/determinize-lattice-inl.h +++ /dev/null @@ -1,1357 +0,0 @@ -// fstext/determinize-lattice-inl.h - -// Copyright 2009-2012 Microsoft Corporation -// 2012-2013 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -#define KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -// Do not include this file directly. It is included by determinize-lattice.h - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fst { - -// This class maps back and forth from/to integer id's to sequences of strings. -// used in determinization algorithm. It is constructed in such a way that -// finding the string-id of the successor of (string, next-label) has constant -// time. - -// Note: class IntType, typically int32, is the type of the element in the -// string (typically a template argument of the CompactLatticeWeightTpl). - -template -class LatticeStringRepository { - public: - struct Entry { - const Entry *parent; // NULL for empty string. - IntType i; - inline bool operator==(const Entry &other) const { - return (parent == other.parent && i == other.i); - } - Entry() {} - Entry(const Entry &e) : parent(e.parent), i(e.i) {} - }; - // Note: all Entry* pointers returned in function calls are - // owned by the repository itself, not by the caller! - - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } - - // Returns string of "parent" with i appended. Pointer - // owned by repository - const Entry *Successor(const Entry *parent, IntType i) { - new_entry_->parent = parent; - new_entry_->i = i; - - std::pair pr = set_.insert(new_entry_); - if (pr.second) { // Was successfully inserted (was not there). We need to - // replace the element we inserted, which resides on the - // stack, with one from the heap. - const Entry *ans = new_entry_; - new_entry_ = new Entry(); - return ans; - } else { // Was not inserted because an equivalent Entry already - // existed. - return *pr.first; - } - } - - const Entry *Concatenate(const Entry *a, const Entry *b) { - if (a == NULL) - return b; - else if (b == NULL) - return a; - std::vector v; - ConvertToVector(b, &v); - const Entry *ans = a; - for (size_t i = 0; i < v.size(); i++) ans = Successor(ans, v[i]); - return ans; - } - const Entry *CommonPrefix(const Entry *a, const Entry *b) { - std::vector a_vec, b_vec; - ConvertToVector(a, &a_vec); - ConvertToVector(b, &b_vec); - const Entry *ans = NULL; - for (size_t i = 0; - i < a_vec.size() && i < b_vec.size() && a_vec[i] == b_vec[i]; i++) - ans = Successor(ans, a_vec[i]); - return ans; - } - - // removes any elements from b that are not part of - // a common prefix with a. - void ReduceToCommonPrefix(const Entry *a, std::vector *b) { - size_t a_size = Size(a), b_size = b->size(); - while (a_size > b_size) { - a = a->parent; - a_size--; - } - if (b_size > a_size) b_size = a_size; - typename std::vector::iterator b_begin = b->begin(); - while (a_size != 0) { - if (a->i != *(b_begin + a_size - 1)) b_size = a_size - 1; - a = a->parent; - a_size--; - } - if (b_size != b->size()) b->resize(b_size); - } - - // removes the first n elements of a. - const Entry *RemovePrefix(const Entry *a, size_t n) { - if (n == 0) return a; - std::vector a_vec; - ConvertToVector(a, &a_vec); - assert(a_vec.size() >= n); - const Entry *ans = NULL; - for (size_t i = n; i < a_vec.size(); i++) ans = Successor(ans, a_vec[i]); - return ans; - } - - // Returns true if a is a prefix of b. If a is prefix of b, - // time taken is |b| - |a|. Else, time taken is |b|. - bool IsPrefixOf(const Entry *a, const Entry *b) const { - if (a == NULL) return true; // empty string prefix of all. - if (a == b) return true; - if (b == NULL) return false; - return IsPrefixOf(a, b->parent); - } - - inline size_t Size(const Entry *entry) const { - size_t ans = 0; - while (entry != NULL) { - ans++; - entry = entry->parent; - } - return ans; - } - - void ConvertToVector(const Entry *entry, std::vector *out) const { - size_t length = Size(entry); - out->resize(length); - if (entry != NULL) { - typename std::vector::reverse_iterator iter = out->rbegin(); - while (entry != NULL) { - *iter = entry->i; - entry = entry->parent; - ++iter; - } - } - } - - const Entry *ConvertFromVector(const std::vector &vec) { - const Entry *e = NULL; - for (size_t i = 0; i < vec.size(); i++) e = Successor(e, vec[i]); - return e; - } - - LatticeStringRepository() { new_entry_ = new Entry; } - - void Destroy() { - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) - delete *iter; - SetType tmp; - tmp.swap(set_); - if (new_entry_) { - delete new_entry_; - new_entry_ = NULL; - } - } - - // Rebuild will rebuild this object, guaranteeing only - // to preserve the Entry values that are in the vector pointed - // to (this list does not have to be unique). The point of - // this is to save memory. - void Rebuild(const std::vector &to_keep) { - SetType tmp_set; - for (typename std::vector::const_iterator iter = - to_keep.begin(); - iter != to_keep.end(); ++iter) - RebuildHelper(*iter, &tmp_set); - // Now delete all elems not in tmp_set. - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) { - if (tmp_set.count(*iter) == 0) - delete (*iter); // delete the Entry; not needed. - } - set_.swap(tmp_set); - } - - ~LatticeStringRepository() { Destroy(); } - int32 MemSize() const { - return set_.size() * sizeof(Entry) * 2; // this is a lower bound - // on the size this structure might take. - } - - private: - class EntryKey { // Hash function object. - public: - inline size_t operator()(const Entry *entry) const { - size_t prime = 49109; - return static_cast(entry->i) + - prime * reinterpret_cast(entry->parent); - } - }; - class EntryEqual { - public: - inline bool operator()(const Entry *e1, const Entry *e2) const { - return (*e1 == *e2); - } - }; - typedef std::unordered_set SetType; - - void RebuildHelper(const Entry *to_add, SetType *tmp_set) { - while (true) { - if (to_add == NULL) return; - typename SetType::iterator iter = tmp_set->find(to_add); - if (iter == tmp_set->end()) { // not in tmp_set. - tmp_set->insert(to_add); - to_add = to_add->parent; // and loop. - } else { - return; - } - } - } - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); - Entry *new_entry_; // We always have a pre-allocated Entry ready to use, - // to avoid unnecessary news and deletes. - SetType set_; -}; - -// class LatticeDeterminizer is templated on the same types that -// CompactLatticeWeight is templated on: the base weight (Weight), typically -// LatticeWeightTpl etc. but could also be e.g. TropicalWeight, and the -// IntType, typically int32, used for the output symbols in the compact -// representation of strings [note: the output symbols would usually be -// p.d.f. id's in the anticipated use of this code] It has a special requirement -// on the Weight type: that there should be a Compare function on the weights -// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 -// > w2. This requires that there be a total order on the weights. - -template -class LatticeDeterminizer { - public: - // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 - // correspondence between our states and the states in ofst. If destroy == - // true, release memory as we go (but we cannot output again). - - typedef CompactLatticeWeightTpl CompactWeight; - typedef ArcTpl - CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - - // Output to standard FST with CompactWeightTpl as its weight type - // (the weight stores the original output-symbol strings). If destroy == - // true, release memory as we go (but we cannot output again). - void Output(MutableFst *ofst, bool destroy = true) { - assert(determinized_); - typedef typename Arc::StateId StateId; - StateId nStates = static_cast(output_arcs_.size()); - if (destroy) FreeMostMemory(); - ofst->DeleteStates(); - ofst->SetStart(kNoStateId); - if (nStates == 0) { - return; - } - for (StateId s = 0; s < nStates; s++) { - OutputStateId news = ofst->AddState(); - assert(news == s); - } - ofst->SetStart(0); - // now process transitions. - for (StateId this_state = 0; this_state < nStates; this_state++) { - std::vector &this_vec(output_arcs_[this_state]); - typename std::vector::const_iterator iter = this_vec.begin(), - end = this_vec.end(); - - for (; iter != end; ++iter) { - const TempArc &temp_arc(*iter); - CompactArc new_arc; - std::vector
is not treated as epsilon, create a common end state for - // all transitions accepting the
, since they do not back off. This small - // optimization saves about 2% states in an average grammar. - if (sub_eps_ == 0) { - eos_state_ = fst_->AddState(); - fst_->SetFinal(eos_state_, 0); - } -} - -template -void ArpaLmCompilerImpl::ConsumeNGram(const NGram& ngram, - bool is_highest) { - // Generally, we do the following. Suppose we are adding an n-gram "A B - // C". Then find the node for "A B", add a new node for "A B C", and connect - // them with the arc accepting "C" with the specified weight. Also, add a - // backoff arc from the new "A B C" node to its backoff state "B C". - // - // Two notable exceptions are the highest order n-grams, and final n-grams. - // - // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM), - // the following optimization is performed. There is no point adding a node - // for "A B C" with a "C" arc from "A B", since there will be no other - // arcs ingoing to this node, and an epsilon backoff arc into the backoff - // model "B C", with the weight of \bar{1}. To save a node, create an arc - // accepting "C" directly from "A B" to "B C". This saves as many nodes - // as there are the highest order n-grams, which is typically about half - // the size of a large 3-gram model. - // - // Indeed, this does not apply to n-grams ending in EOS, since they do not - // back off. These are special, as they do not have a back-off state, and - // the node for "(..anything..) " is always final. These are handled - // in one of the two possible ways, If symbols and are being - // replaced by epsilons, neither node nor arc is created, and the logprob - // of the n-gram is applied to its source node as final weight. If and - // are preserved, then a special final node for is allocated and - // used as the destination of the "" acceptor arc. - HistKey heads(ngram.words.begin(), ngram.words.end() - 1); - typename HistoryMap::iterator source_it = history_.find(heads); - if (source_it == history_.end()) { - // There was no "A B", therefore the probability of "A B C" is zero. - // Print a warning and discard current n-gram. - if (parent_->ShouldWarn()) - KALDI_WARN << parent_->LineReference() - << " skipped: no parent (n-1)-gram exists"; - return; - } - - StateId source = source_it->second; - StateId dest; - Symbol sym = ngram.words.back(); - float weight = -ngram.logprob; - if (sym == sub_eps_ || sym == 0) { - KALDI_ERR << " or disambiguation symbol " << sym - << "found in the ARPA file. "; - } - if (sym == eos_symbol_) { - if (sub_eps_ == 0) { - // Keep as a real symbol when not substituting. - dest = eos_state_; - } else { - // Treat as if it was epsilon: mark source final, with the weight - // of the n-gram. - fst_->SetFinal(source, weight); - return; - } - } else { - // For the highest order n-gram, this may find an existing state, for - // non-highest, will create one (unless there are duplicate n-grams - // in the grammar, which cannot be reliably detected if highest order, - // so we better do not do that at all). - dest = AddStateWithBackoff( - HistKey(ngram.words.begin() + (is_highest ? 1 : 0), ngram.words.end()), - -ngram.backoff); - } - - if (sym == bos_symbol_) { - weight = 0; // Accepting is always free. - if (sub_eps_ == 0) { - // is as a real symbol, only accepted in the start state. - source = fst_->AddState(); - fst_->SetStart(source); - } else { - // The new state for unigram history *is* the start state. - fst_->SetStart(dest); - return; - } - } - - // Add arc from source to dest, whichever way it was found. - fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest)); - return; -} - -// Find or create a new state for n-gram defined by key, and ensure it has a -// backoff transition. The key is either the current n-gram for all but -// highest orders, or the tails of the n-gram for the highest order. The -// latter arises from the chain-collapsing optimization described above. -template -StateId ArpaLmCompilerImpl::AddStateWithBackoff(HistKey key, - float backoff) { - typename HistoryMap::iterator dest_it = history_.find(key); - if (dest_it != history_.end()) { - // Found an existing state in the history map. Invariant: if the state in - // the map, then its backoff arc is in the FST. We are done. - return dest_it->second; - } - // Otherwise create a new state and its backoff arc, and register in the map. - StateId dest = fst_->AddState(); - history_[key] = dest; - CreateBackoff(key.Tails(), dest, backoff); - return dest; -} - -// Create a backoff arc for a state. Key is a backoff destination that may or -// may not exist. When the destination is not found, naturally fall back to -// the lower order model, and all the way down until one is found (since the -// 0-gram model is always present, the search is guaranteed to terminate). -template -inline void ArpaLmCompilerImpl::CreateBackoff(HistKey key, - StateId state, - float weight) { - typename HistoryMap::iterator dest_it = history_.find(key); - while (dest_it == history_.end()) { - key = key.Tails(); - dest_it = history_.find(key); - } - - // The arc should transduce either or #0 to , depending on the - // epsilon substitution mode. This is the only case when input and output - // label may differ. - fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second)); -} - -ArpaLmCompiler::~ArpaLmCompiler() { - if (impl_ != NULL) delete impl_; -} - -void ArpaLmCompiler::HeaderAvailable() { - KALDI_ASSERT(impl_ == NULL); - // Use optimized implementation if the grammar is 4-gram or less, and the - // maximum attained symbol id will fit into the optimized range. - int64 max_symbol = 0; - if (Symbols() != NULL) max_symbol = Symbols()->AvailableKey() - 1; - // If augmenting the symbol table, assume the worst case when all words in - // the model being read are novel. - if (Options().oov_handling == ArpaParseOptions::kAddToSymbols) - max_symbol += NgramCounts()[0]; - - if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - } else { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - KALDI_LOG << "Reverting to slower state tracking because model is large: " - << NgramCounts().size() << "-gram with symbols up to " - << max_symbol; - } -} - -void ArpaLmCompiler::ConsumeNGram(const NGram& ngram) { - // is invalid in tails, in heads of an n-gram. - for (int i = 0; i < ngram.words.size(); ++i) { - if ((i > 0 && ngram.words[i] == Options().bos_symbol) || - (i + 1 < ngram.words.size() && - ngram.words[i] == Options().eos_symbol)) { - if (ShouldWarn()) - KALDI_WARN << LineReference() - << " skipped: n-gram has invalid BOS/EOS placement"; - return; - } - } - - bool is_highest = ngram.words.size() == NgramCounts().size(); - impl_->ConsumeNGram(ngram, is_highest); -} - -void ArpaLmCompiler::RemoveRedundantStates() { - fst::StdArc::Label backoff_symbol = sub_eps_; - if (backoff_symbol == 0) { - // The method of removing redundant states implemented in this function - // leads to slow determinization of L o G when people use the older style of - // usage of arpa2fst where the --disambig-symbol option was not specified. - // The issue seems to be that it creates a non-deterministic FST, while G is - // supposed to be deterministic. By 'return'ing below, we just disable this - // method if people were using an older script. This method isn't really - // that consequential anyway, and people will move to the newer-style - // scripts (see current utils/format_lm.sh), so this isn't much of a - // problem. - return; - } - - fst::StdArc::StateId num_states = fst_.NumStates(); - - // replace the #0 symbols on the input of arcs out of redundant states (states - // that are not final and have only a backoff arc leaving them), with . - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && - fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } - } - } - - // we could call fst::RemoveEps, and it would have the same effect in normal - // cases, where backoff_symbol != 0 and there are no epsilons in unexpected - // places, but RemoveEpsLocal is a bit safer in case something weird is going - // on; it guarantees not to blow up the FST. - fst::RemoveEpsLocal(&fst_); - KALDI_LOG << "Reduced num-states from " << num_states << " to " - << fst_.NumStates(); -} - -void ArpaLmCompiler::Check() const { - if (fst_.Start() == fst::kNoStateId) { - KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " - << Symbols()->Find(Options().bos_symbol) << "."; - } -} - -void ArpaLmCompiler::ReadComplete() { - fst_.SetInputSymbols(Symbols()); - fst_.SetOutputSymbols(Symbols()); - RemoveRedundantStates(); - Check(); -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/lm/arpa-lm-compiler.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/lm/arpa-lm-compiler.h deleted file mode 100644 index 069c71bd0e6f5acf0b9521ec1ef46796eb31fe4d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/lm/arpa-lm-compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// lm/arpa-lm-compiler.h - -// Copyright 2009-2011 Gilles Boulianne -// Copyright 2016 Smart Action LLC (kkm) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_LM_ARPA_LM_COMPILER_H_ -#define KALDI_LM_ARPA_LM_COMPILER_H_ - -#include - -#include "lm/arpa-file-parser.h" - -namespace kaldi { - -class ArpaLmCompilerImplInterface; - -class ArpaLmCompiler : public ArpaFileParser { - public: - ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps, - fst::SymbolTable* symbols) - : ArpaFileParser(options, symbols), sub_eps_(sub_eps), impl_(NULL) {} - ~ArpaLmCompiler(); - - const fst::StdVectorFst& Fst() const { return fst_; } - fst::StdVectorFst* MutableFst() { return &fst_; } - - protected: - // ArpaFileParser overrides. - virtual void HeaderAvailable(); - virtual void ConsumeNGram(const NGram& ngram); - virtual void ReadComplete(); - - private: - // this function removes states that only have a backoff arc coming - // out of them. - void RemoveRedundantStates(); - void Check() const; - - int sub_eps_; - ArpaLmCompilerImplInterface* impl_; // Owned. - fst::StdVectorFst fst_; - template - friend class ArpaLmCompilerImpl; -}; - -} // namespace kaldi - -#endif // KALDI_LM_ARPA_LM_COMPILER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/lmbin/arpa2fst.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/lmbin/arpa2fst.cc deleted file mode 100644 index 881a45c5b37810247ea38dae56237f59b5554a9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/lmbin/arpa2fst.cc +++ /dev/null @@ -1,145 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "lm/arpa-lm-compiler.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; // NOLINT - try { - const char *usage = - "Convert an ARPA format language model into an FST\n" - "Usage: arpa2fst [opts] \n" - " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" - "data/lang/words.txt lm/input.arpa G.fst\n\n" - "Note: When called without switches, the output G.fst will contain\n" - "an embedded symbol table. This is compatible with the way a previous\n" - "version of arpa2fst worked.\n"; - - ParseOptions po(usage); - - ArpaParseOptions options; - options.Register(&po); - - // Option flags. - std::string bos_symbol = ""; - std::string eos_symbol = ""; - std::string disambig_symbol; - std::string read_syms_filename; - std::string write_syms_filename; - bool keep_symbols = false; - bool ilabel_sort = true; - - po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); - po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); - po.Register("disambig-symbol", &disambig_symbol, - "Disambiguator. If provided (e. g. #0), used on input side of " - "backoff links, and and are replaced with epsilons"); - po.Register("read-symbol-table", &read_syms_filename, - "Use existing symbol table"); - po.Register("write-symbol-table", &write_syms_filename, - "Write generated symbol table to a file"); - po.Register("keep-symbols", &keep_symbols, - "Store symbol table with FST. Symbols always saved to FST if " - "symbol tables are neither read or written (otherwise symbols " - "would be lost entirely)"); - po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); - - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_rxfilename = po.GetArg(1), - fst_wxfilename = po.GetOptArg(2); - - int64 disambig_symbol_id = 0; - - fst::SymbolTable *symbols; - if (!read_syms_filename.empty()) { - // Use existing symbols. Required symbols must be in the table. - kaldi::Input kisym(read_syms_filename); - symbols = fst::SymbolTable::ReadText( - kisym.Stream(), PrintableWxfilename(read_syms_filename)); - if (symbols == NULL) - KALDI_ERR << "Could not read symbol table from file " - << read_syms_filename; - - options.oov_handling = ArpaParseOptions::kSkipNGram; - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == -1) // fst::kNoSymbol - KALDI_ERR << "Symbol table " << read_syms_filename - << " has no symbol for " << disambig_symbol; - } - } else { - // Create a new symbol table and populate it from ARPA file. - symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); - options.oov_handling = ArpaParseOptions::kAddToSymbols; - symbols->AddSymbol("", 0); - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->AddSymbol(disambig_symbol); - } - } - - // Add or use existing BOS and EOS. - options.bos_symbol = symbols->AddSymbol(bos_symbol); - options.eos_symbol = symbols->AddSymbol(eos_symbol); - - // If producing new (not reading existing) symbols and not saving them, - // need to keep symbols with FST, otherwise they would be lost. - if (read_syms_filename.empty() && write_syms_filename.empty()) - keep_symbols = true; - - // Actually compile LM. - KALDI_ASSERT(symbols != NULL); - ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); - { - Input ki(arpa_rxfilename); - lm_compiler.Read(ki.Stream()); - } - - // Sort the FST in-place if requested by options. - if (ilabel_sort) { - fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); - } - - // Write symbols if requested. - if (!write_syms_filename.empty()) { - kaldi::Output kosym(write_syms_filename, false); - symbols->WriteText(kosym.Stream()); - } - - // Write LM FST. - bool write_binary = true, write_header = false; - kaldi::Output kofst(fst_wxfilename, write_binary, write_header); - fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); - wopts.write_isymbols = wopts.write_osymbols = keep_symbols; - lm_compiler.Fst().Write(kofst.Stream(), wopts); - - delete symbols; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/basic-filebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/basic-filebuf.h deleted file mode 100644 index 22ec891064d5955c8b1d255e0d34781a9f505a38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/basic-filebuf.h +++ /dev/null @@ -1,952 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// This is a modified version of the std::basic_filebuf from libc++ -// Copyright 20XX LLVM -// (http://libcxx.llvm.org/). -// It allows one to create basic_filebuf from an existing FILE* handle or file -// descriptor. -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source License licenses. See LICENSE.TXT for details (included at the -// bottom). -/////////////////////////////////////////////////////////////////////////////// -#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ -#define KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////// -namespace kaldi { -/////////////////////////////////////////////////////////////////////////////// -template > -class basic_filebuf : public std::basic_streambuf { - public: - typedef CharT char_type; - typedef Traits traits_type; - typedef typename traits_type::int_type int_type; - typedef typename traits_type::pos_type pos_type; - typedef typename traits_type::off_type off_type; - typedef typename traits_type::state_type state_type; - - basic_filebuf(); - basic_filebuf(basic_filebuf&& rhs); - virtual ~basic_filebuf(); - - basic_filebuf& operator=(basic_filebuf&& rhs); - void swap(basic_filebuf& rhs); - - bool is_open() const; - basic_filebuf* open(const char* s, std::ios_base::openmode mode); - basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); - basic_filebuf* open(int fd, std::ios_base::openmode mode); - basic_filebuf* open(FILE* f, std::ios_base::openmode mode); - basic_filebuf* close(); - - FILE* file() { return this->_M_file; } - int fd() { return fileno(this->_M_file); } - - protected: - int_type underflow() override; - int_type pbackfail(int_type c = traits_type::eof()) override; - int_type overflow(int_type c = traits_type::eof()) override; - std::basic_streambuf* setbuf( - char_type* s, std::streamsize n) override; - pos_type seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - pos_type seekpos(pos_type sp, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - int sync() override; - void imbue(const std::locale& loc) override; - - protected: - char* _M_extbuf; - const char* _M_extbufnext; - const char* _M_extbufend; - char _M_extbuf_min[8]; - size_t _M_ebs; - char_type* _M_intbuf; - size_t _M_ibs; - FILE* _M_file; - const std::codecvt* _M_cv; - state_type _M_st; - state_type _M_st_last; - std::ios_base::openmode _M_om; - std::ios_base::openmode _M_cm; - bool _M_owns_eb; - bool _M_owns_ib; - bool _M_always_noconv; - - const char* _M_get_mode(std::ios_base::openmode mode); - bool _M_read_mode(); - void _M_write_mode(); -}; - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf() - : _M_extbuf(nullptr), - _M_extbufnext(nullptr), - _M_extbufend(nullptr), - _M_ebs(0), - _M_intbuf(nullptr), - _M_ibs(0), - _M_file(nullptr), - _M_cv(nullptr), - _M_st(), - _M_st_last(), - _M_om(std::ios_base::openmode(0)), - _M_cm(std::ios_base::openmode(0)), - _M_owns_eb(false), - _M_owns_ib(false), - _M_always_noconv(false) { - if (std::has_facet >( - this->getloc())) { - _M_cv = &std::use_facet >( - this->getloc()); - _M_always_noconv = _M_cv->always_noconv(); - } - setbuf(0, 4096); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf(basic_filebuf&& rhs) - : std::basic_streambuf(rhs) { - if (rhs._M_extbuf == rhs._M_extbuf_min) { - _M_extbuf = _M_extbuf_min; - _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); - _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); - } else { - _M_extbuf = rhs._M_extbuf; - _M_extbufnext = rhs._M_extbufnext; - _M_extbufend = rhs._M_extbufend; - } - _M_ebs = rhs._M_ebs; - _M_intbuf = rhs._M_intbuf; - _M_ibs = rhs._M_ibs; - _M_file = rhs._M_file; - _M_cv = rhs._M_cv; - _M_st = rhs._M_st; - _M_st_last = rhs._M_st_last; - _M_om = rhs._M_om; - _M_cm = rhs._M_cm; - _M_owns_eb = rhs._M_owns_eb; - _M_owns_ib = rhs._M_owns_ib; - _M_always_noconv = rhs._M_always_noconv; - if (rhs.pbase()) { - if (rhs.pbase() == rhs._M_intbuf) - this->setp(_M_intbuf, _M_intbuf + (rhs.epptr() - rhs.pbase())); - else - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + - (rhs.epptr() - rhs.pbase())); - this->pbump(rhs.pptr() - rhs.pbase()); - } else if (rhs.eback()) { - if (rhs.eback() == rhs._M_intbuf) - this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), - _M_intbuf + (rhs.egptr() - rhs.eback())); - else - this->setg( - reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (rhs.gptr() - rhs.eback()), - reinterpret_cast(_M_extbuf) + - (rhs.egptr() - rhs.eback())); - } - rhs._M_extbuf = nullptr; - rhs._M_extbufnext = nullptr; - rhs._M_extbufend = nullptr; - rhs._M_ebs = 0; - rhs._M_intbuf = nullptr; - rhs._M_ibs = 0; - rhs._M_file = nullptr; - rhs._M_st = state_type(); - rhs._M_st_last = state_type(); - rhs._M_om = std::ios_base::openmode(0); - rhs._M_cm = std::ios_base::openmode(0); - rhs._M_owns_eb = false; - rhs._M_owns_ib = false; - rhs.setg(0, 0, 0); - rhs.setp(0, 0); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf& basic_filebuf::operator=( - basic_filebuf&& rhs) { - close(); - swap(rhs); - return *this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::~basic_filebuf() { - // try - // { - // close(); - // } - // catch (...) - // { - // } - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::swap(basic_filebuf& rhs) { - std::basic_streambuf::swap(rhs); - if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - std::swap(_M_extbuf, rhs._M_extbuf); - std::swap(_M_extbufnext, rhs._M_extbufnext); - std::swap(_M_extbufend, rhs._M_extbufend); - } else { - ptrdiff_t ln = _M_extbufnext - _M_extbuf; - ptrdiff_t le = _M_extbufend - _M_extbuf; - ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; - ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; - if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - _M_extbuf = rhs._M_extbuf; - rhs._M_extbuf = rhs._M_extbuf_min; - } else if (_M_extbuf != _M_extbuf_min && - rhs._M_extbuf == rhs._M_extbuf_min) { - rhs._M_extbuf = _M_extbuf; - _M_extbuf = _M_extbuf_min; - } - _M_extbufnext = _M_extbuf + rn; - _M_extbufend = _M_extbuf + re; - rhs._M_extbufnext = rhs._M_extbuf + ln; - rhs._M_extbufend = rhs._M_extbuf + le; - } - std::swap(_M_ebs, rhs._M_ebs); - std::swap(_M_intbuf, rhs._M_intbuf); - std::swap(_M_ibs, rhs._M_ibs); - std::swap(_M_file, rhs._M_file); - std::swap(_M_cv, rhs._M_cv); - std::swap(_M_st, rhs._M_st); - std::swap(_M_st_last, rhs._M_st_last); - std::swap(_M_om, rhs._M_om); - std::swap(_M_cm, rhs._M_cm); - std::swap(_M_owns_eb, rhs._M_owns_eb); - std::swap(_M_owns_ib, rhs._M_owns_ib); - std::swap(_M_always_noconv, rhs._M_always_noconv); - if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->gptr() - this->eback(); - ptrdiff_t e = this->egptr() - this->eback(); - this->setg(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + n, - reinterpret_cast(_M_extbuf_min) + e); - } else if (this->pbase() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->pptr() - this->pbase(); - ptrdiff_t e = this->epptr() - this->pbase(); - this->setp(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + e); - this->pbump(n); - } - if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.gptr() - rhs.eback(); - ptrdiff_t e = rhs.egptr() - rhs.eback(); - rhs.setg(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + n, - reinterpret_cast(rhs._M_extbuf_min) + e); - } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.pptr() - rhs.pbase(); - ptrdiff_t e = rhs.epptr() - rhs.pbase(); - rhs.setp(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + e); - rhs.pbump(n); - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline void swap(basic_filebuf& x, - basic_filebuf& y) { - x.swap(y); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline bool basic_filebuf::is_open() const { - return _M_file != nullptr; -} - -/////////////////////////////////////////////////////////////////////////////// -template -const char* basic_filebuf::_M_get_mode( - std::ios_base::openmode mode) { - switch ((mode & ~std::ios_base::ate) | 0) { - case std::ios_base::out: - case std::ios_base::out | std::ios_base::trunc: - return "w"; - case std::ios_base::out | std::ios_base::app: - case std::ios_base::app: - return "a"; - break; - case std::ios_base::in: - return "r"; - case std::ios_base::in | std::ios_base::out: - return "r+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: - return "w+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app: - case std::ios_base::in | std::ios_base::app: - return "a+"; - case std::ios_base::out | std::ios_base::binary: - case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: - return "wb"; - case std::ios_base::out | std::ios_base::app | std::ios_base::binary: - case std::ios_base::app | std::ios_base::binary: - return "ab"; - case std::ios_base::in | std::ios_base::binary: - return "rb"; - case std::ios_base::in | std::ios_base::out | std::ios_base::binary: - return "r+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | - std::ios_base::binary: - return "w+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app | - std::ios_base::binary: - case std::ios_base::in | std::ios_base::app | std::ios_base::binary: - return "a+b"; - default: - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - const char* s, std::ios_base::openmode mode) { - basic_filebuf* rt = nullptr; - if (_M_file == nullptr) { - const char* md = _M_get_mode(mode); - if (md) { - _M_file = fopen(s, md); - if (_M_file) { - rt = this; - _M_om = mode; - if (mode & std::ios_base::ate) { - if (fseek(_M_file, 0, SEEK_END)) { - fclose(_M_file); - _M_file = nullptr; - rt = nullptr; - } - } - } - } - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf* basic_filebuf::open( - const std::string& s, std::ios_base::openmode mode) { - return open(s.c_str(), mode); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - int fd, std::ios_base::openmode mode) { - const char* md = this->_M_get_mode(mode); - if (md) { - this->_M_file = fdopen(fd, md); - this->_M_om = mode; - return this; - } else { - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - FILE* f, std::ios_base::openmode mode) { - this->_M_file = f; - this->_M_om = mode; - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::close() { - basic_filebuf* rt = nullptr; - if (_M_file) { - rt = this; - std::unique_ptr h(_M_file, fclose); - if (sync()) rt = nullptr; - if (fclose(h.release()) == 0) - _M_file = nullptr; - else - rt = nullptr; - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::underflow() { - if (_M_file == nullptr) return traits_type::eof(); - bool initial = _M_read_mode(); - char_type buf; - if (this->gptr() == nullptr) this->setg(&buf, &buf + 1, &buf + 1); - const size_t unget_sz = - initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); - int_type c = traits_type::eof(); - if (this->gptr() == this->egptr()) { - memmove(this->eback(), this->egptr() - unget_sz, - unget_sz * sizeof(char_type)); - if (_M_always_noconv) { - size_t nmemb = - static_cast(this->egptr() - this->eback() - unget_sz); - nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); - if (nmemb != 0) { - this->setg(this->eback(), this->eback() + unget_sz, - this->eback() + unget_sz + nmemb); - c = traits_type::to_int_type(*this->gptr()); - } - } else { - memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); - _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); - _M_extbufend = - _M_extbuf + - (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); - size_t nmemb = - std::min(static_cast(_M_ibs - unget_sz), - static_cast(_M_extbufend - _M_extbufnext)); - std::codecvt_base::result r; - _M_st_last = _M_st; - size_t nr = - fread(reinterpret_cast(const_cast(_M_extbufnext)), - 1, nmemb, _M_file); - if (nr != 0) { - if (!_M_cv) throw std::bad_cast(); - _M_extbufend = _M_extbufnext + nr; - char_type* inext; - r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, - this->eback() + unget_sz, this->eback() + _M_ibs, inext); - if (r == std::codecvt_base::noconv) { - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf), - const_cast(_M_extbufend)); - c = traits_type::to_int_type(*this->gptr()); - } else if (inext != this->eback() + unget_sz) { - this->setg(this->eback(), this->eback() + unget_sz, inext); - c = traits_type::to_int_type(*this->gptr()); - } - } - } - } else { - c = traits_type::to_int_type(*this->gptr()); - } - if (this->eback() == &buf) this->setg(0, 0, 0); - return c; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::pbackfail(int_type c) { - if (_M_file && this->eback() < this->gptr()) { - if (traits_type::eq_int_type(c, traits_type::eof())) { - this->gbump(-1); - return traits_type::not_eof(c); - } - if ((_M_om & std::ios_base::out) || - traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { - this->gbump(-1); - *this->gptr() = traits_type::to_char_type(c); - return c; - } - } - return traits_type::eof(); -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::overflow(int_type c) { - if (_M_file == nullptr) return traits_type::eof(); - _M_write_mode(); - char_type buf; - char_type* pb_save = this->pbase(); - char_type* epb_save = this->epptr(); - if (!traits_type::eq_int_type(c, traits_type::eof())) { - if (this->pptr() == nullptr) this->setp(&buf, &buf + 1); - *this->pptr() = traits_type::to_char_type(c); - this->pbump(1); - } - if (this->pptr() != this->pbase()) { - if (_M_always_noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else { - char* extbe = _M_extbuf; - std::codecvt_base::result r; - do { - if (!_M_cv) throw std::bad_cast(); - const char_type* e; - r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, _M_extbuf, - _M_extbuf + _M_ebs, extbe); - if (e == this->pbase()) return traits_type::eof(); - if (r == std::codecvt_base::noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else if (r == std::codecvt_base::ok || - r == std::codecvt_base::partial) { - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - if (r == std::codecvt_base::partial) { - this->setp(const_cast(e), this->pptr()); - this->pbump(this->epptr() - this->pbase()); - } - } else { - return traits_type::eof(); - } - } while (r == std::codecvt_base::partial); - } - this->setp(pb_save, epb_save); - } - return traits_type::not_eof(c); -} - -/////////////////////////////////////////////////////////////////////////////// -template -std::basic_streambuf* basic_filebuf::setbuf( - char_type* s, std::streamsize n) { - this->setg(0, 0, 0); - this->setp(0, 0); - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; - _M_ebs = n; - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv && s) { - _M_extbuf = reinterpret_cast(s); - _M_owns_eb = false; - } else { - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } - } else { - _M_extbuf = _M_extbuf_min; - _M_ebs = sizeof(_M_extbuf_min); - _M_owns_eb = false; - } - if (!_M_always_noconv) { - _M_ibs = std::max(n, sizeof(_M_extbuf_min)); - if (s && _M_ibs >= sizeof(_M_extbuf_min)) { - _M_intbuf = s; - _M_owns_ib = false; - } else { - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } else { - _M_ibs = 0; - _M_intbuf = 0; - _M_owns_ib = false; - } - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode) { - if (!_M_cv) throw std::bad_cast(); - int width = _M_cv->encoding(); - if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) - return pos_type(off_type(-1)); - // width > 0 || off == 0 - int whence; - switch (way) { - case std::ios_base::beg: - whence = SEEK_SET; - break; - case std::ios_base::cur: - whence = SEEK_CUR; - break; - case std::ios_base::end: - whence = SEEK_END; - break; - default: - return pos_type(off_type(-1)); - } -#if _WIN32 - if (fseek(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftell(_M_file); -#else - if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftello(_M_file); -#endif - r.state(_M_st); - return r; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { - if (_M_file == nullptr || sync()) return pos_type(off_type(-1)); -#if _WIN32 - if (fseek(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#else - if (fseeko(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#endif - _M_st = sp.state(); - return sp; -} - -/////////////////////////////////////////////////////////////////////////////// -template -int basic_filebuf::sync() { - if (_M_file == nullptr) return 0; - if (!_M_cv) throw std::bad_cast(); - if (_M_cm & std::ios_base::out) { - if (this->pptr() != this->pbase()) - if (overflow() == traits_type::eof()) return -1; - std::codecvt_base::result r; - do { - char* extbe; - r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) return -1; - } while (r == std::codecvt_base::partial); - if (r == std::codecvt_base::error) return -1; - if (fflush(_M_file)) return -1; - } else if (_M_cm & std::ios_base::in) { - off_type c; - state_type state = _M_st_last; - bool update_st = false; - if (_M_always_noconv) { - c = this->egptr() - this->gptr(); - } else { - int width = _M_cv->encoding(); - c = _M_extbufend - _M_extbufnext; - if (width > 0) { - c += width * (this->egptr() - this->gptr()); - } else { - if (this->gptr() != this->egptr()) { - const int off = _M_cv->length(state, _M_extbuf, _M_extbufnext, - this->gptr() - this->eback()); - c += _M_extbufnext - _M_extbuf - off; - update_st = true; - } - } - } -#if _WIN32 - if (fseek(_M_file_, -c, SEEK_CUR)) return -1; -#else - if (fseeko(_M_file, -c, SEEK_CUR)) return -1; -#endif - if (update_st) _M_st = state; - _M_extbufnext = _M_extbufend = _M_extbuf; - this->setg(0, 0, 0); - _M_cm = std::ios_base::openmode(0); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::imbue(const std::locale& loc) { - sync(); - _M_cv = &std::use_facet >(loc); - bool old_anc = _M_always_noconv; - _M_always_noconv = _M_cv->always_noconv(); - if (old_anc != _M_always_noconv) { - this->setg(0, 0, 0); - this->setp(0, 0); - // invariant, char_type is char, else we couldn't get here - // need to dump _M_intbuf - if (_M_always_noconv) { - if (_M_owns_eb) delete[] _M_extbuf; - _M_owns_eb = _M_owns_ib; - _M_ebs = _M_ibs; - _M_extbuf = reinterpret_cast(_M_intbuf); - _M_ibs = 0; - _M_intbuf = nullptr; - _M_owns_ib = false; - } else { // need to obtain an _M_intbuf. - // If _M_extbuf is user-supplied, use it, else new _M_intbuf - if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { - _M_ibs = _M_ebs; - _M_intbuf = reinterpret_cast(_M_extbuf); - _M_owns_ib = false; - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } else { - _M_ibs = _M_ebs; - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -bool basic_filebuf::_M_read_mode() { - if (!(_M_cm & std::ios_base::in)) { - this->setp(0, 0); - if (_M_always_noconv) - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + _M_ebs, - reinterpret_cast(_M_extbuf) + _M_ebs); - else - this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); - _M_cm = std::ios_base::in; - return true; - } - return false; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::_M_write_mode() { - if (!(_M_cm & std::ios_base::out)) { - this->setg(0, 0, 0); - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv) - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (_M_ebs - 1)); - else - this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); - } else { - this->setp(0, 0); - } - _M_cm = std::ios_base::out; - } -} - -/////////////////////////////////////////////////////////////////////////////// -} // namespace kaldi - -/////////////////////////////////////////////////////////////////////////////// -#endif // KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// - -/* - * ============================================================================ - * libc++ License - * ============================================================================ - * - * The libc++ library is dual licensed under both the University of Illinois - * "BSD-Like" license and the MIT license. As a user of this code you may - * choose to use it under either license. As a contributor, you agree to allow - * your code to be used under both. - * - * Full text of the relevant licenses is included below. - * - * ============================================================================ - * - * University of Illinois/NCSA - * Open Source License - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * All rights reserved. - * - * Developed by: - * - * LLVM Team - * - * University of Illinois at Urbana-Champaign - * - * http://llvm.org - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the names of the LLVM Team, University of Illinois at - * Urbana-Champaign, nor the names of its contributors may be used to - * endorse or promote products derived from this Software without specific - * prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - * ============================================================================== - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * ============================================================================== - * - * This file is a partial list of people who have contributed to the LLVM/libc++ - * project. If you have contributed a patch or made some other contribution to - * LLVM/libc++, please submit a patch to this file to add yourself, and it will - * be done! - * - * The list is sorted by surname and formatted to allow easy grepping and - * beautification by scripts. The fields are: name (N), email (E), web-address - * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address - * (S). - * - * N: Saleem Abdulrasool - * E: compnerd@compnerd.org - * D: Minor patches and Linux fixes. - * - * N: Dimitry Andric - * E: dimitry@andric.com - * D: Visibility fixes, minor FreeBSD portability patches. - * - * N: Holger Arnold - * E: holgerar@gmail.com - * D: Minor fix. - * - * N: Ruben Van Boxem - * E: vanboxem dot ruben at gmail dot com - * D: Initial Windows patches. - * - * N: David Chisnall - * E: theraven at theravensnest dot org - * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. - * - * N: Marshall Clow - * E: mclow.lists@gmail.com - * E: marshall@idio.com - * D: C++14 support, patches and bug fixes. - * - * N: Bill Fisher - * E: william.w.fisher@gmail.com - * D: Regex bug fixes. - * - * N: Matthew Dempsky - * E: matthew@dempsky.org - * D: Minor patches and bug fixes. - * - * N: Google Inc. - * D: Copyright owner and contributor of the CityHash algorithm - * - * N: Howard Hinnant - * E: hhinnant@apple.com - * D: Architect and primary author of libc++ - * - * N: Hyeon-bin Jeong - * E: tuhertz@gmail.com - * D: Minor patches and bug fixes. - * - * N: Argyrios Kyrtzidis - * E: kyrtzidis@apple.com - * D: Bug fixes. - * - * N: Bruce Mitchener, Jr. - * E: bruce.mitchener@gmail.com - * D: Emscripten-related changes. - * - * N: Michel Morin - * E: mimomorin@gmail.com - * D: Minor patches to is_convertible. - * - * N: Andrew Morrow - * E: andrew.c.morrow@gmail.com - * D: Minor patches and Linux fixes. - * - * N: Arvid Picciani - * E: aep at exys dot org - * D: Minor patches and musl port. - * - * N: Bjorn Reese - * E: breese@users.sourceforge.net - * D: Initial regex prototype - * - * N: Nico Rieck - * E: nico.rieck@gmail.com - * D: Windows fixes - * - * N: Jonathan Sauer - * D: Minor patches, mostly related to constexpr - * - * N: Craig Silverstein - * E: csilvers@google.com - * D: Implemented Cityhash as the string hash function on 64-bit machines - * - * N: Richard Smith - * D: Minor patches. - * - * N: Joerg Sonnenberger - * E: joerg@NetBSD.org - * D: NetBSD port. - * - * N: Stephan Tolksdorf - * E: st@quanttec.com - * D: Minor fix - * - * N: Michael van der Westhuizen - * E: r1mikey at gmail dot com - * - * N: Klaas de Vries - * E: klaas at klaasgaaf dot nl - * D: Minor bug fix. - * - * N: Zhang Xiongpang - * E: zhangxiongpang@gmail.com - * D: Minor patches and bug fixes. - * - * N: Xing Xue - * E: xingxue@ca.ibm.com - * D: AIX port - * - * N: Zhihao Yuan - * E: lichray@gmail.com - * D: Standard compatibility fixes. - * - * N: Jeffrey Yasskin - * E: jyasskin@gmail.com - * E: jyasskin@google.com - * D: Linux fixes. - */ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/const-integer-set-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/const-integer-set-inl.h deleted file mode 100644 index b93846148a3e4595774507f638396ce13393ac0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/const-integer-set-inl.h +++ /dev/null @@ -1,87 +0,0 @@ -// util/const-integer-set-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ - -// Do not include this file directly. It is included by const-integer-set.h - -namespace kaldi { - -template -void ConstIntegerSet::InitInternal() { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - quick_set_.clear(); // just in case we previously had data. - if (slow_set_.size() == 0) { - lowest_member_ = (I)1; - highest_member_ = (I)0; - contiguous_ = false; - quick_ = false; - } else { - lowest_member_ = slow_set_.front(); - highest_member_ = slow_set_.back(); - size_t range = highest_member_ + 1 - lowest_member_; - if (range == slow_set_.size()) { - contiguous_ = true; - quick_ = false; - } else { - contiguous_ = false; - // If it would be more compact to store as bool - if (range < slow_set_.size() * 8 * sizeof(I)) { - // (assuming 1 bit per element)... - quick_set_.resize(range, false); - for (size_t i = 0; i < slow_set_.size(); i++) - quick_set_[slow_set_[i] - lowest_member_] = true; - quick_ = true; - } else { - quick_ = false; - } - } - } -} - -template -int ConstIntegerSet::count(I i) const { - if (i < lowest_member_ || i > highest_member_) { - return 0; - } else { - if (contiguous_) return true; - if (quick_) { - return (quick_set_[i - lowest_member_] ? 1 : 0); - } else { - bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); - return (ans ? 1 : 0); - } - } -} - -template -void ConstIntegerSet::Write(std::ostream &os, bool binary) const { - WriteIntegerVector(os, binary, slow_set_); -} - -template -void ConstIntegerSet::Read(std::istream &is, bool binary) { - ReadIntegerVector(is, binary, &slow_set_); - InitInternal(); -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/const-integer-set.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/const-integer-set.h deleted file mode 100644 index 809a56a7c83804bfaa4badb5e28059734bfcad1e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/const-integer-set.h +++ /dev/null @@ -1,96 +0,0 @@ -// util/const-integer-set.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_H_ -#include -#include -#include -#include -#include -#include "util/stl-utils.h" - -/* ConstIntegerSet is a way to efficiently test whether something is in a - supplied set of integers. It can be initialized from a vector or set, but - never changed after that. It either uses a sorted vector or an array of - bool, depending on the input. It behaves like a const version of an STL set, - with only a subset of the functionality, except all the member functions are - upper-case. - - Note that we could get rid of the member slow_set_, but we'd have to - do more work to implement an iterator type. This would save memory. -*/ - -namespace kaldi { - -template -class ConstIntegerSet { - public: - ConstIntegerSet() : lowest_member_(1), highest_member_(0) {} - - void Init(const std::vector &input) { - slow_set_ = input; - SortAndUniq(&slow_set_); - InitInternal(); - } - - void Init(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - - explicit ConstIntegerSet(const std::vector &input) : slow_set_(input) { - SortAndUniq(&slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const ConstIntegerSet &other) - : slow_set_(other.slow_set_) { - InitInternal(); - } - - int count(I i) const; // returns 1 or 0. - - typedef typename std::vector::const_iterator iterator; - iterator begin() const { return slow_set_.begin(); } - iterator end() const { return slow_set_.end(); } - size_t size() const { return slow_set_.size(); } - bool empty() const { return slow_set_.empty(); } - - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); - - private: - I lowest_member_; - I highest_member_; - bool contiguous_; - bool quick_; - std::vector quick_set_; - std::vector slow_set_; - void InitInternal(); -}; - -} // end namespace kaldi - -#include "util/const-integer-set-inl.h" - -#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/hash-list-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/hash-list-inl.h deleted file mode 100644 index 063fa7131ec618f0aae9dc30f4edd26c9dcce7fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/hash-list-inl.h +++ /dev/null @@ -1,193 +0,0 @@ -// util/hash-list-inl.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_INL_H_ -#define KALDI_UTIL_HASH_LIST_INL_H_ - -// Do not include this file directly. It is included by fast-hash.h - -namespace kaldi { - -template -HashList::HashList() { - list_head_ = NULL; - bucket_list_tail_ = static_cast(-1); // invalid. - hash_size_ = 0; - freed_head_ = NULL; -} - -template -void HashList::SetSize(size_t size) { - hash_size_ = size; - KALDI_ASSERT(list_head_ == NULL && - bucket_list_tail_ == - static_cast(-1)); // make sure empty. - if (size > buckets_.size()) buckets_.resize(size, HashBucket(0, NULL)); -} - -template -typename HashList::Elem *HashList::Clear() { - // Clears the hashtable and gives ownership of the currently contained list - // to the user. - for (size_t cur_bucket = bucket_list_tail_; - cur_bucket != static_cast(-1); - cur_bucket = buckets_[cur_bucket].prev_bucket) { - buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". - } - bucket_list_tail_ = static_cast(-1); - Elem *ans = list_head_; - list_head_ = NULL; - return ans; -} - -template -const typename HashList::Elem *HashList::GetList() const { - return list_head_; -} - -template -inline void HashList::Delete(Elem *e) { - e->tail = freed_head_; - freed_head_ = e; -} - -template -inline typename HashList::Elem *HashList::Find(I key) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - if (bucket.last_elem == NULL) { - return NULL; // empty bucket. - } else { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - return NULL; // Not found. - } -} - -template -inline typename HashList::Elem *HashList::New() { - if (freed_head_) { - Elem *ans = freed_head_; - freed_head_ = freed_head_->tail; - return ans; - } else { - Elem *tmp = new Elem[allocate_block_size_]; - for (size_t i = 0; i + 1 < allocate_block_size_; i++) - tmp[i].tail = tmp + i + 1; - tmp[allocate_block_size_ - 1].tail = NULL; - freed_head_ = tmp; - allocated_.push_back(tmp); - return this->New(); - } -} - -template -HashList::~HashList() { - // First test whether we had any memory leak within the - // HashList, i.e. things for which the user did not call Delete(). - size_t num_in_list = 0, num_allocated = 0; - for (Elem *e = freed_head_; e != NULL; e = e->tail) num_in_list++; - for (size_t i = 0; i < allocated_.size(); i++) { - num_allocated += allocate_block_size_; - delete[] allocated_[i]; - } - if (num_in_list != num_allocated) { - KALDI_WARN << "Possible memory leak: " << num_in_list - << " != " << num_allocated - << ": you might have forgotten to call Delete on " - << "some Elems"; - } -} - -template -inline typename HashList::Elem *HashList::Insert(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - // Check the element is existing or not. - if (bucket.last_elem != NULL) { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - } - - // This is a new element. Insert it. - Elem *elem = New(); - elem->key = key; - elem->val = val; - if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at - // head of bucket list (which is tail of regular list, they go in - // opposite directions). - if (bucket_list_tail_ == static_cast(-1)) { - // list was empty so this is the first elem. - KALDI_ASSERT(list_head_ == NULL); - list_head_ = elem; - } else { - // link in to the chain of Elems - buckets_[bucket_list_tail_].last_elem->tail = elem; - } - elem->tail = NULL; - bucket.last_elem = elem; - bucket.prev_bucket = bucket_list_tail_; - bucket_list_tail_ = index; - } else { - // Already-occupied bucket. Insert at tail of list of elements within - // the bucket. - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - } - return elem; -} - -template -void HashList::InsertMore(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - Elem *elem = New(); - elem->key = key; - elem->val = val; - - KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here - if (bucket.last_elem->key == key) { // standard behavior: add as last element - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - return; - } - Elem *e = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail); - // find place to insert in linked list - while (e != bucket.last_elem->tail && e->key != key) e = e->tail; - KALDI_ASSERT(e->key == key); // not found? - should not happen - elem->tail = e->tail; - e->tail = elem; -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/hash-list.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/hash-list.h deleted file mode 100644 index 31cc9bdc4870773475f8c5139539e320746bf5fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/hash-list.h +++ /dev/null @@ -1,146 +0,0 @@ -// util/hash-list.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_H_ -#define KALDI_UTIL_HASH_LIST_H_ - -#include -#include -#include -#include -#include - -#include "base/kaldi-error.h" - -/* This header provides utilities for a structure that's used in a decoder (but - is quite generic in nature so we implement and test it separately). - Basically it's a singly-linked list, but implemented in such a way that we - can quickly search for elements in the list. We give it a slightly richer - interface than just a hash and a list. The idea is that we want to separate - the hash part and the list part: basically, in the decoder, we want to have a - single hash for the current frame and the next frame, because by the time we - need to access the hash for the next frame we no longer need the hash for the - previous frame. So we have an operation that clears the hash but leaves the - list structure intact. We also control memory management inside this object, - to avoid repeated new's/deletes. - - See hash-list-test.cc for an example of how to use this object. -*/ - -namespace kaldi { - -template -class HashList { - public: - struct Elem { - I key; - T val; - Elem *tail; - }; - - /// Constructor takes no arguments. - /// Call SetSize to inform it of the likely size. - HashList(); - - /// Clears the hash and gives the head of the current list to the user; - /// ownership is transferred to the user (the user must call Delete() - /// for each element in the list, at his/her leisure). - Elem *Clear(); - - /// Gives the head of the current list to the user. Ownership retained in the - /// class. Caution: in December 2013 the return type was changed to const - /// Elem* and this function was made const. You may need to change some types - /// of local Elem* variables to const if this produces compilation errors. - const Elem *GetList() const; - - /// Think of this like delete(). It is to be called for each Elem in turn - /// after you "obtained ownership" by doing Clear(). This is not the opposite - /// of. Insert, it is the opposite of New. It's really a memory operation. - inline void Delete(Elem *e); - - /// This should probably not be needed to be called directly by the user. - /// Think of it as opposite - /// to Delete(); - inline Elem *New(); - - /// Find tries to find this element in the current list using the hashtable. - /// It returns NULL if not present. The Elem it returns is not owned by the - /// user, it is part of the internal list owned by this object, but the user - /// is free to modify the "val" element. - inline Elem *Find(I key); - - /// Insert inserts a new element into the hashtable/stored list. - /// Because element keys in a hashtable are unique, this operation checks - /// whether each inserted element has a key equivalent to the one of an - /// element already in the hashtable. If so, the element is not inserted, - /// returning an pointer to this existing element. - inline Elem *Insert(I key, T val); - - /// Insert inserts another element with same key into the hashtable/ - /// stored list. - /// By calling this, the user asserts that one element with that key is - /// already present. - /// We insert it that way, that all elements with the same key - /// follow each other. - /// Find() will return the first one of the elements with the same key. - inline void InsertMore(I key, T val); - - /// SetSize tells the object how many hash buckets to allocate (should - /// typically be at least twice the number of objects we expect to go in the - /// structure, for fastest performance). It must be called while the hash - /// is empty (e.g. after Clear() or after initializing the object, but before - /// adding anything to the hash. - void SetSize(size_t sz); - - /// Returns current number of hash buckets. - inline size_t Size() { return hash_size_; } - - ~HashList(); - - private: - struct HashBucket { - size_t prev_bucket; // index to next bucket (-1 if list tail). Note: - // list of buckets goes in opposite direction to list of Elems. - Elem *last_elem; // pointer to last element in this bucket (NULL if empty) - inline HashBucket(size_t i, Elem *e) : prev_bucket(i), last_elem(e) {} - }; - - Elem *list_head_; // head of currently stored list. - size_t bucket_list_tail_; // tail of list of active hash buckets. - - size_t hash_size_; // number of hash buckets. - - std::vector buckets_; - - Elem *freed_head_; // head of list of currently freed elements. [ready for - // allocation] - - std::vector allocated_; // list of allocated blocks. - - static const size_t allocate_block_size_ = 1024; // Number of Elements to - // allocate in one block. Must be largish so storing allocated_ doesn't - // become a problem. -}; - -} // end namespace kaldi - -#include "util/hash-list-inl.h" - -#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io-inl.h deleted file mode 100644 index 8b0c92131c4af2113eb33da6f3cfa9dc4dee83e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io-inl.h +++ /dev/null @@ -1,40 +0,0 @@ -// util/kaldi-io-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_INL_H_ -#define KALDI_UTIL_KALDI_IO_INL_H_ - -#include - -namespace kaldi { - -bool Input::Open(const std::string &rxfilename, bool *binary) { - return OpenInternal(rxfilename, true, binary); -} - -bool Input::OpenTextMode(const std::string &rxfilename) { - return OpenInternal(rxfilename, false, NULL); -} - -bool Input::IsOpen() { return impl_ != NULL; } - -bool Output::IsOpen() { return impl_ != NULL; } - -} // end namespace kaldi. - -#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io.cc deleted file mode 100644 index 5f8ec4870138df32f6aca9c12383cf3885411741..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io.cc +++ /dev/null @@ -1,898 +0,0 @@ -// util/kaldi-io.cc - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/kaldi-io.h" - -#include -#include -#include - -#include - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" -#include "util/kaldi-pipebuf.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -#ifdef KALDI_CYGWIN_COMPAT -#include "util/kaldi-cygwin-io-inl.h" -#define MapOsPath(x) MapCygwinPath(x) -#else // KALDI_CYGWIN_COMPAT -#define MapOsPath(x) x -#endif // KALDI_CYGWIN_COMPAT - -#if defined(_MSC_VER) -static FILE *popen(const char *command, const char *mode) { -#ifdef KALDI_CYGWIN_COMPAT - return kaldi::CygwinCompatPopen(command, mode); -#else // KALDI_CYGWIN_COMPAT - return _popen(command, mode); -#endif // KALDI_CYGWIN_COMPAT -} -#endif // _MSC_VER - -namespace kaldi { - -#ifndef _MSC_VER // on VS, we don't need this type. -// could replace basic_pipebuf with stdio_filebuf on some platforms. -// Would mean we could use less of our own code. -typedef basic_pipebuf PipebufType; -#endif -} // namespace kaldi - -namespace kaldi { - -std::string PrintableRxfilename(const std::string &rxfilename) { - if (rxfilename == "" || rxfilename == "-") { - return "standard input"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return rxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(rxfilename); - } -} - -std::string PrintableWxfilename(const std::string &wxfilename) { - if (wxfilename == "" || wxfilename == "-") { - return "standard output"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return wxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(wxfilename); - } -} - -OutputType ClassifyWxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardOutput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardOutput; - } else if (first_char == '|') { - return kPipeOutput; // An output pipe like "|blah". - } else if (isspace(first_char) || isspace(last_char) || last_char == '|') { - return kNoOutput; // Leading or trailing space: can't interpret this. - // Final '|' would represent an input pipe, not an - // output pipe. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoOutput; - } else if (isdigit(last_char)) { - // This could be a file, but we have to see if it's an offset into a file - // (like foo.ark:4314328), which is not allowed for writing (but is - // allowed for reaching). This eliminates some things which would be - // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed - // such filenames for writing, we woudln't be able to correctly read them). - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') return kNoOutput; - // else it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but we - // check for internal '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" - " wrong place (pipe without | at the beginning?): " - << filename; - return kNoOutput; - } - return kFileOutput; // It matched no other pattern: assume it's a filename. -} - -InputType ClassifyRxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardInput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardInput; - } else if (first_char == '|') { - return kNoInput; // An output pipe like "|blah": not - // valid for input. - } else if (last_char == '|') { - return kPipeInput; - } else if (isspace(first_char) || isspace(last_char)) { - return kNoInput; // We don't allow leading or trailing space in a filename. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoInput; - } else if (isdigit(last_char)) { - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') - return kOffsetFileInput; // Filename is like - // some_file:12345 - // otherwise it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but - // we check for '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified in this case. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" - " wrong place (pipe without | at the end?): " - << filename; - return kNoInput; - } - return kFileInput; // It matched no other pattern: assume it's a filename. -} - -class OutputImplBase { - public: - // Open will open it as a file (no header), and return true - // on success. It cannot be called on an already open stream. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::ostream &Stream() = 0; - virtual bool Close() = 0; - virtual ~OutputImplBase() {} -}; - -class FileOutputImpl : public OutputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (os_.is_open()) - KALDI_ERR << "FileOutputImpl::Open(), " - << "open called on already open file."; - filename_ = filename; - os_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out); - return os_.is_open(); - } - - virtual std::ostream &Stream() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return os_; - } - - virtual bool Close() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - os_.close(); - return !(os_.fail()); - } - virtual ~FileOutputImpl() { - if (os_.is_open()) { - os_.close(); - if (os_.fail()) KALDI_ERR << "Error closing output file " << filename_; - } - } - - private: - std::string filename_; - std::ofstream os_; -}; - -class StandardOutputImpl : public OutputImplBase { - public: - StandardOutputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardOutputImpl::Open(), " - "open called on already open file."; -#ifdef _MSC_VER - _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); -#endif - is_open_ = std::cout.good(); - return is_open_; - } - - virtual std::ostream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cout; - } - - virtual bool Close() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; - is_open_ = false; - std::cout << std::flush; - return !(std::cout.fail()); - } - virtual ~StandardOutputImpl() { - if (is_open_) { - std::cout << std::flush; - if (std::cout.fail()) KALDI_ERR << "Error writing to standard output"; - } - } - - private: - bool is_open_; -}; - -class PipeOutputImpl : public OutputImplBase { - public: - PipeOutputImpl() : f_(NULL), os_(NULL) {} - - virtual bool Open(const std::string &wxfilename, bool binary) { - filename_ = wxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should - // start with '|' - std::string cmd_name(wxfilename, 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); -#else - f_ = popen(cmd_name.c_str(), "w"); -#endif - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for writing, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't make the - // destructor try to close the stream when - // we're done. - (binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - os_ = new std::ostream(fb_); -#else - os_ = new std::ofstream(f_); -#endif - return os_->good(); - } - } - - virtual std::ostream &Stream() { - if (os_ == NULL) - KALDI_ERR << "PipeOutputImpl::Stream()," - " object not initialized."; - // I believe this error can only arise from coding error. - return *os_; - } - - virtual bool Close() { - if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; - bool ok = true; - os_->flush(); - if (os_->fail()) ok = false; - delete os_; - os_ = NULL; - int status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return ok; - } - virtual ~PipeOutputImpl() { - if (os_) { - if (!Close()) - KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); - } - } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::ostream *os_; -}; - -class InputImplBase { - public: - // Open will open it as a file, and return true on success. - // May be called twice only for kOffsetFileInput (otherwise, - // if called twice, we just create a new Input object, to avoid - // having to deal with the extra hassle of reopening with the - // same object. - // Note that we will to call Open with true (binary) for - // for text-mode Kaldi files; the only actual text-mode input - // is for non-Kaldi files. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::istream &Stream() = 0; - virtual int32 Close() = 0; // We only need to check failure in the case of - // kPipeInput. - // on close for input streams. - virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may - // call Open twice - // (has efficiency benefits). - - virtual ~InputImplBase() {} -}; - -class FileInputImpl : public InputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (is_.is_open()) - KALDI_ERR << "FileInputImpl::Open(), " - << "open called on already open file."; - is_.open( - MapOsPath(filename).c_str(), - binary ? std::ios_base::in | std::ios_base::binary : std::ios_base::in); - return is_.is_open(); - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kFileInput; } - - virtual ~FileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::ifstream is_; -}; - -class StandardInputImpl : public InputImplBase { - public: - StandardInputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardInputImpl::Open(), " - "open called on already open file."; - is_open_ = true; -#ifdef _MSC_VER - _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); -#endif - return true; // Don't check good() because would be false if - // eof, which may be valid input. - } - - virtual std::istream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cin; - } - - virtual InputType MyType() { return kStandardInput; } - - virtual int32 Close() { - if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; - is_open_ = false; - return 0; - } - virtual ~StandardInputImpl() {} - - private: - bool is_open_; -}; - -class PipeInputImpl : public InputImplBase { - public: - PipeInputImpl() : f_(NULL), is_(NULL) {} - - virtual bool Open(const std::string &rxfilename, bool binary) { - filename_ = rxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(rxfilename.length() != 0 && - rxfilename[rxfilename.length() - 1] == - '|'); // should end with '|' - std::string cmd_name(rxfilename, 0, rxfilename.length() - 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); -#else - f_ = popen(cmd_name.c_str(), "r"); -#endif - - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for reading, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't lead the - // destructor to close the stream. - (binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - is_ = new std::istream(fb_); -#else - is_ = new std::ifstream(f_); -#endif - if (is_->fail() || is_->bad()) return false; - if (is_->eof()) { - KALDI_WARN << "Pipe opened with command " - << PrintableRxfilename(rxfilename) << " is empty."; - // don't return false: empty may be valid. - } - return true; - } - } - - virtual std::istream &Stream() { - if (is_ == NULL) - KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return *is_; - } - - virtual int32 Close() { - if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open."; - delete is_; - is_ = NULL; - int32 status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return status; - } - virtual ~PipeInputImpl() { - if (is_) Close(); - } - virtual InputType MyType() { return kPipeInput; } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::istream *is_; -}; - -/* -#else - -// Just have an empty implementation of the pipe input that crashes if -// called. -class PipeInputImpl: public InputImplBase { - public: - PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this - platform."); } - virtual bool Open(const std::string, bool) { return 0; } - virtual std::istream &Stream() const { return NULL; } - virtual void Close() {} - virtual InputType MyType() { return kPipeInput; } -}; - -#endif -*/ - -class OffsetFileInputImpl : public InputImplBase { - // This class is a bit more complicated than the - - public: - // splits a filename like /my/file:123 into /my/file and the - // number 123. Crashes if not this format. - static void SplitFilename(const std::string &rxfilename, - std::string *filename, size_t *offset) { - size_t pos = rxfilename.find_last_of(':'); - KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling - // code, as the filename is supposed to be of the correct form at this - // point. - *filename = std::string(rxfilename, 0, pos); - std::string number(rxfilename, pos + 1); - bool ans = ConvertStringToInteger(number, offset); - if (!ans) - KALDI_ERR << "Cannot get offset from filename " << rxfilename - << " (possibly you compiled in 32-bit and have a >32-bit" - << " byte offset into a file; you'll have to compile 64-bit."; - } - - bool Seek(size_t offset) { - size_t cur_pos = is_.tellg(); - if (cur_pos == offset) { - return true; - } else if (cur_pos < offset && cur_pos + 100 > offset) { - // We're close enough that it may be faster to just - // read that data, rather than seek. - for (size_t i = cur_pos; i < offset; i++) is_.get(); - return (is_.tellg() == std::streampos(offset)); - } - // Try to actually seek. - is_.seekg(offset, std::ios_base::beg); - if (is_.fail()) { // failbit or badbit is set [error happened] - is_.close(); - return false; // failure. - } else { - is_.clear(); // Clear any failure bits (e.g. eof). - return true; // success. - } - } - - // This Open routine is unusual in that it is designed to work even - // if it was already open. This for efficiency when seeking multiple - // times. - virtual bool Open(const std::string &rxfilename, bool binary) { - if (is_.is_open()) { - // We are opening when we have an already-open file. - // We may have to seek within this file, or else close it and - // open a different one. - std::string tmp_filename; - size_t offset; - SplitFilename(rxfilename, &tmp_filename, &offset); - if (tmp_filename == filename_ && binary == binary_) { // Just seek - is_.clear(); // clear fail bit, etc. - return Seek(offset); - } else { - is_.close(); // don't bother checking error status of is_. - filename_ = tmp_filename; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } else { - size_t offset; - SplitFilename(rxfilename, &filename_, &offset); - binary_ = binary; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kOffsetFileInput; } - - virtual ~OffsetFileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::string filename_; // the actual filename - bool binary_; // true if was opened in binary mode. - std::ifstream is_; -}; - -Output::Output(const std::string &wxfilename, bool binary, bool write_header) - : impl_(NULL) { - if (!Open(wxfilename, binary, write_header)) { - if (impl_) { - delete impl_; - impl_ = NULL; - } - KALDI_ERR << "Error opening output stream " - << PrintableWxfilename(wxfilename); - } -} - -bool Output::Close() { - if (!impl_) { - return false; // error to call Close if not open. - } else { - bool ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } -} - -Output::~Output() { - if (impl_) { - bool ok = impl_->Close(); - delete impl_; - impl_ = NULL; - if (!ok) - KALDI_ERR << "Error closing output file " - << PrintableWxfilename(filename_) - << (ClassifyWxfilename(filename_) == kFileOutput - ? " (disk full?)" - : ""); - } -} - -std::ostream &Output::Stream() { // will throw if not open; else returns - // stream. - if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; - return impl_->Stream(); -} - -bool Output::Open(const std::string &wxfn, bool binary, bool header) { - if (IsOpen()) { - if (!Close()) { // Throw here rather than return status, as it's an error - // about something else: if the user wanted to avoid the exception he/she - // could have called Close(). - KALDI_ERR << "Output::Open(), failed to close output stream: " - << PrintableWxfilename(filename_); - } - } - - filename_ = wxfn; - - OutputType type = ClassifyWxfilename(wxfn); - KALDI_ASSERT(impl_ == NULL); - - if (type == kFileOutput) { - impl_ = new FileOutputImpl(); - } else if (type == kStandardOutput) { - impl_ = new StandardOutputImpl(); - } else if (type == kPipeOutput) { - impl_ = new PipeOutputImpl(); - } else { // type == kNoOutput - KALDI_WARN << "Invalid output filename format " - << PrintableWxfilename(wxfn); - return false; - } - if (!impl_->Open(wxfn, binary)) { - delete impl_; - impl_ = NULL; - return false; // failed to open. - } else { // successfully opened it. - if (header) { - InitKaldiOutputStream(impl_->Stream(), binary); - bool ok = impl_->Stream().good(); // still OK? - if (!ok) { - delete impl_; - impl_ = NULL; - return false; - } - return true; - } else { - return true; - } - } -} - -Input::Input(const std::string &rxfilename, bool *binary) : impl_(NULL) { - if (!Open(rxfilename, binary)) { - KALDI_ERR << "Error opening input stream " - << PrintableRxfilename(rxfilename); - } -} - -int32 Input::Close() { - if (impl_) { - int32 ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } else { - return 0; - } -} - -bool Input::OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary) { - InputType type = ClassifyRxfilename(rxfilename); - if (IsOpen()) { - // May have to close the stream first. - if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { - // We want to use the same object to Open... this is in case - // the files are the same, so we can just seek. - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always open in binary. - delete impl_; - impl_ = NULL; - return false; - } - // read the binary header, if requested. - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; - } else { - Close(); - // and fall through to code below which actually opens the file. - } - } - if (type == kFileInput) { - impl_ = new FileInputImpl(); - } else if (type == kStandardInput) { - impl_ = new StandardInputImpl(); - } else if (type == kPipeInput) { - impl_ = new PipeInputImpl(); - } else if (type == kOffsetFileInput) { - impl_ = new OffsetFileInputImpl(); - } else { // type == kNoInput - KALDI_WARN << "Invalid input filename format " - << PrintableRxfilename(rxfilename); - return false; - } - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always read in binary. - delete impl_; - impl_ = NULL; - return false; - } - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; -} - -Input::~Input() { - if (impl_) Close(); -} - -std::istream &Input::Stream() { - if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; - return impl_->Stream(); -} - -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io.h deleted file mode 100644 index 2175ca8f89ed5f3e3bade26528e924208df692c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-io.h +++ /dev/null @@ -1,266 +0,0 @@ -// util/kaldi-io.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_H_ -#define KALDI_UTIL_KALDI_IO_H_ - -#ifdef _MSC_VER -#include -#include -#endif -#include // For isspace. -#include -#include -#include "base/kaldi-common.h" -// #include "matrix/kaldi-matrix.h" - -namespace kaldi { - -class OutputImplBase; // Forward decl; defined in a .cc file -class InputImplBase; // Forward decl; defined in a .cc file - -/// \addtogroup io_group -/// @{ - -// The Output and Input classes handle stream-opening for "extended" filenames -// that include actual files, standard-input/standard-output, pipes, and -// offsets into actual files. They also handle reading and writing the -// binary-mode headers for Kaldi files, where applicable. The classes have -// versions of the Open routines that throw and do not throw, depending whether -// the calling code wants to catch the errors or not; there are also versions -// that write (or do not write) the Kaldi binary-mode header that says if it's -// binary mode. Generally files that contain Kaldi objects will have the header -// on, so we know upon reading them whether they have the header. So you would -// use the OpenWithHeader routines for these (or the constructor); but other -// types of objects (e.g. FSTs) would have files without a header so you would -// use OpenNoHeader. - -// We now document the types of extended filenames that we use. -// -// A "wxfilename" is an extended filename for writing. It can take three forms: -// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My -// Documents\\boo" -// (whatever the actual file-system interprets) -// (2) Standard output: "" or "-" -// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" -// -// -// A "rxfilename" is an extended filename for reading. It can take four forms: -// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". -// (2) Standard input: "" or "-" -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" -// [these are created by the Table and TableWriter classes; I may also write -// a program that creates them for arbitrary files] -// - -// Typical usage: -// ... -// bool binary; -// MyObject.Write(Output(some_filename, binary).Stream(), binary); -// -// ... more extensive example: -// { -// Output ko(some_filename, binary); -// MyObject1.Write(ko.Stream(), binary); -// MyObject2.Write(ko.Stream(), binary); -// } - -enum OutputType { kNoOutput, kFileOutput, kStandardOutput, kPipeOutput }; - -/// ClassifyWxfilename interprets filenames as follows: -/// - kNoOutput: invalid filenames (leading or trailing space, things that look -/// like wspecifiers and rspecifiers or like pipes to read from with leading -/// |. -/// - kFileOutput: Normal filenames -/// - kStandardOutput: The empty string or "-", interpreted as standard output -/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" -OutputType ClassifyWxfilename(const std::string &wxfilename); - -enum InputType { - kNoInput, - kFileInput, - kStandardInput, - kOffsetFileInput, - kPipeInput -}; - -/// ClassifyRxfilenames interprets filenames for reading as follows: -/// - kNoInput: invalid filenames (leading or trailing space, things that -/// look like wspecifiers and rspecifiers or pipes to write to -/// with trailing |. -/// - kFileInput: normal filenames -/// - kStandardInput: the empty string or "-" -/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" -/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 -InputType ClassifyRxfilename(const std::string &rxfilename); - -class Output { - public: - // The normal constructor, provided for convenience. - // Equivalent to calling with default constructor then Open() - // with these arguments. - Output(const std::string &filename, bool binary, bool write_header = true); - - Output() : impl_(NULL) {} - - /// This opens the stream, with the given mode (binary or text). It returns - /// true on success and false on failure. However, it will throw if something - /// was already open and could not be closed (to avoid this, call Close() - /// first. if write_header == true and binary == true, it writes the Kaldi - /// binary-mode header ('\0' then 'B'). You may call Open even if it is - /// already open; it will close the existing stream and reopen (however if - /// closing the old stream failed it will throw). - bool Open(const std::string &wxfilename, bool binary, bool write_header); - - inline bool IsOpen(); // return true if we have an open stream. Does not - // imply stream is good for writing. - - std::ostream &Stream(); // will throw if not open; else returns stream. - - // Close closes the stream. Calling Close is never necessary unless you - // want to avoid exceptions being thrown. There are times when calling - // Close will hurt efficiency (basically, when using offsets into files, - // and using the same Input object), - // but most of the time the user won't be doing this directly, it will - // be done in kaldi-table.{h, cc}, so you don't have to worry about it. - bool Close(); - - // This will throw if stream could not be closed (to check error status, - // call Close()). - ~Output(); - - private: - OutputImplBase *impl_; // non-NULL if open. - std::string filename_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Output); -}; - -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject.Read(ki.Stream(), binary_in); -// -// ... more extensive example: -// -// { -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject1.Read(ki.Stream(), &binary_in); -// MyObject2.Write(ki.Stream(), &binary_in); -// } -// Note that to catch errors you need to use try.. catch. -// Input communicates errors by throwing exceptions. - -// Input interprets four kinds of filenames: -// (1) Normal filenames -// (2) The empty string or "-", interpreted as standard output -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) Offsets into [real] files, e.g. "/my/filename:12049" -// The last one has no correspondence in Output. - -class Input { - public: - /// The normal constructor. Opens the stream in binary mode. - /// Equivalent to calling the default constructor followed by Open(); then, if - /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it - /// throws on error. - explicit Input(const std::string &rxfilename, bool *contents_binary = NULL); - - Input() : impl_(NULL) {} - - // Open opens the stream for reading (the mode, where relevant, is binary; use - // OpenTextMode for text-mode, we made this a separate function rather than a - // boolean argument, to avoid confusion with Kaldi's text/binary distinction, - // since reading in the file system's text mode is unusual.) If - // contents_binary != NULL, it reads the binary-mode header and puts it in the - // "binary" variable. Returns true on success. If it returns false it will - // not be open. You may call Open even if it is already open; it will close - // the existing stream and reopen (however if closing the old stream failed it - // will throw). - inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); - - // As Open but (if the file system has text/binary modes) opens in text mode; - // you shouldn't ever have to use this as in Kaldi we read even text files in - // binary mode (and ignore the \r). - inline bool OpenTextMode(const std::string &rxfilename); - - // Return true if currently open for reading and Stream() will - // succeed. Does not guarantee that the stream is good. - inline bool IsOpen(); - - // It is never necessary or helpful to call Close, except if - // you are concerned about to many filehandles being open. - // Close does not throw. It returns the exit code as int32 - // in the case of a pipe [kPipeInput], and always zero otherwise. - int32 Close(); - - // Returns the underlying stream. Throws if !IsOpen() - std::istream &Stream(); - - // Destructor does not throw: input streams may legitimately fail so we - // don't worry about the status when we close them. - ~Input(); - - private: - bool OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary); - InputImplBase *impl_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Input); -}; - -template -void ReadKaldiObject(const std::string &filename, C *c) { - bool binary_in; - Input ki(filename, &binary_in); - c->Read(ki.Stream(), binary_in); -} - -// Specialize the template for reading matrices, because we want to be able to -// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); -// -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); - -template -inline void WriteKaldiObject(const C &c, const std::string &filename, - bool binary) { - Output ko(filename, binary); - c.Write(ko.Stream(), binary); -} - -/// PrintableRxfilename turns the rxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard input". -std::string PrintableRxfilename(const std::string &rxfilename); - -/// PrintableWxfilename turns the wxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard output". -std::string PrintableWxfilename(const std::string &wxfilename); - -/// @} - -} // end namespace kaldi. - -#include "util/kaldi-io-inl.h" - -#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-pipebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-pipebuf.h deleted file mode 100644 index bcee80ccb1a6fa8ce3195483ac144c5ff66d2f89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/kaldi-pipebuf.h +++ /dev/null @@ -1,86 +0,0 @@ -// util/kaldi-pipebuf.h - -// Copyright 2009-2011 Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/** @file kaldi-pipebuf.h - * This is an Kaldi C++ Library header. - */ - -#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ -#define KALDI_UTIL_KALDI_PIPEBUF_H_ - -#include -#if !defined(_LIBCPP_VERSION) // libc++ -#include -#else -#include "util/basic-filebuf.h" -#endif - -namespace kaldi { -// This class provides a way to initialize a filebuf with a FILE* pointer -// directly; it will not close the file pointer when it is deleted. -// The C++ standard does not allow implementations of C++ to provide -// this constructor within basic_filebuf, which makes it hard to deal -// with pipes using completely native C++. This is a workaround - -#ifdef _MSC_VER -#elif defined(_LIBCPP_VERSION) // libc++ -template > -class basic_pipebuf : public basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : basic_filebuf() { - this->open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - } -}; // class basic_pipebuf -#else -template > -class basic_pipebuf : public std::basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : std::basic_filebuf() { - this->_M_file.sys_open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - this->_M_mode = mode; - this->_M_buf_size = BUFSIZ; - this->_M_allocate_internal_buffer(); - this->_M_reading = false; - this->_M_writing = false; - this->_M_set_buffer(-1); - } -}; // class basic_pipebuf -#endif // _MSC_VER - -} // namespace kaldi - -#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/parse-options.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/parse-options.cc deleted file mode 100644 index 1f2ef844d28d67ed58d2e0c9d7c7b674e8209df8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/parse-options.cc +++ /dev/null @@ -1,636 +0,0 @@ -// util/parse-options.cc - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -// Frantisek Skala; Arnab Ghoshal -// Copyright 2013 Tanel Alumae -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -namespace kaldi { - -ParseOptions::ParseOptions(const std::string &prefix, OptionsItf *other) - : print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { - ParseOptions *po = dynamic_cast(other); - if (po != NULL && po->other_parser_ != NULL) { - // we get here if this constructor is used twice, recursively. - other_parser_ = po->other_parser_; - } else { - other_parser_ = other; - } - if (po != NULL && po->prefix_ != "") { - prefix_ = po->prefix_ + std::string(".") + prefix; - } else { - prefix_ = prefix; - } -} - -void ParseOptions::Register(const std::string &name, bool *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, int32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, uint32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, float *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, double *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, std::string *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -// old-style, used for registering application-specific parameters -template -void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, - const std::string &doc) { - if (other_parser_ == NULL) { - this->RegisterCommon(name, ptr, doc, false); - } else { - KALDI_ASSERT(prefix_ != "" && - "Cannot use empty prefix when registering with prefix."); - std::string new_name = prefix_ + '.' + name; // name becomes prefix.name - other_parser_->Register(new_name, ptr, doc); - } -} - -// does the common part of the job of registering a parameter -template -void ParseOptions::RegisterCommon(const std::string &name, T *ptr, - const std::string &doc, bool is_standard) { - KALDI_ASSERT(ptr != NULL); - std::string idx = name; - NormalizeArgName(&idx); - if (doc_map_.find(idx) != doc_map_.end()) - KALDI_WARN << "Registering option twice, ignoring second time: " << name; - this->RegisterSpecific(name, idx, ptr, doc, is_standard); -} - -// used to register standard parameters (those that are present in all of the -// applications) -template -void ParseOptions::RegisterStandard(const std::string &name, T *ptr, - const std::string &doc) { - this->RegisterCommon(name, ptr, doc, true); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, bool *b, - const std::string &doc, bool is_standard) { - bool_map_[idx] = b; - doc_map_[idx] = - DocInfo(name, doc + " (bool, default = " + ((*b) ? "true)" : "false)"), - is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, int32 *i, - const std::string &doc, bool is_standard) { - int_map_[idx] = i; - std::ostringstream ss; - ss << doc << " (int, default = " << *i << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, uint32 *u, - const std::string &doc, bool is_standard) { - uint_map_[idx] = u; - std::ostringstream ss; - ss << doc << " (uint, default = " << *u << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, float *f, - const std::string &doc, bool is_standard) { - float_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (float, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, double *f, - const std::string &doc, bool is_standard) { - double_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (double, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, std::string *s, - const std::string &doc, bool is_standard) { - string_map_[idx] = s; - doc_map_[idx] = - DocInfo(name, doc + " (string, default = \"" + *s + "\")", is_standard); -} -void ParseOptions::DisableOption(const std::string &name) { - if (argv_ != NULL) - KALDI_ERR << "DisableOption must not be called after calling Read()."; - if (doc_map_.erase(name) == 0) - KALDI_ERR << "Option " << name - << " was not registered so cannot be disabled: "; - bool_map_.erase(name); - int_map_.erase(name); - uint_map_.erase(name); - float_map_.erase(name); - double_map_.erase(name); - string_map_.erase(name); -} - -int ParseOptions::NumArgs() const { return positional_args_.size(); } - -std::string ParseOptions::GetArg(int i) const { - // use KALDI_ERR if code error - if (i < 1 || i > static_cast(positional_args_.size())) - KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; - return positional_args_[i - 1]; -} - -// We currently do not support any other options. -enum ShellType { kBash = 0 }; - -// This can be changed in the code if it ever does need to be changed (as it's -// unlikely that one compilation of this tool-set would use both shells). -static ShellType kShellType = kBash; - -// Returns true if we need to escape a string before putting it into -// a shell (mainly thinking of bash shell, but should work for others) -// This is for the convenience of the user so command-lines that are -// printed out by ParseOptions::Read (with --print-args=true) are -// paste-able into the shell and will run. If you use a different type of -// shell, it might be necessary to change this function. -// But it's mostly a cosmetic issue as it basically affects how -// the program echoes its command-line arguments to the screen. -static bool MustBeQuoted(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - const char *c = str.c_str(); - if (*c == '\0') { - return true; // Must quote empty string - } else { - const char *ok_chars[2]; - - // These seem not to be interpreted as long as there are no other "bad" - // characters involved (e.g. "," would be interpreted as part of something - // like a{b,c}, but not on its own. - ok_chars[kBash] = "[]~#^_-+=:.,/"; - - // Just want to make sure that a space character doesn't get automatically - // inserted here via an automated style-checking script, like it did before. - KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); - - for (; *c != '\0'; c++) { - // For non-alphanumeric characters we have a list of characters which - // are OK. All others are forbidden (this is easier since the shell - // interprets most non-alphanumeric characters). - if (!isalnum(*c)) { - const char *d; - for (d = ok_chars[st]; *d != '\0'; d++) - if (*c == *d) break; - // If not alphanumeric or one of the "ok_chars", it must be escaped. - if (*d == '\0') return true; - } - } - return false; // The string was OK. No quoting or escaping. - } -} - -// Returns a quoted and escaped version of "str" -// which has previously been determined to need escaping. -// Our aim is to print out the command line in such a way that if it's -// pasted into a shell of ShellType "st" (only bash for now), it -// will get passed to the program in the same way. -static std::string QuoteAndEscape(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - // For now we use the following rules: - // In the normal case, we quote with single-quote "'", and to escape - // a single-quote we use the string: '\'' (interpreted as closing the - // single-quote, putting an escaped single-quote from the shell, and - // then reopening the single quote). - char quote_char = '\''; - const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b - - // If the string contains single-quotes that would need escaping this - // way, and we determine that the string could be safely double-quoted - // without requiring any escaping, then we double-quote the string. - // This is the case if the characters "`$\ do not appear in the string. - // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html - const char *c_str = str.c_str(); - if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { - quote_char = '"'; - escape_str = "\\\""; // should never be accessed. - } - - char buf[2]; - buf[1] = '\0'; - - buf[0] = quote_char; - std::string ans = buf; - const char *c = str.c_str(); - for (; *c != '\0'; c++) { - if (*c == quote_char) { - ans += escape_str; - } else { - buf[0] = *c; - ans += buf; - } - } - buf[0] = quote_char; - ans += buf; - return ans; -} - -// static function -std::string ParseOptions::Escape(const std::string &str) { - return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; -} - -int ParseOptions::Read(int argc, const char *const argv[]) { - argc_ = argc; - argv_ = argv; - std::string key, value; - int i; - if (argc > 0) { - // set global "const char*" g_program_name (name of the program) - // so it can be printed out in error messages; - // it's useful because often the stderr of different programs will - // be mixed together in the same log file. -#ifdef _MSC_VER - const char *c = strrchr(argv[0], '\\'); -#else - const char *c = strrchr(argv[0], '/'); -#endif - SetProgramName(c == NULL ? argv[0] : c + 1); - } - // first pass: look for config parameter, look for priority - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // a lone "--" marks the end of named options - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (key.compare("config") == 0) { - ReadConfigFile(value); - } - if (key.compare("help") == 0) { - PrintUsage(); - exit(0); - } - } - } - bool double_dash_seen = false; - // second pass: add the command line options - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // A lone "--" marks the end of named options. - // Skip that option and break the processing of named options - i += 1; - double_dash_seen = true; - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << argv[i]; - } - } else { - break; - } - } - - // process remaining arguments as positional - for (; i < argc; i++) { - if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { - double_dash_seen = true; - } else { - positional_args_.push_back(std::string(argv[i])); - } - } - - // if the user did not suppress this with --print-args = false.... - if (print_args_) { - std::ostringstream strm; - for (int j = 0; j < argc; j++) strm << Escape(argv[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } - return i; -} - -void ParseOptions::PrintUsage(bool print_command_line) { - std::cerr << '\n' << usage_ << '\n'; - DocMapType::iterator it; - // first we print application-specific options - bool app_specific_header_printed = false; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option - if (app_specific_header_printed == false) { // header was not yet printed - std::cerr << "Options:" << '\n'; - app_specific_header_printed = true; - } - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - if (app_specific_header_printed == true) { - std::cerr << '\n'; - } - - // then the standard options - std::cerr << "Standard options:" << '\n'; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - std::cerr << '\n'; - if (print_command_line) { - std::ostringstream strm; - strm << "Command line was: "; - for (int j = 0; j < argc_; j++) strm << Escape(argv_[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } -} - -void ParseOptions::PrintConfig(std::ostream &os) { - os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; - std::string key; - DocMapType::iterator it; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; - if (bool_map_.end() != bool_map_.find(key)) { - os << (*bool_map_[key] ? "true" : "false"); - } else if (int_map_.end() != int_map_.find(key)) { - os << (*int_map_[key]); - } else if (uint_map_.end() != uint_map_.find(key)) { - os << (*uint_map_[key]); - } else if (float_map_.end() != float_map_.find(key)) { - os << (*float_map_[key]); - } else if (double_map_.end() != double_map_.find(key)) { - os << (*double_map_[key]); - } else if (string_map_.end() != string_map_.find(key)) { - os << "'" << *string_map_[key] << "'"; - } else { - KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; - } - os << '\n'; - } - os << '\n'; -} - -void ParseOptions::ReadConfigFile(const std::string &filename) { - std::ifstream is(filename.c_str(), std::ifstream::in); - if (!is.good()) { - KALDI_ERR << "Cannot open config file: " << filename; - } - - std::string line, key, value; - int32 line_number = 0; - while (std::getline(is, line)) { - line_number++; - // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { - line.erase(pos); - } - // skip empty lines - Trim(&line); - if (line.length() == 0) continue; - - if (line.substr(0, 2) != "--") { - KALDI_ERR << "Reading config file " << filename << ": line " - << line_number << " does not look like a line " - << "from a Kaldi command-line program's config file: should " - << "be of the form --x=y. Note: config files intended to " - << "be sourced by shell scripts lack the '--'."; - } - - // parse option - bool has_equal_sign; - SplitLongArg(line, &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << line << " in config file " << filename; - } - } -} - -void ParseOptions::SplitLongArg(const std::string &in, std::string *key, - std::string *value, bool *has_equal_sign) { - KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. - size_t pos = in.find_first_of('=', 0); - if (pos == std::string::npos) { // we allow --option for bools - // defaults to empty. We handle this differently in different cases. - *key = in.substr(2, in.size() - 2); // 2 because starts with --. - *value = ""; - *has_equal_sign = false; - } else if (pos == 2) { // we also don't allow empty keys: --=value - PrintUsage(true); - KALDI_ERR << "Invalid option (no key): " << in; - } else { // normal case: --option=value - *key = in.substr(2, pos - 2); // 2 because starts with --. - *value = in.substr(pos + 1); - *has_equal_sign = true; - } -} - -void ParseOptions::NormalizeArgName(std::string *str) { - std::string out; - std::string::iterator it; - - for (it = str->begin(); it != str->end(); ++it) { - if (*it == '_') - out += '-'; // convert _ to - - else - out += std::tolower(*it); - } - *str = out; - - KALDI_ASSERT(str->length() > 0); -} - -bool ParseOptions::SetOption(const std::string &key, const std::string &value, - bool has_equal_sign) { - if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") - KALDI_ERR << "Invalid option --" << key << "="; - *(bool_map_[key]) = ToBool(value); - } else if (int_map_.end() != int_map_.find(key)) { - *(int_map_[key]) = ToInt(value); - } else if (uint_map_.end() != uint_map_.find(key)) { - *(uint_map_[key]) = ToUint(value); - } else if (float_map_.end() != float_map_.find(key)) { - *(float_map_[key]) = ToFloat(value); - } else if (double_map_.end() != double_map_.find(key)) { - *(double_map_[key]) = ToDouble(value); - } else if (string_map_.end() != string_map_.find(key)) { - if (!has_equal_sign) - KALDI_ERR << "Invalid option --" << key << " (option format is --x=y)."; - *(string_map_[key]) = value; - } else { - return false; - } - return true; -} - -bool ParseOptions::ToBool(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); - - // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { - return true; - } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { - return false; - } - // if it is neither true nor false: - PrintUsage(true); - KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " - << str; - return false; // never reached -} - -int32 ParseOptions::ToInt(const std::string &str) { - int32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -uint32 ParseOptions::ToUint(const std::string &str) { - uint32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -float ParseOptions::ToFloat(const std::string &str) { - float ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -double ParseOptions::ToDouble(const std::string &str) { - double ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -// instantiate templates -template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterStandard(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - float *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - double *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterCommon(const std::string &name, bool *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, int32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, uint32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, float *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, double *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, - std::string *ptr, - const std::string &doc, - bool is_standard); - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/parse-options.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/parse-options.h deleted file mode 100644 index 93a060f4a411dfd63298a91bb313e0b66d337a75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/parse-options.h +++ /dev/null @@ -1,265 +0,0 @@ -// util/parse-options.h - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ -#define KALDI_UTIL_PARSE_OPTIONS_H_ - -#include -#include -#include - -#include "base/kaldi-common.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/// The class ParseOptions is for parsing command-line options; see -/// \ref parse_options for more documentation. -class ParseOptions : public OptionsItf { - public: - explicit ParseOptions(const char *usage) - : print_args_(true), - help_(false), - usage_(usage), - argc_(0), - argv_(NULL), - prefix_(""), - other_parser_(NULL) { -#if !defined(_MSC_VER) && \ - !defined(__CYGWIN__) // This is just a convenient place to set the stderr - // to line - setlinebuf(stderr); // buffering mode, since it's called at program start. -#endif // This helps ensure different programs' output is not mixed up. - RegisterStandard("config", &config_, - "Configuration file to read (this " - "option may be repeated)"); - RegisterStandard("print-args", &print_args_, - "Print the command line arguments (to stderr)"); - RegisterStandard("help", &help_, "Print out usage message"); - RegisterStandard("verbose", &g_kaldi_verbose_level, - "Verbose level (higher->more logging)"); - } - - /** - This is a constructor for the special case where some options are - registered with a prefix to avoid conflicts. The object thus created will - only be used temporarily to register an options class with the original - options parser (which is passed as the *other pointer) using the given - prefix. It should not be used for any other purpose, and the prefix must - not be the empty string. It seems to be the least bad way of implementing - options with prefixes at this point. - Example of usage is: - ParseOptions po; // original ParseOptions object - ParseOptions po_mfcc("mfcc", &po); // object with prefix. - MfccOptions mfcc_opts; - mfcc_opts.Register(&po_mfcc); - The options will now get registered as, e.g., --mfcc.frame-shift=10.0 - instead of just --frame-shift=10.0 - */ - ParseOptions(const std::string &prefix, OptionsItf *other); - - ~ParseOptions() {} - - // Methods from the interface - void Register(const std::string &name, bool *ptr, const std::string &doc); - void Register(const std::string &name, int32 *ptr, const std::string &doc); - void Register(const std::string &name, uint32 *ptr, const std::string &doc); - void Register(const std::string &name, float *ptr, const std::string &doc); - void Register(const std::string &name, double *ptr, const std::string &doc); - void Register(const std::string &name, std::string *ptr, - const std::string &doc); - - /// If called after registering an option and before calling - /// Read(), disables that option from being used. Will crash - /// at runtime if that option had not been registered. - void DisableOption(const std::string &name); - - /// This one is used for registering standard parameters of all the programs - template - void RegisterStandard(const std::string &name, T *ptr, - const std::string &doc); - - /** - Parses the command line options and fills the ParseOptions-registered - variables. This must be called after all the variables were registered!!! - - Initially the variables have implicit values, - then the config file values are set-up, - finally the command line values given. - Returns the first position in argv that was not used. - [typically not useful: use NumParams() and GetParam(). ] - */ - int Read(int argc, const char *const *argv); - - /// Prints the usage documentation [provided in the constructor]. - void PrintUsage(bool print_command_line = false); - /// Prints the actual configuration of all the registered variables - void PrintConfig(std::ostream &os); - - /// Reads the options values from a config file. Must be called after - /// registering all options. This is usually used internally after the - /// standard --config option is used, but it may also be called from a - /// program. - void ReadConfigFile(const std::string &filename); - - /// Number of positional parameters (c.f. argc-1). - int NumArgs() const; - - /// Returns one of the positional parameters; 1-based indexing for argc/argv - /// compatibility. Will crash if param is not >=1 and <=NumArgs(). - std::string GetArg(int param) const; - - std::string GetOptArg(int param) const { - return (param <= NumArgs() ? GetArg(param) : ""); - } - - /// The following function will return a possibly quoted and escaped - /// version of "str", according to the current shell. Currently - /// this is just hardwired to bash. It's useful for debug output. - static std::string Escape(const std::string &str); - - private: - /// Template to register various variable types, - /// used for program-specific parameters - template - void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); - - // Following functions do just the datatype-specific part of the job - /// Register boolean variable - void RegisterSpecific(const std::string &name, const std::string &idx, - bool *b, const std::string &doc, bool is_standard); - /// Register int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - int32 *i, const std::string &doc, bool is_standard); - /// Register unsinged int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - uint32 *u, const std::string &doc, bool is_standard); - /// Register float variable - void RegisterSpecific(const std::string &name, const std::string &idx, - float *f, const std::string &doc, bool is_standard); - /// Register double variable [useful as we change BaseFloat type]. - void RegisterSpecific(const std::string &name, const std::string &idx, - double *f, const std::string &doc, bool is_standard); - /// Register string variable - void RegisterSpecific(const std::string &name, const std::string &idx, - std::string *s, const std::string &doc, - bool is_standard); - - /// Does the actual job for both kinds of parameters - /// Does the common part of the job for all datatypes, - /// then calls RegisterSpecific - template - void RegisterCommon(const std::string &name, T *ptr, const std::string &doc, - bool is_standard); - - /// Set option with name "key" to "value"; will crash if can't do it. - /// "has_equal_sign" is used to allow --x for a boolean option x, - /// and --y=, for a string option y. - bool SetOption(const std::string &key, const std::string &value, - bool has_equal_sign); - - bool ToBool(std::string str); - int32 ToInt(const std::string &str); - uint32 ToUint(const std::string &str); - float ToFloat(const std::string &str); - double ToDouble(const std::string &str); - - // maps for option variables - std::map bool_map_; - std::map int_map_; - std::map uint_map_; - std::map float_map_; - std::map double_map_; - std::map string_map_; - - /** - Structure for options' documentation - */ - struct DocInfo { - DocInfo() {} - DocInfo(const std::string &name, const std::string &usemsg) - : name_(name), use_msg_(usemsg), is_standard_(false) {} - DocInfo(const std::string &name, const std::string &usemsg, - bool is_standard) - : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} - - std::string name_; - std::string use_msg_; - bool is_standard_; - }; - typedef std::map DocMapType; - DocMapType doc_map_; ///< map for the documentation - - bool print_args_; ///< variable for the implicit --print-args parameter - bool help_; ///< variable for the implicit --help parameter - std::string config_; ///< variable for the implicit --config parameter - std::vector positional_args_; - const char *usage_; - int argc_; - const char *const *argv_; - - /// These members are not normally used. They are only used when the object - /// is constructed with a prefix - std::string prefix_; - OptionsItf *other_parser_; - - protected: - /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, - /// and sets "has_equal_sign" to true if an equals-sign was parsed.. - /// this is needed in order to correctly allow --x for a boolean option - /// x, and --y= for a string option y, and to disallow --x= and --y. - void SplitLongArg(const std::string &in, std::string *key, std::string *value, - bool *has_equal_sign); - - void NormalizeArgName(std::string *str); -}; - -/// This template is provided for convenience in reading config classes from -/// files; this is not the standard way to read configuration options, but may -/// occasionally be needed. This function assumes the config has a function -/// "void Register(OptionsItf *opts)" which it can call to register the -/// ParseOptions object. -template -void ReadConfigFromFile(const std::string &config_filename, C *c) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << config_filename << "'"; - ParseOptions po(usage_str.str().c_str()); - c->Register(&po); - po.ReadConfigFile(config_filename); -} - -/// This variant of the template ReadConfigFromFile is for if you need to read -/// two config classes from the same file. -template -void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << conf << "'"; - ParseOptions po(usage_str.str().c_str()); - c1->Register(&po); - c2->Register(&po); - po.ReadConfigFile(conf); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/simple-io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/simple-io-funcs.cc deleted file mode 100644 index 5ace601b6a2bb186dec78b0b25cb5a3227c48bc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/simple-io-funcs.cc +++ /dev/null @@ -1,80 +0,0 @@ -// util/simple-io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/simple-io-funcs.h" -#include "util/text-utils.h" - -namespace kaldi { - -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; - return ko.Close(); -} - -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - int32 i; - list->clear(); - while (!(is >> i).fail()) list->push_back(i); - is >> std::ws; - return is.eof(); // should be eof, or junk at end of file. -} - -bool WriteIntegerVectorVectorSimple( - const std::string &wxfilename, - const std::vector > &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - std::ostream &os = ko.Stream(); - for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i].size(); j++) { - os << list[i][j]; - if (j + 1 < list[i].size()) os << ' '; - } - os << '\n'; - } - return ko.Close(); -} - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - list->clear(); - std::string line; - while (std::getline(is, line)) { - std::vector v; - if (!SplitStringToIntegers(line, " \t\r", true, &v)) { - list->clear(); - return false; - } - list->push_back(v); - } - return is.eof(); // if we're not at EOF, something weird happened. -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/simple-io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/simple-io-funcs.h deleted file mode 100644 index 1ead12790ba9bd6a44ccdff855918270191b8ebd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/simple-io-funcs.h +++ /dev/null @@ -1,61 +0,0 @@ -// util/simple-io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ -#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ - -#include -#include -#include "util/kaldi-io.h" - -// This header contains some utilities for reading some common, simple text -// formats:integers in files, one per line, and integers in files, possibly -// multiple per line. these are not really fully native Kaldi formats; they are -// mostly for small files that might be generated by scripts, and can be read -// all at one time. for longer files of this type, we would probably use the -// Table code. - -namespace kaldi { - -/// WriteToList attempts to write this list of integers, one per line, -/// to the given file, in text format. -/// returns true if succeeded. -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &v); - -/// ReadFromList attempts to read this list of integers, one per line, -/// from the given file, in text format. -/// returns true if succeeded. -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *v); - -// This is a file format like: -// 1 2 -// 3 -// -// 4 5 6 -// etc. -bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, - const std::vector > &v); - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *v); - -} // end namespace kaldi. - -#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/stl-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/stl-utils.h deleted file mode 100644 index 8a29cd582c77b3078277aa9713b8676032bbc5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/stl-utils.h +++ /dev/null @@ -1,310 +0,0 @@ -// util/stl-utils.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_STL_UTILS_H_ -#define KALDI_UTIL_STL_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -using std::unordered_map; -using std::unordered_set; - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Sorts and uniq's (removes duplicates) from a vector. -template -inline void SortAndUniq(std::vector *vec) { - std::sort(vec->begin(), vec->end()); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Returns true if the vector is sorted. -template -inline bool IsSorted(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter < *iter) return false; - iter = next_iter; - } -} - -/// Returns true if the vector is sorted and contains each element -/// only once. -template -inline bool IsSortedAndUniq(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter <= *iter) return false; - iter = next_iter; - } -} - -/// Removes duplicate elements from a sorted list. -template -inline void Uniq(std::vector *vec) { // must be already sorted. - KALDI_PARANOID_ASSERT(IsSorted(*vec)); - KALDI_ASSERT(vec); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Copies the elements of a set to a vector. -template -void CopySetToVector(const std::set &s, std::vector *v) { - // copies members of s into v, in sorted order from lowest to highest - // (because the set was in sorted order). - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename std::set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -template -void CopySetToVector(const unordered_set &s, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename unordered_set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -/// Copies the (key, value) pairs in a map to a vector of pairs. -template -void CopyMapToVector(const std::map &m, - std::vector > *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector >::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = std::make_pair(miter->first, miter->second); - // do it like this because of const casting. - } -} - -/// Copies the keys in a map to a vector. -template -void CopyMapKeysToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->first; - } -} - -/// Copies the values in a map to a vector. -template -void CopyMapValuesToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->second; - } -} - -/// Copies the keys in a map to a set. -template -void CopyMapKeysToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) { - s->insert(s->end(), miter->first); - } -} - -/// Copies the values in a map to a set. -template -void CopyMapValuesToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) s->insert(s->end(), miter->second); -} - -/// Copies the contents of a vector to a set. -template -void CopyVectorToSet(const std::vector &v, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) s->insert(s->end(), *iter); - // s->end() is a hint in case v was sorted. will work regardless. -} - -/// Deletes any non-NULL pointers in the vector v, and sets -/// the corresponding entries of v to NULL -template -void DeletePointers(std::vector *v) { - KALDI_ASSERT(v != NULL); - typename std::vector::iterator iter = v->begin(), end = v->end(); - for (; iter != end; ++iter) { - if (*iter != NULL) { - delete *iter; - *iter = NULL; // set to NULL for extra safety. - } - } -} - -/// Returns true if the vector of pointers contains NULL pointers. -template -bool ContainsNullPointers(const std::vector &v) { - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) - if (*iter == static_cast(NULL)) return true; - return false; -} - -/// Copies the contents a vector of one type to a vector -/// of another type. -template -void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { - KALDI_ASSERT(vec_out != NULL); - vec_out->resize(vec_in.size()); - for (size_t i = 0; i < vec_in.size(); i++) - (*vec_out)[i] = static_cast(vec_in[i]); -} - -/// A hashing function-object for vectors. -template -struct VectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const noexcept { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - VectorHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - } - - private: - static const int kPrime = 7853; -}; - -/// A hashing function-object for pairs of ints -template -struct PairHasher { // hashing function for pair - size_t operator()(const std::pair &x) const noexcept { - // 7853 was chosen at random from a list of primes. - return x.first + x.second * 7853; - } - PairHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int1); - KALDI_ASSERT_IS_INTEGER_TYPE(Int2); - } -}; - -/// A hashing function object for strings. -struct StringHasher { // hashing function for std::string - size_t operator()(const std::string &str) const noexcept { - size_t ans = 0, len = str.length(); - const char *c = str.c_str(), *end = c + len; - for (; c != end; c++) { - ans *= kPrime; - ans += *c; - } - return ans; - } - - private: - static const int kPrime = 7853; -}; - -/// Reverses the contents of a vector. -template -inline void ReverseVector(std::vector *vec) { - KALDI_ASSERT(vec != NULL); - size_t sz = vec->size(); - for (size_t i = 0; i < sz / 2; i++) std::swap((*vec)[i], (*vec)[sz - 1 - i]); -} - -/// Comparator object for pairs that compares only the first pair. -template -struct CompareFirstMemberOfPair { - inline bool operator()(const std::pair &p1, const std::pair &p2) { - return p1.first < p2.first; - } -}; - -/// For a vector of pair where I is an integer and F a floating-point or -/// integer type, this function sorts a vector of type vector > on -/// the I value and then merges elements with equal I values, summing these over -/// the F component and then removing any F component with zero value. This -/// is for where the vector of pairs represents a map from the integer to float -/// component, with an "adding" type of semantics for combining the elements. -template -inline void MergePairVectorSumming(std::vector > *vec) { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - CompareFirstMemberOfPair c; - std::sort(vec->begin(), vec->end(), c); // sort on 1st element. - typename std::vector >::iterator out = vec->begin(), - in = vec->begin(), - end = vec->end(); - // special case: while there is nothing to be changed, skip over - // initial input (avoids unnecessary copying). - while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { - in++; - out++; - } - while (in < end) { - // We reach this point only at the first element of - // each stretch of identical .first elements. - *out = *in; - ++in; - while (in < end && in->first == out->first) { - out->second += in->second; // this is the merge operation. - ++in; - } - if (out->second != static_cast(0)) // Don't keep zero elements. - out++; - } - vec->erase(out, end); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/text-utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/text-utils.cc deleted file mode 100644 index fd70889644f6b4e14793ddd4f5b0d71a66768699..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/text-utils.cc +++ /dev/null @@ -1,580 +0,0 @@ -// util/text-utils.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/text-utils.h" - -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out) { - KALDI_ASSERT(out != NULL); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - F f = 0; - if (!ConvertStringToReal(split[i], &f)) return false; - (*out)[i] = f; - } - return true; -} - -// Instantiate the template above for float and double. -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out) { - std::string tmp_str; - for (size_t i = 0; i < vec_in.size(); i++) { - if (!omit_empty_strings || !vec_in[i].empty()) { - tmp_str.append(vec_in[i]); - if (i < vec_in.size() - 1) - if (!omit_empty_strings || !vec_in[i + 1].empty()) - tmp_str.append(delim); - } - } - str_out->swap(tmp_str); -} - -void Trim(std::string *str) { - const char *white_chars = " \t\n\r\f\v"; - - std::string::size_type pos = str->find_last_not_of(white_chars); - if (pos != std::string::npos) { - str->erase(pos + 1); - pos = str->find_first_not_of(white_chars); - if (pos != std::string::npos) str->erase(0, pos); - } else { - str->erase(str->begin(), str->end()); - } -} - -bool IsToken(const std::string &token) { - size_t l = token.length(); - if (l == 0) return false; - for (size_t i = 0; i < l; i++) { - unsigned char c = token[i]; - if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) - return false; - // The "&& (isascii(c) || c == 255)" was added so that we won't reject - // non-ASCII characters such as French characters with accents [except for - // 255 which is "nbsp", a form of space]. - } - return true; -} - -void SplitStringOnFirstSpace(const std::string &str, std::string *first, - std::string *rest) { - const char *white_chars = " \t\n\r\f\v"; - typedef std::string::size_type I; - const I npos = std::string::npos; - I first_nonwhite = str.find_first_not_of(white_chars); - if (first_nonwhite == npos) { - first->clear(); - rest->clear(); - return; - } - // next_white is first whitespace after first nonwhitespace. - I next_white = str.find_first_of(white_chars, first_nonwhite); - - if (next_white == npos) { // no more whitespace... - *first = std::string(str, first_nonwhite); - rest->clear(); - return; - } - I next_nonwhite = str.find_first_not_of(white_chars, next_white); - if (next_nonwhite == npos) { - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - rest->clear(); - return; - } - - I last_nonwhite = str.find_last_not_of(white_chars); - KALDI_ASSERT(last_nonwhite != npos); // or coding error. - - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - *rest = std::string(str, next_nonwhite, last_nonwhite + 1 - next_nonwhite); -} - -bool IsLine(const std::string &line) { - if (line.find('\n') != std::string::npos) return false; - if (line.empty()) return true; - if (isspace(*(line.begin()))) return false; - if (isspace(*(line.rbegin()))) return false; - std::string::const_iterator iter = line.begin(), end = line.end(); - for (; iter != end; iter++) - if (!isprint(*iter)) return false; - return true; -} - -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - -template bool ConvertStringToReal(const std::string &str, float *out); -template bool ConvertStringToReal(const std::string &str, double *out); - -/* - This function is a helper function of StringsApproxEqual. It should be - thought of as a recursive function-- it was designed that way-- but rather - than actually recursing (which would cause problems with stack overflow), we - just set the args and return to the start. - - The 'decimal_places_tolerance' argument is just passed in from outside, - see the documentation for StringsApproxEqual in text-utils.h to see an - explanation. The argument 'places_into_number' provides some information - about the strings 'a' and 'b' that precedes the current pointers. - For purposes of this comment, let's define the 'decimal' of a number - as the part that comes after the decimal point, e.g. in '99.123', - '123' would be the decimal. If 'places_into_number' is -1, it means - we're not currently inside some place like that (i.e. it's not the - case that we're pointing to the '1' or the '2' or the '3'). - If it's 0, then we'd be pointing to the first place after the decimal, - '1' in this case. Note if one of the numbers is shorter than the - other, like '99.123' versus '99.1234' and 'a' points to the first '3' - while 'b' points to the second '4', 'places_into_number' referes to the - shorter of the two, i.e. it would be 2 in this example. - - - */ -bool StringsApproxEqualInternal(const char *a, const char *b, - int32 decimal_places_tolerance, - int32 places_into_number) { -start: - char ca = *a, cb = *b; - if (ca == cb) { - if (ca == '\0') { - return true; - } else { - if (places_into_number >= 0) { - if (isdigit(ca)) { - places_into_number++; - } else { - places_into_number = -1; - } - } else { - if (ca == '.') { - places_into_number = 0; - } - } - a++; - b++; - goto start; - } - } else { - if (places_into_number >= decimal_places_tolerance && - (isdigit(ca) || isdigit(cb))) { - // we're potentially willing to accept this difference between the - // strings. - if (isdigit(ca)) a++; - if (isdigit(cb)) b++; - // we'll have advanced at least one of the two strings. - goto start; - } else if (places_into_number >= 0 && - ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { - // this clause is designed to ensure that, for example, - // "0.1" would count the same as "0.100001". - if (ca == '0') - a++; - else - b++; - places_into_number++; - goto start; - } else { - return false; - } - } -} - -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_tolerance) { - return StringsApproxEqualInternal(a.c_str(), b.c_str(), - decimal_places_tolerance, -1); -} - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = - std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals - // sign, or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign + 1] == '\'' || - line[next_equals_sign + 1] == '"') { - char my_quote = line[next_equals_sign + 1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote - << " in config line '" << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) - // foo=bar": in general, config values with spaces in them, even without - // quoting. - - size_t next_next_equals_sign = - line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != - std::string::npos) { // found a later equals sign. - size_t preceding_space = - line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -// void ExpectOneOrTwoTokens(std::istream &is, bool binary, -// const std::string &token1, -// const std::string &token2) { -// KALDI_ASSERT(token1 != token2); -// std::string temp; -// ReadToken(is, binary, &temp); -// if (temp == token1) { -// ExpectToken(is, binary, token2); -// } else { -// if (temp != token2) { -// KALDI_ERR << "Expecting token " << token1 << " or " << token2 -// << " but got " << temp; -// } -// } -// } - -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines) { - config_lines->resize(lines.size()); - for (size_t i = 0; i < lines.size(); i++) { - bool ret = (*config_lines)[i].ParseLine(lines[i]); - if (!ret) { - KALDI_ERR << "Error parsing config line: " << lines[i]; - } - } -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/text-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/text-utils.h deleted file mode 100644 index bc7763c4aff38214d97cbeda3b29c8717dd65318..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/kaldi/util/text-utils.h +++ /dev/null @@ -1,264 +0,0 @@ -// util/text-utils.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_TEXT_UTILS_H_ -#define KALDI_UTIL_TEXT_UTILS_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Split a string using any of the single character delimiters. -/// If omit_empty_strings == true, the output will contain any -/// nonempty strings after splitting on any of the -/// characters in the delimiter. If omit_empty_strings == false, -/// the output will contain n+1 strings if there are n characters -/// in the set "delim" within the input string. In this case -/// the empty string is split to a single empty string. -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -/// Joins the elements of a vector of strings into a single string using -/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings -/// in the vector are skipped. A vector of empty strings results in an empty -/// string on the output. -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out); - -/** - \brief Split a string (e.g. 1:2:3) into a vector of integers. - - \param [in] delim String containing a list of characters, any of which - is allowed as a delimiter. - \param [in] omit_empty_strings If true, empty strings between delimiters are - allowed and will not produce an output integer; if false, - instances of characters in 'delim' that are consecutive or - at the start or end of the string would be an error. - You'll normally want this to be true if 'delim' consists - of spaces, and false otherwise. - \param [out] out The output list of integers. -*/ -template -bool SplitStringToIntegers(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false [but - // should probably be true - // if "delim" is spaces]. - std::vector *out) { - KALDI_ASSERT(out != NULL); - KALDI_ASSERT_IS_INTEGER_TYPE(I); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - const char *this_str = split[i].c_str(); - char *end = NULL; - int64 j = 0; - j = KALDI_STRTOLL(this_str, &end); - if (end == this_str || *end != '\0') { - out->clear(); - return false; - } else { - I jI = static_cast(j); - if (static_cast(jI) != j) { - // output type cannot fit this integer. - out->clear(); - return false; - } - (*out)[i] = jI; - } - } - return true; -} - -// This is defined for F = float and double. -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out); - -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - const char *this_str = str.c_str(); - char *end = NULL; - errno = 0; - int64 i = KALDI_STRTOLL(this_str, &end); - if (end != this_str) - while (isspace(*end)) end++; - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out); - -/// Removes the beginning and trailing whitespaces from a string -void Trim(std::string *str); - -/// Removes leading and trailing white space from the string, then splits on the -/// first section of whitespace found (if present), putting the part before the -/// whitespace in "first" and the rest in "rest". If there is no such space, -/// everything that remains after removing leading and trailing whitespace goes -/// in "first". -void SplitStringOnFirstSpace(const std::string &line, std::string *first, - std::string *rest); - -/// Returns true if "token" is nonempty, and all characters are -/// printable and whitespace-free. -bool IsToken(const std::string &token); - -/// Returns true if "line" is free of \n characters and unprintable -/// characters, and does not contain leading or trailing whitespace. -bool IsLine(const std::string &line); - -/** - This function returns true when two text strings are approximately equal, and - false when they are not. The definition of 'equal' is normal string - equality, except that two substrings like "0.31134" and "0.311341" would be - considered equal. 'decimal_places_tolerance' controls how many digits after - the '.' have to match up. - E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would - return false because there is a difference in the 2nd decimal, but with - an argument of 1 it would return true. - */ -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_check = 2); - -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' - baz="a b c d='a b' e" and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free - of the '=' character. If values are going to contain the '=' character, you - need to quote them with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; -}; - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, const std::string &token2); - -/** - This function reads in a config file and *appends* its contents to a vector - of lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, std::vector *lines); - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); - -/// Returns true if 'name' would be a valid name for a component or node in a -/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing -/// only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - -} // namespace kaldi - -#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/CPPLINT.cfg deleted file mode 100644 index 51ff339c18435a6c3a3be03131080d7b8ab8de86..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/CPPLINT.cfg +++ /dev/null @@ -1 +0,0 @@ -exclude_files=.* diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/CMakeLists.txt deleted file mode 100644 index 04051ef5ae46c04a40c1ffccc98c37fa594ad13e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - -#-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o - -include_directories(./include/) -install(DIRECTORY include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - -add_subdirectory(lib) - -if(HAVE_SCRIPT) - add_subdirectory(script) -endif(HAVE_SCRIPT) - -if(HAVE_BIN) - add_subdirectory(bin) -endif(HAVE_BIN) - -add_subdirectory(extensions) - -if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) -endif(BUILD_TESTING) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/extensions/special/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/extensions/special/CMakeLists.txt deleted file mode 100644 index 9c71b750a72ffe3c2dafde657273361c3dbae409..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/extensions/special/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) -message(STATUS "${HEADER_FILES}") - -if(HAVE_BIN) - add_executable(fstspecial-bin - ../../bin/fstconvert.cc - ../../bin/fstconvert-main.cc - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ) - - set_target_properties(fstspecial-bin PROPERTIES - FOLDER special/bin - OUTPUT_NAME fstspecial - ) - - target_link_libraries(fstspecial-bin - fstscript - fst - ${CMAKE_DL_LIBS} - ) -endif(HAVE_BIN) - - -add_library(fstspecial - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ${HEADER_FILES} -) - -set_target_properties(fstspecial PROPERTIES - SOVERSION "${SOVERSION}" - FOLDER special -) -target_link_libraries(fstspecial - fst -) - -set(FST_SPECIAL_INSTALL_TARGETS fstspecial) -if(HAVE_BIN) - list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) -endif() - -install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib -) - -function (add_module _name) - add_library(${ARGV}) - if (TARGET ${_name}) - target_link_libraries(${_name} fst) - set_target_properties(${_name} - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true - FOLDER special/modules - ) - endif() - - install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) -endfunction() - -add_module(phi-fst MODULE phi-fst.cc) -add_module(rho-fst MODULE rho-fst.cc) -add_module(sigma-fst MODULE sigma-fst.cc) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/include/fst/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/include/fst/flags.h deleted file mode 100644 index b5ec8ff7416774a0612ae0fe7e008a630b289dd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/include/fst/flags.h +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style flag handling declarations and inline definitions. - -#ifndef FST_LIB_FLAGS_H_ -#define FST_LIB_FLAGS_H_ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" - -using std::string; - -// FLAGS USAGE: -// -// Definition example: -// -// DEFINE_int32(length, 0, "length"); -// -// This defines variable FLAGS_length, initialized to 0. -// -// Declaration example: -// -// DECLARE_int32(length); -// -// SET_FLAGS() can be used to set flags from the command line -// using, for example, '--length=2'. -// -// ShowUsage() can be used to print out command and flag usage. - -// #define DECLARE_bool(name) extern bool FLAGS_ ## name -// #define DECLARE_string(name) extern string FLAGS_ ## name -// #define DECLARE_int32(name) extern int32 FLAGS_ ## name -// #define DECLARE_int64(name) extern int64 FLAGS_ ## name -// #define DECLARE_double(name) extern double FLAGS_ ## name - -template -struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, - const char *file, const T val) - : address(addr), - doc_string(doc), - type_name(type), - file_name(file), - default_value(val) {} - - T *address; - const char *doc_string; - const char *type_name; - const char *file_name; - const T default_value; -}; - -template -class FlagRegister { - public: - static FlagRegister *GetRegister() { - static auto reg = new FlagRegister; - return reg; - } - - const FlagDescription &GetFlagDescription(const string &name) const { - fst::MutexLock l(&flag_lock_); - auto it = flag_table_.find(name); - return it != flag_table_.end() ? it->second : 0; - } - - void SetDescription(const string &name, - const FlagDescription &desc) { - fst::MutexLock l(&flag_lock_); - flag_table_.insert(make_pair(name, desc)); - } - - bool SetFlag(const string &val, bool *address) const { - if (val == "true" || val == "1" || val.empty()) { - *address = true; - return true; - } else if (val == "false" || val == "0") { - *address = false; - return true; - } - else { - return false; - } - } - - bool SetFlag(const string &val, string *address) const { - *address = val; - return true; - } - - bool SetFlag(const string &val, int32 *address) const { - char *p = 0; - *address = strtol(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, int64 *address) const { - char *p = 0; - *address = strtoll(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, double *address) const { - char *p = 0; - *address = strtod(val.c_str(), &p); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, FlagDescription >::const_iterator it = - flag_table_.begin(); - it != flag_table_.end(); - ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - if (arg == name) - return SetFlag(val, desc.address); - } - return false; - } - - void GetUsage(std::set> *usage_set) const { - for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - string usage = " --" + name; - usage += ": type = "; - usage += desc.type_name; - usage += ", default = "; - usage += GetDefault(desc.default_value) + "\n "; - usage += desc.doc_string; - usage_set->insert(make_pair(desc.file_name, usage)); - } - } - - private: - string GetDefault(bool default_value) const { - return default_value ? "true" : "false"; - } - - string GetDefault(const string &default_value) const { - return "\"" + default_value + "\""; - } - - template - string GetDefault(const V &default_value) const { - std::ostringstream strm; - strm << default_value; - return strm.str(); - } - - mutable fst::Mutex flag_lock_; // Multithreading lock. - std::map> flag_table_; -}; - -template -class FlagRegisterer { - public: - FlagRegisterer(const string &name, const FlagDescription &desc) { - auto registr = FlagRegister::GetRegister(); - registr->SetDescription(name, desc); - } - - private: - FlagRegisterer(const FlagRegisterer &) = delete; - FlagRegisterer &operator=(const FlagRegisterer &) = delete; -}; - - -#define DEFINE_VAR(type, name, value, doc) \ - type FLAGS_ ## name = value; \ - static FlagRegisterer \ - name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ - doc, \ - #type, \ - __FILE__, \ - value)) - -// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) -// #define DEFINE_string(name, value, doc) \ -// DEFINE_VAR(string, name, value, doc) -// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc) -// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc) -// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc) - - -// Temporary directory. -DECLARE_string(tmpdir); - -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, - const char *src = ""); - -#define SET_FLAGS(usage, argc, argv, rmflags) \ -gflags::ParseCommandLineFlags(argc, argv, true) -// SetFlags(usage, argc, argv, rmflags, __FILE__) - -// Deprecated; for backward compatibility. -inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { - return SetFlags(usage, argc, argv, rmflags); -} - -void ShowUsage(bool long_usage = true); - -#endif // FST_LIB_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/include/fst/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/include/fst/log.h deleted file mode 100644 index bf041c58ebfab73d03bb14adf28c7c7916a2217d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/patch/openfst/src/include/fst/log.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style logging declarations and inline definitions. - -#ifndef FST_LIB_LOG_H_ -#define FST_LIB_LOG_H_ - -#include -#include -#include - -#include -#include - -using std::string; - -DECLARE_int32(v); - -class LogMessage { - public: - LogMessage(const string &type) : fatal_(type == "FATAL") { - std::cerr << type << ": "; - } - ~LogMessage() { - std::cerr << std::endl; - if(fatal_) - exit(1); - } - std::ostream &stream() { return std::cerr; } - - private: - bool fatal_; -}; - -// #define LOG(type) LogMessage(#type).stream() -// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO) - -// Checks -inline void FstCheck(bool x, const char* expr, - const char *file, int line) { - if (!x) { - LOG(FATAL) << "Check failed: \"" << expr - << "\" file: " << file - << " line: " << line; - } -} - -// #define CHECK(x) FstCheck(static_cast(x), #x, __FILE__, __LINE__) -// #define CHECK_EQ(x, y) CHECK((x) == (y)) -// #define CHECK_LT(x, y) CHECK((x) < (y)) -// #define CHECK_GT(x, y) CHECK((x) > (y)) -// #define CHECK_LE(x, y) CHECK((x) <= (y)) -// #define CHECK_GE(x, y) CHECK((x) >= (y)) -// #define CHECK_NE(x, y) CHECK((x) != (y)) - -// Debug checks -// #define DCHECK(x) assert(x) -// #define DCHECK_EQ(x, y) DCHECK((x) == (y)) -// #define DCHECK_LT(x, y) DCHECK((x) < (y)) -// #define DCHECK_GT(x, y) DCHECK((x) > (y)) -// #define DCHECK_LE(x, y) DCHECK((x) <= (y)) -// #define DCHECK_GE(x, y) DCHECK((x) >= (y)) -// #define DCHECK_NE(x, y) DCHECK((x) != (y)) - - -// Ports -#define ATTRIBUTE_DEPRECATED __attribute__((deprecated)) - -#endif // FST_LIB_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/CMakeLists.txt deleted file mode 100644 index 6113bbc26eb8fe35e4e17ffd1cab382f0fb0f1f8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(post_processor STATIC - post_processor.cc -) -target_link_libraries(post_processor PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/post_processor.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/post_processor.cc deleted file mode 100644 index 315f62d34cbc441ecbaf7c07667eb35ee61c2c8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/post_processor.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "utils/string.h" - -namespace wenet { - -std::string PostProcessor::ProcessSpace(const std::string& str) { - std::string result = str; - // 1. remove ' ' if needed - // only spaces between mandarin words need to be removed, please note that - // if str contains '_', we assume that the decoding type must be - // `CtcPrefixBeamSearch` and this branch will do nothing since str must be - // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) - if (opts_.language_type == kMandarinEnglish && !str.empty()) { - result.clear(); - // split str by ' ' - std::vector words; - std::stringstream ss(str); - std::string tmp; - while (ss >> tmp) { - words.push_back(tmp); - } - // check english word - bool is_englishword_prev = false; - bool is_englishword_now = false; - for (std::string& w : words) { - is_englishword_now = CheckEnglishWord(w); - if (is_englishword_prev && is_englishword_now) { - result += (' ' + w); - } else { - result += (w); - } - is_englishword_prev = is_englishword_now; - } - } - // 2. replace '_' with ' ' - // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) - result = ProcessBlank(result, opts_.lowercase); - return result; -} - -std::string PostProcessor::Process(const std::string& str, bool finish) { - std::string result; - result = ProcessSpace(str); - // TODO(xcsong): do itn/punctuation if finish == true - return result; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/post_processor.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/post_processor.h deleted file mode 100644 index 54597845ebc88ad22e1244d2e693e2088cff6d21..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/post_processor/post_processor.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#ifndef POST_PROCESSOR_POST_PROCESSOR_H_ -#define POST_PROCESSOR_POST_PROCESSOR_H_ - -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -enum LanguageType { - // spaces between **mandarin words** should be removed. - // cases of processing spaces with mandarin-only, english-only - // and mandarin-english code-switch can be found in post_processor_test.cc - kMandarinEnglish = 0x00, - // spaces should be kept for most of the - // Indo-European languages (i.e., deutsch or english-deutsch code-switch). - // cases of those languages can be found in post_processor_test.cc - kIndoEuropean = 0x01 -}; - -struct PostProcessOptions { - // space options - // The decoded result may contain spaces (' ' or '_'), - // we will process those spaces according to language_type. More details can - // be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - LanguageType language_type = kMandarinEnglish; - // whether lowercase letters are required - bool lowercase = true; -}; - -// TODO(xcsong): add itn/punctuation related resource -struct PostProcessResource {}; - -// Post Processor -class PostProcessor { - public: - explicit PostProcessor(PostProcessOptions&& opts) : opts_(std::move(opts)) {} - explicit PostProcessor(const PostProcessOptions& opts) : opts_(opts) {} - // call other functions to do post processing - std::string Process(const std::string& str, bool finish); - // process spaces according to configurations - std::string ProcessSpace(const std::string& str); - // TODO(xcsong): add itn/punctuation - // void InverseTN(const std::string& str); - // void Punctuate(const std::string& str); - - private: - const PostProcessOptions opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(PostProcessor); -}; - -} // namespace wenet - -#endif // POST_PROCESSOR_POST_PROCESSOR_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/CMakeLists.txt deleted file mode 100644 index 145654105350e91a5f9121b47197f5fc60663f5c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -link_libraries(gtest_main gmock) - -add_executable(utils_test utils_test.cc) -target_link_libraries(utils_test PUBLIC utils) -add_test(UTILS_TEST utils_test) - -add_executable(ctc_prefix_beam_search_test ctc_prefix_beam_search_test.cc) -target_link_libraries(ctc_prefix_beam_search_test PUBLIC decoder) -add_test(CTC_PREFIX_BEAM_SEARCH_TEST ctc_prefix_beam_search_test) - -add_executable(post_processor_test post_processor_test.cc) -target_link_libraries(post_processor_test PUBLIC post_processor) -add_test(POST_PROCESSOR_TEST post_processor_test) - - -add_executable(feature_pipeline_test feature_pipeline_test.cc) -target_link_libraries(feature_pipeline_test PUBLIC frontend) -add_test(FEATURE_PIPELINE_TEST feature_pipeline_test) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/ctc_prefix_beam_search_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/ctc_prefix_beam_search_test.cc deleted file mode 100644 index d8f3b65693b934beb33f3a770795f0b6e7ce3456..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/ctc_prefix_beam_search_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(CtcPrefixBeamSearchTest, CtcPrefixBeamSearchLogicTest) { - using ::testing::ElementsAre; - // See https://robin1001.github.io/2020/12/11/ctc-search for the - // graph demonstration of the data - std::vector> data = { - {0.25, 0.40, 0.35}, {0.40, 0.35, 0.25}, {0.10, 0.50, 0.40}}; - // Apply log - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data[i].size(); j++) { - data[i][j] = std::log(data[i][j]); - } - } - wenet::CtcPrefixBeamSearchOptions option; - option.first_beam_size = 3; - option.second_beam_size = 3; - wenet::CtcPrefixBeamSearch prefix_beam_search(option); - prefix_beam_search.Search(data); - /* Test case info - | top k | result index | prefix score | viterbi score | timestamp | - |-------|--------------|--------------|---------------|-----------| - | top 1 | [2, 1] | 0.2185 | 0.07 | [0, 2] | - | top 2 | [1, 2] | 0.1550 | 0.064 | [0, 2] | - | top 3 | [1] | 0.1525 | 0.07 | [2] | - */ - const std::vector>& result = prefix_beam_search.Outputs(); - EXPECT_EQ(result.size(), 3); - ASSERT_THAT(result[0], ElementsAre(2, 1)); - ASSERT_THAT(result[1], ElementsAre(1, 2)); - ASSERT_THAT(result[2], ElementsAre(1)); - - const std::vector& likelihood = prefix_beam_search.Likelihood(); - EXPECT_EQ(likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(likelihood[0]), 0.2185); - EXPECT_FLOAT_EQ(std::exp(likelihood[1]), 0.1550); - EXPECT_FLOAT_EQ(std::exp(likelihood[2]), 0.1525); - - const std::vector& viterbi_likelihood = - prefix_beam_search.viterbi_likelihood(); - EXPECT_EQ(viterbi_likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[0]), 0.07); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[1]), 0.064); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[2]), 0.07); - - const std::vector>& times = prefix_beam_search.Times(); - EXPECT_EQ(times.size(), 3); - ASSERT_THAT(times[0], ElementsAre(0, 2)); - ASSERT_THAT(times[1], ElementsAre(0, 2)); - ASSERT_THAT(times[2], ElementsAre(2)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/feature_pipeline_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/feature_pipeline_test.cc deleted file mode 100644 index 244ec0735b6086211b476e8d97569e1ee5959bc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/feature_pipeline_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2022 Roney -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "frontend/feature_pipeline.h" -#include "utils/blocking_queue.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -void pushQueue(const std::shared_ptr>& que, - std::vector vec) { - que->Push(vec); -} - -void popQueue(const std::shared_ptr>& que, int num, - int back_data) { - auto pop_data = que->Pop(num); - ASSERT_EQ(pop_data[num - 1], back_data); -} - -TEST(FeaturePipelineTest, BlockingQueueTest) { - auto capacity_queue = std::make_shared>(2); - std::vector test_data{1, 2, 3, 4, 5}; - std::thread push_thread(&pushQueue, capacity_queue, test_data); - ASSERT_EQ(capacity_queue->Pop(), 1); - ASSERT_LE(capacity_queue->Size(), 2); // capacity_queue: 2 or 2,3 - auto pop_data = capacity_queue->Pop(3); // 2,3,4 num > capacity - ASSERT_EQ(pop_data.size(), 3); - ASSERT_EQ(pop_data[2], 4); - push_thread.join(); - ASSERT_EQ(capacity_queue->Size(), 1); // capacity_queue:5 - - std::thread pop_thread(&popQueue, capacity_queue, 3, 0); // num > capacity - capacity_queue->Push(9); // capacity_queue:5,9 - capacity_queue->Push(0); // capacity_queue:5,9,0 - pop_thread.join(); // capacity_queue: - ASSERT_EQ(capacity_queue->Size(), 0); - - pop_data = capacity_queue->Pop(0); - ASSERT_TRUE(pop_data.empty()); -} - -TEST(FeaturePipelineTest, PipelineTest) { - wenet::FeaturePipelineConfig config(80, 8000); - wenet::FeaturePipeline feature_pipeline(config); - int audio_len = 8 * 55; // audio len 55ms,4 frames - std::vector pcm(audio_len, 0); - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 4); - - std::vector> out_feats; - auto b = feature_pipeline.Read(2, &out_feats); - ASSERT_TRUE(b); - ASSERT_EQ(out_feats.size(), 2); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 2); - - std::vector out_feat; - b = feature_pipeline.ReadOne(&out_feat); - ASSERT_TRUE(b); - ASSERT_FALSE(out_feat.empty()); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 1); - - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 1); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); - - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - feature_pipeline.Read(2, &out_feats); - feature_pipeline.Reset(); - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 0); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/post_processor_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/post_processor_test.cc deleted file mode 100644 index fa11fa29231032d62389a93fd00b0ec782bf8a3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/post_processor_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(PostProcessorTest, ProcessSpacekMandarinEnglishTest) { - wenet::PostProcessOptions opts_lowercase; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: mandarin character - // decode type: CtcPrefixBeamSearch, "".join() - "震东好帅", - // modeling unit: mandarin word - // decode type: CtcWfstBeamSearch, " ".join() - " 吴迪 也 好帅", - // modeling unit: english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁binbin▁is▁also▁handsome", - // modeling unit: english word - // decode type: CtcWfstBeamSearch, " ".join() - " life is short i use wenet", - // modeling unit: mandarin character + english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "超哥▁is▁the▁most▁handsome", - // modeling unit: mandarin word + english word - // decode type: CtcWfstBeamSearch, " ".join() - " 人生 苦短 i use wenet", - }; - - std::vector result_lowercase = { - "震东好帅", - "吴迪也好帅", - "binbin is also handsome", - "life is short i use wenet", - "超哥 is the most handsome", - "人生苦短i use wenet", - }; - - std::vector result_uppercase = { - "震东好帅", - "吴迪也好帅", - "BINBIN IS ALSO HANDSOME", - "LIFE IS SHORT I USE WENET", - "超哥 IS THE MOST HANDSOME", - "人生苦短I USE WENET", - }; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} - -TEST(PostProcessorTest, ProcessSpacekIndoEuropeanTest) { - wenet::PostProcessOptions opts_lowercase; - opts_lowercase.language_type = wenet::kIndoEuropean; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.language_type = wenet::kIndoEuropean; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁zhendong▁ist▁so▁schön", - // modeling unit: word - // decode type: CtcWfstBeamSearch, " ".join() - " zhendong ist so schön"}; - - std::vector result_lowercase = {"zhendong ist so schön", - "zhendong ist so schön"}; - - std::vector result_uppercase = {"ZHENDONG IST SO SCHÖN", - "ZHENDONG IST SO SCHÖN"}; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/utils_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/utils_test.cc deleted file mode 100644 index 6b2bbac25e000ce854d5e55a50cb51109d62d758..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/test/utils_test.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "utils/utils.h" - -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -TEST(UtilsTest, TopKTest) { - using ::testing::ElementsAre; - using ::testing::FloatNear; - using ::testing::Pointwise; - std::vector data = {1, 3, 5, 7, 9, 2, 4, 6, 8, 10}; - std::vector values; - std::vector indices; - wenet::TopK(data, 3, &values, &indices); - EXPECT_THAT(values, Pointwise(FloatNear(1e-8), {10, 9, 8})); - ASSERT_THAT(indices, ElementsAre(9, 4, 8)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/CMakeLists.txt deleted file mode 100644 index 686362688c050d48224ca0a01e0d24b03d94758a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_library(utils STATIC - string.cc - utils.cc -) - -if(NOT ANDROID) - if(MSVC) - target_link_libraries(utils PUBLIC fst) - else() - target_link_libraries(utils PUBLIC fst dl) - endif() -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/blocking_queue.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/blocking_queue.h deleted file mode 100644 index 9bf0127d9298fbfae2eeebb9431c680fc5dd7647..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/blocking_queue.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_BLOCKING_QUEUE_H_ -#define UTILS_BLOCKING_QUEUE_H_ - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity = std::numeric_limits::max()) - : capacity_(capacity) {} - - void Push(const T& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(value); - } - not_empty_condition_.notify_one(); - } - - void Push(T&& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - void Push(const std::vector& values) { - { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(value); - } - } - not_empty_condition_.notify_one(); - } - - void Push(std::vector&& values) { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - T Pop() { - std::unique_lock lock(mutex_); - while (queue_.empty()) { - not_empty_condition_.wait(lock); - } - T t(std::move(queue_.front())); - queue_.pop(); - not_full_condition_.notify_one(); - return t; - } - - // num can be greater than capacity,but it needs to be used with care - std::vector Pop(size_t num) { - std::unique_lock lock(mutex_); - std::vector block_data; - while (block_data.size() < num) { - while (queue_.empty()) { - not_full_condition_.notify_one(); - not_empty_condition_.wait(lock); - } - block_data.push_back(std::move(queue_.front())); - queue_.pop(); - } - not_full_condition_.notify_one(); - return block_data; - } - - bool Empty() const { - std::lock_guard lock(mutex_); - return queue_.empty(); - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - void Clear() { - while (!Empty()) { - Pop(); - } - } - - private: - size_t capacity_; - mutable std::mutex mutex_; - std::condition_variable not_full_condition_; - std::condition_variable not_empty_condition_; - std::queue queue_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace wenet - -#endif // UTILS_BLOCKING_QUEUE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/file.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/file.h deleted file mode 100644 index 83ad9c8c52fecd334b3549285bf39cd4f59b9f2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/file.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FILE_H_ -#define UTILS_FILE_H_ - -#include -#include - -namespace wenet { - -inline bool FileExists(const std::string& path) { - std::ifstream f(path.c_str()); - return f.good(); -} - -} // namespace wenet - -#endif // UTILS_FILE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/flags.h deleted file mode 100644 index 3432aa78847322edec8d6d2aec59ed7ca5352fcd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/flags.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FLAGS_H_ -#define UTILS_FLAGS_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/flags.h" - -#endif // UTILS_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/json.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/json.h deleted file mode 100644 index bf8d94a3e42504139b10daa39b8f8e7a8b2d93cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/json.h +++ /dev/null @@ -1,754 +0,0 @@ -// Copyright (c) From https://github.com/nbsdx/SimpleJSON -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_JSON_H_ -#define UTILS_JSON_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace json { - -using std::deque; -using std::enable_if; -using std::initializer_list; -using std::is_convertible; -using std::is_floating_point; -using std::is_integral; -using std::is_same; -using std::map; -using std::string; - -namespace { // NOLINT -string json_escape(const string& str) { - string output; - for (unsigned i = 0; i < str.length(); ++i) switch (str[i]) { - case '\"': - output += "\\\""; - break; - case '\\': - output += "\\\\"; - break; - case '\b': - output += "\\b"; - break; - case '\f': - output += "\\f"; - break; - case '\n': - output += "\\n"; - break; - case '\r': - output += "\\r"; - break; - case '\t': - output += "\\t"; - break; - default: - output += str[i]; - break; - } - return std::move(output); -} -} // namespace - -class JSON { - union BackingData { - BackingData(double d) : Float(d) {} - BackingData(int l) : Int(l) {} - BackingData(bool b) : Bool(b) {} - BackingData(string s) : String(new string(s)) {} - BackingData() : Int(0) {} - - deque* List; - map* Map; - string* String; - double Float; - int Int; - bool Bool; - } Internal; - - public: - enum class Class { Null, Object, Array, String, Floating, Integral, Boolean }; - - template - class JSONWrapper { - Container* object; - - public: - explicit JSONWrapper(Container* val) : object(val) {} - explicit JSONWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::iterator begin() { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::iterator end() { - return object ? object->end() : typename Container::iterator(); - } - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::iterator(); - } - }; - - template - class JSONConstWrapper { - const Container* object; - - public: - explicit JSONConstWrapper(const Container* val) : object(val) {} - explicit JSONConstWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::const_iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::const_iterator(); - } - }; - - JSON() : Internal(), Type(Class::Null) {} - - explicit JSON(initializer_list list) : JSON() { - SetType(Class::Object); - for (auto i = list.begin(), e = list.end(); i != e; ++i, ++i) - operator[](i->ToString()) = *std::next(i); - } - - JSON(JSON&& other) : Internal(other.Internal), Type(other.Type) { - other.Type = Class::Null; - other.Internal.Map = nullptr; - } - - JSON& operator=(JSON&& other) { - ClearInternal(); - Internal = other.Internal; - Type = other.Type; - other.Internal.Map = nullptr; - other.Type = Class::Null; - return *this; - } - - JSON(const JSON& other) { - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - } - - JSON& operator=(const JSON& other) { - ClearInternal(); - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - return *this; - } - - ~JSON() { - switch (Type) { - case Class::Array: - delete Internal.List; - break; - case Class::Object: - delete Internal.Map; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - template - explicit JSON(T b, typename enable_if::value>::type* = 0) - : Internal(b), Type(Class::Boolean) {} - - template - explicit JSON(T i, typename enable_if::value && - !is_same::value>::type* = 0) - : Internal(static_cast(i)), Type(Class::Integral) {} - - template - explicit JSON(T f, typename enable_if::value>::type* = 0) - : Internal(static_cast(f)), Type(Class::Floating) {} - - template - explicit JSON(T s, - typename enable_if::value>::type* = 0) - : Internal(string(s)), Type(Class::String) {} - - explicit JSON(std::nullptr_t) : Internal(), Type(Class::Null) {} - - static JSON Make(Class type) { - JSON ret; - ret.SetType(type); - return ret; - } - - static JSON Load(const string&); - - template - void append(T arg) { - SetType(Class::Array); - Internal.List->emplace_back(arg); - } - - template - void append(T arg, U... args) { - append(arg); - append(args...); - } - - template - typename enable_if::value, JSON&>::type operator=(T b) { - SetType(Class::Boolean); - Internal.Bool = b; - return *this; - } - - template - typename enable_if::value && !is_same::value, - JSON&>::type - operator=(T i) { - SetType(Class::Integral); - Internal.Int = i; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=(T f) { - SetType(Class::Floating); - Internal.Float = f; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=( - T s) { - SetType(Class::String); - *Internal.String = string(s); - return *this; - } - - JSON& operator[](const string& key) { - SetType(Class::Object); - return Internal.Map->operator[](key); - } - - JSON& operator[](unsigned index) { - SetType(Class::Array); - if (index >= Internal.List->size()) Internal.List->resize(index + 1); - return Internal.List->operator[](index); - } - - JSON& at(const string& key) { return operator[](key); } - - const JSON& at(const string& key) const { return Internal.Map->at(key); } - - JSON& at(unsigned index) { return operator[](index); } - - const JSON& at(unsigned index) const { return Internal.List->at(index); } - - int length() const { - if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - bool hasKey(const string& key) const { - if (Type == Class::Object) - return Internal.Map->find(key) != Internal.Map->end(); - return false; - } - - int size() const { - if (Type == Class::Object) - return Internal.Map->size(); - else if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - Class JSONType() const { return Type; } - - /// Functions for getting primitives from the JSON object. - bool IsNull() const { return Type == Class::Null; } - - string ToString() const { - bool b; - return std::move(ToString(&b)); - } - string ToString(bool* ok) const { - *ok = (Type == Class::String); - return *ok ? std::move(json_escape(*Internal.String)) : string(""); - } - - double ToFloat() const { - bool b; - return ToFloat(&b); - } - double ToFloat(bool* ok) const { - *ok = (Type == Class::Floating); - return *ok ? Internal.Float : 0.0; - } - - int ToInt() const { - bool b; - return ToInt(&b); - } - int ToInt(bool* ok) const { - *ok = (Type == Class::Integral); - return *ok ? Internal.Int : 0; - } - - bool ToBool() const { - bool b; - return ToBool(&b); - } - bool ToBool(bool* ok) const { - *ok = (Type == Class::Boolean); - return *ok ? Internal.Bool : false; - } - - JSONWrapper> ObjectRange() { - if (Type == Class::Object) - return JSONWrapper>(Internal.Map); - return JSONWrapper>(nullptr); - } - - JSONWrapper> ArrayRange() { - if (Type == Class::Array) return JSONWrapper>(Internal.List); - return JSONWrapper>(nullptr); - } - - JSONConstWrapper> ObjectRange() const { - if (Type == Class::Object) - return JSONConstWrapper>(Internal.Map); - return JSONConstWrapper>(nullptr); - } - - JSONConstWrapper> ArrayRange() const { - if (Type == Class::Array) - return JSONConstWrapper>(Internal.List); - return JSONConstWrapper>(nullptr); - } - - string dump(int depth = 1, string tab = " ") const { - string pad = ""; - for (int i = 0; i < depth; ++i, pad += tab) { - } - - switch (Type) { - case Class::Null: - return "null"; - case Class::Object: { - string s = "{\n"; - bool skip = true; - for (auto& p : *Internal.Map) { - if (!skip) s += ",\n"; - s += (pad + "\"" + p.first + "\" : " + p.second.dump(depth + 1, tab)); - skip = false; - } - s += ("\n" + pad.erase(0, 2) + "}"); - return s; - } - case Class::Array: { - string s = "["; - bool skip = true; - for (auto& p : *Internal.List) { - if (!skip) s += ", "; - s += p.dump(depth + 1, tab); - skip = false; - } - s += "]"; - return s; - } - case Class::String: - return "\"" + json_escape(*Internal.String) + "\""; - case Class::Floating: - return std::to_string(Internal.Float); - case Class::Integral: - return std::to_string(Internal.Int); - case Class::Boolean: - return Internal.Bool ? "true" : "false"; - default: - return ""; - } - return ""; - } - - friend std::ostream& operator<<(std::ostream&, const JSON&); - - private: - void SetType(Class type) { - if (type == Type) return; - - ClearInternal(); - - switch (type) { - case Class::Null: - Internal.Map = nullptr; - break; - case Class::Object: - Internal.Map = new map(); - break; - case Class::Array: - Internal.List = new deque(); - break; - case Class::String: - Internal.String = new string(); - break; - case Class::Floating: - Internal.Float = 0.0; - break; - case Class::Integral: - Internal.Int = 0; - break; - case Class::Boolean: - Internal.Bool = false; - break; - } - - Type = type; - } - - private: - /* beware: only call if YOU know that Internal is allocated. No checks - performed here. This function should be called in a constructed JSON just - before you are going to overwrite Internal... -*/ - void ClearInternal() { - switch (Type) { - case Class::Object: - delete Internal.Map; - break; - case Class::Array: - delete Internal.List; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - private: - Class Type = Class::Null; -}; - -JSON Array() { return std::move(JSON::Make(JSON::Class::Array)); } - -template -JSON Array(T... args) { - JSON arr = JSON::Make(JSON::Class::Array); - arr.append(args...); - return std::move(arr); -} - -JSON Object() { return std::move(JSON::Make(JSON::Class::Object)); } - -std::ostream& operator<<(std::ostream& os, const JSON& json) { - os << json.dump(); - return os; -} - -namespace { // NOLINT -JSON parse_next(const string&, size_t&); - -void consume_ws(const string& str, size_t& offset) { // NOLINT - while (isspace(str[offset])) ++offset; -} - -JSON parse_object(const string& str, size_t& offset) { // NOLINT - JSON Object = JSON::Make(JSON::Class::Object); - - ++offset; - consume_ws(str, offset); - if (str[offset] == '}') { - ++offset; - return std::move(Object); - } - - while (true) { - JSON Key = parse_next(str, offset); - consume_ws(str, offset); - if (str[offset] != ':') { - std::cerr << "Error: Object: Expected colon, found '" << str[offset] - << "'\n"; - break; - } - consume_ws(str, ++offset); - JSON Value = parse_next(str, offset); - Object[Key.ToString()] = Value; - - consume_ws(str, offset); - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == '}') { - ++offset; - break; - } else { - std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] - << "'\n"; - break; - } - } - - return std::move(Object); -} - -JSON parse_array(const string& str, size_t& offset) { // NOLINT - JSON Array = JSON::Make(JSON::Class::Array); - unsigned index = 0; - - ++offset; - consume_ws(str, offset); - if (str[offset] == ']') { - ++offset; - return std::move(Array); - } - - while (true) { - Array[index++] = parse_next(str, offset); - consume_ws(str, offset); - - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == ']') { - ++offset; - break; - } else { - std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] - << "'\n"; - return std::move(JSON::Make(JSON::Class::Array)); - } - } - - return std::move(Array); -} - -JSON parse_string(const string& str, size_t& offset) { // NOLINT - JSON String; - string val; - for (char c = str[++offset]; c != '\"'; c = str[++offset]) { - if (c == '\\') { - switch (str[++offset]) { - case '\"': - val += '\"'; - break; - case '\\': - val += '\\'; - break; - case '/': - val += '/'; - break; - case 'b': - val += '\b'; - break; - case 'f': - val += '\f'; - break; - case 'n': - val += '\n'; - break; - case 'r': - val += '\r'; - break; - case 't': - val += '\t'; - break; - case 'u': { - val += "\\u"; - for (unsigned i = 1; i <= 4; ++i) { - c = str[offset + i]; - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { - val += c; - } else { - std::cerr << "ERROR: String: Expected hex character in unicode " - "escape, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::String)); - } - } - offset += 4; - } break; - default: - val += '\\'; - break; - } - } else { - val += c; - } - } - ++offset; - String = val; - return std::move(String); -} - -JSON parse_number(const string& str, size_t& offset) { // NOLINT - JSON Number; - string val, exp_str; - char c; - bool isDouble = false; - int exp = 0; - while (true) { - c = str[offset++]; - if ((c == '-') || (c >= '0' && c <= '9')) { - val += c; - } else if (c == '.') { - val += c; - isDouble = true; - } else { - break; - } - } - if (c == 'E' || c == 'e') { - c = str[offset++]; - if (c == '-') { - ++offset; - exp_str += '-'; - } - while (true) { - c = str[offset++]; - if (c >= '0' && c <= '9') { - exp_str += c; - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: Expected a number for exponent, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } else { - break; - } - } - exp = std::stol(exp_str); - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - --offset; - - if (isDouble) { - Number = std::stod(val) * std::pow(10, exp); - } else { - if (!exp_str.empty()) - Number = std::stol(val) * std::pow(10, exp); - else - Number = std::stol(val); - } - return std::move(Number); -} - -JSON parse_bool(const string& str, size_t& offset) { // NOLINT - JSON Bool; - if (str.substr(offset, 4) == "true") { - Bool = true; - } else if (str.substr(offset, 5) == "false") { - Bool = false; - } else { - std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" - << str.substr(offset, 5) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += (Bool.ToBool() ? 4 : 5); - return std::move(Bool); -} - -JSON parse_null(const string& str, size_t& offset) { // NOLINT - JSON Null; - if (str.substr(offset, 4) != "null") { - std::cerr << "ERROR: Null: Expected 'null', found '" - << str.substr(offset, 4) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += 4; - return std::move(Null); -} - -JSON parse_next(const string& str, size_t& offset) { // NOLINT - char value; - consume_ws(str, offset); - value = str[offset]; - switch (value) { - case '[': - return std::move(parse_array(str, offset)); - case '{': - return std::move(parse_object(str, offset)); - case '\"': - return std::move(parse_string(str, offset)); - case 't': - case 'f': - return std::move(parse_bool(str, offset)); - case 'n': - return std::move(parse_null(str, offset)); - default: - if ((value <= '9' && value >= '0') || value == '-') - return std::move(parse_number(str, offset)); - } - std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; - return JSON(); -} -} // namespace - -JSON JSON::Load(const string& str) { - size_t offset = 0; - return std::move(parse_next(str, offset)); -} - -} // namespace json - -#endif // UTILS_JSON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/log.h deleted file mode 100644 index c2bf03f261a8711f74da819d80d68e8eb9fb124a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/log.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_LOG_H_ -#define UTILS_LOG_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/log.h" - -#endif // UTILS_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/string.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/string.cc deleted file mode 100644 index 1ab93adf3cac1bc5a42c0b8c6cadbde399678fef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/string.cc +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/string.h" - -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -void SplitString(const std::string& str, std::vector* strs) { - SplitStringToVector(Trim(str), " \t", true, strs); -} - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars) { - chars->clear(); - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - assert((str[i] & 0xF8) <= 0xF0); - if ((str[i] & 0x80) == 0x00) { - // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - // The next 1,920 characters need two bytes to encode, - // which covers the remainder of almost all Latin-script alphabets. - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - // Three bytes are needed for characters in the rest of - // the Basic Multilingual Plane, which contains virtually all characters - // in common use, including most Chinese, Japanese and Korean characters. - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - // Four bytes are needed for characters in the other planes of Unicode, - // which include less common CJK characters, various historic scripts, - // mathematical symbols, and emoji (pictographic symbols). - bytes = 4; - } - chars->push_back(str.substr(i, bytes)); - } -} - -int UTF8StringLength(const std::string& str) { - int len = 0; - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - if ((str[i] & 0x80) == 0x00) { - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - bytes = 4; - } - ++len; - } - return len; -} - -bool CheckEnglishChar(const std::string& ch) { - // all english characters should be encoded in one byte - if (ch.size() != 1) return false; - // english words may contain apostrophe, i.e., "He's" - return isalpha(ch[0]) || ch[0] == '\''; -} - -bool CheckEnglishWord(const std::string& word) { - std::vector chars; - SplitUTF8StringToChars(word, &chars); - for (size_t k = 0; k < chars.size(); k++) { - if (!CheckEnglishChar(chars[k])) { - return false; - } - } - return true; -} - -std::string JoinString(const std::string& c, - const std::vector& strs) { - std::string result; - if (strs.size() > 0) { - for (int i = 0; i < strs.size() - 1; i++) { - result += (strs[i] + c); - } - result += strs.back(); - } - return result; -} - -bool IsAlpha(const std::string& str) { - for (size_t i = 0; i < str.size(); i++) { - if (!isalpha(str[i])) { - return false; - } - } - return true; -} - -std::string ProcessBlank(const std::string& str, bool lowercase) { - std::string result; - if (!str.empty()) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - for (std::string& ch : chars) { - if (ch != kSpaceSymbol) { - result.append(ch); - } else { - // Ignore consecutive space or located in head - if (!result.empty() && result.back() != ' ') { - result.push_back(' '); - } - } - } - // Ignore tailing space - if (!result.empty() && result.back() == ' ') { - result.pop_back(); - } - // NOTE: convert string to wstring - // see issue 745: https://github.com/wenet-e2e/wenet/issues/745 - std::locale loc(""); - std::wstring_convert, wchar_t> converter; - std::wstring wsresult = converter.from_bytes(result); - for (auto& c : wsresult) { - c = lowercase ? tolower(c, loc) : toupper(c, loc); - } - result = converter.to_bytes(wsresult); - } - return result; -} - -std::string Ltrim(const std::string& str) { - size_t start = str.find_first_not_of(WHITESPACE); - return (start == std::string::npos) ? "" : str.substr(start); -} - -std::string Rtrim(const std::string& str) { - size_t end = str.find_last_not_of(WHITESPACE); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); -} - -std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } - -std::string JoinPath(const std::string& left, const std::string& right) { - std::string path(left); - if (path.size() && path.back() != '/') { - path.push_back('/'); - } - path.append(right); - return path; -} - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str) { - unsigned len = str.size() * 2; - setlocale(LC_CTYPE, ""); - wchar_t* p = new wchar_t[len]; - mbstowcs(p, str.c_str(), len); - std::wstring wstr(p); - delete[] p; - return wstr; -} -#endif - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/string.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/string.h deleted file mode 100644 index bf7a52ae09bce45ab7e34a5277652d7ae91bae1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/string.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_STRING_H_ -#define UTILS_STRING_H_ - -#include -#include -#include -#include -#include - -#include "fst/symbol-table.h" - -namespace wenet { - -const char WHITESPACE[] = " \n\r\t\f\v"; - -// Split the string with space or tab. -void SplitString(const std::string& str, std::vector* strs); - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out); - -// NOTE(Xingchen Song): we add this function to make it possible to -// support multilingual recipe in the future, in which characters of -// different languages are all encoded in UTF-8 format. -// UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding -// Split the UTF-8 string into chars. -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars); - -int UTF8StringLength(const std::string& str); - -// Check whether the UTF-8 char is alphabet or '. -bool CheckEnglishChar(const std::string& ch); - -// Check whether the UTF-8 word is only contains alphabet or '. -bool CheckEnglishWord(const std::string& word); - -std::string JoinString(const std::string& c, - const std::vector& strs); - -bool IsAlpha(const std::string& str); - -// Split the UTF-8 string into words by symbol table. -// Return whether not contains oov. -bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - -// Replace ▁ with space, then remove head, tail and consecutive space. -std::string ProcessBlank(const std::string& str, bool lowercase); - -std::string Ltrim(const std::string& str); - -std::string Rtrim(const std::string& str); - -std::string Trim(const std::string& str); - -std::string JoinPath(const std::string& left, const std::string& right); - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str); -#endif - -} // namespace wenet - -#endif // UTILS_STRING_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/thread_pool.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/thread_pool.h deleted file mode 100644 index a78162995d90bf079ad091cf14cb9f2cd4476d05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/thread_pool.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2012 Jakob Progsch, Václav Zeman - -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. - -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: - -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. - -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original software. - -// 3. This notice may not be removed or altered from any source -// distribution. - -#ifndef UTILS_THREAD_POOL_H_ -#define UTILS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - public: - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared >( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) { - worker.join(); - } -} - -#endif // UTILS_THREAD_POOL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/timer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/timer.h deleted file mode 100644 index 068519f98d140ba0eef68babe2ad2fdcb798c074..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/timer.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_TIMER_H_ -#define UTILS_TIMER_H_ - -#include - -namespace wenet { - -class Timer { - public: - Timer() : time_start_(std::chrono::steady_clock::now()) {} - void Reset() { time_start_ = std::chrono::steady_clock::now(); } - // return int in milliseconds - int Elapsed() const { - auto time_now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(time_now - - time_start_) - .count(); - } - - private: - std::chrono::time_point time_start_; -}; -} // namespace wenet - -#endif // UTILS_TIMER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/utils.cc deleted file mode 100644 index c37e36c6e9f629e0a4b11cf21a791aefd58b659f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/utils.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/utils.h" - -#include -#include -#include -#include -#include -#include - -#include "utils/log.h" - -namespace wenet { - -float LogAdd(float x, float y) { - static float num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - float xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -template -struct ValueComp { - bool operator()(const std::pair& lhs, - const std::pair& rhs) const { - return lhs.first > rhs.first || - (lhs.first == rhs.first && lhs.second < rhs.second); - } -}; - -// We refer the pytorch topk implementation -// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices) { - std::vector> heap_data; - int n = data.size(); - for (int32_t i = 0; i < k && i < n; ++i) { - heap_data.emplace_back(data[i], i); - } - std::priority_queue, std::vector>, - ValueComp> - pq(ValueComp(), std::move(heap_data)); - for (int32_t i = k; i < n; ++i) { - if (pq.top().first < data[i]) { - pq.pop(); - pq.emplace(data[i], i); - } - } - - values->resize(std::min(k, n)); - indices->resize(std::min(k, n)); - int32_t cur = values->size() - 1; - while (!pq.empty()) { - const auto& item = pq.top(); - (*values)[cur] = item.first; - (*indices)[cur] = item.second; - pq.pop(); - cur -= 1; - } -} - -template void TopK(const std::vector& data, int32_t k, - std::vector* values, - std::vector* indices); - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/utils.h deleted file mode 100644 index f9957c0b6e8ae27d9260e75cf55e786055827801..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/utils/utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_UTILS_H_ -#define UTILS_UTILS_H_ - -#include -#include -#include - -namespace wenet { - -#define WENET_DISALLOW_COPY_AND_ASSIGN(Type) \ - Type(const Type&) = delete; \ - Type& operator=(const Type&) = delete; - -const float kFloatMax = std::numeric_limits::max(); -// kSpaceSymbol in UTF-8 is: ▁ -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Return the sum of two probabilities in log scale -float LogAdd(float x, float y); - -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices); - -} // namespace wenet - -#endif // UTILS_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/CMakeLists.txt deleted file mode 100644 index 67447c42d977f120fc39cdab0d052b011edd3efe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(websocket STATIC - websocket_client.cc - websocket_server.cc -) -target_link_libraries(websocket PUBLIC decoder) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_client.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_client.cc deleted file mode 100644 index c0394e6250153e2d59636c9eab62badc4a737d16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_client.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "websocket/websocket_client.h" - -#include "boost/json/src.hpp" - -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -WebSocketClient::WebSocketClient(const std::string& hostname, int port) - : hostname_(hostname), port_(port) { - Connect(); - t_.reset(new std::thread(&WebSocketClient::ReadLoopFunc, this)); -} - -void WebSocketClient::Connect() { - tcp::resolver resolver{ioc_}; - // Look up the domain name - auto const results = resolver.resolve(hostname_, std::to_string(port_)); - // Make the connection on the IP address we get from a lookup - auto ep = asio::connect(ws_.next_layer(), results); - // Provide the value of the Host HTTP header during the WebSocket handshake. - // See https://tools.ietf.org/html/rfc7230#section-5.4 - std::string host = hostname_ + ":" + std::to_string(ep.port()); - // Perform the websocket handshake - ws_.handshake(host, "/"); -} - -void WebSocketClient::SendTextData(const std::string& data) { - ws_.text(true); - ws_.write(asio::buffer(data)); -} - -void WebSocketClient::SendBinaryData(const void* data, size_t size) { - ws_.binary(true); - ws_.write(asio::buffer(data, size)); -} - -void WebSocketClient::Close() { ws_.close(websocket::close_code::normal); } - -void WebSocketClient::ReadLoopFunc() { - try { - while (true) { - beast::flat_buffer buffer; - ws_.read(buffer); - std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; - CHECK(ws_.got_text()); - json::object obj = json::parse(message).as_object(); - if (obj["status"] != "ok") { - break; - } - if (obj["type"] == "speech_end") { - done_ = true; - break; - } - } - } catch (beast::system_error const& se) { - // This indicates that the session was closed - if (se.code() != websocket::error::closed) { - LOG(ERROR) << se.code().message(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void WebSocketClient::Join() { t_->join(); } - -void WebSocketClient::SendStartSignal() { - // TODO(Binbin Zhang): Add sample rate and other setting support - json::value start_tag = {{"signal", "start"}, - {"nbest", nbest_}, - {"continuous_decoding", continuous_decoding_}}; - std::string start_message = json::serialize(start_tag); - this->SendTextData(start_message); -} - -void WebSocketClient::SendEndSignal() { - json::value end_tag = {{"signal", "end"}}; - std::string end_message = json::serialize(end_tag); - this->SendTextData(end_message); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_client.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_client.h deleted file mode 100644 index 76ec3aa451d31c7ee6b158ce21c8acdc10575eb3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_client.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef WEBSOCKET_WEBSOCKET_CLIENT_H_ -#define WEBSOCKET_WEBSOCKET_CLIENT_H_ - -#include -#include -#include -#include - -#include "boost/asio/connect.hpp" -#include "boost/asio/ip/tcp.hpp" -#include "boost/beast/core.hpp" -#include "boost/beast/websocket.hpp" - -#include "utils/utils.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class WebSocketClient { - public: - WebSocketClient(const std::string& host, int port); - - void SendTextData(const std::string& data); - void SendBinaryData(const void* data, size_t size); - void ReadLoopFunc(); - void Close(); - void Join(); - void SendStartSignal(); - void SendEndSignal(); - void set_nbest(int nbest) { nbest_ = nbest; } - void set_continuous_decoding(bool continuous_decoding) { - continuous_decoding_ = continuous_decoding; - } - bool done() const { return done_; } - - private: - void Connect(); - std::string hostname_; - int port_; - int nbest_ = 1; - bool continuous_decoding_ = false; - bool done_ = false; - asio::io_context ioc_; - websocket::stream ws_{ioc_}; - std::unique_ptr t_{nullptr}; - - WENET_DISALLOW_COPY_AND_ASSIGN(WebSocketClient); -}; - -} // namespace wenet - -#endif // WEBSOCKET_WEBSOCKET_CLIENT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_server.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_server.cc deleted file mode 100644 index 52ab088f46d59b9f3f1add1e34d3aceae290f5da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_server.cc +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "websocket/websocket_server.h" - -#include -#include -#include - -#include "boost/json/src.hpp" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from -namespace json = boost::json; - -ConnectionHandler::ConnectionHandler( - tcp::socket&& socket, std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : ws_(std::move(socket)), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - -void ConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Received speech start signal, start reading speech"; - got_start_tag_ = true; - json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); - feature_pipeline_ = std::make_shared(*feature_config_); - decoder_ = std::make_shared(feature_pipeline_, decode_resource_, - *decode_config_); - // Start decoder thread - decode_thread_ = - std::make_shared(&ConnectionHandler::DecodeThreadFunc, this); -} - -void ConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Received speech end signal"; - if (feature_pipeline_ != nullptr) { - feature_pipeline_->set_input_finished(); - } - got_end_tag_ = true; -} - -void ConnectionHandler::OnPartialResult(const std::string& result) { - LOG(INFO) << "Partial result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "partial_result"}, {"nbest", result}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnFinalResult(const std::string& result) { - LOG(INFO) << "Final result: " << result; - json::value rv = { - {"status", "ok"}, {"type", "final_result"}, {"nbest", result}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnFinish() { - // Send finish tag - json::value rv = {{"status", "ok"}, {"type", "speech_end"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); -} - -void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) { - // Read binary PCM data - int num_samples = buffer.size() / sizeof(int16_t); - VLOG(2) << "Received " << num_samples << " samples"; - CHECK(feature_pipeline_ != nullptr); - CHECK(decoder_ != nullptr); - const auto* pcm_data = static_cast(buffer.data().data()); - feature_pipeline_->AcceptWaveform(pcm_data, num_samples); -} - -std::string ConnectionHandler::SerializeResult(bool finish) { - json::array nbest; - for (const DecodeResult& path : decoder_->result()) { - json::object jpath({{"sentence", path.sentence}}); - if (finish) { - json::array word_pieces; - for (const WordPiece& word_piece : path.word_pieces) { - json::object jword_piece({{"word", word_piece.word}, - {"start", word_piece.start}, - {"end", word_piece.end}}); - word_pieces.emplace_back(jword_piece); - } - jpath.emplace("word_pieces", word_pieces); - } - nbest.emplace_back(jpath); - - if (nbest.size() == nbest_) { - break; - } - } - return json::serialize(nbest); -} - -void ConnectionHandler::DecodeThreadFunc() { - try { - while (true) { - DecodeState state = decoder_->Decode(); - if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - OnFinish(); - stop_recognition_ = true; - break; - } else if (state == DecodeState::kEndpoint) { - decoder_->Rescoring(); - std::string result = SerializeResult(true); - OnFinalResult(result); - // If it's not continuous decoding, continue to do next recognition - // otherwise stop the recognition - if (continuous_decoding_) { - decoder_->ResetContinuousDecoding(); - } else { - OnFinish(); - stop_recognition_ = true; - break; - } - } else { - if (decoder_->DecodedSomething()) { - std::string result = SerializeResult(false); - OnPartialResult(result); - } - } - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void ConnectionHandler::OnError(const std::string& message) { - json::value rv = {{"status", "failed"}, {"message", message}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); - // Close websocket - ws_.close(websocket::close_code::normal); -} - -void ConnectionHandler::OnText(const std::string& message) { - json::value v = json::parse(message); - if (v.is_object()) { - json::object obj = v.get_object(); - if (obj.find("signal") != obj.end()) { - json::string signal = obj["signal"].as_string(); - if (signal == "start") { - if (obj.find("nbest") != obj.end()) { - if (obj["nbest"].is_int64()) { - nbest_ = obj["nbest"].as_int64(); - } else { - OnError("integer is expected for nbest option"); - } - } - if (obj.find("continuous_decoding") != obj.end()) { - if (obj["continuous_decoding"].is_bool()) { - continuous_decoding_ = obj["continuous_decoding"].as_bool(); - } else { - OnError( - "boolean true or false is expected for " - "continuous_decoding option"); - } - } - OnSpeechStart(); - } else if (signal == "end") { - OnSpeechEnd(); - } else { - OnError("Unexpected signal type"); - } - } else { - OnError("Wrong message header"); - } - } else { - OnError("Wrong protocol"); - } -} - -void ConnectionHandler::operator()() { - try { - // Accept the websocket handshake - ws_.accept(); - for (;;) { - // This buffer will hold the incoming message - beast::flat_buffer buffer; - // Read a message - ws_.read(buffer); - if (ws_.got_text()) { - std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; - OnText(message); - if (got_end_tag_) { - break; - } - } else { - if (!got_start_tag_) { - OnError("Start signal is expected before binary data"); - } else { - if (stop_recognition_) { - break; - } - OnSpeechData(buffer); - } - } - } - - LOG(INFO) << "Read all pcm data, wait for decoding thread"; - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (beast::system_error const& se) { - LOG(INFO) << se.code().message(); - // This indicates that the session was closed - if (se.code() == websocket::error::closed) { - OnSpeechEnd(); - } - if (decode_thread_ != nullptr) { - decode_thread_->join(); - } - } catch (std::exception const& e) { - LOG(ERROR) << e.what(); - } -} - -void WebSocketServer::Start() { - try { - auto const address = asio::ip::make_address("0.0.0.0"); - tcp::acceptor acceptor{ioc_, {address, static_cast(port_)}}; - for (;;) { - // This will receive the new connection - tcp::socket socket{ioc_}; - // Block until we get a connection - acceptor.accept(socket); - // Launch the session, transferring ownership of the socket - ConnectionHandler handler(std::move(socket), feature_config_, - decode_config_, decode_resource_); - std::thread t(std::move(handler)); - t.detach(); - } - } catch (const std::exception& e) { - LOG(FATAL) << e.what(); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_server.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_server.h deleted file mode 100644 index a1241834221dcf93c34d6414bd9b5ae40ef1cf38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/onnxruntime/websocket/websocket_server.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef WEBSOCKET_WEBSOCKET_SERVER_H_ -#define WEBSOCKET_WEBSOCKET_SERVER_H_ - -#include -#include -#include -#include -#include - -#include "boost/asio/connect.hpp" -#include "boost/asio/ip/tcp.hpp" -#include "boost/beast/core.hpp" -#include "boost/beast/websocket.hpp" - -#include "decoder/asr_decoder.h" -#include "frontend/feature_pipeline.h" -#include "utils/log.h" - -namespace wenet { - -namespace beast = boost::beast; // from -namespace http = beast::http; // from -namespace websocket = beast::websocket; // from -namespace asio = boost::asio; // from -using tcp = boost::asio::ip::tcp; // from - -class ConnectionHandler { - public: - ConnectionHandler(tcp::socket&& socket, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource_); - void operator()(); - - private: - void OnSpeechStart(); - void OnSpeechEnd(); - void OnText(const std::string& message); - void OnFinish(); - void OnSpeechData(const beast::flat_buffer& buffer); - void OnError(const std::string& message); - void OnPartialResult(const std::string& result); - void OnFinalResult(const std::string& result); - void DecodeThreadFunc(); - std::string SerializeResult(bool finish); - - bool continuous_decoding_ = false; - int nbest_ = 1; - websocket::stream ws_; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - - bool got_start_tag_ = false; - bool got_end_tag_ = false; - // When endpoint is detected, stop recognition, and stop receiving data. - bool stop_recognition_ = false; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr decode_thread_ = nullptr; -}; - -class WebSocketServer { - public: - WebSocketServer(int port, - std::shared_ptr feature_config, - std::shared_ptr decode_config, - std::shared_ptr decode_resource) - : port_(port), - feature_config_(std::move(feature_config)), - decode_config_(std::move(decode_config)), - decode_resource_(std::move(decode_resource)) {} - - void Start(); - - private: - int port_; - // The io_context is required for all I/O - asio::io_context ioc_{1}; - std::shared_ptr feature_config_; - std::shared_ptr decode_config_; - std::shared_ptr decode_resource_; - WENET_DISALLOW_COPY_AND_ASSIGN(WebSocketServer); -}; - -} // namespace wenet - -#endif // WEBSOCKET_WEBSOCKET_SERVER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/CMakeLists.txt deleted file mode 100644 index 492b84242b48c3ccb2e62e997ebb182e98dbecc2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -../libtorch/CMakeLists.txt \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/README.md deleted file mode 100644 index 9cb1943e7e7763cb5e17ae62037b2f67e8f1b417..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# WeNet & Raspberry PI (Cross Compile) - -* Step 1. Install cross compile tools in the PC. - -``` sh -sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu -``` - -Or download, and install the binaries from: https://releases.linaro.org/components/toolchain/binaries/latest-7 - - -* Step 2. Export your experiment model to ONNX by https://github.com/wenet-e2e/wenet/blob/main/wenet/bin/export_onnx_cpu.py - -``` sh -exp=exp # Change it to your experiment dir -onnx_dir=onnx -python -m wenet.bin.export_onnx_cpu \ - --config $exp/train.yaml \ - --checkpoint $exp/final.pt \ - --chunk_size 16 \ - --output_dir $onnx_dir \ - --num_decoding_left_chunks -1 - -# When it finishes, you can find `encoder.onnx(.quant)`, `ctc.onnx(.quant)`, and `decoder.onnx(.quant)` in the $onnx_dir respectively. -# We use the quantified to speed up the inference, so rename it without the suffix `.quant` -``` - -* Step 3. Build. The build requires cmake 3.14 or above. and Send the binary and libraries to Raspberry PI. - -``` sh -cmake -B build -DONNX=ON -DTORCH=OFF -DWEBSOCKET=OFF -DGRPC=OFF -DCMAKE_TOOLCHAIN_FILE=toolchains/aarch64-linux-gnu.toolchain.cmake -cmake --build build -scp build/bin/decoder_main pi@xxx.xxx.xxx:/path/to/wenet -scp fc_base/onnxruntime-src/lib/libonnxruntime.so* pi@xxx.xxx.xxx:/path/to/wenet -``` - -* Step 4. Testing, the RTF(real time factor) is shown in Raspberry PI's console. - -``` sh -cd /path/to/wenet -export LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH -export GLOG_logtostderr=1 -export GLOG_v=2 -wav_path=your_test_wav_path -onnx_dir=your_model_dir -units=units.txt # Change it to your model units path -./build/bin/decoder_main \ - --chunk_size 16 \ - --wav_path $wav_path \ - --onnx_dir $onnx_dir \ - --unit_path $units 2>&1 | tee log.txt -``` diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/CMakeLists.txt deleted file mode 100644 index 8d61ca8477f0f0b6128f1effe0a2738494b2620f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(TORCH) - add_library(wenet_api SHARED wenet_api.cc) - target_link_libraries(wenet_api PUBLIC decoder) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/README.md deleted file mode 100644 index 5eaa13b977eb4836eb930452f4434dc9f2ea4139..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# WeNet API - -We refer [vosk](https://github.com/alphacep/vosk-api/blob/master/src/vosk_api.h) -for the interface design. - - -We are going to implement the following interfaces: - -- [x] non-streaming recognition -- [] streaming recognition -- [] nbest -- [] contextual biasing word -- [] alignment -- [] language support(post processor) -- [] label check diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/wenet_api.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/wenet_api.cc deleted file mode 100644 index cb1e0c8552e0126e2db274a29075578fe351a25f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/wenet_api.cc +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" - -#include -#include -#include - -#include "decoder/asr_decoder.h" -#include "decoder/torch_asr_model.h" -#include "post_processor/post_processor.h" -#include "utils/file.h" -#include "utils/json.h" -#include "utils/string.h" - -class Recognizer { - public: - explicit Recognizer(const std::string& model_dir) { - // FeaturePipeline init - feature_config_ = std::make_shared(80, 16000); - feature_pipeline_ = - std::make_shared(*feature_config_); - // Resource init - resource_ = std::make_shared(); - wenet::TorchAsrModel::InitEngineThreads(); - std::string model_path = wenet::JoinPath(model_dir, "final.zip"); - CHECK(wenet::FileExists(model_path)); - - auto model = std::make_shared(); - model->Read(model_path); - resource_->model = model; - - // units.txt: E2E model unit - std::string unit_path = wenet::JoinPath(model_dir, "units.txt"); - CHECK(wenet::FileExists(unit_path)); - resource_->unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(unit_path)); - - std::string fst_path = wenet::JoinPath(model_dir, "TLG.fst"); - if (wenet::FileExists(fst_path)) { // With LM - resource_->fst = std::shared_ptr>( - fst::Fst::Read(fst_path)); - - std::string symbol_path = wenet::JoinPath(model_dir, "words.txt"); - CHECK(wenet::FileExists(symbol_path)); - resource_->symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(symbol_path)); - } else { // Without LM, symbol_table is the same as unit_table - resource_->symbol_table = resource_->unit_table; - } - - // Context config init - context_config_ = std::make_shared(); - decode_options_ = std::make_shared(); - post_process_opts_ = std::make_shared(); - } - - void Reset() { - if (feature_pipeline_ != nullptr) { - feature_pipeline_->Reset(); - } - if (decoder_ != nullptr) { - decoder_->Reset(); - } - result_.clear(); - } - - void InitDecoder() { - CHECK(decoder_ == nullptr); - // Optional init context graph - if (context_.size() > 0) { - context_config_->context_score = context_score_; - auto context_graph = - std::make_shared(*context_config_); - context_graph->BuildContextGraph(context_, resource_->symbol_table); - resource_->context_graph = context_graph; - } - // PostProcessor - if (language_ == "chs") { // TODO(Binbin Zhang): CJK(chs, jp, kr) - post_process_opts_->language_type = wenet::kMandarinEnglish; - } else { - post_process_opts_->language_type = wenet::kIndoEuropean; - } - resource_->post_processor = - std::make_shared(*post_process_opts_); - // Init decoder - decoder_ = std::make_shared(feature_pipeline_, resource_, - *decode_options_); - } - - void Decode(const char* data, int len, int last) { - using wenet::DecodeState; - // Init decoder when it is called first time - if (decoder_ == nullptr) { - InitDecoder(); - } - // Convert to 16 bits PCM data to float - CHECK_EQ(len % 2, 0); - feature_pipeline_->AcceptWaveform(reinterpret_cast(data), - len / 2); - if (last > 0) { - feature_pipeline_->set_input_finished(); - } - - while (true) { - DecodeState state = decoder_->Decode(false); - if (state == DecodeState::kWaitFeats) { - break; - } else if (state == DecodeState::kEndFeats) { - decoder_->Rescoring(); - UpdateResult(true); - break; - } else if (state == DecodeState::kEndpoint && continuous_decoding_) { - decoder_->Rescoring(); - UpdateResult(true); - decoder_->ResetContinuousDecoding(); - } else { // kEndBatch - UpdateResult(false); - } - } - } - - void UpdateResult(bool final_result) { - json::JSON obj; - obj["type"] = final_result ? "final_result" : "partial_result"; - int nbest = final_result ? nbest_ : 1; - obj["nbest"] = json::Array(); - for (int i = 0; i < nbest && i < decoder_->result().size(); i++) { - json::JSON one; - one["sentence"] = decoder_->result()[i].sentence; - if (final_result && enable_timestamp_) { - one["word_pieces"] = json::Array(); - for (const auto& word_piece : decoder_->result()[i].word_pieces) { - json::JSON piece; - piece["word"] = word_piece.word; - piece["start"] = word_piece.start; - piece["end"] = word_piece.end; - one["word_pieces"].append(piece); - } - } - one["sentence"] = decoder_->result()[i].sentence; - obj["nbest"].append(one); - } - result_ = obj.dump(); - } - - const char* GetResult() { return result_.c_str(); } - - void set_nbest(int n) { nbest_ = n; } - void set_enable_timestamp(bool flag) { enable_timestamp_ = flag; } - void AddContext(const char* word) { context_.emplace_back(word); } - void set_context_score(float score) { context_score_ = score; } - void set_language(const char* lang) { language_ = lang; } - void set_continuous_decoding(bool flag) { continuous_decoding_ = flag; } - - private: - // NOTE(Binbin Zhang): All use shared_ptr for clone in the future - std::shared_ptr feature_config_ = nullptr; - std::shared_ptr feature_pipeline_ = nullptr; - std::shared_ptr resource_ = nullptr; - std::shared_ptr decode_options_ = nullptr; - std::shared_ptr decoder_ = nullptr; - std::shared_ptr context_config_ = nullptr; - std::shared_ptr post_process_opts_ = nullptr; - - int nbest_ = 1; - std::string result_; - bool enable_timestamp_ = false; - std::vector context_; - float context_score_; - std::string language_ = "chs"; - bool continuous_decoding_ = false; -}; - -void* wenet_init(const char* model_dir) { - Recognizer* decoder = new Recognizer(model_dir); - return reinterpret_cast(decoder); -} - -void wenet_free(void* decoder) { - delete reinterpret_cast(decoder); -} - -void wenet_reset(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Reset(); -} - -void wenet_decode(void* decoder, const char* data, int len, int last) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->Decode(data, len, last); -} - -const char* wenet_get_result(void* decoder) { - Recognizer* recognizer = reinterpret_cast(decoder); - return recognizer->GetResult(); -} - -void wenet_set_log_level(int level) { - FLAGS_logtostderr = true; - FLAGS_v = level; -} - -void wenet_set_nbest(void* decoder, int n) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_nbest(n); -} - -void wenet_set_timestamp(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - bool enable = flag > 0 ? true : false; - recognizer->set_enable_timestamp(enable); -} - -void wenet_add_context(void* decoder, const char* word) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->AddContext(word); -} - -void wenet_set_context_score(void* decoder, float score) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_context_score(score); -} - -void wenet_set_language(void* decoder, const char* lang) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_language(lang); -} - -void wenet_set_continuous_decoding(void* decoder, int flag) { - Recognizer* recognizer = reinterpret_cast(decoder); - recognizer->set_continuous_decoding(flag > 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/wenet_api.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/wenet_api.h deleted file mode 100644 index e839aaa40166a6e50d9aa2ac0e697356bd25b941..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/api/wenet_api.h +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef API_WENET_API_H_ -#define API_WENET_API_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/** Init decoder from the file and returns the object - * - * @param model_dir: the model dir - * @returns model object or NULL if problem occured - */ -void* wenet_init(const char* model_dir); - -/** Free wenet decoder and corresponding resource - */ -void wenet_free(void* decoder); - -/** Reset decoder for next decoding - */ -void wenet_reset(void* decoder); - -/** Decode the input wav data - * @param data: pcm data, encoded as int16_t(16 bits) - * @param len: data length - * @param last: if it is the last package - */ -void wenet_decode(void* decoder, const char* data, int len, int last); - -/** Get decode result in json format - * It returns partial result when last is 0 - * It returns final result when last is 1 - - { - "nbest" : [{ - "sentence" : "are you okay" - "word_pieces" : [{ - "end" : 960, - "start" : 0, - "word" : "are" - }, { - "end" : 1200, - "start" : 960, - "word" : "you" - }, { - ...}] - }, { - "sentence" : "are you ok" - }], - "type" : "final_result" - } - - "type": final_result/partial_result - "nbest": nbest is enabled when n > 1 in final_result - "sentence": the ASR result - "word_pieces": optional, output timestamp when enabled - */ -const char* wenet_get_result(void* decoder); - -/** Set n-best, range 1~10 - * wenet_get_result will return top-n best results - */ -void wenet_set_nbest(void* decoder, int n); - -/** Whether to enable word level timestamp in results - disable it when flag = 0, otherwise enable - */ -void wenet_set_timestamp(void* decoder, int flag); - -/** Add one contextual biasing - */ -void wenet_add_context(void* decoder, const char* word); - -/** Set contextual biasing bonus score - */ -void wenet_set_context_score(void* decoder, float score); - -/** Set language, has effect on the postpocessing - * @param: lang, could be chs/en now - */ -void wenet_set_language(void* decoder, const char* lang); - -/** Set log level - * We use glog in wenet, so the level is the glog level - */ -void wenet_set_log_level(int level); - -/** Enable continous decoding or not - * flag > 0: enable, otherwise disable - */ -void wenet_set_continuous_decoding(void* decoder, int flag); - -#ifdef __cplusplus -} -#endif - -#endif // API_WENET_API_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/CMakeLists.txt deleted file mode 100644 index a117b8bcb580c8738a7ce72f88bc10ff0a450e98..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -add_executable(decoder_main decoder_main.cc) -target_link_libraries(decoder_main PUBLIC decoder) - -add_executable(label_checker_main label_checker_main.cc) -target_link_libraries(label_checker_main PUBLIC decoder) - -# if(TORCH) -# add_executable(api_main api_main.cc) -# target_link_libraries(api_main PUBLIC wenet_api) -# endif() - -if(WEBSOCKET) - add_executable(websocket_client_main websocket_client_main.cc) - target_link_libraries(websocket_client_main PUBLIC websocket) - add_executable(websocket_server_main websocket_server_main.cc) - target_link_libraries(websocket_server_main PUBLIC websocket) -endif() - -if(GRPC) - add_executable(grpc_server_main grpc_server_main.cc) - target_link_libraries(grpc_server_main PUBLIC wenet_grpc) - add_executable(grpc_client_main grpc_client_main.cc) - target_link_libraries(grpc_client_main PUBLIC wenet_grpc) -endif() - -if(HTTP) - add_executable(http_client_main http_client_main.cc) - target_link_libraries(http_client_main PUBLIC http) - add_executable(http_server_main http_server_main.cc) - target_link_libraries(http_server_main PUBLIC http) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/api_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/api_main.cc deleted file mode 100644 index 94b20d52a7b8eee5c39a12af4e1e25324d7d880f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/api_main.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "api/wenet_api.h" -#include "frontend/wav.h" -#include "utils/flags.h" - -DEFINE_string(model_dir, "", "model dir path"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_bool(enable_timestamp, false, "enable timestamps"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet_set_log_level(2); - - void* decoder = wenet_init(FLAGS_model_dir.c_str()); - wenet_set_timestamp(decoder, FLAGS_enable_timestamp == true ? 1 : 0); - wenet::WavReader wav_reader(FLAGS_wav_path); - std::vector data(wav_reader.num_samples()); - for (int i = 0; i < wav_reader.num_samples(); i++) { - data[i] = static_cast(*(wav_reader.data() + i)); - } - - for (int i = 0; i < 10; i++) { - // Return the final result when last is 1 - wenet_decode(decoder, reinterpret_cast(data.data()), - data.size() * 2, 1); - const char* result = wenet_get_result(decoder); - LOG(INFO) << i << " " << result; - wenet_reset(decoder); - } - wenet_free(decoder); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/decoder_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/decoder_main.cc deleted file mode 100644 index b8f1dbae6b88390504cc9ce63f33dc9bd54a2d6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/decoder_main.cc +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" -#include "utils/thread_pool.h" -#include "utils/timer.h" -#include "utils/utils.h" - -DEFINE_bool(simulate_streaming, false, "simulate streaming input"); -DEFINE_bool(output_nbest, false, "output n-best of decode result"); -DEFINE_string(wav_path, "", "single wave path"); -DEFINE_string(wav_scp, "", "input wav scp"); -DEFINE_string(result, "", "result output file"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); -DEFINE_int32(thread_num, 1, "num of decode thread"); -DEFINE_int32(warmup, 0, "num of warmup decode, 0 means no warmup"); - -std::shared_ptr g_decode_config; -std::shared_ptr g_feature_config; -std::shared_ptr g_decode_resource; - -std::ofstream g_result; -std::mutex g_mutex; -int g_total_waves_dur = 0; -int g_total_decode_time = 0; - -void decode(std::pair wav, bool warmup = false) { - wenet::WavReader wav_reader(wav.second); - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - - auto feature_pipeline = - std::make_shared(*g_feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - - wenet::AsrDecoder decoder(feature_pipeline, g_decode_resource, - *g_decode_config); - - int wave_dur = static_cast(static_cast(num_samples) / - wav_reader.sample_rate() * 1000); - int decode_time = 0; - std::string final_result; - while (true) { - wenet::Timer timer; - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - } - int chunk_decode_time = timer.Elapsed(); - decode_time += chunk_decode_time; - if (decoder.DecodedSomething()) { - LOG(INFO) << "Partial result: " << decoder.result()[0].sentence; - } - - if (FLAGS_continuous_decoding && state == wenet::DecodeState::kEndpoint) { - if (decoder.DecodedSomething()) { - decoder.Rescoring(); - LOG(INFO) << "Final result (continuous decoding): " - << decoder.result()[0].sentence; - final_result.append(decoder.result()[0].sentence); - } - decoder.ResetContinuousDecoding(); - } - - if (state == wenet::DecodeState::kEndFeats) { - break; - } else if (FLAGS_chunk_size > 0 && FLAGS_simulate_streaming) { - float frame_shift_in_ms = - static_cast(g_feature_config->frame_shift) / - wav_reader.sample_rate() * 1000; - auto wait_time = - decoder.num_frames_in_current_chunk() * frame_shift_in_ms - - chunk_decode_time; - if (wait_time > 0) { - LOG(INFO) << "Simulate streaming, waiting for " << wait_time << "ms"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(wait_time))); - } - } - } - if (decoder.DecodedSomething()) { - final_result.append(decoder.result()[0].sentence); - } - LOG(INFO) << wav.first << " Final result: " << final_result << std::endl; - LOG(INFO) << "Decoded " << wave_dur << "ms audio taken " << decode_time - << "ms."; - - if (!warmup) { - g_mutex.lock(); - std::ostream& buffer = FLAGS_result.empty() ? std::cout : g_result; - if (!FLAGS_output_nbest) { - buffer << wav.first << " " << final_result << std::endl; - } else { - buffer << "wav " << wav.first << std::endl; - auto& results = decoder.result(); - for (auto& r : results) { - if (r.sentence.empty()) continue; - buffer << "candidate " << r.score << " " << r.sentence << std::endl; - } - } - g_total_waves_dur += wave_dur; - g_total_decode_time += decode_time; - g_mutex.unlock(); - } -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - g_decode_config = wenet::InitDecodeOptionsFromFlags(); - g_feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - g_decode_resource = wenet::InitDecodeResourceFromFlags(); - - if (FLAGS_wav_path.empty() && FLAGS_wav_scp.empty()) { - LOG(FATAL) << "Please provide the wave path or the wav scp."; - } - std::vector> waves; - if (!FLAGS_wav_path.empty()) { - waves.emplace_back(make_pair("test", FLAGS_wav_path)); - } else { - std::ifstream wav_scp(FLAGS_wav_scp); - std::string line; - while (getline(wav_scp, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_GE(strs.size(), 2); - waves.emplace_back(make_pair(strs[0], strs[1])); - } - - if (waves.empty()) { - LOG(FATAL) << "Please provide non-empty wav scp."; - } - } - - if (!FLAGS_result.empty()) { - g_result.open(FLAGS_result, std::ios::out); - } - - // Warmup - if (FLAGS_warmup > 0) { - LOG(INFO) << "Warming up..."; - { - ThreadPool pool(FLAGS_thread_num); - auto wav = waves[0]; - for (int i = 0; i < FLAGS_warmup; i++) { - pool.enqueue(decode, wav, true); - } - } - LOG(INFO) << "Warmup done."; - } - - { - ThreadPool pool(FLAGS_thread_num); - for (auto& wav : waves) { - pool.enqueue(decode, wav, false); - } - } - - LOG(INFO) << "Total: decoded " << g_total_waves_dur << "ms audio taken " - << g_total_decode_time << "ms."; - LOG(INFO) << "RTF: " << std::setprecision(4) - << static_cast(g_total_decode_time) / g_total_waves_dur; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/grpc_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/grpc_client_main.cc deleted file mode 100644 index f2d226d48d3757c5f095335eff3288f5d227282b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/grpc_client_main.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "grpc/grpc_client.h" -#include "utils/flags.h" -#include "utils/timer.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::GrpcClient client(FLAGS_hostname, FLAGS_port, FLAGS_nbest, - FLAGS_continuous_decoding); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - std::vector pcm_data(wav_reader.data(), - wav_reader.data() + num_samples); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(pcm_data[j])); - } - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/grpc_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/grpc_server_main.cc deleted file mode 100644 index b00f3cbade1ee70dadfb49829e9ca73fd50c2be2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/grpc_server_main.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "decoder/params.h" -#include "grpc/grpc_server.h" -#include "utils/log.h" - -DEFINE_int32(port, 10086, "grpc listening port"); -DEFINE_int32(workers, 4, "grpc num workers"); - -using grpc::Server; -using grpc::ServerBuilder; - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::GrpcServer service(feature_config, decode_config, decode_resource); - grpc::EnableDefaultHealthCheckService(true); - grpc::reflection::InitProtoReflectionServerBuilderPlugin(); - ServerBuilder builder; - std::string address("0.0.0.0:" + std::to_string(FLAGS_port)); - builder.AddListeningPort(address, grpc::InsecureServerCredentials()); - builder.RegisterService(&service); - builder.SetSyncServerOption(ServerBuilder::SyncServerOption::NUM_CQS, - FLAGS_workers); - std::unique_ptr server(builder.BuildAndStart()); - LOG(INFO) << "Listening at port " << FLAGS_port; - server->Wait(); - google::ShutdownGoogleLogging(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/http_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/http_client_main.cc deleted file mode 100644 index b59ee3f5f32bf08552416b183802029ac5d5afa5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/http_client_main.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "http/http_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of http server"); -DEFINE_int32(port, 10086, "port of http server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Convert to short - std::vector data; - data.reserve(num_samples); - for (int j = 0; j < num_samples; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // Send data - wenet::HttpClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - wenet::Timer timer; - VLOG(2) << "Send " << data.size() << " samples"; - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/http_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/http_server_main.cc deleted file mode 100644 index e30cf2bcdf746c2072f023e90f470ccba5467c2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/http_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2023 Ximalaya Speech Team (Xiang Lyu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "http/http_server.h" - -DEFINE_int32(port, 10086, "http listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::HttpServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/label_checker_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/label_checker_main.cc deleted file mode 100644 index e36e3d5c29a38a7ebee80606ebd8e69ae8b1eb96..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/label_checker_main.cc +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include -#include - -#include "decoder/params.h" -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_string(text, "", "kaldi style text input file"); -DEFINE_string(wav_scp, "", "kaldi style wav scp"); -DEFINE_double(is_penalty, 1.0, - "insertion/substitution penalty for align insertion"); -DEFINE_double(del_penalty, 1.0, "deletion penalty for align insertion"); -DEFINE_string(result, "", "result output file"); -DEFINE_string(timestamp, "", "timestamp output file"); - -namespace wenet { - -const char* kDeletion = ""; -// Is: Insertion and substitution -const char* kIsStart = ""; -const char* kIsEnd = ""; - -bool MapToLabel(const std::string& text, - std::shared_ptr symbol_table, - std::vector* labels) { - labels->clear(); - // Split label to char sequence - std::vector chars; - SplitUTF8StringToChars(text, &chars); - for (size_t i = 0; i < chars.size(); i++) { - // ▁ is special symbol for white space - std::string label = chars[i] != " " ? chars[i] : "▁"; - int id = symbol_table->Find(label); - if (id != -1) { // fst::kNoSymbol - // LOG(INFO) << label << " " << id; - labels->push_back(id); - } - } - return true; -} - -std::shared_ptr MakeSymbolTableForFst( - std::shared_ptr isymbol_table) { - LOG(INFO) << isymbol_table; - CHECK(isymbol_table != nullptr); - auto osymbol_table = std::make_shared(); - osymbol_table->AddSymbol("", 0); - CHECK_EQ(isymbol_table->Find(""), 0); - osymbol_table->AddSymbol("", 1); - for (int i = 1; i < isymbol_table->NumSymbols(); i++) { - std::string symbol = isymbol_table->Find(i); - osymbol_table->AddSymbol(symbol, i + 1); - } - osymbol_table->AddSymbol(kDeletion, isymbol_table->NumSymbols() + 1); - osymbol_table->AddSymbol(kIsStart, isymbol_table->NumSymbols() + 2); - osymbol_table->AddSymbol(kIsEnd, isymbol_table->NumSymbols() + 3); - return osymbol_table; -} - -void CompileCtcFst(std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int start = ofst->AddState(); - ofst->SetStart(start); - CHECK_EQ(symbol_table->Find(""), 0); - CHECK_EQ(symbol_table->Find(""), 1); - ofst->AddArc(start, fst::StdArc(1, 0, 0.0, start)); - // Exclude kDeletion and kInsertion - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - int s = ofst->AddState(); - ofst->AddArc(start, fst::StdArc(i, i, 0.0, s)); - ofst->AddArc(s, fst::StdArc(i, 0, 0.0, s)); - ofst->AddArc(s, fst::StdArc(0, 0, 0.0, start)); - } - ofst->SetFinal(start, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdOLabelCompare()); -} - -void CompileAlignFst(std::vector labels, - std::shared_ptr symbol_table, - fst::StdVectorFst* ofst) { - ofst->DeleteStates(); - int deletion = symbol_table->Find(kDeletion); - int insertion_start = symbol_table->Find(kIsStart); - int insertion_end = symbol_table->Find(kIsEnd); - - int start = ofst->AddState(); - ofst->SetStart(start); - // Filler State - int filler_start = ofst->AddState(); - int filler_end = ofst->AddState(); - for (int i = 2; i < symbol_table->NumSymbols() - 3; i++) { - ofst->AddArc(filler_start, fst::StdArc(i, i, FLAGS_is_penalty, filler_end)); - } - ofst->AddArc(filler_end, fst::StdArc(0, 0, 0.0, filler_start)); - - int prev = start; - // Alignment path and optional filler - for (size_t i = 0; i < labels.size(); i++) { - int cur = ofst->AddState(); - // 1. Insertion or Substitution - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - // 2. Correct - ofst->AddArc(prev, fst::StdArc(labels[i], labels[i], 0.0, cur)); - // 3. Deletion - ofst->AddArc(prev, fst::StdArc(0, deletion, FLAGS_del_penalty, cur)); - - prev = cur; - } - // Optional add endding filler - ofst->AddArc(prev, fst::StdArc(0, insertion_start, 0.0, filler_start)); - ofst->AddArc(filler_end, fst::StdArc(0, insertion_end, 0.0, prev)); - ofst->SetFinal(prev, fst::StdArc::Weight::One()); - fst::ArcSort(ofst, fst::StdILabelCompare()); -} - -} // namespace wenet - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - CHECK(decode_resource->unit_table != nullptr); - - auto wfst_symbol_table = - wenet::MakeSymbolTableForFst(decode_resource->unit_table); - // wfst_symbol_table->WriteText("fst.txt"); - // Reset symbol_table to on-the-fly generated wfst_symbol_table - decode_resource->symbol_table = wfst_symbol_table; - - // Compile ctc FST - fst::StdVectorFst ctc_fst; - wenet::CompileCtcFst(wfst_symbol_table, &ctc_fst); - // ctc_fst.Write("ctc.fst"); - - std::unordered_map wav_table; - std::ifstream wav_is(FLAGS_wav_scp); - std::string line; - while (std::getline(wav_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - CHECK_EQ(strs.size(), 2); - wav_table[strs[0]] = strs[1]; - } - - std::ifstream text_is(FLAGS_text); - std::ofstream result_os(FLAGS_result, std::ios::out); - std::ofstream timestamp_out; - if (!FLAGS_timestamp.empty()) { - timestamp_out.open(FLAGS_timestamp, std::ios::out); - } - std::ostream& timestamp_os = - FLAGS_timestamp.empty() ? std::cout : timestamp_out; - - while (std::getline(text_is, line)) { - std::vector strs; - wenet::SplitString(line, &strs); - if (strs.size() < 2) continue; - std::string key = strs[0]; - LOG(INFO) << "Processing " << key; - if (wav_table.find(key) != wav_table.end()) { - strs.erase(strs.begin()); - std::string text = wenet::JoinString(" ", strs); - std::vector labels; - wenet::MapToLabel(text, wfst_symbol_table, &labels); - // Prepare FST for alignment decoding - fst::StdVectorFst align_fst; - wenet::CompileAlignFst(labels, wfst_symbol_table, &align_fst); - // align_fst.Write("align.fst"); - auto decoding_fst = std::make_shared(); - fst::Compose(ctc_fst, align_fst, decoding_fst.get()); - // decoding_fst->Write("decoding.fst"); - // Preapre feature pipeline - wenet::WavReader wav_reader; - if (!wav_reader.Open(wav_table[key])) { - LOG(WARNING) << "Error in reading " << wav_table[key]; - continue; - } - int num_samples = wav_reader.num_samples(); - CHECK_EQ(wav_reader.sample_rate(), FLAGS_sample_rate); - auto feature_pipeline = - std::make_shared(*feature_config); - feature_pipeline->AcceptWaveform(wav_reader.data(), num_samples); - feature_pipeline->set_input_finished(); - decode_resource->fst = decoding_fst; - LOG(INFO) << "num frames " << feature_pipeline->num_frames(); - wenet::AsrDecoder decoder(feature_pipeline, decode_resource, - *decode_config); - while (true) { - wenet::DecodeState state = decoder.Decode(); - if (state == wenet::DecodeState::kEndFeats) { - decoder.Rescoring(); - break; - } - } - std::string final_result; - std::string timestamp_str; - if (decoder.DecodedSomething()) { - const wenet::DecodeResult& result = decoder.result()[0]; - final_result = result.sentence; - std::stringstream ss; - for (const auto& w : result.word_pieces) { - ss << " " << w.word << " " << w.start << " " << w.end; - } - timestamp_str = ss.str(); - } - result_os << key << " " << final_result << std::endl; - timestamp_os << key << " " << timestamp_str << std::endl; - LOG(INFO) << key << " " << final_result; - } else { - LOG(WARNING) << "No wav file for " << key; - } - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/websocket_client_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/websocket_client_main.cc deleted file mode 100644 index 3eaa96069dc5f57673fbb2819bf7d4883e0d5ffa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/websocket_client_main.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/wav.h" -#include "utils/flags.h" -#include "utils/timer.h" -#include "websocket/websocket_client.h" - -DEFINE_string(hostname, "127.0.0.1", "hostname of websocket server"); -DEFINE_int32(port, 10086, "port of websocket server"); -DEFINE_int32(nbest, 1, "n-best of decode result"); -DEFINE_string(wav_path, "", "test wav file path"); -DEFINE_bool(continuous_decoding, false, "continuous decoding mode"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - wenet::WebSocketClient client(FLAGS_hostname, FLAGS_port); - client.set_nbest(FLAGS_nbest); - client.set_continuous_decoding(FLAGS_continuous_decoding); - client.SendStartSignal(); - - wenet::WavReader wav_reader(FLAGS_wav_path); - const int sample_rate = 16000; - // Only support 16K - CHECK_EQ(wav_reader.sample_rate(), sample_rate); - const int num_samples = wav_reader.num_samples(); - // Send data every 0.5 second - const float interval = 0.5; - const int sample_interval = interval * sample_rate; - for (int start = 0; start < num_samples; start += sample_interval) { - if (client.done()) { - break; - } - int end = std::min(start + sample_interval, num_samples); - // Convert to short - std::vector data; - data.reserve(end - start); - for (int j = start; j < end; j++) { - data.push_back(static_cast(wav_reader.data()[j])); - } - // TODO(Binbin Zhang): Network order? - // Send PCM data - client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); - VLOG(2) << "Send " << data.size() << " samples"; - std::this_thread::sleep_for( - std::chrono::milliseconds(static_cast(interval * 1000))); - } - wenet::Timer timer; - client.SendEndSignal(); - client.Join(); - VLOG(2) << "Total latency: " << timer.Elapsed() << "ms."; - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/websocket_server_main.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/websocket_server_main.cc deleted file mode 100644 index 796d9d2e6d151f7c08b43d66b7245c58ee086cc2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/bin/websocket_server_main.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/params.h" -#include "utils/log.h" -#include "websocket/websocket_server.h" - -DEFINE_int32(port, 10086, "websocket listening port"); - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - auto decode_config = wenet::InitDecodeOptionsFromFlags(); - auto feature_config = wenet::InitFeaturePipelineConfigFromFlags(); - auto decode_resource = wenet::InitDecodeResourceFromFlags(); - - wenet::WebSocketServer server(FLAGS_port, feature_config, decode_config, - decode_resource); - LOG(INFO) << "Listening at port " << FLAGS_port; - server.Start(); - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/boost.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/boost.cmake deleted file mode 100644 index 8684c0ec43960da213da923dc57416f04301ea2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/boost.cmake +++ /dev/null @@ -1,10 +0,0 @@ -FetchContent_Declare(boost - URL https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz - URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a -) -FetchContent_MakeAvailable(boost) -include_directories(${boost_SOURCE_DIR}) - -if(MSVC) - add_definitions(-DBOOST_ALL_DYN_LINK -DBOOST_ALL_NO_LIB) -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/bpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/bpu.cmake deleted file mode 100644 index 350d76c19d6f656fb130de09877d649cf49972a4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/bpu.cmake +++ /dev/null @@ -1,30 +0,0 @@ -if(BPU) - if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(EASY_DNN_URL "https://github.com/xingchensong/toolchain_pkg/releases/download/easy_dnn/easy_dnn.0.4.11.tar.gz") - set(URL_HASH "SHA256=a1a6f77d1baae7181d75ec5d37a2ee529ac4e1c4400babd6ceb1c007392a4904") - else() - message(FATAL_ERROR "Unsupported CMake System Processor '${CMAKE_SYSTEM_PROCESSOR}' (expected 'aarch64')") - endif() - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Linux')") - endif() - - FetchContent_Declare(easy_dnn - URL ${EASY_DNN_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(easy_dnn) - include_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/include) - include_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/include) - link_directories(${easy_dnn_SOURCE_DIR}/easy_dnn/0.4.11_linux_aarch64-j3_hobot_gcc6.5.0/files/easy_dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/dnn/1.7.0_linux_aarch64-j3_hobot_gcc6.5.0/files/dnn/lib) - link_directories(${easy_dnn_SOURCE_DIR}/hlog/0.4.7_linux_aarch64-j3_hobot_gcc6.5.0/files/hlog/lib) - - add_definitions(-DUSE_BPU) - # NOTE(xcsong): Reasons for adding flag `-fuse-ld=gold`: - # https://stackoverflow.com/questions/59915966/unknown-gcc-linker-error-but-builds-sucessfully/59916438#59916438 - # https://github.com/tensorflow/tensorflow/issues/47849 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/gflags.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/gflags.cmake deleted file mode 100644 index 53ae5763b5a8c860b7e64d35b380eee5429f539d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/gflags.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(gflags - URL https://github.com/gflags/gflags/archive/v2.2.2.zip - URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 -) -FetchContent_MakeAvailable(gflags) -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/glog.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/glog.cmake deleted file mode 100644 index 447ab4132f669ee2c3a52c37959dd684a39ff21b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/glog.cmake +++ /dev/null @@ -1,6 +0,0 @@ -FetchContent_Declare(glog - URL https://github.com/google/glog/archive/v0.4.0.zip - URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc -) -FetchContent_MakeAvailable(glog) -include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/grpc.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/grpc.cmake deleted file mode 100644 index 644093a4bf8191f3a45b0df0a72c000981c48f58..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/grpc.cmake +++ /dev/null @@ -1,9 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grpc) -# third_party: grpc -# On how to build grpc, you may refer to https://github.com/grpc/grpc -# We recommend manually recursive clone the repo to avoid internet connection problem -FetchContent_Declare(gRPC - GIT_REPOSITORY https://github.com/grpc/grpc - GIT_TAG v1.37.1 -) -FetchContent_MakeAvailable(gRPC) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/gtest.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/gtest.cmake deleted file mode 100644 index 30dc7c1a31d8b83991841a4dc33f61ed078b532a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/gtest.cmake +++ /dev/null @@ -1,8 +0,0 @@ -FetchContent_Declare(googletest - URL https://github.com/google/googletest/archive/release-1.11.0.zip - URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a -) -if(MSVC) - set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) -endif() -FetchContent_MakeAvailable(googletest) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/libtorch.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/libtorch.cmake deleted file mode 100644 index 3cd9245b2da52f8be206d27164de5f411bff171b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/libtorch.cmake +++ /dev/null @@ -1,79 +0,0 @@ -if(TORCH) - add_definitions(-DUSE_TORCH) - if(NOT ANDROID) - if(GPU) - if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(FATAL_ERROR "GPU is supported only Linux, you can use CPU version") - else() - add_definitions(-DUSE_GPU) - endif() - endif() - - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - if(${CMAKE_BUILD_TYPE} MATCHES "Release") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bece54d36377990257e9d028c687c5b6759c5cfec0a0153da83cf6f0f71f648f") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=3cc7ba3c3865d86f03d78c2f0878fdbed8b764359476397a5c95cf3bba0d665a") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CXX11_ABI) - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=d52f63577a07adb0bfd6d77c90f7da21896e94f71eb7dcd55ed7835ccb3b2b59") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.12.0%2Bcu113.zip") - set(URL_HASH "SHA256=80f089939de20e68e3fcad4dfa72a26c8bf91b5e77b11042f671f39ebac35865") - endif() - else() - if(NOT GPU) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.13.0%2Bcpu.zip") - set(URL_HASH "SHA256=bee1b7be308792aa60fc95a4f5274d9658cb7248002d0e333d49eb81ec88430c") - else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip") - set(URL_HASH "SHA256=90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad") - endif() - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cpu/libtorch-macos-1.13.0.zip") - set(URL_HASH "SHA256=a8f80050b95489b4e002547910410c2c230e9f590ffab2482e19e809afe4f7aa") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - add_definitions(-DIOS) - else() - message(FATAL_ERROR "Unsupported System '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux', 'Darwin' or 'iOS')") - endif() - - # iOS use LibTorch from pod install - if(NOT IOS) - FetchContent_Declare(libtorch - URL ${LIBTORCH_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(libtorch) - find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG") - endif() - - if(MSVC) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - file(COPY ${TORCH_DLLS} DESTINATION ${CMAKE_BINARY_DIR}) - endif() - else() - # Change version in runtime/android/app/build.gradle. - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH - ) - include_directories( - ${PYTORCH_INCLUDE_DIRS} - ${PYTORCH_INCLUDE_DIRS}/torch/csrc/api/include - ) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/onnx.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/onnx.cmake deleted file mode 100644 index bd55402cb2a6024620fa6ff8b5c413207041adfa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/onnx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -if(ONNX) - set(ONNX_VERSION "1.12.0") - if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-win-x64-${ONNX_VERSION}.zip") - set(URL_HASH "SHA256=8b5d61204989350b7904ac277f5fbccd3e6736ddbb6ec001e412723d71c9c176") - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-aarch64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5820d9f343df73c63b6b2b174a1ff62575032e171c9564bcf92060f46827d0ac") - else() - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=5d503ce8540358b59be26c675e42081be14a3e833a5301926f555451046929c5") - endif() - elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") - set(ONNX_URL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-osx-x86_64-${ONNX_VERSION}.tgz") - set(URL_HASH "SHA256=09b17f712f8c6f19bb63da35d508815b443cbb473e16c6192abfaa297c02f600") - else() - message(FATAL_ERROR "Unsupported CMake System Name '${CMAKE_SYSTEM_NAME}' (expected 'Windows', 'Linux' or 'Darwin')") - endif() - - FetchContent_Declare(onnxruntime - URL ${ONNX_URL} - URL_HASH ${URL_HASH} - ) - FetchContent_MakeAvailable(onnxruntime) - include_directories(${onnxruntime_SOURCE_DIR}/include) - link_directories(${onnxruntime_SOURCE_DIR}/lib) - - if(MSVC) - file(GLOB ONNX_DLLS "${onnxruntime_SOURCE_DIR}/lib/*.dll") - file(COPY ${ONNX_DLLS} DESTINATION ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}) - endif() - - add_definitions(-DUSE_ONNX) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/openfst.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/openfst.cmake deleted file mode 100644 index 490a3da6b571ec228114167fb9c0d9e9b4043bd2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/openfst.cmake +++ /dev/null @@ -1,45 +0,0 @@ -if(NOT ANDROID) - include(gflags) - # We can't build glog with gflags, unless gflags is pre-installed. - # If build glog with pre-installed gflags, there will be conflict. - set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) - include(glog) - - if(NOT GRAPH_TOOLS) - set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) - set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) - endif() - set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) - set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) - set(HAVE_GRM OFF CACHE BOOL "Build grm" FORCE) - set(HAVE_FAR OFF CACHE BOOL "Build far" FORCE) - set(HAVE_PDT OFF CACHE BOOL "Build pdt" FORCE) - set(HAVE_MPDT OFF CACHE BOOL "Build mpdt" FORCE) - set(HAVE_LINEAR OFF CACHE BOOL "Build linear" FORCE) - set(HAVE_LOOKAHEAD OFF CACHE BOOL "Build lookahead" FORCE) - set(HAVE_NGRAM OFF CACHE BOOL "Build ngram" FORCE) - set(HAVE_SPECIAL OFF CACHE BOOL "Build special" FORCE) - - if(MSVC) - add_compile_options(/W0 /wd4244 /wd4267) - endif() - - # "OpenFST port for Windows" builds openfst with cmake for multiple platforms. - # Openfst is compiled with glog/gflags to avoid log and flag conflicts with log and flags in wenet/libtorch. - # To build openfst with gflags and glog, we comment out some vars of {flags, log}.h and flags.cc. - set(openfst_SOURCE_DIR ${fc_base}/openfst-src CACHE PATH "OpenFST source directory") - FetchContent_Declare(openfst - URL https://github.com/kkm000/openfst/archive/refs/tags/win/1.6.5.1.tar.gz - URL_HASH SHA256=02c49b559c3976a536876063369efc0e41ab374be1035918036474343877046e - PATCH_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} - ) - FetchContent_MakeAvailable(openfst) - add_dependencies(fst gflags glog) - target_link_libraries(fst PUBLIC gflags_nothreads_static glog) - include_directories(${openfst_SOURCE_DIR}/src/include) -else() - set(openfst_BINARY_DIR ${build_DIR}/wenet-openfst-android-1.0.2.aar/jni) - include_directories(${openfst_BINARY_DIR}/include) - link_directories(${openfst_BINARY_DIR}/${ANDROID_ABI}) - link_libraries(log gflags_nothreads glog fst) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/pybind11.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/pybind11.cmake deleted file mode 100644 index 6bdae202c1c4d94228e5f92dab051c118dba7d3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/pybind11.cmake +++ /dev/null @@ -1,7 +0,0 @@ -FetchContent_Declare(pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.zip - URL_HASH SHA256=d1646e6f70d8a3acb2ddd85ce1ed543b5dd579c68b8fb8e9638282af20edead8 -) -FetchContent_MakeAvailable(pybind11) - -add_subdirectory(${pybind11_SOURCE_DIR}) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/xpu.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/xpu.cmake deleted file mode 100644 index 38418671b0237550cd01d4d95e8743067e113e56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/cmake/xpu.cmake +++ /dev/null @@ -1,37 +0,0 @@ -if(NOT WIN32) - string(ASCII 27 Esc) - set(ColourReset "${Esc}[m") - set(ColourBold "${Esc}[1m") - set(Red "${Esc}[31m") - set(Green "${Esc}[32m") - set(Yellow "${Esc}[33m") - set(Blue "${Esc}[34m") - set(Magenta "${Esc}[35m") - set(Cyan "${Esc}[36m") - set(White "${Esc}[37m") - set(BoldRed "${Esc}[1;31m") - set(BoldGreen "${Esc}[1;32m") - set(BoldYellow "${Esc}[1;33m") - set(BoldBlue "${Esc}[1;34m") - set(BoldMagenta "${Esc}[1;35m") - set(BoldCyan "${Esc}[1;36m") - set(BoldWhite "${Esc}[1;37m") -endif() - -if(XPU) - set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") - set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) - if(NOT DEFINED ENV{XPU_API_PATH}) - message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") - else() - set(XPU_API_PATH $ENV{XPU_API_PATH}) - message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") - endif() - - include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ - ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) - link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) - - add_definitions(-DUSE_XPU) -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/CMakeLists.txt deleted file mode 100644 index fe03efb288eb1c7ae3d05e896e95855e5865472f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -set(decoder_srcs - asr_decoder.cc - asr_model.cc - context_graph.cc - ctc_prefix_beam_search.cc - ctc_wfst_beam_search.cc - ctc_endpoint.cc -) - -if(NOT TORCH AND NOT ONNX AND NOT XPU AND NOT IOS AND NOT BPU) - message(FATAL_ERROR "Please build with TORCH or ONNX or XPU or IOS or BPU!!!") -endif() -if(TORCH OR IOS) - list(APPEND decoder_srcs torch_asr_model.cc) -endif() -if(ONNX) - list(APPEND decoder_srcs onnx_asr_model.cc) -endif() - -add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend - post_processor utils) - -if(ANDROID) - target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) -else() - if(TORCH) - target_link_libraries(decoder PUBLIC ${TORCH_LIBRARIES}) - endif() - if(ONNX) - target_link_libraries(decoder PUBLIC onnxruntime) - endif() - if(BPU) - target_link_libraries(decoder PUBLIC bpu_asr_model) - endif() - if(XPU) - target_link_libraries(decoder PUBLIC xpu_conformer) - endif() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_decoder.cc deleted file mode 100644 index 34de7550ea287b37d2cb707e148f5d6853b3d804..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_decoder.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/asr_decoder.h" - -#include - -#include -#include -#include - -#include "utils/timer.h" - -namespace wenet { - -AsrDecoder::AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts) - : feature_pipeline_(std::move(feature_pipeline)), - // Make a copy of the model ASR model since we will change the inner - // status of the model - model_(resource->model->Copy()), - post_processor_(resource->post_processor), - symbol_table_(resource->symbol_table), - fst_(resource->fst), - unit_table_(resource->unit_table), - opts_(opts), - ctc_endpointer_(new CtcEndpoint(opts.ctc_endpoint_config)) { - if (opts_.reverse_weight > 0) { - // Check if model has a right to left decoder - CHECK(model_->is_bidirectional_decoder()); - } - if (nullptr == fst_) { - searcher_.reset(new CtcPrefixBeamSearch(opts.ctc_prefix_search_opts, - resource->context_graph)); - } else { - searcher_.reset(new CtcWfstBeamSearch(*fst_, opts.ctc_wfst_search_opts, - resource->context_graph)); - } - ctc_endpointer_->frame_shift_in_ms(frame_shift_in_ms()); -} - -void AsrDecoder::Reset() { - start_ = false; - result_.clear(); - num_frames_ = 0; - global_frame_offset_ = 0; - model_->Reset(); - searcher_->Reset(); - feature_pipeline_->Reset(); - ctc_endpointer_->Reset(); -} - -void AsrDecoder::ResetContinuousDecoding() { - global_frame_offset_ = num_frames_; - start_ = false; - result_.clear(); - model_->Reset(); - searcher_->Reset(); - ctc_endpointer_->Reset(); -} - -DecodeState AsrDecoder::Decode(bool block) { - return this->AdvanceDecoding(block); -} - -void AsrDecoder::Rescoring() { - // Do attention rescoring - Timer timer; - AttentionRescoring(); - VLOG(2) << "Rescoring cost latency: " << timer.Elapsed() << "ms."; -} - -DecodeState AsrDecoder::AdvanceDecoding(bool block) { - DecodeState state = DecodeState::kEndBatch; - model_->set_chunk_size(opts_.chunk_size); - model_->set_num_left_chunks(opts_.num_left_chunks); - int num_required_frames = model_->num_frames_for_chunk(start_); - std::vector> chunk_feats; - // Return immediately if we do not want to block - if (!block && !feature_pipeline_->input_finished() && - feature_pipeline_->NumQueuedFrames() < num_required_frames) { - return DecodeState::kWaitFeats; - } - // If not okay, that means we reach the end of the input - if (!feature_pipeline_->Read(num_required_frames, &chunk_feats)) { - state = DecodeState::kEndFeats; - } - - num_frames_ += chunk_feats.size(); - VLOG(2) << "Required " << num_required_frames << " get " - << chunk_feats.size(); - Timer timer; - std::vector> ctc_log_probs; - model_->ForwardEncoder(chunk_feats, &ctc_log_probs); - int forward_time = timer.Elapsed(); - if (opts_.ctc_wfst_search_opts.blank_scale != 1.0) { - for (int i = 0; i < ctc_log_probs.size(); i++) { - ctc_log_probs[i][0] = ctc_log_probs[i][0] - + std::log(opts_.ctc_wfst_search_opts.blank_scale); - } - } - timer.Reset(); - searcher_->Search(ctc_log_probs); - int search_time = timer.Elapsed(); - VLOG(3) << "forward takes " << forward_time << " ms, search takes " - << search_time << " ms"; - UpdateResult(); - - if (state != DecodeState::kEndFeats) { - if (ctc_endpointer_->IsEndpoint(ctc_log_probs, DecodedSomething())) { - VLOG(1) << "Endpoint is detected at " << num_frames_; - state = DecodeState::kEndpoint; - } - } - - start_ = true; - return state; -} - -void AsrDecoder::UpdateResult(bool finish) { - const auto& hypotheses = searcher_->Outputs(); - const auto& inputs = searcher_->Inputs(); - const auto& likelihood = searcher_->Likelihood(); - const auto& times = searcher_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - int offset = global_frame_offset_ * feature_frame_shift_in_ms(); - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (searcher_->Type() == kWfstBeamSearch) { - path.sentence += (' ' + word); - } else { - path.sentence += (word); - } - } - - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - const std::vector& input = inputs[i]; - const std::vector& time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ > 0 - ? time_stamp[j] * frame_shift_in_ms() - time_stamp_gap_ - : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : start; - } - int end = time_stamp[j] * frame_shift_in_ms(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * frame_shift_in_ms() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - frame_shift_in_ms() - : end; - } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } - } - - if (post_processor_ != nullptr) { - path.sentence = post_processor_->Process(path.sentence, finish); - } - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } -} - -void AsrDecoder::AttentionRescoring() { - searcher_->FinalizeSearch(); - UpdateResult(true); - // No need to do rescoring - if (0.0 == opts_.rescoring_weight) { - return; - } - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = searcher_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - std::vector rescoring_score; - model_->AttentionRescoring(hypotheses, opts_.reverse_weight, - &rescoring_score); - - // Combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - result_[i].score = opts_.rescoring_weight * rescoring_score[i] + - opts_.ctc_weight * result_[i].score; - } - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_decoder.h deleted file mode 100644 index df71f5b7bad7b2ffdc69bbd7ab11f576bed464d2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_decoder.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_ASR_DECODER_H_ -#define DECODER_ASR_DECODER_H_ - -#include -#include -#include -#include - -#include "fst/fstlib.h" -#include "fst/symbol-table.h" - -#include "decoder/asr_model.h" -#include "decoder/context_graph.h" -#include "decoder/ctc_endpoint.h" -#include "decoder/ctc_prefix_beam_search.h" -#include "decoder/ctc_wfst_beam_search.h" -#include "decoder/search_interface.h" -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/utils.h" - -namespace wenet { - -struct DecodeOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 64 = 16*4 - int chunk_size = 16; - int num_left_chunks = -1; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores in the following two search - // methods are different. - // For CtcPrefixBeamSearch, it's a sum(prefix) score + context score - // For CtcWfstBeamSearch, it's a max(viterbi) path score + context score - // So we should carefully set ctc_weight according to the search methods. - float ctc_weight = 0.5; - float rescoring_weight = 1.0; - float reverse_weight = 0.0; - CtcEndpointConfig ctc_endpoint_config; - CtcPrefixBeamSearchOptions ctc_prefix_search_opts; - CtcWfstBeamSearchOptions ctc_wfst_search_opts; -}; - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -enum DecodeState { - kEndBatch = 0x00, // End of current decoding batch, normal case - kEndpoint = 0x01, // Endpoint is detected - kEndFeats = 0x02, // All feature is decoded - kWaitFeats = 0x03 // Feat is not enough for one chunk inference, wait -}; - -// DecodeResource is thread safe, which can be shared for multiple -// decoding threads -struct DecodeResource { - std::shared_ptr model = nullptr; - std::shared_ptr symbol_table = nullptr; - std::shared_ptr> fst = nullptr; - std::shared_ptr unit_table = nullptr; - std::shared_ptr context_graph = nullptr; - std::shared_ptr post_processor = nullptr; -}; - -// Torch ASR decoder -class AsrDecoder { - public: - AsrDecoder(std::shared_ptr feature_pipeline, - std::shared_ptr resource, - const DecodeOptions& opts); - // @param block: if true, block when feature is not enough for one chunk - // inference. Otherwise, return kWaitFeats. - DecodeState Decode(bool block = true); - void Rescoring(); - void Reset(); - void ResetContinuousDecoding(); - bool DecodedSomething() const { - return !result_.empty() && !result_[0].sentence.empty(); - } - - // This method is used for time benchmark - int num_frames_in_current_chunk() const { - return num_frames_in_current_chunk_; - } - int frame_shift_in_ms() const { - return model_->subsampling_rate() * - feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - int feature_frame_shift_in_ms() const { - return feature_pipeline_->config().frame_shift * 1000 / - feature_pipeline_->config().sample_rate; - } - const std::vector& result() const { return result_; } - - private: - DecodeState AdvanceDecoding(bool block = true); - void AttentionRescoring(); - - void UpdateResult(bool finish = false); - - std::shared_ptr feature_pipeline_; - std::shared_ptr model_; - std::shared_ptr post_processor_; - - std::shared_ptr> fst_ = nullptr; - // output symbol table - std::shared_ptr symbol_table_; - // e2e unit symbol table - std::shared_ptr unit_table_ = nullptr; - const DecodeOptions& opts_; - // cache feature - bool start_ = false; - // For continuous decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = 100; // timestamp gap between words in a sentence - - std::unique_ptr searcher_; - std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(AsrDecoder); -}; - -} // namespace wenet - -#endif // DECODER_ASR_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_model.cc deleted file mode 100644 index 8c7b0fb1195cf07bac6c3ff1bb8cb0e187e977da..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_model.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#include "decoder/asr_model.h" - -#include -#include - -namespace wenet { - -int AsrModel::num_frames_for_chunk(bool start) const { - int num_required_frames = 0; - if (chunk_size_ > 0) { - if (!start) { // First batch - int context = right_context_ + 1; // Add current frame - num_required_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - num_required_frames = chunk_size_ * subsampling_rate_; - } - } else { - num_required_frames = std::numeric_limits::max(); - } - return num_required_frames; -} - -void AsrModel::CacheFeature( - const std::vector>& chunk_feats) { - // Cache feature for next chunk - const int cached_feature_size = 1 + right_context_ - subsampling_rate_; - if (chunk_feats.size() >= cached_feature_size) { - // TODO(Binbin Zhang): Only deal the case when - // chunk_feats.size() > cached_feature_size here, and it's consistent - // with our current model, refine it later if we have new model or - // new requirements - cached_feature_.resize(cached_feature_size); - for (int i = 0; i < cached_feature_size; ++i) { - cached_feature_[i] = - chunk_feats[chunk_feats.size() - cached_feature_size + i]; - } - } -} - -void AsrModel::ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) { - ctc_prob->clear(); - int num_frames = cached_feature_.size() + chunk_feats.size(); - if (num_frames >= right_context_ + 1) { - this->ForwardEncoderFunc(chunk_feats, ctc_prob); - this->CacheFeature(chunk_feats); - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_model.h deleted file mode 100644 index d100dd818551014fa4769c1766bc3b1b626e8453..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/asr_model.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Horizon Robotics. All Rights Reserved. -// Author: binbin.zhang@horizon.ai (Binbin Zhang) - -#ifndef DECODER_ASR_MODEL_H_ -#define DECODER_ASR_MODEL_H_ - -#include -#include -#include -#include - -#include "utils/timer.h" -#include "utils/utils.h" - -namespace wenet { - -class AsrModel { - public: - virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } - virtual int sos() const { return sos_; } - virtual int eos() const { return eos_; } - virtual bool is_bidirectional_decoder() const { - return is_bidirectional_decoder_; - } - virtual int offset() const { return offset_; } - - // If chunk_size > 0, streaming case. Otherwise, none streaming case - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { - num_left_chunks_ = num_left_chunks; - } - // start: if it is the start chunk of one sentence - virtual int num_frames_for_chunk(bool start) const; - - virtual void Reset() = 0; - - virtual void ForwardEncoder( - const std::vector>& chunk_feats, - std::vector>* ctc_prob); - - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - - virtual std::shared_ptr Copy() const = 0; - - protected: - virtual void ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* ctc_prob) = 0; - virtual void CacheFeature(const std::vector>& chunk_feats); - - int right_context_ = 1; - int subsampling_rate_ = 1; - int sos_ = 0; - int eos_ = 0; - bool is_bidirectional_decoder_ = false; - int chunk_size_ = 16; - int num_left_chunks_ = -1; // -1 means all left chunks - int offset_ = 0; - - std::vector> cached_feature_; -}; - -} // namespace wenet - -#endif // DECODER_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/context_graph.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/context_graph.cc deleted file mode 100644 index adc59c506de2afa7087815887295e4d8735d2a35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/context_graph.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/context_graph.h" - -#include - -#include "fst/determinize.h" - -#include "utils/string.h" -#include "utils/utils.h" - -namespace wenet { - -ContextGraph::ContextGraph(ContextConfig config) : config_(config) {} - -void ContextGraph::BuildContextGraph( - const std::vector& query_contexts, - const std::shared_ptr& symbol_table) { - CHECK(symbol_table != nullptr) << "Symbols table should not be nullptr!"; - start_tag_id_ = symbol_table->AddSymbol(""); - end_tag_id_ = symbol_table->AddSymbol(""); - symbol_table_ = symbol_table; - if (query_contexts.empty()) { - if (graph_ != nullptr) graph_.reset(); - return; - } - - std::unique_ptr ofst(new fst::StdVectorFst()); - // State 0 is the start state and the final state. - int start_state = ofst->AddState(); - ofst->SetStart(start_state); - ofst->SetFinal(start_state, fst::StdArc::Weight::One()); - - LOG(INFO) << "Contexts count size: " << query_contexts.size(); - int count = 0; - for (const auto& context : query_contexts) { - if (context.size() > config_.max_context_length) { - LOG(INFO) << "Skip long context: " << context; - continue; - } - if (++count > config_.max_contexts) break; - - std::vector words; - // Split context to words by symbol table, and build the context graph. - bool no_oov = SplitUTF8StringToWords(Trim(context), symbol_table, &words); - if (!no_oov) { - LOG(WARNING) << "Ignore unknown word found during compilation."; - continue; - } - - int prev_state = start_state; - int next_state = start_state; - float escape_score = 0; - for (size_t i = 0; i < words.size(); ++i) { - int word_id = symbol_table_->Find(words[i]); - float score = (i * config_.incremental_context_score - + config_.context_score) * UTF8StringLength(words[i]); - next_state = (i < words.size() - 1) ? ofst->AddState() : start_state; - ofst->AddArc(prev_state, - fst::StdArc(word_id, word_id, score, next_state)); - // Add escape arc to clean the previous context score. - if (i > 0) { - // ilabel and olabel of the escape arc is 0 (). - ofst->AddArc(prev_state, fst::StdArc(0, 0, -escape_score, start_state)); - } - prev_state = next_state; - escape_score += score; - } - } - std::unique_ptr det_fst(new fst::StdVectorFst()); - fst::Determinize(*ofst, det_fst.get()); - graph_ = std::move(det_fst); -} - -int ContextGraph::GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary) { - int next_state = 0; - for (fst::ArcIterator aiter(*graph_, cur_state); !aiter.Done(); - aiter.Next()) { - const fst::StdArc& arc = aiter.Value(); - if (arc.ilabel == 0) { - // escape score, will be overwritten when ilabel equals to word id. - *score = arc.weight.Value(); - } else if (arc.ilabel == word_id) { - next_state = arc.nextstate; - *score = arc.weight.Value(); - if (cur_state == 0) { - *is_start_boundary = true; - } - if (graph_->Final(arc.nextstate) == fst::StdArc::Weight::One()) { - *is_end_boundary = true; - } - break; - } - } - return next_state; -} - -bool ContextGraph::SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - bool no_oov = true; - for (size_t start = 0; start < chars.size();) { - for (size_t end = chars.size(); end > start; --end) { - std::string word; - for (size_t i = start; i < end; i++) { - word += chars[i]; - } - // Skip space. - if (word == " ") { - start = end; - continue; - } - // Add '▁' at the beginning of English word. - if (IsAlpha(word)) { - word = kSpaceSymbol + word; - } - - if (symbol_table->Find(word) != -1) { - words->emplace_back(word); - start = end; - continue; - } - if (end == start + 1) { - ++start; - no_oov = false; - LOG(WARNING) << word << " is oov."; - } - } - } - return no_oov; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/context_graph.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/context_graph.h deleted file mode 100644 index 41b59206987cfe22d421f40506057830b6311f8e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/context_graph.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CONTEXT_GRAPH_H_ -#define DECODER_CONTEXT_GRAPH_H_ - -#include -#include -#include - -#include "fst/compose.h" -#include "fst/fst.h" -#include "fst/vector-fst.h" - -namespace wenet { - -using StateId = fst::StdArc::StateId; - -struct ContextConfig { - int max_contexts = 5000; - int max_context_length = 100; - float context_score = 3.0; - float incremental_context_score = 0.0; -}; - -class ContextGraph { - public: - explicit ContextGraph(ContextConfig config); - void BuildContextGraph(const std::vector& query_context, - const std::shared_ptr& symbol_table); - int GetNextState(int cur_state, int word_id, float* score, - bool* is_start_boundary, bool* is_end_boundary); - - int start_tag_id() { return start_tag_id_; } - int end_tag_id() { return end_tag_id_; } - - private: - bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - - int start_tag_id_ = -1; - int end_tag_id_ = -1; - ContextConfig config_; - std::shared_ptr symbol_table_ = nullptr; - std::unique_ptr graph_ = nullptr; - DISALLOW_COPY_AND_ASSIGN(ContextGraph); -}; - -} // namespace wenet - -#endif // DECODER_CONTEXT_GRAPH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_endpoint.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_endpoint.cc deleted file mode 100644 index 4a64dd048f32401ab0dca468836cfac8be943d26..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_endpoint.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_endpoint.h" - -#include - -#include -#include - -#include "utils/log.h" - -namespace wenet { - -CtcEndpoint::CtcEndpoint(const CtcEndpointConfig& config) : config_(config) { - Reset(); -} - -void CtcEndpoint::Reset() { - num_frames_decoded_ = 0; - num_frames_trailing_blank_ = 0; -} - -static bool RuleActivated(const CtcEndpointRule& rule, - const std::string& rule_name, bool decoded_sth, - int trailing_silence, int utterance_length) { - bool ans = (decoded_sth || !rule.must_decoded_sth) && - trailing_silence >= rule.min_trailing_silence && - utterance_length >= rule.min_utterance_length; - if (ans) { - VLOG(2) << "Endpointing rule " << rule_name - << " activated: " << (decoded_sth ? "true" : "false") << ',' - << trailing_silence << ',' << utterance_length; - } - return ans; -} - -bool CtcEndpoint::IsEndpoint( - const std::vector>& ctc_log_probs, - bool decoded_something) { - for (int t = 0; t < ctc_log_probs.size(); ++t) { - const auto& logp_t = ctc_log_probs[t]; - float blank_prob = expf(logp_t[config_.blank]); - - num_frames_decoded_++; - if (blank_prob > config_.blank_threshold) { - num_frames_trailing_blank_++; - } else { - num_frames_trailing_blank_ = 0; - } - } - CHECK_GE(num_frames_decoded_, num_frames_trailing_blank_); - CHECK_GT(frame_shift_in_ms_, 0); - int utterance_length = num_frames_decoded_ * frame_shift_in_ms_; - int trailing_silence = num_frames_trailing_blank_ * frame_shift_in_ms_; - if (RuleActivated(config_.rule1, "rule1", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule2, "rule2", decoded_something, trailing_silence, - utterance_length)) - return true; - if (RuleActivated(config_.rule3, "rule3", decoded_something, trailing_silence, - utterance_length)) - return true; - return false; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_endpoint.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_endpoint.h deleted file mode 100644 index 56d9e08e7d3fab5562028e956f7b1d6ebac7b9e4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_endpoint.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_ENDPOINT_H_ -#define DECODER_CTC_ENDPOINT_H_ - -#include - -namespace wenet { - -struct CtcEndpointRule { - bool must_decoded_sth; - int min_trailing_silence; - int min_utterance_length; - - CtcEndpointRule(bool must_decoded_sth = true, int min_trailing_silence = 1000, - int min_utterance_length = 0) - : must_decoded_sth(must_decoded_sth), - min_trailing_silence(min_trailing_silence), - min_utterance_length(min_utterance_length) {} -}; - -struct CtcEndpointConfig { - /// We consider blank as silence for purposes of endpointing. - int blank = 0; // blank id - float blank_threshold = 0.8; // blank threshold to be silence - /// We support three rules. We terminate decoding if ANY of these rules - /// evaluates to "true". If you want to add more rules, do it by changing this - /// code. If you want to disable a rule, you can set the silence-timeout for - /// that rule to a very large number. - - /// rule1 times out after 5000 ms of silence, even if we decoded nothing. - CtcEndpointRule rule1; - /// rule2 times out after 1000 ms of silence after decoding something. - CtcEndpointRule rule2; - /// rule3 times out after the utterance is 20000 ms long, regardless of - /// anything else. - CtcEndpointRule rule3; - - CtcEndpointConfig() - : rule1(false, 5000, 0), rule2(true, 1000, 0), rule3(false, 0, 20000) {} -}; - -class CtcEndpoint { - public: - explicit CtcEndpoint(const CtcEndpointConfig& config); - - void Reset(); - /// This function returns true if this set of endpointing rules thinks we - /// should terminate decoding. - bool IsEndpoint(const std::vector>& ctc_log_probs, - bool decoded_something); - - void frame_shift_in_ms(int frame_shift_in_ms) { - frame_shift_in_ms_ = frame_shift_in_ms; - } - - private: - CtcEndpointConfig config_; - int frame_shift_in_ms_ = -1; - int num_frames_decoded_ = 0; - int num_frames_trailing_blank_ = 0; -}; - -} // namespace wenet - -#endif // DECODER_CTC_ENDPOINT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_prefix_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index 154c8864ba98255528a33a80a35b18eee8fa5dc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_prefix_beam_search.cc +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -CtcPrefixBeamSearch::CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : opts_(opts), context_graph_(context_graph) { - Reset(); -} - -void CtcPrefixBeamSearch::Reset() { - hypotheses_.clear(); - likelihood_.clear(); - cur_hyps_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); - abs_time_step_ = 0; - PrefixScore prefix_score; - prefix_score.s = 0.0; - prefix_score.ns = -kFloatMax; - prefix_score.v_s = 0.0; - prefix_score.v_ns = 0.0; - std::vector empty; - cur_hyps_[empty] = prefix_score; - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.total_score()); - times_.emplace_back(empty); -} - -static bool PrefixScoreCompare( - const std::pair, PrefixScore>& a, - const std::pair, PrefixScore>& b) { - return a.second.total_score() > b.second.total_score(); -} - -void CtcPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - const std::vector& start_boundaries = prefix.second.start_boundaries; - const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - if (s < start_boundaries.size() && i == start_boundaries[s]) { - output.emplace_back(context_graph_->start_tag_id()); - ++s; - } - output.emplace_back(input[i]); - if (e < end_boundaries.size() && i == end_boundaries[e]) { - output.emplace_back(context_graph_->end_tag_id()); - ++e; - } - } - outputs_.emplace_back(output); -} - -void CtcPrefixBeamSearch::UpdateHypotheses( - const std::vector, PrefixScore>>& hpys) { - cur_hyps_.clear(); - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - for (auto& item : hpys) { - cur_hyps_[item.first] = item.second; - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.total_score()); - viterbi_likelihood_.emplace_back(item.second.viterbi_score()); - times_.emplace_back(item.second.times()); - } -} - -// Please refer https://robin1001.github.io/2020/12/11/ctc-search -// for how CTC prefix beam search works, and there is a simple graph demo in -// it. -void CtcPrefixBeamSearch::Search(const std::vector>& logp) { - if (logp.size() == 0) return; - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixHash> next_hyps; - // 1. First beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. Token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields s(blank ending score) and - // ns(none blank ending score) to -inf, respectively. - if (id == opts_.blank) { - // Case 0: *a + ε => *a - PrefixScore& next_score = next_hyps[prefix]; - next_score.s = LogAdd(next_score.s, prefix_score.score() + prob); - next_score.v_s = prefix_score.viterbi_score() + prob; - next_score.times_s = prefix_score.times(); - // Prefix not changed, copy the context from prefix. - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - } else if (!prefix.empty() && id == prefix.back()) { - // Case 1: *a + a => *a - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.ns = LogAdd(next_score1.ns, prefix_score.ns + prob); - if (next_score1.v_ns < prefix_score.v_ns + prob) { - next_score1.v_ns = prefix_score.v_ns + prob; - if (next_score1.cur_token_prob < prob) { - next_score1.cur_token_prob = prob; - next_score1.times_ns = prefix_score.times_ns; - CHECK_GT(next_score1.times_ns.size(), 0); - next_score1.times_ns.back() = abs_time_step_; - } - } - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // Case 2: *aε + a => *aa - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.ns = LogAdd(next_score2.ns, prefix_score.s + prob); - if (next_score2.v_ns < prefix_score.v_s + prob) { - next_score2.v_ns = prefix_score.v_s + prob; - next_score2.cur_token_prob = prob; - next_score2.times_ns = prefix_score.times_s; - next_score2.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score2.has_context) { - // Prefix changed, calculate the context score. - next_score2.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score2.has_context = true; - } - } else { - // Case 3: *a + b => *ab, *aε + b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.ns = LogAdd(next_score.ns, prefix_score.score() + prob); - if (next_score.v_ns < prefix_score.viterbi_score() + prob) { - next_score.v_ns = prefix_score.viterbi_score() + prob; - next_score.cur_token_prob = prob; - next_score.times_ns = prefix_score.times(); - next_score.times_ns.emplace_back(abs_time_step_); - } - if (context_graph_ && !next_score.has_context) { - // Calculate the context score. - next_score.UpdateContext(context_graph_, prefix_score, id, - prefix.size()); - next_score.has_context = true; - } - } - } - } - - // 3. Second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), arr.begin() + second_beam_size, arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. Update cur_hyps_ and get new result - UpdateHypotheses(arr); - } -} - -void CtcPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } - -void CtcPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - CHECK_EQ(hypotheses_.size(), cur_hyps_.size()); - CHECK_EQ(hypotheses_.size(), likelihood_.size()); - // We should backoff the context score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_state != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); - } - } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_prefix_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_prefix_beam_search.h deleted file mode 100644 index f44ec23c37af517c9e45140f89ef7346768f5d35..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_prefix_beam_search.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_PREFIX_BEAM_SEARCH_H_ -#define DECODER_CTC_PREFIX_BEAM_SEARCH_H_ - -#include -#include -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "utils/utils.h" - -namespace wenet { - -struct CtcPrefixBeamSearchOptions { - int blank = 0; // blank id - int first_beam_size = 10; - int second_beam_size = 10; -}; - -struct PrefixScore { - float s = -kFloatMax; // blank ending score - float ns = -kFloatMax; // none blank ending score - float v_s = -kFloatMax; // viterbi blank ending score - float v_ns = -kFloatMax; // viterbi none blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_s; // times of viterbi blank path - std::vector times_ns; // times of viterbi none blank path - - float score() const { return LogAdd(s, ns); } - float viterbi_score() const { return v_s > v_ns ? v_s : v_ns; } - const std::vector& times() const { - return v_s > v_ns ? times_s : times_ns; - } - - bool has_context = false; - int context_state = 0; - float context_score = 0; - std::vector start_boundaries; - std::vector end_boundaries; - - void CopyContext(const PrefixScore& prefix_score) { - context_state = prefix_score.context_state; - context_score = prefix_score.context_score; - start_boundaries = prefix_score.start_boundaries; - end_boundaries = prefix_score.end_boundaries; - } - - void UpdateContext(const std::shared_ptr& context_graph, - const PrefixScore& prefix_score, int word_id, - int prefix_len) { - this->CopyContext(prefix_score); - - float score = 0; - bool is_start_boundary = false; - bool is_end_boundary = false; - - context_state = - context_graph->GetNextState(prefix_score.context_state, word_id, &score, - &is_start_boundary, &is_end_boundary); - context_score += score; - if (is_start_boundary) start_boundaries.emplace_back(prefix_len); - if (is_end_boundary) end_boundaries.emplace_back(prefix_len); - } - - float total_score() const { return score() + context_score; } -}; - -struct PrefixHash { - size_t operator()(const std::vector& prefix) const { - size_t hash_code = 0; - // here we use KB&DR hash code - for (int id : prefix) { - hash_code = id + 31 * hash_code; - } - return hash_code; - } -}; - -class CtcPrefixBeamSearch : public SearchInterface { - public: - explicit CtcPrefixBeamSearch( - const CtcPrefixBeamSearchOptions& opts, - const std::shared_ptr& context_graph = nullptr); - - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kPrefixBeamSearch; } - void UpdateOutputs(const std::pair, PrefixScore>& prefix); - void UpdateHypotheses( - const std::vector, PrefixScore>>& hpys); - void UpdateFinalContext(); - - const std::vector& viterbi_likelihood() const { - return viterbi_likelihood_; - } - const std::vector>& Inputs() const override { - return hypotheses_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - int abs_time_step_ = 0; - - // N-best list and corresponding likelihood_, in sorted order - std::vector> hypotheses_; - std::vector likelihood_; - std::vector viterbi_likelihood_; - std::vector> times_; - - std::unordered_map, PrefixScore, PrefixHash> cur_hyps_; - std::shared_ptr context_graph_ = nullptr; - // Outputs contain the hypotheses_ and tags like: and - std::vector> outputs_; - const CtcPrefixBeamSearchOptions& opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(CtcPrefixBeamSearch); -}; - -} // namespace wenet - -#endif // DECODER_CTC_PREFIX_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_wfst_beam_search.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_wfst_beam_search.cc deleted file mode 100644 index 10e93f387e87b5f16fb7784d7060c50f227bf58e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_wfst_beam_search.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_wfst_beam_search.h" - -#include - -namespace wenet { - -void DecodableTensorScaled::Reset() { - num_frames_ready_ = 0; - done_ = false; - // Give an empty initialization, will throw error when - // AcceptLoglikes is not called - logp_.clear(); -} - -void DecodableTensorScaled::AcceptLoglikes(const std::vector& logp) { - ++num_frames_ready_; - // TODO(Binbin Zhang): Avoid copy here - logp_ = logp; -} - -float DecodableTensorScaled::LogLikelihood(int32 frame, int32 index) { - CHECK_GT(index, 0); - CHECK_LT(frame, num_frames_ready_); - return scale_ * logp_[index - 1]; -} - -bool DecodableTensorScaled::IsLastFrame(int32 frame) const { - CHECK_LT(frame, num_frames_ready_); - return done_ && (frame == num_frames_ready_ - 1); -} - -int32 DecodableTensorScaled::NumIndices() const { - LOG(FATAL) << "Not implement"; - return 0; -} - -CtcWfstBeamSearch::CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph) - : decodable_(opts.acoustic_scale), - decoder_(fst, opts, context_graph), - context_graph_(context_graph), - opts_(opts) { - Reset(); -} - -void CtcWfstBeamSearch::Reset() { - num_frames_ = 0; - decoded_frames_mapping_.clear(); - is_last_frame_blank_ = false; - last_best_ = 0; - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - decodable_.Reset(); - decoder_.InitDecoding(); -} - -void CtcWfstBeamSearch::Search(const std::vector>& logp) { - if (0 == logp.size()) { - return; - } - // Every time we get the log posterior, we decode it all before return - for (int i = 0; i < logp.size(); i++) { - float blank_score = std::exp(logp[i][0]); - if (blank_score > opts_.blank_skip_thresh * opts_.blank_scale) { - VLOG(3) << "skipping frame " << num_frames_ << " score " << blank_score; - is_last_frame_blank_ = true; - last_frame_prob_ = logp[i]; - } else { - // Get the best symbol - int cur_best = - std::max_element(logp[i].begin(), logp[i].end()) - logp[i].begin(); - // Optional, adding one blank frame if we has skipped it in two same - // symbols - if (cur_best != 0 && is_last_frame_blank_ && cur_best == last_best_) { - decodable_.AcceptLoglikes(last_frame_prob_); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_ - 1); - VLOG(2) << "Adding blank frame at symbol " << cur_best; - } - last_best_ = cur_best; - - decodable_.AcceptLoglikes(logp[i]); - decoder_.AdvanceDecoding(&decodable_, 1); - decoded_frames_mapping_.push_back(num_frames_); - is_last_frame_blank_ = false; - } - num_frames_++; - } - // Get the best path - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - if (decoded_frames_mapping_.size() > 0) { - inputs_.resize(1); - outputs_.resize(1); - likelihood_.resize(1); - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, false); - std::vector alignment; - kaldi::LatticeWeight weight; - fst::GetLinearSymbolSequence(lat, &alignment, &outputs_[0], &weight); - ConvertToInputs(alignment, &inputs_[0]); - RemoveContinuousTags(&outputs_[0]); - VLOG(3) << weight.Value1() << " " << weight.Value2(); - likelihood_[0] = -(weight.Value1() + weight.Value2()); - } -} - -void CtcWfstBeamSearch::FinalizeSearch() { - decodable_.SetFinish(); - decoder_.FinalizeDecoding(); - inputs_.clear(); - outputs_.clear(); - likelihood_.clear(); - times_.clear(); - if (decoded_frames_mapping_.size() > 0) { - std::vector nbest_lats; - if (opts_.nbest == 1) { - kaldi::Lattice lat; - decoder_.GetBestPath(&lat, true); - nbest_lats.push_back(std::move(lat)); - } else { - // Get N-best path by lattice(CompactLattice) - kaldi::CompactLattice clat; - decoder_.GetLattice(&clat, true); - kaldi::Lattice lat, nbest_lat; - fst::ConvertLattice(clat, &lat); - // TODO(Binbin Zhang): it's n-best word lists here, not character n-best - fst::ShortestPath(lat, &nbest_lat, opts_.nbest); - fst::ConvertNbestToVector(nbest_lat, &nbest_lats); - } - int nbest = nbest_lats.size(); - inputs_.resize(nbest); - outputs_.resize(nbest); - likelihood_.resize(nbest); - times_.resize(nbest); - for (int i = 0; i < nbest; i++) { - kaldi::LatticeWeight weight; - std::vector alignment; - fst::GetLinearSymbolSequence(nbest_lats[i], &alignment, &outputs_[i], - &weight); - ConvertToInputs(alignment, &inputs_[i], ×_[i]); - RemoveContinuousTags(&outputs_[i]); - likelihood_[i] = -(weight.Value1() + weight.Value2()); - } - } -} - -void CtcWfstBeamSearch::ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time) { - input->clear(); - if (time != nullptr) time->clear(); - for (int cur = 0; cur < alignment.size(); ++cur) { - // ignore blank - if (alignment[cur] - 1 == 0) continue; - // merge continuous same label - if (cur > 0 && alignment[cur] == alignment[cur - 1]) continue; - - input->push_back(alignment[cur] - 1); - if (time != nullptr) { - time->push_back(decoded_frames_mapping_[cur]); - } - } -} - -void CtcWfstBeamSearch::RemoveContinuousTags(std::vector* output) { - if (context_graph_) { - for (auto it = output->begin(); it != output->end();) { - if (*it == context_graph_->start_tag_id() || - *it == context_graph_->end_tag_id()) { - if (it + 1 != output->end() && *it == *(it + 1)) { - it = output->erase(it); - continue; - } - } - ++it; - } - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_wfst_beam_search.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_wfst_beam_search.h deleted file mode 100644 index 204a0c8db1254035b7e3bd4a6e02b65d66b756f3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/ctc_wfst_beam_search.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_CTC_WFST_BEAM_SEARCH_H_ -#define DECODER_CTC_WFST_BEAM_SEARCH_H_ - -#include -#include - -#include "decoder/context_graph.h" -#include "decoder/search_interface.h" -#include "kaldi/decoder/lattice-faster-online-decoder.h" -#include "utils/utils.h" - -namespace wenet { - -class DecodableTensorScaled : public kaldi::DecodableInterface { - public: - explicit DecodableTensorScaled(float scale = 1.0) : scale_(scale) { Reset(); } - - void Reset(); - int32 NumFramesReady() const override { return num_frames_ready_; } - bool IsLastFrame(int32 frame) const override; - float LogLikelihood(int32 frame, int32 index) override; - int32 NumIndices() const override; - void AcceptLoglikes(const std::vector& logp); - void SetFinish() { done_ = true; } - - private: - int num_frames_ready_ = 0; - float scale_ = 1.0; - bool done_ = false; - std::vector logp_; -}; - -// LatticeFasterDecoderConfig has the following key members -// beam: decoding beam -// max_active: Decoder max active states -// lattice_beam: Lattice generation beam -struct CtcWfstBeamSearchOptions : public kaldi::LatticeFasterDecoderConfig { - float acoustic_scale = 1.0; - float nbest = 10; - // When blank score is greater than this thresh, skip the frame in viterbi - // search - float blank_skip_thresh = 0.98; - float blank_scale = 1.0; -}; - -class CtcWfstBeamSearch : public SearchInterface { - public: - explicit CtcWfstBeamSearch( - const fst::Fst& fst, const CtcWfstBeamSearchOptions& opts, - const std::shared_ptr& context_graph); - void Search(const std::vector>& logp) override; - void Reset() override; - void FinalizeSearch() override; - SearchType Type() const override { return SearchType::kWfstBeamSearch; } - // For CTC prefix beam search, both inputs and outputs are hypotheses_ - const std::vector>& Inputs() const override { - return inputs_; - } - const std::vector>& Outputs() const override { - return outputs_; - } - const std::vector& Likelihood() const override { return likelihood_; } - const std::vector>& Times() const override { return times_; } - - private: - // Sub one and remove - void ConvertToInputs(const std::vector& alignment, - std::vector* input, - std::vector* time = nullptr); - void RemoveContinuousTags(std::vector* output); - - int num_frames_ = 0; - std::vector decoded_frames_mapping_; - - int last_best_ = 0; // last none blank best id - std::vector last_frame_prob_; - bool is_last_frame_blank_ = false; - std::vector> inputs_, outputs_; - std::vector likelihood_; - std::vector> times_; - DecodableTensorScaled decodable_; - kaldi::LatticeFasterOnlineDecoder decoder_; - std::shared_ptr context_graph_; - const CtcWfstBeamSearchOptions& opts_; -}; - -} // namespace wenet - -#endif // DECODER_CTC_WFST_BEAM_SEARCH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/onnx_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/onnx_asr_model.cc deleted file mode 100644 index fc7afc704febbde3b7e350e392dc46763c453e74..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/onnx_asr_model.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/onnx_asr_model.h" - -#include -#include -#include - -#include "utils/string.h" - -namespace wenet { - -Ort::Env OnnxAsrModel::env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, ""); -Ort::SessionOptions OnnxAsrModel::session_options_ = Ort::SessionOptions(); - -void OnnxAsrModel::InitEngineThreads(int num_threads) { - session_options_.SetIntraOpNumThreads(num_threads); -} - -void OnnxAsrModel::GetInputOutputInfo( - const std::shared_ptr& session, - std::vector* in_names, std::vector* out_names) { - Ort::AllocatorWithDefaultOptions allocator; - // Input info - int num_nodes = session->GetInputCount(); - in_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetInputName(i, allocator); - Ort::TypeInfo type_info = session->GetInputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tInput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*in_names)[i] = name; - } - // Output info - num_nodes = session->GetOutputCount(); - out_names->resize(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - char* name = session->GetOutputName(i, allocator); - Ort::TypeInfo type_info = session->GetOutputTypeInfo(i); - auto tensor_info = type_info.GetTensorTypeAndShapeInfo(); - ONNXTensorElementDataType type = tensor_info.GetElementType(); - std::vector node_dims = tensor_info.GetShape(); - std::stringstream shape; - for (auto j : node_dims) { - shape << j; - shape << " "; - } - LOG(INFO) << "\tOutput " << i << " : name=" << name << " type=" << type - << " dims=" << shape.str(); - (*out_names)[i] = name; - } -} - -void OnnxAsrModel::Read(const std::string& model_dir) { - std::string encoder_onnx_path = model_dir + "/encoder.onnx"; - std::string rescore_onnx_path = model_dir + "/decoder.onnx"; - std::string ctc_onnx_path = model_dir + "/ctc.onnx"; - - // 1. Load sessions - try { -#ifdef _MSC_VER - encoder_session_ = std::make_shared( - env_, ToWString(encoder_onnx_path).c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, ToWString(rescore_onnx_path).c_str(), session_options_); - ctc_session_ = std::make_shared( - env_, ToWString(ctc_onnx_path).c_str(), session_options_); -#else - encoder_session_ = std::make_shared( - env_, encoder_onnx_path.c_str(), session_options_); - rescore_session_ = std::make_shared( - env_, rescore_onnx_path.c_str(), session_options_); - ctc_session_ = std::make_shared(env_, ctc_onnx_path.c_str(), - session_options_); -#endif - } catch (std::exception const& e) { - LOG(ERROR) << "error when load onnx model: " << e.what(); - exit(0); - } - - // 2. Read metadata - auto model_metadata = encoder_session_->GetModelMetadata(); - - Ort::AllocatorWithDefaultOptions allocator; - encoder_output_size_ = - atoi(model_metadata.LookupCustomMetadataMap("output_size", allocator)); - num_blocks_ = - atoi(model_metadata.LookupCustomMetadataMap("num_blocks", allocator)); - head_ = atoi(model_metadata.LookupCustomMetadataMap("head", allocator)); - cnn_module_kernel_ = atoi( - model_metadata.LookupCustomMetadataMap("cnn_module_kernel", allocator)); - subsampling_rate_ = atoi( - model_metadata.LookupCustomMetadataMap("subsampling_rate", allocator)); - right_context_ = - atoi(model_metadata.LookupCustomMetadataMap("right_context", allocator)); - sos_ = atoi(model_metadata.LookupCustomMetadataMap("sos_symbol", allocator)); - eos_ = atoi(model_metadata.LookupCustomMetadataMap("eos_symbol", allocator)); - is_bidirectional_decoder_ = atoi(model_metadata.LookupCustomMetadataMap( - "is_bidirectional_decoder", allocator)); - chunk_size_ = - atoi(model_metadata.LookupCustomMetadataMap("chunk_size", allocator)); - num_left_chunks_ = - atoi(model_metadata.LookupCustomMetadataMap("left_chunks", allocator)); - - LOG(INFO) << "Onnx Model Info:"; - LOG(INFO) << "\tencoder_output_size " << encoder_output_size_; - LOG(INFO) << "\tnum_blocks " << num_blocks_; - LOG(INFO) << "\thead " << head_; - LOG(INFO) << "\tcnn_module_kernel " << cnn_module_kernel_; - LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; - LOG(INFO) << "\tright_context " << right_context_; - LOG(INFO) << "\tsos " << sos_; - LOG(INFO) << "\teos " << eos_; - LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; - LOG(INFO) << "\tchunk_size " << chunk_size_; - LOG(INFO) << "\tnum_left_chunks " << num_left_chunks_; - - // 3. Read model nodes - LOG(INFO) << "Onnx Encoder:"; - GetInputOutputInfo(encoder_session_, &encoder_in_names_, &encoder_out_names_); - LOG(INFO) << "Onnx CTC:"; - GetInputOutputInfo(ctc_session_, &ctc_in_names_, &ctc_out_names_); - LOG(INFO) << "Onnx Rescore:"; - GetInputOutputInfo(rescore_session_, &rescore_in_names_, &rescore_out_names_); -} - -OnnxAsrModel::OnnxAsrModel(const OnnxAsrModel& other) { - // metadatas - encoder_output_size_ = other.encoder_output_size_; - num_blocks_ = other.num_blocks_; - head_ = other.head_; - cnn_module_kernel_ = other.cnn_module_kernel_; - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - - // sessions - encoder_session_ = other.encoder_session_; - ctc_session_ = other.ctc_session_; - rescore_session_ = other.rescore_session_; - - // node names - encoder_in_names_ = other.encoder_in_names_; - encoder_out_names_ = other.encoder_out_names_; - ctc_in_names_ = other.ctc_in_names_; - ctc_out_names_ = other.ctc_out_names_; - rescore_in_names_ = other.rescore_in_names_; - rescore_out_names_ = other.rescore_out_names_; -} - -std::shared_ptr OnnxAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void OnnxAsrModel::Reset() { - offset_ = 0; - encoder_outs_.clear(); - cached_feature_.clear(); - // Reset att_cache - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - if (num_left_chunks_ > 0) { - int required_cache_size = chunk_size_ * num_left_chunks_; - offset_ = required_cache_size; - att_cache_.resize(num_blocks_ * head_ * required_cache_size * - encoder_output_size_ / head_ * 2, - 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, required_cache_size, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } else { - att_cache_.resize(0, 0.0); - const int64_t att_cache_shape[] = {num_blocks_, head_, 0, - encoder_output_size_ / head_ * 2}; - att_cache_ort_ = Ort::Value::CreateTensor( - memory_info, att_cache_.data(), att_cache_.size(), att_cache_shape, 4); - } - - // Reset cnn_cache - cnn_cache_.resize( - num_blocks_ * encoder_output_size_ * (cnn_module_kernel_ - 1), 0.0); - const int64_t cnn_cache_shape[] = {num_blocks_, 1, encoder_output_size_, - cnn_module_kernel_ - 1}; - cnn_cache_ort_ = Ort::Value::CreateTensor( - memory_info, cnn_cache_.data(), cnn_cache_.size(), cnn_cache_shape, 4); -} - -void OnnxAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - // 1. Prepare onnx required data, splice cached_feature_ and chunk_feats - // chunk - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - std::vector feats; - for (size_t i = 0; i < cached_feature_.size(); ++i) { - feats.insert(feats.end(), cached_feature_[i].begin(), - cached_feature_[i].end()); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - feats.insert(feats.end(), chunk_feats[i].begin(), chunk_feats[i].end()); - } - const int64_t feats_shape[3] = {1, num_frames, feature_dim}; - Ort::Value feats_ort = Ort::Value::CreateTensor( - memory_info, feats.data(), feats.size(), feats_shape, 3); - // offset - int64_t offset_int64 = static_cast(offset_); - Ort::Value offset_ort = Ort::Value::CreateTensor( - memory_info, &offset_int64, 1, std::vector{}.data(), 0); - // required_cache_size - int64_t required_cache_size = chunk_size_ * num_left_chunks_; - Ort::Value required_cache_size_ort = Ort::Value::CreateTensor( - memory_info, &required_cache_size, 1, std::vector{}.data(), 0); - // att_mask - Ort::Value att_mask_ort{nullptr}; - std::vector att_mask(required_cache_size + chunk_size_, 1); - if (num_left_chunks_ > 0) { - int chunk_idx = offset_ / chunk_size_ - num_left_chunks_; - if (chunk_idx < num_left_chunks_) { - for (int i = 0; i < (num_left_chunks_ - chunk_idx) * chunk_size_; ++i) { - att_mask[i] = 0; - } - } - const int64_t att_mask_shape[] = {1, 1, required_cache_size + chunk_size_}; - att_mask_ort = Ort::Value::CreateTensor( - memory_info, reinterpret_cast(att_mask.data()), att_mask.size(), - att_mask_shape, 3); - } - - // 2. Encoder chunk forward - std::vector inputs; - for (auto name : encoder_in_names_) { - if (!strcmp(name, "chunk")) { - inputs.emplace_back(std::move(feats_ort)); - } else if (!strcmp(name, "offset")) { - inputs.emplace_back(std::move(offset_ort)); - } else if (!strcmp(name, "required_cache_size")) { - inputs.emplace_back(std::move(required_cache_size_ort)); - } else if (!strcmp(name, "att_cache")) { - inputs.emplace_back(std::move(att_cache_ort_)); - } else if (!strcmp(name, "cnn_cache")) { - inputs.emplace_back(std::move(cnn_cache_ort_)); - } else if (!strcmp(name, "att_mask")) { - inputs.emplace_back(std::move(att_mask_ort)); - } - } - - std::vector ort_outputs = encoder_session_->Run( - Ort::RunOptions{nullptr}, encoder_in_names_.data(), inputs.data(), - inputs.size(), encoder_out_names_.data(), encoder_out_names_.size()); - - offset_ += static_cast( - ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape()[1]); - att_cache_ort_ = std::move(ort_outputs[1]); - cnn_cache_ort_ = std::move(ort_outputs[2]); - - std::vector ctc_inputs; - ctc_inputs.emplace_back(std::move(ort_outputs[0])); - - std::vector ctc_ort_outputs = ctc_session_->Run( - Ort::RunOptions{nullptr}, ctc_in_names_.data(), ctc_inputs.data(), - ctc_inputs.size(), ctc_out_names_.data(), ctc_out_names_.size()); - encoder_outs_.push_back(std::move(ctc_inputs[0])); - - float* logp_data = ctc_ort_outputs[0].GetTensorMutableData(); - auto type_info = ctc_ort_outputs[0].GetTensorTypeAndShapeInfo(); - - int num_outputs = type_info.GetShape()[1]; - int output_dim = type_info.GetShape()[2]; - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), logp_data + i * output_dim, - sizeof(float) * output_dim); - } -} - -float OnnxAsrModel::ComputeAttentionScore(const float* prob, - const std::vector& hyp, int eos, - int decode_out_len) { - float score = 0.0f; - for (size_t j = 0; j < hyp.size(); ++j) { - score += *(prob + j * decode_out_len + hyp[j]); - } - score += *(prob + hyp.size() * decode_out_len + eos); - return score; -} - -void OnnxAsrModel::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) { - Ort::MemoryInfo memory_info = - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - std::vector hyps_lens; - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_lens.emplace_back(static_cast(length)); - } - - std::vector rescore_input; - int encoder_len = 0; - for (int i = 0; i < encoder_outs_.size(); i++) { - float* encoder_outs_data = encoder_outs_[i].GetTensorMutableData(); - auto type_info = encoder_outs_[i].GetTensorTypeAndShapeInfo(); - for (int j = 0; j < type_info.GetElementCount(); j++) { - rescore_input.emplace_back(encoder_outs_data[j]); - } - encoder_len += type_info.GetShape()[1]; - } - - const int64_t decode_input_shape[] = {1, encoder_len, encoder_output_size_}; - - std::vector hyps_pad; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_pad.emplace_back(sos_); - size_t j = 0; - for (; j < hyp.size(); ++j) { - hyps_pad.emplace_back(hyp[j]); - } - if (j == max_hyps_len - 1) { - continue; - } - for (; j < max_hyps_len - 1; ++j) { - hyps_pad.emplace_back(0); - } - } - - const int64_t hyps_pad_shape[] = {num_hyps, max_hyps_len}; - - const int64_t hyps_lens_shape[] = {num_hyps}; - - Ort::Value decode_input_tensor_ = Ort::Value::CreateTensor( - memory_info, rescore_input.data(), rescore_input.size(), - decode_input_shape, 3); - Ort::Value hyps_pad_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_pad.data(), hyps_pad.size(), hyps_pad_shape, 2); - Ort::Value hyps_lens_tensor_ = Ort::Value::CreateTensor( - memory_info, hyps_lens.data(), hyps_lens.size(), hyps_lens_shape, 1); - - std::vector rescore_inputs; - - rescore_inputs.emplace_back(std::move(hyps_pad_tensor_)); - rescore_inputs.emplace_back(std::move(hyps_lens_tensor_)); - rescore_inputs.emplace_back(std::move(decode_input_tensor_)); - - std::vector rescore_outputs = rescore_session_->Run( - Ort::RunOptions{nullptr}, rescore_in_names_.data(), rescore_inputs.data(), - rescore_inputs.size(), rescore_out_names_.data(), - rescore_out_names_.size()); - - float* decoder_outs_data = rescore_outputs[0].GetTensorMutableData(); - float* r_decoder_outs_data = rescore_outputs[1].GetTensorMutableData(); - - auto type_info = rescore_outputs[0].GetTensorTypeAndShapeInfo(); - int decode_out_len = type_info.GetShape()[2]; - - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left to right decoder score - score = ComputeAttentionScore( - decoder_outs_data + max_hyps_len * decode_out_len * i, hyp, eos_, - decode_out_len); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore( - r_decoder_outs_data + max_hyps_len * decode_out_len * i, r_hyp, eos_, - decode_out_len); - } - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/onnx_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/onnx_asr_model.h deleted file mode 100644 index f5d9e9a0c61d728f2fb6d45d1428234abae98c90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/onnx_asr_model.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 ZeXuan Li (lizexuan@huya.com) -// Xingchen Song(sxc19@mails.tsinghua.edu.cn) -// hamddct@gmail.com (Mddct) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_ONNX_ASR_MODEL_H_ -#define DECODER_ONNX_ASR_MODEL_H_ - -#include -#include -#include - -#include "onnxruntime_cxx_api.h" // NOLINT - -#include "decoder/asr_model.h" -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -class OnnxAsrModel : public AsrModel { - public: - static void InitEngineThreads(int num_threads = 1); - - public: - OnnxAsrModel() = default; - OnnxAsrModel(const OnnxAsrModel& other); - void Read(const std::string& model_dir); - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - void GetInputOutputInfo(const std::shared_ptr& session, - std::vector* in_names, - std::vector* out_names); - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const float* prob, const std::vector& hyp, - int eos, int decode_out_len); - - private: - int encoder_output_size_ = 0; - int num_blocks_ = 0; - int cnn_module_kernel_ = 0; - int head_ = 0; - - // sessions - // NOTE(Mddct): The Env holds the logging state used by all other objects. - // One Env must be created before using any other Onnxruntime functionality. - static Ort::Env env_; // shared environment across threads. - static Ort::SessionOptions session_options_; - std::shared_ptr encoder_session_ = nullptr; - std::shared_ptr rescore_session_ = nullptr; - std::shared_ptr ctc_session_ = nullptr; - - // node names - std::vector encoder_in_names_, encoder_out_names_; - std::vector ctc_in_names_, ctc_out_names_; - std::vector rescore_in_names_, rescore_out_names_; - - // caches - Ort::Value att_cache_ort_{nullptr}; - Ort::Value cnn_cache_ort_{nullptr}; - std::vector encoder_outs_; - // NOTE: Instead of making a copy of the xx_cache, ONNX only maintains - // its data pointer when initializing xx_cache_ort (see https://github.com/ - // microsoft/onnxruntime/blob/master/onnxruntime/core/framework - // /tensor.cc#L102-L129), so we need the following variables to keep - // our data "alive" during the lifetime of decoder. - std::vector att_cache_; - std::vector cnn_cache_; -}; - -} // namespace wenet - -#endif // DECODER_ONNX_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/params.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/params.h deleted file mode 100644 index 3edc877f1bb6d876ca087cab8e4ed00d42e97e63..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/params.h +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_PARAMS_H_ -#define DECODER_PARAMS_H_ - -#include -#include -#include -#include - -#include "decoder/asr_decoder.h" -#ifdef USE_ONNX -#include "decoder/onnx_asr_model.h" -#endif -#ifdef USE_TORCH -#include "decoder/torch_asr_model.h" -#endif -#ifdef USE_XPU -#include "xpu/xpu_asr_model.h" -#endif -#ifdef USE_BPU -#include "bpu/bpu_asr_model.h" -#endif -#include "frontend/feature_pipeline.h" -#include "post_processor/post_processor.h" -#include "utils/flags.h" -#include "utils/string.h" - -DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); - -// TorchAsrModel flags -DEFINE_string(model_path, "", "pytorch exported model path"); -// OnnxAsrModel flags -DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); -// XPUAsrModel flags -DEFINE_string(xpu_model_dir, "", - "directory where the XPU model and weights is saved"); -// BPUAsrModel flags -DEFINE_string(bpu_model_dir, "", - "directory where the HORIZON BPU model is saved"); - -// FeaturePipelineConfig flags -DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); -DEFINE_int32(sample_rate, 16000, "sample rate for audio"); - -// TLG fst -DEFINE_string(fst_path, "", "TLG fst path"); - -// DecodeOptions flags -DEFINE_int32(chunk_size, 16, "decoding chunk size"); -DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); -DEFINE_double(ctc_weight, 0.5, - "ctc weight when combining ctc score and rescoring score"); -DEFINE_double(rescoring_weight, 1.0, - "rescoring weight when combining ctc score and rescoring score"); -DEFINE_double(reverse_weight, 0.0, - "used for bitransformer rescoring. it must be 0.0 if decoder is" - "conventional transformer decoder, and only reverse_weight > 0.0" - "dose the right to left decoder will be calculated and used"); -DEFINE_int32(max_active, 7000, "max active states in ctc wfst search"); -DEFINE_int32(min_active, 200, "min active states in ctc wfst search"); -DEFINE_double(beam, 16.0, "beam in ctc wfst search"); -DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); -DEFINE_double(blank_skip_thresh, 1.0, - "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(blank_scale, 1.0, "blank scale for ctc wfst search"); -DEFINE_double(length_penalty, 0.0, - "length penalty ctc wfst search, will not" - "apply on self-loop arc, for balancing the del/ins ratio, " - "suggest set to -3.0"); -DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); - -// SymbolTable flags -DEFINE_string(dict_path, "", - "dict symbol table path, required when LM is enabled"); -DEFINE_string(unit_path, "", - "e2e model unit symbol table, it is used in both " - "with/without LM scenarios for context/timestamp"); - -// Context flags -DEFINE_string(context_path, "", "context path, is used to build context graph"); -DEFINE_double(context_score, 3.0, "is used to rescore the decoded result"); - -// PostProcessOptions flags -DEFINE_int32(language_type, 0, - "remove spaces according to language type" - "0x00 = kMandarinEnglish, " - "0x01 = kIndoEuropean"); -DEFINE_bool(lowercase, true, "lowercase final result if needed"); - -namespace wenet { -std::shared_ptr InitFeaturePipelineConfigFromFlags() { - auto feature_config = std::make_shared( - FLAGS_num_bins, FLAGS_sample_rate); - return feature_config; -} - -std::shared_ptr InitDecodeOptionsFromFlags() { - auto decode_config = std::make_shared(); - decode_config->chunk_size = FLAGS_chunk_size; - decode_config->num_left_chunks = FLAGS_num_left_chunks; - decode_config->ctc_weight = FLAGS_ctc_weight; - decode_config->reverse_weight = FLAGS_reverse_weight; - decode_config->rescoring_weight = FLAGS_rescoring_weight; - decode_config->ctc_wfst_search_opts.max_active = FLAGS_max_active; - decode_config->ctc_wfst_search_opts.min_active = FLAGS_min_active; - decode_config->ctc_wfst_search_opts.beam = FLAGS_beam; - decode_config->ctc_wfst_search_opts.lattice_beam = FLAGS_lattice_beam; - decode_config->ctc_wfst_search_opts.acoustic_scale = FLAGS_acoustic_scale; - decode_config->ctc_wfst_search_opts.blank_skip_thresh = - FLAGS_blank_skip_thresh; - decode_config->ctc_wfst_search_opts.blank_scale = FLAGS_blank_scale; - decode_config->ctc_wfst_search_opts.length_penalty = FLAGS_length_penalty; - decode_config->ctc_wfst_search_opts.nbest = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; - decode_config->ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; - return decode_config; -} - -std::shared_ptr InitDecodeResourceFromFlags() { - auto resource = std::make_shared(); - const int kNumGemmThreads = 1; - if (!FLAGS_onnx_dir.empty()) { -#ifdef USE_ONNX - LOG(INFO) << "Reading onnx model "; - OnnxAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_onnx_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; -#endif - } else if (!FLAGS_model_path.empty()) { -#ifdef USE_TORCH - LOG(INFO) << "Reading torch model " << FLAGS_model_path; - TorchAsrModel::InitEngineThreads(kNumGemmThreads); - auto model = std::make_shared(); - model->Read(FLAGS_model_path); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; -#endif - } else if (!FLAGS_xpu_model_dir.empty()) { -#ifdef USE_XPU - LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; - auto model = std::make_shared(); - model->SetEngineThreads(kNumGemmThreads); - model->SetDeviceId(FLAGS_device_id); - model->Read(FLAGS_xpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; -#endif - } else if (!FLAGS_bpu_model_dir.empty()) { -#ifdef USE_BPU - LOG(INFO) << "Reading Horizon BPU model from " << FLAGS_bpu_model_dir; - auto model = std::make_shared(); - model->Read(FLAGS_bpu_model_dir); - resource->model = model; -#else - LOG(FATAL) << "Please rebuild with cmake options '-DBPU=ON'."; -#endif - } else { - LOG(FATAL) << "Please set ONNX, TORCH, XPU or BPU model path!!!"; - } - - LOG(INFO) << "Reading unit table " << FLAGS_unit_path; - auto unit_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_unit_path)); - CHECK(unit_table != nullptr); - resource->unit_table = unit_table; - - if (!FLAGS_fst_path.empty()) { // With LM - CHECK(!FLAGS_dict_path.empty()); - LOG(INFO) << "Reading fst " << FLAGS_fst_path; - auto fst = std::shared_ptr>( - fst::Fst::Read(FLAGS_fst_path)); - CHECK(fst != nullptr); - resource->fst = fst; - - LOG(INFO) << "Reading symbol table " << FLAGS_dict_path; - auto symbol_table = std::shared_ptr( - fst::SymbolTable::ReadText(FLAGS_dict_path)); - CHECK(symbol_table != nullptr); - resource->symbol_table = symbol_table; - } else { // Without LM, symbol_table is the same as unit_table - resource->symbol_table = unit_table; - } - - if (!FLAGS_context_path.empty()) { - LOG(INFO) << "Reading context " << FLAGS_context_path; - std::vector contexts; - std::ifstream infile(FLAGS_context_path); - std::string context; - while (getline(infile, context)) { - contexts.emplace_back(Trim(context)); - } - ContextConfig config; - config.context_score = FLAGS_context_score; - resource->context_graph = std::make_shared(config); - resource->context_graph->BuildContextGraph(contexts, - resource->symbol_table); - } - - PostProcessOptions post_process_opts; - post_process_opts.language_type = - FLAGS_language_type == 0 ? kMandarinEnglish : kIndoEuropean; - post_process_opts.lowercase = FLAGS_lowercase; - resource->post_processor = - std::make_shared(std::move(post_process_opts)); - return resource; -} - -} // namespace wenet - -#endif // DECODER_PARAMS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/search_interface.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/search_interface.h deleted file mode 100644 index 25bad26705f8be44561d2c686f50a63035b14bbf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/search_interface.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef DECODER_SEARCH_INTERFACE_H_ -#define DECODER_SEARCH_INTERFACE_H_ - -namespace wenet { - -#include - -enum SearchType { - kPrefixBeamSearch = 0x00, - kWfstBeamSearch = 0x01, -}; - -class SearchInterface { - public: - virtual ~SearchInterface() {} - virtual void Search(const std::vector>& logp) = 0; - virtual void Reset() = 0; - virtual void FinalizeSearch() = 0; - - virtual SearchType Type() const = 0; - // N-best inputs id - virtual const std::vector>& Inputs() const = 0; - // N-best outputs id - virtual const std::vector>& Outputs() const = 0; - // N-best likelihood - virtual const std::vector& Likelihood() const = 0; - // N-best timestamp - virtual const std::vector>& Times() const = 0; -}; - -} // namespace wenet - -#endif // DECODER_SEARCH_INTERFACE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/torch_asr_model.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/torch_asr_model.cc deleted file mode 100644 index 3abca283e12f5c173c9511707229ea82b31f26d8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/torch_asr_model.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "decoder/torch_asr_model.h" - -#include -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -namespace wenet { - -#ifndef IOS -void TorchAsrModel::InitEngineThreads(int num_threads) { - // For multi-thread performance - at::set_num_threads(num_threads); - VLOG(1) << "Num intra-op threads: " << at::get_num_threads(); -} -#endif - -void TorchAsrModel::Read(const std::string& model_path) { - torch::DeviceType device = at::kCPU; -#ifdef USE_GPU - if (!torch::cuda::is_available()) { - VLOG(1) << "CUDA is not available! Please check your GPU settings"; - throw std::runtime_error("CUDA is not available!"); - } else { - VLOG(1) << "CUDA available! Running on GPU"; - device = at::kCUDA; - } -#endif - torch::jit::script::Module model = torch::jit::load(model_path, device); - model_ = std::make_shared(std::move(model)); - torch::NoGradGuard no_grad; - model_->eval(); - torch::jit::IValue o1 = model_->run_method("subsampling_rate"); - CHECK_EQ(o1.isInt(), true); - subsampling_rate_ = o1.toInt(); - torch::jit::IValue o2 = model_->run_method("right_context"); - CHECK_EQ(o2.isInt(), true); - right_context_ = o2.toInt(); - torch::jit::IValue o3 = model_->run_method("sos_symbol"); - CHECK_EQ(o3.isInt(), true); - sos_ = o3.toInt(); - torch::jit::IValue o4 = model_->run_method("eos_symbol"); - CHECK_EQ(o4.isInt(), true); - eos_ = o4.toInt(); - torch::jit::IValue o5 = model_->run_method("is_bidirectional_decoder"); - CHECK_EQ(o5.isBool(), true); - is_bidirectional_decoder_ = o5.toBool(); - - VLOG(1) << "Torch Model Info:"; - VLOG(1) << "\tsubsampling_rate " << subsampling_rate_; - VLOG(1) << "\tright context " << right_context_; - VLOG(1) << "\tsos " << sos_; - VLOG(1) << "\teos " << eos_; - VLOG(1) << "\tis bidirectional decoder " << is_bidirectional_decoder_; -} - -TorchAsrModel::TorchAsrModel(const TorchAsrModel& other) { - // 1. Init the model info - right_context_ = other.right_context_; - subsampling_rate_ = other.subsampling_rate_; - sos_ = other.sos_; - eos_ = other.eos_; - is_bidirectional_decoder_ = other.is_bidirectional_decoder_; - chunk_size_ = other.chunk_size_; - num_left_chunks_ = other.num_left_chunks_; - offset_ = other.offset_; - // 2. Model copy, just copy the model ptr since: - // PyTorch allows using multiple CPU threads during TorchScript model - // inference, please see https://pytorch.org/docs/stable/notes/cpu_ - // threading_torchscript_inference.html - model_ = other.model_; - - // NOTE(Binbin Zhang): - // inner states for forward are not copied here. -} - -std::shared_ptr TorchAsrModel::Copy() const { - auto asr_model = std::make_shared(*this); - // Reset the inner states for new decoding - asr_model->Reset(); - return asr_model; -} - -void TorchAsrModel::Reset() { - offset_ = 0; - att_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - cnn_cache_ = std::move(torch::zeros({0, 0, 0, 0})); - encoder_outs_.clear(); - cached_feature_.clear(); -} - -void TorchAsrModel::ForwardEncoderFunc( - const std::vector>& chunk_feats, - std::vector>* out_prob) { - // 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats - // The first dimension is for batchsize, which is 1. - int num_frames = cached_feature_.size() + chunk_feats.size(); - const int feature_dim = chunk_feats[0].size(); - torch::Tensor feats = - torch::zeros({1, num_frames, feature_dim}, torch::kFloat); - for (size_t i = 0; i < cached_feature_.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(cached_feature_[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][i] = std::move(row); - } - for (size_t i = 0; i < chunk_feats.size(); ++i) { - torch::Tensor row = - torch::from_blob(const_cast(chunk_feats[i].data()), - {feature_dim}, torch::kFloat) - .clone(); - feats[0][cached_feature_.size() + i] = std::move(row); - } - - // 2. Encoder chunk forward -#ifdef USE_GPU - feats = feats.to(at::kCUDA); - att_cache_ = att_cache_.to(at::kCUDA); - cnn_cache_ = cnn_cache_.to(at::kCUDA); -#endif - int required_cache_size = chunk_size_ * num_left_chunks_; - torch::NoGradGuard no_grad; - std::vector inputs = {feats, offset_, required_cache_size, - att_cache_, cnn_cache_}; - - // Refer interfaces in wenet/transformer/asr_model.py - auto outputs = - model_->get_method("forward_encoder_chunk")(inputs).toTuple()->elements(); - CHECK_EQ(outputs.size(), 3); -#ifdef USE_GPU - torch::Tensor chunk_out = outputs[0].toTensor().to(at::kCPU); - att_cache_ = outputs[1].toTensor().to(at::kCPU); - cnn_cache_ = outputs[2].toTensor().to(at::kCPU); -#else - torch::Tensor chunk_out = outputs[0].toTensor(); - att_cache_ = outputs[1].toTensor(); - cnn_cache_ = outputs[2].toTensor(); -#endif - offset_ += chunk_out.size(1); - - // The first dimension of returned value is for batchsize, which is 1 -#ifdef USE_GPU - chunk_out = chunk_out.to(at::kCUDA); - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor(); - ctc_log_probs = ctc_log_probs.to(at::kCPU)[0]; - encoder_outs_.push_back(std::move(chunk_out.to(at::kCPU))); -#else - torch::Tensor ctc_log_probs = - model_->run_method("ctc_activation", chunk_out).toTensor()[0]; - encoder_outs_.push_back(std::move(chunk_out)); -#endif - - // Copy to output - int num_outputs = ctc_log_probs.size(0); - int output_dim = ctc_log_probs.size(1); - out_prob->resize(num_outputs); - for (int i = 0; i < num_outputs; i++) { - (*out_prob)[i].resize(output_dim); - memcpy((*out_prob)[i].data(), ctc_log_probs[i].data_ptr(), - sizeof(float) * output_dim); - } -} - -float TorchAsrModel::ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, - int eos) { - float score = 0.0f; - auto accessor = prob.accessor(); - for (size_t j = 0; j < hyp.size(); ++j) { - score += accessor[j][hyp[j]]; - } - score += accessor[hyp.size()][eos]; - return score; -} - -void TorchAsrModel::AttentionRescoring( - const std::vector>& hyps, float reverse_weight, - std::vector* rescoring_score) { - CHECK(rescoring_score != nullptr); - int num_hyps = hyps.size(); - rescoring_score->resize(num_hyps, 0.0f); - - if (num_hyps == 0) { - return; - } - // No encoder output - if (encoder_outs_.size() == 0) { - return; - } - - torch::NoGradGuard no_grad; - // Step 1: Prepare input for libtorch - torch::Tensor hyps_length = torch::zeros({num_hyps}, torch::kLong); - int max_hyps_len = 0; - for (size_t i = 0; i < num_hyps; ++i) { - int length = hyps[i].size() + 1; - max_hyps_len = std::max(length, max_hyps_len); - hyps_length[i] = static_cast(length); - } - torch::Tensor hyps_tensor = - torch::zeros({num_hyps, max_hyps_len}, torch::kLong); - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - hyps_tensor[i][0] = sos_; - for (size_t j = 0; j < hyp.size(); ++j) { - hyps_tensor[i][j + 1] = hyp[j]; - } - } - - // Step 2: Forward attention decoder by hyps and corresponding encoder_outs_ - torch::Tensor encoder_out = torch::cat(encoder_outs_, 1); -#ifdef USE_GPU - hyps_tensor = hyps_tensor.to(at::kCUDA); - hyps_length = hyps_length.to(at::kCUDA); - encoder_out = encoder_out.to(at::kCUDA); -#endif - auto outputs = model_ - ->run_method("forward_attention_decoder", hyps_tensor, - hyps_length, encoder_out, reverse_weight) - .toTuple() - ->elements(); -#ifdef USE_GPU - auto probs = outputs[0].toTensor().to(at::kCPU); - auto r_probs = outputs[1].toTensor().to(at::kCPU); -#else - auto probs = outputs[0].toTensor(); - auto r_probs = outputs[1].toTensor(); -#endif - CHECK_EQ(probs.size(0), num_hyps); - CHECK_EQ(probs.size(1), max_hyps_len); - - // Step 3: Compute rescoring score - for (size_t i = 0; i < num_hyps; ++i) { - const std::vector& hyp = hyps[i]; - float score = 0.0f; - // left-to-right decoder score - score = ComputeAttentionScore(probs[i], hyp, eos_); - // Optional: Used for right to left score - float r_score = 0.0f; - if (is_bidirectional_decoder_ && reverse_weight > 0) { - // right-to-left score - CHECK_EQ(r_probs.size(0), num_hyps); - CHECK_EQ(r_probs.size(1), max_hyps_len); - std::vector r_hyp(hyp.size()); - std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); - // right to left decoder score - r_score = ComputeAttentionScore(r_probs[i], r_hyp, eos_); - } - - // combined left-to-right and right-to-left score - (*rescoring_score)[i] = - score * (1 - reverse_weight) + r_score * reverse_weight; - } -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/torch_asr_model.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/torch_asr_model.h deleted file mode 100644 index a3cebe08798f1cad60ca4cd73c7b2488173b6114..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/decoder/torch_asr_model.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef DECODER_TORCH_ASR_MODEL_H_ -#define DECODER_TORCH_ASR_MODEL_H_ - -#include -#include -#include - -#include "torch/script.h" -#ifndef IOS -#include "torch/torch.h" -#endif - -#include "decoder/asr_model.h" -#include "utils/utils.h" - -namespace wenet { - -class TorchAsrModel : public AsrModel { - public: -#ifndef IOS - static void InitEngineThreads(int num_threads = 1); -#endif - - public: - using TorchModule = torch::jit::script::Module; - TorchAsrModel() = default; - TorchAsrModel(const TorchAsrModel& other); - void Read(const std::string& model_path); - std::shared_ptr torch_model() const { return model_; } - void Reset() override; - void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override; - std::shared_ptr Copy() const override; - - protected: - void ForwardEncoderFunc(const std::vector>& chunk_feats, - std::vector>* ctc_prob) override; - - float ComputeAttentionScore(const torch::Tensor& prob, - const std::vector& hyp, int eos); - - private: - std::shared_ptr model_ = nullptr; - std::vector encoder_outs_; - // transformer/conformer attention cache - torch::Tensor att_cache_ = torch::zeros({0, 0, 0, 0}); - // conformer-only conv_module cache - torch::Tensor cnn_cache_ = torch::zeros({0, 0, 0, 0}); -}; - -} // namespace wenet - -#endif // DECODER_TORCH_ASR_MODEL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/CMakeLists.txt deleted file mode 100644 index 78872257e43bb9a6ffcedaae977bf0173817ae50..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(frontend STATIC - feature_pipeline.cc - fft.cc -) -target_link_libraries(frontend PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fbank.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fbank.h deleted file mode 100644 index 5a650dc035b8e244388cc1f2e0b9512654de7fda..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fbank.h +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FBANK_H_ -#define FRONTEND_FBANK_H_ - -#include -#include -#include -#include -#include - -#include "frontend/fft.h" -#include "utils/log.h" - -namespace wenet { - -// This code is based on kaldi Fbank implementation, please see -// https://github.com/kaldi-asr/kaldi/blob/master/src/feat/feature-fbank.cc -class Fbank { - public: - Fbank(int num_bins, int sample_rate, int frame_length, int frame_shift) - : num_bins_(num_bins), - sample_rate_(sample_rate), - frame_length_(frame_length), - frame_shift_(frame_shift), - use_log_(true), - remove_dc_offset_(true), - generator_(0), - distribution_(0, 1.0), - dither_(0.0) { - fft_points_ = UpperPowerOfTwo(frame_length_); - // generate bit reversal table and trigonometric function table - const int fft_points_4 = fft_points_ / 4; - bitrev_.resize(fft_points_); - sintbl_.resize(fft_points_ + fft_points_4); - make_sintbl(fft_points_, sintbl_.data()); - make_bitrev(fft_points_, bitrev_.data()); - - int num_fft_bins = fft_points_ / 2; - float fft_bin_width = static_cast(sample_rate_) / fft_points_; - int low_freq = 20, high_freq = sample_rate_ / 2; - float mel_low_freq = MelScale(low_freq); - float mel_high_freq = MelScale(high_freq); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); - bins_.resize(num_bins_); - center_freqs_.resize(num_bins_); - for (int bin = 0; bin < num_bins; ++bin) { - float left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - center_freqs_[bin] = InverseMelScale(center_mel); - std::vector this_bin(num_fft_bins); - int first_index = -1, last_index = -1; - for (int i = 0; i < num_fft_bins; ++i) { - float freq = (fft_bin_width * i); // Center frequency of this fft - // bin. - float mel = MelScale(freq); - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) - weight = (mel - left_mel) / (center_mel - left_mel); - else - weight = (right_mel - mel) / (right_mel - center_mel); - this_bin[i] = weight; - if (first_index == -1) first_index = i; - last_index = i; - } - } - CHECK(first_index != -1 && last_index >= first_index); - bins_[bin].first = first_index; - int size = last_index + 1 - first_index; - bins_[bin].second.resize(size); - for (int i = 0; i < size; ++i) { - bins_[bin].second[i] = this_bin[first_index + i]; - } - } - - // povey window - povey_window_.resize(frame_length_); - double a = M_2PI / (frame_length - 1); - for (int i = 0; i < frame_length; ++i) { - povey_window_[i] = pow(0.5 - 0.5 * cos(a * i), 0.85); - } - } - - void set_use_log(bool use_log) { use_log_ = use_log; } - - void set_remove_dc_offset(bool remove_dc_offset) { - remove_dc_offset_ = remove_dc_offset; - } - - void set_dither(float dither) { dither_ = dither; } - - int num_bins() const { return num_bins_; } - - static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); - } - - static inline float MelScale(float freq) { - return 1127.0f * logf(1.0f + freq / 700.0f); - } - - static int UpperPowerOfTwo(int n) { - return static_cast(pow(2, ceil(log(n) / log(2)))); - } - - // pre emphasis - void PreEmphasis(float coeff, std::vector* data) const { - if (coeff == 0.0) return; - for (int i = data->size() - 1; i > 0; i--) - (*data)[i] -= coeff * (*data)[i - 1]; - (*data)[0] -= coeff * (*data)[0]; - } - - // Apply povey window on data in place - void Povey(std::vector* data) const { - CHECK_GE(data->size(), povey_window_.size()); - for (size_t i = 0; i < povey_window_.size(); ++i) { - (*data)[i] *= povey_window_[i]; - } - } - - // Compute fbank feat, return num frames - int Compute(const std::vector& wave, - std::vector>* feat) { - int num_samples = wave.size(); - if (num_samples < frame_length_) return 0; - int num_frames = 1 + ((num_samples - frame_length_) / frame_shift_); - feat->resize(num_frames); - std::vector fft_real(fft_points_, 0), fft_img(fft_points_, 0); - std::vector power(fft_points_ / 2); - for (int i = 0; i < num_frames; ++i) { - std::vector data(wave.data() + i * frame_shift_, - wave.data() + i * frame_shift_ + frame_length_); - // optional add noise - if (dither_ != 0.0) { - for (size_t j = 0; j < data.size(); ++j) - data[j] += dither_ * distribution_(generator_); - } - // optinal remove dc offset - if (remove_dc_offset_) { - float mean = 0.0; - for (size_t j = 0; j < data.size(); ++j) mean += data[j]; - mean /= data.size(); - for (size_t j = 0; j < data.size(); ++j) data[j] -= mean; - } - - PreEmphasis(0.97, &data); - Povey(&data); - // copy data to fft_real - memset(fft_img.data(), 0, sizeof(float) * fft_points_); - memset(fft_real.data() + frame_length_, 0, - sizeof(float) * (fft_points_ - frame_length_)); - memcpy(fft_real.data(), data.data(), sizeof(float) * frame_length_); - fft(bitrev_.data(), sintbl_.data(), fft_real.data(), fft_img.data(), - fft_points_); - // power - for (int j = 0; j < fft_points_ / 2; ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - } - - (*feat)[i].resize(num_bins_); - // cepstral coefficients, triangle filter array - for (int j = 0; j < num_bins_; ++j) { - float mel_energy = 0.0; - int s = bins_[j].first; - for (size_t k = 0; k < bins_[j].second.size(); ++k) { - mel_energy += bins_[j].second[k] * power[s + k]; - } - // optional use log - if (use_log_) { - if (mel_energy < std::numeric_limits::epsilon()) - mel_energy = std::numeric_limits::epsilon(); - mel_energy = logf(mel_energy); - } - - (*feat)[i][j] = mel_energy; - } - } - return num_frames; - } - - private: - int num_bins_; - int sample_rate_; - int frame_length_, frame_shift_; - int fft_points_; - bool use_log_; - bool remove_dc_offset_; - std::vector center_freqs_; - std::vector>> bins_; - std::vector povey_window_; - std::default_random_engine generator_; - std::normal_distribution distribution_; - float dither_; - - // bit reversal table - std::vector bitrev_; - // trigonometric function table - std::vector sintbl_; -}; - -} // namespace wenet - -#endif // FRONTEND_FBANK_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/feature_pipeline.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/feature_pipeline.cc deleted file mode 100644 index ab450b15cd35ebd8101a3bcdec4f963a73bed10c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/feature_pipeline.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "frontend/feature_pipeline.h" - -#include -#include - -namespace wenet { - -FeaturePipeline::FeaturePipeline(const FeaturePipelineConfig& config) - : config_(config), - feature_dim_(config.num_bins), - fbank_(config.num_bins, config.sample_rate, config.frame_length, - config.frame_shift), - num_frames_(0), - input_finished_(false) {} - -void FeaturePipeline::AcceptWaveform(const float* pcm, const int size) { - std::vector> feats; - std::vector waves; - waves.insert(waves.end(), remained_wav_.begin(), remained_wav_.end()); - waves.insert(waves.end(), pcm, pcm + size); - int num_frames = fbank_.Compute(waves, &feats); - feature_queue_.Push(std::move(feats)); - num_frames_ += num_frames; - - int left_samples = waves.size() - config_.frame_shift * num_frames; - remained_wav_.resize(left_samples); - std::copy(waves.begin() + config_.frame_shift * num_frames, waves.end(), - remained_wav_.begin()); - // We are still adding wave, notify input is not finished - finish_condition_.notify_one(); -} - -void FeaturePipeline::AcceptWaveform(const int16_t* pcm, const int size) { - auto* float_pcm = new float[size]; - for (size_t i = 0; i < size; i++) { - float_pcm[i] = static_cast(pcm[i]); - } - this->AcceptWaveform(float_pcm, size); - delete[] float_pcm; -} - -void FeaturePipeline::set_input_finished() { - CHECK(!input_finished_); - { - std::lock_guard lock(mutex_); - input_finished_ = true; - } - finish_condition_.notify_one(); -} - -bool FeaturePipeline::ReadOne(std::vector* feat) { - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (!feature_queue_.Empty()) { - *feat = std::move(feature_queue_.Pop()); - return true; - } else { - return false; - } - } -} - -bool FeaturePipeline::Read(int num_frames, - std::vector>* feats) { - feats->clear(); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - std::unique_lock lock(mutex_); - while (!input_finished_) { - // This will release the lock and wait for notify_one() - // from AcceptWaveform() or set_input_finished() - finish_condition_.wait(lock); - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } - } - CHECK(input_finished_); - // Double check queue.empty, see issue#893 for detailed discussions. - if (feature_queue_.Size() >= num_frames) { - *feats = std::move(feature_queue_.Pop(num_frames)); - return true; - } else { - *feats = std::move(feature_queue_.Pop(feature_queue_.Size())); - return false; - } - } -} - -void FeaturePipeline::Reset() { - input_finished_ = false; - num_frames_ = 0; - remained_wav_.clear(); - feature_queue_.Clear(); -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/feature_pipeline.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/feature_pipeline.h deleted file mode 100644 index 9918d6b573255795e0e665f0a9598c44be625c19..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/feature_pipeline.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2017 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FRONTEND_FEATURE_PIPELINE_H_ -#define FRONTEND_FEATURE_PIPELINE_H_ - -#include -#include -#include -#include - -#include "frontend/fbank.h" -#include "utils/blocking_queue.h" -#include "utils/log.h" - -namespace wenet { - -struct FeaturePipelineConfig { - int num_bins; - int sample_rate; - int frame_length; - int frame_shift; - FeaturePipelineConfig(int num_bins, int sample_rate) - : num_bins(num_bins), // 80 dim fbank - sample_rate(sample_rate) { // 16k sample rate - frame_length = sample_rate / 1000 * 25; // frame length 25ms - frame_shift = sample_rate / 1000 * 10; // frame shift 10ms - } - - void Info() const { - LOG(INFO) << "feature pipeline config" - << " num_bins " << num_bins << " frame_length " << frame_length - << " frame_shift " << frame_shift; - } -}; - -// Typically, FeaturePipeline is used in two threads: one thread A calls -// AcceptWaveform() to add raw wav data and set_input_finished() to notice -// the end of input wav, another thread B (decoder thread) calls Read() to -// consume features.So a BlockingQueue is used to make this class thread safe. - -// The Read() is designed as a blocking method when there is no feature -// in feature_queue_ and the input is not finished. - -// See bin/decoder_main.cc, websocket/websocket_server.cc and -// decoder/torch_asr_decoder.cc for usage - -class FeaturePipeline { - public: - explicit FeaturePipeline(const FeaturePipelineConfig& config); - - // The feature extraction is done in AcceptWaveform(). - void AcceptWaveform(const float* pcm, const int size); - void AcceptWaveform(const int16_t* pcm, const int size); - - // Current extracted frames number. - int num_frames() const { return num_frames_; } - int feature_dim() const { return feature_dim_; } - const FeaturePipelineConfig& config() const { return config_; } - - // The caller should call this method when speech input is end. - // Never call AcceptWaveform() after calling set_input_finished() ! - void set_input_finished(); - bool input_finished() const { return input_finished_; } - - // Return False if input is finished and no feature could be read. - // Return True if a feature is read. - // This function is a blocking method. It will block the thread when - // there is no feature in feature_queue_ and the input is not finished. - bool ReadOne(std::vector* feat); - - // Read #num_frames frame features. - // Return False if less than #num_frames features are read and the - // input is finished. - // Return True if #num_frames features are read. - // This function is a blocking method when there is no feature - // in feature_queue_ and the input is not finished. - bool Read(int num_frames, std::vector>* feats); - - void Reset(); - bool IsLastFrame(int frame) const { - return input_finished_ && (frame == num_frames_ - 1); - } - - int NumQueuedFrames() const { return feature_queue_.Size(); } - - private: - const FeaturePipelineConfig& config_; - int feature_dim_; - Fbank fbank_; - - BlockingQueue> feature_queue_; - int num_frames_; - bool input_finished_; - - // The feature extraction is done in AcceptWaveform(). - // This waveform sample points are consumed by frame size. - // The residual waveform sample points after framing are - // kept to be used in next AcceptWaveform() calling. - std::vector remained_wav_; - - // Used to block the Read when there is no feature in feature_queue_ - // and the input is not finished. - mutable std::mutex mutex_; - std::condition_variable finish_condition_; -}; - -} // namespace wenet - -#endif // FRONTEND_FEATURE_PIPELINE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fft.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fft.cc deleted file mode 100644 index 9e05f854e79ea733d0411045385e924c2670b7f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fft.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include -#include -#include - -#include "frontend/fft.h" - -namespace wenet { - -void make_sintbl(int n, float* sintbl) { - int i, n2, n4, n8; - float c, s, dc, ds, t; - - n2 = n / 2; - n4 = n / 4; - n8 = n / 8; - t = sin(M_PI / n); - dc = 2 * t * t; - ds = sqrt(dc * (2 - dc)); - t = 2 * dc; - c = sintbl[n4] = 1; - s = sintbl[0] = 0; - for (i = 1; i < n8; ++i) { - c -= dc; - dc += t * c; - s += ds; - ds -= t * s; - sintbl[i] = s; - sintbl[n4 - i] = c; - } - if (n8 != 0) sintbl[n8] = sqrt(0.5); - for (i = 0; i < n4; ++i) sintbl[n2 - i] = sintbl[i]; - for (i = 0; i < n2 + n4; ++i) sintbl[i + n2] = -sintbl[i]; -} - -void make_bitrev(int n, int* bitrev) { - int i, j, k, n2; - - n2 = n / 2; - i = j = 0; - for (;;) { - bitrev[i] = j; - if (++i >= n) break; - k = n2; - while (k <= j) { - j -= k; - k /= 2; - } - j += k; - } -} - -// bitrev: bit reversal table -// sintbl: trigonometric function table -// x:real part -// y:image part -// n: fft length -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n) { - int i, j, k, ik, h, d, k2, n4, inverse; - float t, s, c, dx, dy; - - /* preparation */ - if (n < 0) { - n = -n; - inverse = 1; /* inverse transform */ - } else { - inverse = 0; - } - n4 = n / 4; - if (n == 0) { - return 0; - } - - /* bit reversal */ - for (i = 0; i < n; ++i) { - j = bitrev[i]; - if (i < j) { - t = x[i]; - x[i] = x[j]; - x[j] = t; - t = y[i]; - y[i] = y[j]; - y[j] = t; - } - } - - /* transformation */ - for (k = 1; k < n; k = k2) { - h = 0; - k2 = k + k; - d = n / k2; - for (j = 0; j < k; ++j) { - c = sintbl[h + n4]; - if (inverse) - s = -sintbl[h]; - else - s = sintbl[h]; - for (i = j; i < n; i += k2) { - ik = i + k; - dx = s * y[ik] + c * x[ik]; - dy = c * y[ik] - s * x[ik]; - x[ik] = x[i] - dx; - x[i] += dx; - y[ik] = y[i] - dy; - y[i] += dy; - } - h += d; - } - } - if (inverse) { - /* divide by n in case of the inverse transformation */ - for (i = 0; i < n; ++i) { - x[i] /= n; - y[i] /= n; - } - } - return 0; /* finished successfully */ -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fft.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fft.h deleted file mode 100644 index 6b92e406c44b4768eaee6e734f55bb39cd9af28b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/fft.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Network -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_FFT_H_ -#define FRONTEND_FFT_H_ - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -namespace wenet { - -// Fast Fourier Transform - -void make_sintbl(int n, float* sintbl); - -void make_bitrev(int n, int* bitrev); - -int fft(const int* bitrev, const float* sintbl, float* x, float* y, int n); - -} // namespace wenet - -#endif // FRONTEND_FFT_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/wav.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/wav.h deleted file mode 100644 index 688a049a940ebbdc83f24e59134fff22b7b09bfd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/frontend/wav.h +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2016 Personal (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#ifndef FRONTEND_WAV_H_ -#define FRONTEND_WAV_H_ - -#include -#include -#include -#include -#include - -#include - -#include "utils/log.h" - -namespace wenet { - -struct WavHeader { - char riff[4] = {'R', 'I', 'F', 'F'}; - unsigned int size = 0; - char wav[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - unsigned int fmt_size = 16; - uint16_t format = 1; - uint16_t channels = 0; - unsigned int sample_rate = 0; - unsigned int bytes_per_second = 0; - uint16_t block_size = 0; - uint16_t bit = 0; - char data[4] = {'d', 'a', 't', 'a'}; - unsigned int data_size = 0; - - WavHeader() {} - - WavHeader(int num_samples, int num_channel, int sample_rate, - int bits_per_sample) { - data_size = num_samples * num_channel * (bits_per_sample / 8); - size = sizeof(WavHeader) - 8 + data_size; - channels = num_channel; - this->sample_rate = sample_rate; - bytes_per_second = sample_rate * num_channel * (bits_per_sample / 8); - block_size = num_channel * (bits_per_sample / 8); - bit = bits_per_sample; - } -}; - -class WavReader { - public: - WavReader() : data_(nullptr) {} - explicit WavReader(const std::string& filename) { Open(filename); } - - bool Open(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "rb"); - if (NULL == fp) { - LOG(WARNING) << "Error in read " << filename; - return false; - } - - WavHeader header; - fread(&header, 1, sizeof(header), fp); - if (header.fmt_size < 16) { - fprintf(stderr, - "WaveData: expect PCM format data " - "to have fmt chunk of at least size 16.\n"); - return false; - } else if (header.fmt_size > 16) { - int offset = 44 - 8 + header.fmt_size - 16; - fseek(fp, offset, SEEK_SET); - fread(header.data, 8, sizeof(char), fp); - } - // check "RIFF" "WAVE" "fmt " "data" - - // Skip any sub-chunks between "fmt" and "data". Usually there will - // be a single "fact" sub chunk, but on Windows there can also be a - // "list" sub chunk. - while (0 != strncmp(header.data, "data", 4)) { - // We will just ignore the data in these chunks. - fseek(fp, header.data_size, SEEK_CUR); - // read next sub chunk - fread(header.data, 8, sizeof(char), fp); - } - - num_channel_ = header.channels; - sample_rate_ = header.sample_rate; - bits_per_sample_ = header.bit; - int num_data = header.data_size / (bits_per_sample_ / 8); - data_ = new float[num_data]; - num_samples_ = num_data / num_channel_; - - for (int i = 0; i < num_data; ++i) { - switch (bits_per_sample_) { - case 8: { - char sample; - fread(&sample, 1, sizeof(char), fp); - data_[i] = static_cast(sample); - break; - } - case 16: { - int16_t sample; - fread(&sample, 1, sizeof(int16_t), fp); - data_[i] = static_cast(sample); - break; - } - case 32: { - int sample; - fread(&sample, 1, sizeof(int), fp); - data_[i] = static_cast(sample); - break; - } - default: - fprintf(stderr, "unsupported quantization bits"); - exit(1); - } - } - fclose(fp); - return true; - } - - int num_channel() const { return num_channel_; } - int sample_rate() const { return sample_rate_; } - int bits_per_sample() const { return bits_per_sample_; } - int num_samples() const { return num_samples_; } - - ~WavReader() { - delete[] data_; - } - - const float* data() const { return data_; } - - private: - int num_channel_; - int sample_rate_; - int bits_per_sample_; - int num_samples_; // sample points per channel - float* data_; -}; - -class WavWriter { - public: - WavWriter(const float* data, int num_samples, int num_channel, - int sample_rate, int bits_per_sample) - : data_(data), - num_samples_(num_samples), - num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample) {} - - void Write(const std::string& filename) { - FILE* fp = fopen(filename.c_str(), "wb"); - WavHeader header(num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fwrite(&header, 1, sizeof(header), fp); - - for (int i = 0; i < num_samples_; ++i) { - for (int j = 0; j < num_channel_; ++j) { - switch (bits_per_sample_) { - case 8: { - char sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 16: { - int16_t sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - case 32: { - int sample = static_cast(data_[i * num_channel_ + j]); - fwrite(&sample, 1, sizeof(sample), fp); - break; - } - } - } - } - fclose(fp); - } - - private: - const float* data_; - int num_samples_; // total float points in data_ - int num_channel_; - int sample_rate_; - int bits_per_sample_; -}; - -class StreamWavWriter { - public: - StreamWavWriter(int num_channel, int sample_rate, int bits_per_sample) - : num_channel_(num_channel), - sample_rate_(sample_rate), - bits_per_sample_(bits_per_sample), - total_num_samples_(0) {} - - StreamWavWriter(const std::string& filename, int num_channel, - int sample_rate, int bits_per_sample) - : StreamWavWriter(num_channel, sample_rate, bits_per_sample) { - Open(filename); - } - - void Open(const std::string& filename) { - fp_ = fopen(filename.c_str(), "wb"); - fseek(fp_, sizeof(WavHeader), SEEK_SET); - } - - void Write(const int16_t* sample_data, size_t num_samples) { - fwrite(sample_data, sizeof(int16_t), num_samples, fp_); - total_num_samples_ += num_samples; - } - - void Close() { - WavHeader header(total_num_samples_, num_channel_, sample_rate_, - bits_per_sample_); - fseek(fp_, 0L, SEEK_SET); - fwrite(&header, 1, sizeof(header), fp_); - fclose(fp_); - } - - private: - FILE* fp_; - int num_channel_; - int sample_rate_; - int bits_per_sample_; - size_t total_num_samples_; -}; - -} // namespace wenet - -#endif // FRONTEND_WAV_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/CMakeLists.txt deleted file mode 100644 index b072309e44b90dcee44ea31e9bcbc1741e73f151..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) - -project(kaldi) - -# include_directories() is called in the root CMakeLists.txt - -add_library(kaldi-util - base/kaldi-error.cc - base/kaldi-math.cc - util/kaldi-io.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/text-utils.cc -) -target_link_libraries(kaldi-util PUBLIC utils) - -add_library(kaldi-decoder - lat/determinize-lattice-pruned.cc - lat/lattice-functions.cc - decoder/lattice-faster-decoder.cc - decoder/lattice-faster-online-decoder.cc -) -target_link_libraries(kaldi-decoder PUBLIC kaldi-util) - -if(GRAPH_TOOLS) - # Arpa binary - add_executable(arpa2fst - lm/arpa-file-parser.cc - lm/arpa-lm-compiler.cc - lmbin/arpa2fst.cc - ) - target_link_libraries(arpa2fst PUBLIC kaldi-util) - - # FST tools binary - set(FST_BINS - fstaddselfloops - fstdeterminizestar - fstisstochastic - fstminimizeencoded - fsttablecompose - ) - - if(NOT MSVC) - # dl is for dynamic linking, otherwise there is a linking error on linux - link_libraries(dl) - endif() - foreach(name IN LISTS FST_BINS) - add_executable(${name} - fstbin/${name}.cc - fstext/kaldi-fst-io.cc - ) - target_link_libraries(${name} PUBLIC kaldi-util) - endforeach() -endif() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/README.md deleted file mode 100644 index 4eb9c9173b747686f00b658afc5e1e0dfdc17e68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/README.md +++ /dev/null @@ -1,21 +0,0 @@ -We use Kaldi decoder to implement TLG based language model integration, -so we copied related files to this directory. -The main changes are: - -1. To minimize the change, we use the same directories tree as Kaldi. - -2. We replace Kaldi log system with glog in the following way. - -``` c++ -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_INFO \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) -``` - -3. We lint all the files to satisfy the lint in WeNet. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs-inl.h deleted file mode 100644 index 9397400833676b323492321183c989cec2f41c3f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs-inl.h +++ /dev/null @@ -1,329 +0,0 @@ -// base/io-funcs-inl.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; -// Johns Hopkins University (Author: Daniel Povey) -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_INL_H_ -#define KALDI_BASE_IO_FUNCS_INL_H_ 1 - -// Do not include this file directly. It is included by base/io-funcs.h - -#include -#include -#include - -namespace kaldi { - -// Template that covers integers. -template -void WriteBasicType(std::ostream &os, bool binary, T t) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char len_c = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(t)); - os.put(len_c); - os.write(reinterpret_cast(&t), sizeof(t)); - } else { - if (sizeof(t) == 1) - os << static_cast(t) << " "; - else - os << t << " "; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteBasicType."; - } -} - -// Template that covers integers. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t) { - KALDI_PARANOID_ASSERT(t != NULL); - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - int len_c_in = is.get(); - if (len_c_in == -1) - KALDI_ERR << "ReadBasicType: encountered end of stream."; - char len_c = static_cast(len_c_in), - len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * - static_cast(sizeof(*t)); - if (len_c != len_c_expected) { - KALDI_ERR << "ReadBasicType: did not get expected integer type, " - << static_cast(len_c) << " vs. " - << static_cast(len_c_expected) - << ". You can change this code to successfully" - << " read it later, if needed."; - // insert code here to read "wrong" type. Might have a switch statement. - } - is.read(reinterpret_cast(t), sizeof(*t)); - } else { - if (sizeof(*t) == 1) { - int16 i; - is >> i; - *t = i; - } else { - is >> *t; - } - } - if (is.fail()) { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << is.peek(); - } -} - -// Template that covers integers. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector >::const_iterator iter = v.begin(), - end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(iter->first) << ',' - << static_cast(iter->second) << ' '; - else - os << iter->first << ',' << iter->second << ' '; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerPairVector."; - } -} - -// Template that covers integers. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz * 2); - } - } else { - std::vector > tmp_v; // use temporary so v doesn't use - // extra memory due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); - } else { - T next_t1, next_t2; - is >> next_t1; - if (is.fail()) goto bad; - if (is.peek() != static_cast(',')) - KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " - << is.peek() << ", at file position " << is.tellg(); - is.get(); // consume the ','. - is >> next_t2 >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(std::pair(next_t1, next_t2)); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerPairVector: read failure at file position " - << is.tellg(); -} - -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v) { - // Compile time assertion that this is not called with a wrong type. - KALDI_ASSERT_IS_INTEGER_TYPE(T); - if (binary) { - char sz = sizeof(T); // this is currently just a check. - os.write(&sz, 1); - int32 vecsz = static_cast(v.size()); - KALDI_ASSERT((size_t)vecsz == v.size()); - os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (vecsz != 0) { - os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz); - } - } else { - // focus here is on prettiness of text form rather than - // efficiency of reading-in. - // reading-in is dominated by low-level operations anyway: - // for efficiency use binary. - os << "[ "; - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) { - if (sizeof(T) == 1) - os << static_cast(*iter) << " "; - else - os << *iter << " "; - } - os << "]\n"; - } - if (os.fail()) { - KALDI_ERR << "Write failure in WriteIntegerVector."; - } -} - -template -inline void ReadIntegerVector(std::istream &is, bool binary, - std::vector *v) { - KALDI_ASSERT_IS_INTEGER_TYPE(T); - KALDI_ASSERT(v != NULL); - if (binary) { - int sz = is.peek(); - if (sz == sizeof(T)) { - is.get(); - } else { // this is currently just a check. - KALDI_ERR << "ReadIntegerVector: expected to see type of size " - << sizeof(T) << ", saw instead " << sz << ", at file position " - << is.tellg(); - } - int32 vecsz; - is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); - if (is.fail() || vecsz < 0) goto bad; - v->resize(vecsz); - if (vecsz > 0) { - is.read(reinterpret_cast(&((*v)[0])), sizeof(T) * vecsz); - } - } else { - std::vector tmp_v; // use temporary so v doesn't use extra memory - // due to resizing. - is >> std::ws; - if (is.peek() != static_cast('[')) { - KALDI_ERR << "ReadIntegerVector: expected to see [, saw " << is.peek() - << ", at file position " << is.tellg(); - } - is.get(); // consume the '['. - is >> std::ws; // consume whitespace. - while (is.peek() != static_cast(']')) { - if (sizeof(T) == 1) { // read/write chars as numbers. - int16 next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back((T)next_t); - } else { - T next_t; - is >> next_t >> std::ws; - if (is.fail()) - goto bad; - else - tmp_v.push_back(next_t); - } - } - is.get(); // get the final ']'. - *v = tmp_v; // could use std::swap to use less temporary memory, but this - // uses less permanent memory. - } - if (!is.fail()) return; -bad: - KALDI_ERR << "ReadIntegerVector: read failure at file position " - << is.tellg(); -} - -// Initialize an opened stream for writing by writing an optional binary -// header and modifying the floating-point precision. -inline void InitKaldiOutputStream(std::ostream &os, bool binary) { - // This does not throw exceptions (does not check for errors). - if (binary) { - os.put('\0'); - os.put('B'); - } - // Note, in non-binary mode we may at some point want to mess with - // the precision a bit. - // 7 is a bit more than the precision of float.. - if (os.precision() < 7) os.precision(7); -} - -/// Initialize an opened stream for reading by detecting the binary header and -// setting the "binary" value appropriately. -inline bool InitKaldiInputStream(std::istream &is, bool *binary) { - // Sets the 'binary' variable. - // Throws exception in the very unusual situation that stream - // starts with '\0' but not then 'B'. - - if (is.peek() == '\0') { // seems to be binary - is.get(); - if (is.peek() != 'B') { - return false; - } - is.get(); - *binary = true; - return true; - } else { - *binary = false; - return true; - } -} - -} // end namespace kaldi. - -#endif // KALDI_BASE_IO_FUNCS_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs.cc deleted file mode 100644 index bd6c350780d1096ff8c452fd00864aa07a30ac65..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs.cc +++ /dev/null @@ -1,215 +0,0 @@ -// base/io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" - -namespace kaldi { - -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b) { - os << (b ? "T" : "F"); - if (!binary) os << " "; - if (os.fail()) KALDI_ERR << "Write failure in WriteBasicType"; -} - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b) { - KALDI_PARANOID_ASSERT(b != NULL); - if (!binary) is >> std::ws; // eat up whitespace. - char c = is.peek(); - if (c == 'T') { - *b = true; - is.get(); - } else if (c == 'F') { - *b = false; - is.get(); - } else { - KALDI_ERR << "Read failure in ReadBasicType, file position is " - << is.tellg() << ", next char is " << CharToString(c); - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, float f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f) { - if (binary) { - char c = sizeof(f); - os.put(c); - os.write(reinterpret_cast(&f), sizeof(f)); - } else { - os << f << " "; - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f) { - KALDI_PARANOID_ASSERT(f != NULL); - if (binary) { - double d; - int c = is.peek(); - if (c == sizeof(*f)) { - is.get(); - is.read(reinterpret_cast(f), sizeof(*f)); - } else if (c == sizeof(d)) { - ReadBasicType(is, binary, &d); - *f = d; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *f; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -template <> -void ReadBasicType(std::istream &is, bool binary, double *d) { - KALDI_PARANOID_ASSERT(d != NULL); - if (binary) { - float f; - int c = is.peek(); - if (c == sizeof(*d)) { - is.get(); - is.read(reinterpret_cast(d), sizeof(*d)); - } else if (c == sizeof(f)) { - ReadBasicType(is, binary, &f); - *d = f; - } else { - KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek() - << ", at file position " << is.tellg(); - } - } else { - is >> *d; - } - if (is.fail()) { - KALDI_ERR << "ReadBasicType: failed to read, at file position " - << is.tellg(); - } -} - -void CheckToken(const char *token) { - if (*token == '\0') KALDI_ERR << "Token is empty (not a valid token)"; - const char *orig_token = token; - while (*token != '\0') { - if (::isspace(*token)) - KALDI_ERR << "Token is not a valid token (contains space): '" - << orig_token << "'"; - token++; - } -} - -void WriteToken(std::ostream &os, bool binary, const char *token) { - // binary mode is ignored; - // we use space as termination character in either case. - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - os << token << " "; - if (os.fail()) { - KALDI_ERR << "Write failure in WriteToken."; - } -} - -int Peek(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // eat up whitespace. - return is.peek(); -} - -void WriteToken(std::ostream &os, bool binary, const std::string &token) { - WriteToken(os, binary, token.c_str()); -} - -void ReadToken(std::istream &is, bool binary, std::string *str) { - KALDI_ASSERT(str != NULL); - if (!binary) is >> std::ws; // consume whitespace. - is >> *str; - if (is.fail()) { - KALDI_ERR << "ReadToken, failed to read token at file position " - << is.tellg(); - } - if (!isspace(is.peek())) { - KALDI_ERR << "ReadToken, expected space after token, saw instead " - << CharToString(static_cast(is.peek())) - << ", at file position " << is.tellg(); - } - is.get(); // consume the space. -} - -int PeekToken(std::istream &is, bool binary) { - if (!binary) is >> std::ws; // consume whitespace. - bool read_bracket; - if (static_cast(is.peek()) == '<') { - read_bracket = true; - is.get(); - } else { - read_bracket = false; - } - int ans = is.peek(); - if (read_bracket) { - if (!is.unget()) { - // Clear the bad bit. This code can be (and is in fact) reached, since the - // C++ standard does not guarantee that a call to unget() must succeed. - is.clear(); - } - } - return ans; -} - -void ExpectToken(std::istream &is, bool binary, const char *token) { - int pos_at_start = is.tellg(); - KALDI_ASSERT(token != NULL); - CheckToken(token); // make sure it's valid (can be read back) - if (!binary) is >> std::ws; // consume whitespace. - std::string str; - is >> str; - is.get(); // consume the space. - if (is.fail()) { - KALDI_ERR << "Failed to read token [started at file position " - << pos_at_start << "], expected " << token; - } - // The second half of the '&&' expression below is so that if we're expecting - // "", we will accept "Foo>" instead. This is so that the model-reading - // code will tolerate errors in PeekToken where is.unget() failed; search for - // is.clear() in PeekToken() for an explanation. - if (strcmp(str.c_str(), token) != 0 && - !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { - KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str - << "\"."; - } -} - -void ExpectToken(std::istream &is, bool binary, const std::string &token) { - ExpectToken(is, binary, token.c_str()); -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs.h deleted file mode 100644 index 06ad1e3d2d8dc8385886a7c6653f620642c7c05a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/io-funcs.h +++ /dev/null @@ -1,246 +0,0 @@ -// base/io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_IO_FUNCS_H_ -#define KALDI_BASE_IO_FUNCS_H_ - -// This header only contains some relatively low-level I/O functions. -// The full Kaldi I/O declarations are in ../util/kaldi-io.h -// and ../util/kaldi-table.h -// They were put in util/ in order to avoid making the Matrix library -// dependent on them. - -#include -#include -#include -#include - -#include "base/io-funcs-inl.h" -#include "base/kaldi-common.h" - -namespace kaldi { - -/* - This comment describes the Kaldi approach to I/O. All objects can be written - and read in two modes: binary and text. In addition we want to make the I/O - work if we redefine the typedef "BaseFloat" between floats and doubles. - We also want to have control over whitespace in text mode without affecting - the meaning of the file, for pretty-printing purposes. - - Errors are handled by throwing a KaldiFatalError exception. - - For integer and floating-point types (and boolean values): - - WriteBasicType(std::ostream &, bool binary, const T&); - ReadBasicType(std::istream &, bool binary, T*); - - and we expect these functions to be defined in such a way that they work when - the type T changes between float and double, so you can read float into double - and vice versa]. Note that for efficiency and space-saving reasons, the - Vector and Matrix classes do not use these functions [but they preserve the - type interchangeability in their own way] - - For a class (or struct) C: - class C { - .. - Write(std::ostream &, bool binary, [possibly extra optional args for - specific classes]) const; Read(std::istream &, bool binary, [possibly extra - optional args for specific classes]); - .. - } - NOTE: The only actual optional args we used are the "add" arguments in - Vector/Matrix classes, which specify whether we should sum the data already - in the class with the data being read. - - For types which are typedef's involving stl classes, I/O is as follows: - typedef std::vector > MyTypedefName; - - The user should define something like: - - WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t); - ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t); - - The user would have to write these functions. - - For a type std::vector: - - void WriteIntegerVector(std::ostream &os, bool binary, const std::vector - &v); void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - - For other types, e.g. vectors of pairs, the user should create a routine of - the type WriteMyTypedefName. This is to avoid introducing confusing templated - functions; we could easily create templated functions to handle most of these - cases but they would have to share the same name. - - It also often happens that the user needs to write/read special tokens as part - of a file. These might be class headers, or separators/identifiers in the - class. We provide special functions for manipulating these. These special - tokens must be nonempty and must not contain any whitespace. - - void WriteToken(std::ostream &os, bool binary, const char*); - void WriteToken(std::ostream &os, bool binary, const std::string & token); - int Peek(std::istream &is, bool binary); - void ReadToken(std::istream &is, bool binary, std::string *str); - void PeekToken(std::istream &is, bool binary, std::string *str); - - WriteToken writes the token and one space (whether in binary or text mode). - - Peek returns the first character of the next token, by consuming whitespace - (in text mode) and then returning the peek() character. It returns -1 at EOF; - it doesn't throw. It's useful if a class can have various forms based on - typedefs and virtual classes, and wants to know which version to read. - - ReadToken allows the caller to obtain the next token. PeekToken works just - like ReadToken, but seeks back to the beginning of the token. A subsequent - call to ReadToken will read the same token again. This is useful when - different object types are written to the same file; using PeekToken one can - decide which of the objects to read. - - There is currently no special functionality for writing/reading strings (where - the strings contain data rather than "special tokens" that are whitespace-free - and nonempty). This is because Kaldi is structured in such a way that strings - don't appear, except as OpenFst symbol table entries (and these have their own - format). - - - NOTE: you should not call ReadIntegerType and WriteIntegerType with types, - such as int and size_t, that are machine-independent -- at least not - if you want your file formats to port between machines. Use int32 and - int64 where necessary. There is no way to detect this using compile-time - assertions because C++ only keeps track of the internal representation of - the type. -*/ - -/// \addtogroup io_funcs_basic -/// @{ - -/// WriteBasicType is the name of the write function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void WriteBasicType(std::ostream &os, bool binary, T t); - -/// ReadBasicType is the name of the read function for bool, integer types, -/// and floating-point types. They all throw on error. -template -void ReadBasicType(std::istream &is, bool binary, T *t); - -// Declare specialization for bool. -template <> -void WriteBasicType(std::ostream &os, bool binary, bool b); - -template <> -void ReadBasicType(std::istream &is, bool binary, bool *b); - -// Declare specializations for float and double. -template <> -void WriteBasicType(std::ostream &os, bool binary, float f); - -template <> -void WriteBasicType(std::ostream &os, bool binary, double f); - -template <> -void ReadBasicType(std::istream &is, bool binary, float *f); - -template <> -void ReadBasicType(std::istream &is, bool binary, double *f); - -// Define ReadBasicType that accepts an "add" parameter to add to -// the destination. Caution: if used in Read functions, be careful -// to initialize the parameters concerned to zero in the default -// constructor. -template -inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) { - if (!add) { - ReadBasicType(is, binary, t); - } else { - T tmp = T(0); - ReadBasicType(is, binary, &tmp); - *t += tmp; - } -} - -/// Function for writing STL vectors of integer types. -template -inline void WriteIntegerVector(std::ostream &os, bool binary, - const std::vector &v); - -/// Function for reading STL vector of integer types. -template -inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); - -/// Function for writing STL vectors of pairs of integer types. -template -inline void WriteIntegerPairVector(std::ostream &os, bool binary, - const std::vector > &v); - -/// Function for reading STL vector of pairs of integer types. -template -inline void ReadIntegerPairVector(std::istream &is, bool binary, - std::vector > *v); - -/// The WriteToken functions are for writing nonempty sequences of non-space -/// characters. They are not for general strings. -void WriteToken(std::ostream &os, bool binary, const char *token); -void WriteToken(std::ostream &os, bool binary, const std::string &token); - -/// Peek consumes whitespace (if binary == false) and then returns the peek() -/// value of the stream. -int Peek(std::istream &is, bool binary); - -/// ReadToken gets the next token and puts it in str (exception on failure). If -/// PeekToken() had been previously called, it is possible that the stream had -/// failed to unget the starting '<' character. In this case ReadToken() returns -/// the token string without the leading '<'. You must be prepared to handle -/// this case. ExpectToken() handles this internally, and is not affected. -void ReadToken(std::istream &is, bool binary, std::string *token); - -/// PeekToken will return the first character of the next token, or -1 if end of -/// file. It's the same as Peek(), except if the first character is '<' it will -/// skip over it and will return the next character. It will attempt to unget -/// the '<' so the stream is where it was before you did PeekToken(), however, -/// this is not guaranteed (see ReadToken()). -int PeekToken(std::istream &is, bool binary); - -/// ExpectToken tries to read in the given token, and throws an exception -/// on failure. -void ExpectToken(std::istream &is, bool binary, const char *token); -void ExpectToken(std::istream &is, bool binary, const std::string &token); - -/// ExpectPretty attempts to read the text in "token", but only in non-binary -/// mode. Throws exception on failure. It expects an exact match except that -/// arbitrary whitespace matches arbitrary whitespace. -void ExpectPretty(std::istream &is, bool binary, const char *token); -void ExpectPretty(std::istream &is, bool binary, const std::string &token); - -/// @} end "addtogroup io_funcs_basic" - -/// InitKaldiOutputStream initializes an opened stream for writing by writing an -/// optional binary header and modifying the floating-point precision; it will -/// typically not be called by users directly. -inline void InitKaldiOutputStream(std::ostream &os, bool binary); - -/// InitKaldiInputStream initializes an opened stream for reading by detecting -/// the binary header and setting the "binary" value appropriately; -/// It will typically not be called by users directly. -inline bool InitKaldiInputStream(std::istream &is, bool *binary); - -} // end namespace kaldi. -#endif // KALDI_BASE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-common.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-common.h deleted file mode 100644 index eee5f34d7234e7c029e6bb59584d3ee65ff5a875..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-common.h +++ /dev/null @@ -1,41 +0,0 @@ -// base/kaldi-common.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_COMMON_H_ -#define KALDI_BASE_KALDI_COMMON_H_ 1 - -#include -#include -#include // C string stuff like strcpy -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-utils.h" -#include "base/kaldi-error.h" -#include "base/kaldi-types.h" -// #include "base/io-funcs.h" -#include "base/kaldi-math.h" -// #include "base/timer.h" - -#endif // KALDI_BASE_KALDI_COMMON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-error.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-error.cc deleted file mode 100644 index 77edc6af6e56bb8fa3431d519e58fda9ee0bac6a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-error.cc +++ /dev/null @@ -1,42 +0,0 @@ -// base/kaldi-error.cc - -// Copyright 2019 LAIX (Yi Sun) -// Copyright 2019 SmartAction LLC (kkm) -// Copyright 2016 Brno University of Technology (author: Karel Vesely) -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-error.h" - -#include - -namespace kaldi { - -/***** GLOBAL VARIABLES FOR LOGGING *****/ - -int32 g_kaldi_verbose_level = 0; -static std::string program_name; // NOLINT - -void SetProgramName(const char *basename) { - // Using the 'static std::string' for the program name is mostly harmless, - // because (a) Kaldi logging is undefined before main(), and (b) no stdc++ - // string implementation has been found in the wild that would not be just - // an empty string when zero-initialized but not yet constructed. - program_name = basename; -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-error.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-error.h deleted file mode 100644 index 0f65db372b5f05a8017433eed7c95badc819a0a6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-error.h +++ /dev/null @@ -1,57 +0,0 @@ -// base/kaldi-error.h - -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_ERROR_H_ -#define KALDI_BASE_KALDI_ERROR_H_ 1 - -#include "utils/log.h" - -namespace kaldi { - -#define KALDI_WARN \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_WARNING).stream() -#define KALDI_ERR \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_ERROR).stream() -#define KALDI_LOG \ - google::LogMessage(__FILE__, __LINE__, google::GLOG_INFO).stream() -#define KALDI_VLOG(v) VLOG(v) - -#define KALDI_ASSERT(condition) CHECK(condition) - - -/***** PROGRAM NAME AND VERBOSITY LEVEL *****/ - -/// Called by ParseOptions to set base name (no directory) of the executing -/// program. The name is printed in logging code along with every message, -/// because in our scripts, we often mix together the stderr of many programs. -/// This function is very thread-unsafe. -void SetProgramName(const char *basename); - -/// This is set by util/parse-options.{h,cc} if you set --verbose=? option. -/// Do not use directly, prefer {Get,Set}VerboseLevel(). -extern int32 g_kaldi_verbose_level; - -/// Get verbosity level, usually set via command line '--verbose=' switch. -inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } - -/// This should be rarely used, except by programs using Kaldi as library; -/// command-line programs set the verbose level automatically from ParseOptions. -inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } - -} // namespace kaldi - -#endif // KALDI_BASE_KALDI_ERROR_H_ - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-math.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-math.cc deleted file mode 100644 index 175d9f49b6c5216645e90e146f4e2eab5572c342..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-math.cc +++ /dev/null @@ -1,164 +0,0 @@ -// base/kaldi-math.cc - -// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; -// Saarland University; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-math.h" -#ifndef _MSC_VER -#include -#include -#endif -#include -#include - -namespace kaldi { -// These routines are tested in matrix/matrix-test.cc - -int32 RoundUpToNearestPowerOfTwo(int32 n) { - KALDI_ASSERT(n > 0); - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - return n+1; -} - -static std::mutex _RandMutex; - -int Rand(struct RandomState* state) { -#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS) - // On Windows and Cygwin, just call Rand() - return rand(); -#else - if (state) { - return rand_r(&(state->seed)); - } else { - std::lock_guard lock(_RandMutex); - return rand(); - } -#endif -} - -RandomState::RandomState() { - // we initialize it as Rand() + 27437 instead of just Rand(), because on some - // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be - // the case that rand_r when initialized with rand() will give you the exact - // same sequence of numbers that rand() will give if you keep calling rand() - // after that initial call. This can cause problems with repeated sequences. - // For example if you initialize two RandomState structs one after the other - // without calling rand() in between, they would give you the same sequence - // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just - // a randomly chosen prime number. - seed = unsigned(Rand()) + 27437; -} - -bool WithProb(BaseFloat prob, struct RandomState* state) { - KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0, - // but we allow slightly larger values that could arise from roundoff in - // previous calculations. - KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128); - if (prob == 0) { - return false; - } else if (prob == 1.0) { - return true; - } else if (prob * RAND_MAX < 128.0) { - // prob is very small but nonzero, and the "main algorithm" - // wouldn't work that well. So: with probability 1/128, we - // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... - // Note: we know that prob * 128.0 < 1.0, because - // we asserted RAND_MAX > 128 * 128. - return WithProb(prob * 128.0); - } else { - return false; - } - } else { - return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob)); - } -} - -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { - // This is not exact. - KALDI_ASSERT(max_val >= min_val); - if (max_val == min_val) return min_val; - -#ifdef _MSC_VER - // RAND_MAX is quite small on Windows -> may need to handle larger numbers. - if (RAND_MAX > (max_val-min_val)*8) { - // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + - ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); - } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > - (unsigned int)((max_val+1-min_val)*8)) { - // *8 to avoid inaccuracies in probability, from the modulus... - return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) - % (unsigned int)(max_val+1-min_val)); - } else { - KALDI_ERR << "rand_int failed because we do not support such large " - "random numbers. (Extend this function)."; - } - } -#else - return min_val + - (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); -#endif -} - -// Returns poisson-distributed random number. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state) { - // Knuth's algorithm. - KALDI_ASSERT(lambda >= 0); - float L = expf(-lambda), p = 1.0; - int32 k = 0; - do { - k++; - float u = RandUniform(state); - p *= u; - } while (p > L); - return k-1; -} - -void RandGauss2(float *a, float *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float u1 = RandUniform(state); - float u2 = RandUniform(state); - u1 = sqrtf(-2.0f * logf(u1)); - u2 = 2.0f * M_PI * u2; - *a = u1 * cosf(u2); - *b = u1 * sinf(u2); -} - -void RandGauss2(double *a, double *b, RandomState *state) { - KALDI_ASSERT(a); - KALDI_ASSERT(b); - float a_float, b_float; - // Just because we're using doubles doesn't mean we need super-high-quality - // random numbers, so we just use the floating-point version internally. - RandGauss2(&a_float, &b_float, state); - *a = a_float; - *b = b_float; -} - - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-math.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-math.h deleted file mode 100644 index 93c265ee96e704893da26b9083a44a9e60c6c192..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-math.h +++ /dev/null @@ -1,363 +0,0 @@ -// base/kaldi-math.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian; -// Jan Silovsky; Saarland University -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_MATH_H_ -#define KALDI_BASE_KALDI_MATH_H_ 1 - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include - -#include "base/kaldi-types.h" -#include "base/kaldi-common.h" - - -#ifndef DBL_EPSILON -#define DBL_EPSILON 2.2204460492503131e-16 -#endif -#ifndef FLT_EPSILON -#define FLT_EPSILON 1.19209290e-7f -#endif - -#ifndef M_PI -#define M_PI 3.1415926535897932384626433832795 -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.4142135623730950488016887 -#endif - -#ifndef M_2PI -#define M_2PI 6.283185307179586476925286766559005 -#endif - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244008443621048490 -#endif - -#ifndef M_LOG_2PI -#define M_LOG_2PI 1.8378770664093454835606594728112 -#endif - -#ifndef M_LN2 -#define M_LN2 0.693147180559945309417232121458 -#endif - -#ifndef M_LN10 -#define M_LN10 2.302585092994045684017991454684 -#endif - - -#define KALDI_ISNAN std::isnan -#define KALDI_ISINF std::isinf -#define KALDI_ISFINITE(x) std::isfinite(x) - -#if !defined(KALDI_SQR) -# define KALDI_SQR(x) ((x) * (x)) -#endif - -namespace kaldi { - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline double Exp(double x) { return exp(x); } -#ifndef KALDI_NO_EXPF -inline float Exp(float x) { return expf(x); } -#else -inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF -#else -inline double Exp(double x) { return exp(x); } -#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -// Microsoft CL v18.0 buggy 64-bit implementation of -// expf() incorrectly returns -inf for exp(-inf). -inline float Exp(float x) { return exp(static_cast(x)); } -#else -inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -inline double Log(double x) { return log(x); } -inline float Log(float x) { return logf(x); } - -#if !defined(_MSC_VER) || (_MSC_VER >= 1700) -inline double Log1p(double x) { return log1p(x); } -inline float Log1p(float x) { return log1pf(x); } -#else -inline double Log1p(double x) { - const double cutoff = 1.0e-08; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} - -inline float Log1p(float x) { - const float cutoff = 1.0e-07; - if (x < cutoff) - return x - 0.5 * x * x; - else - return Log(1.0 + x); -} -#endif - -static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative! -static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative! - -// -infinity -const float kLogZeroFloat = -std::numeric_limits::infinity(); -const double kLogZeroDouble = -std::numeric_limits::infinity(); -const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); - -// Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state = NULL); - -// State for thread-safe random number generator -struct RandomState { - RandomState(); - unsigned seed; -}; - -// Returns a random integer between first and last inclusive. -int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL); - -// Returns true with probability "prob", -bool WithProb(BaseFloat prob, struct RandomState* state = NULL); -// with 0 <= prob <= 1 [we check this]. -// Internally calls Rand(). This function is carefully implemented so -// that it should work even if prob is very small. - -/// Returns a random number strictly between 0 and 1. -inline float RandUniform(struct RandomState* state = NULL) { - return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); -} - -inline float RandGauss(struct RandomState* state = NULL) { - return static_cast(sqrtf (-2 * Log(RandUniform(state))) - * cosf(2*M_PI*RandUniform(state))); -} - -// Returns poisson-distributed random number. Uses Knuth's algorithm. -// Take care: this takes time proportional -// to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state = NULL); - -// Returns a pair of gaussian random numbers. Uses Box-Muller transform -void RandGauss2(float *a, float *b, RandomState *state = NULL); -void RandGauss2(double *a, double *b, RandomState *state = NULL); - -// Also see Vector::RandCategorical(). - -// This is a randomized pruning mechanism that preserves expectations, -// that we typically use to prune posteriors. -template -inline Float RandPrune(Float post, BaseFloat prune_thresh, - struct RandomState* state = NULL) { - KALDI_ASSERT(prune_thresh >= 0.0); - if (post == 0.0 || std::abs(post) >= prune_thresh) - return post; - return (post >= 0 ? 1.0 : -1.0) * - (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0); -} - -// returns log(exp(x) + exp(y)). -inline double LogAdd(double x, double y) { - double diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffDouble) { - double res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) + exp(y)). -inline float LogAdd(float x, float y) { - float diff; - - if (x < y) { - diff = x - y; - x = y; - } else { - diff = y - x; - } - // diff is negative. x is now the larger one. - - if (diff >= kMinLogDiffFloat) { - float res; - res = x + Log1p(Exp(diff)); - return res; - } else { - return x; // return the larger one. - } -} - - -// returns log(exp(x) - exp(y)). -inline double LogSub(double x, double y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - double diff = y - x; // Will be negative. - double res = x + Log(1.0 - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroDouble; - return res; -} - - -// returns log(exp(x) - exp(y)). -inline float LogSub(float x, float y) { - if (y >= x) { // Throws exception if y>=x. - if (y == x) - return kLogZeroDouble; - else - KALDI_ERR << "Cannot subtract a larger from a smaller number."; - } - - float diff = y - x; // Will be negative. - float res = x + Log(1.0f - Exp(diff)); - - // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision - if (KALDI_ISNAN(res)) - return kLogZeroFloat; - return res; -} - -/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)). -static inline bool ApproxEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - if (a == b) return true; - float diff = std::abs(a-b); - if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); -} - -/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) -static inline void AssertEqual(float a, float b, - float relative_tolerance = 0.001) { - // a==b handles infinities. - KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance)); -} - - -// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0. -int32 RoundUpToNearestPowerOfTwo(int32 n); - -/// Returns a / b, rounding towards negative infinity in all cases. -static inline int32 DivideRoundingDown(int32 a, int32 b) { - KALDI_ASSERT(b != 0); - if (a * b >= 0) - return a / b; - else if (a < 0) - return (a - b + 1) / b; - else - return (a - b - 1) / b; -} - -template I Gcd(I m, I n) { - if (m == 0 || n == 0) { - if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. - KALDI_ERR << "Undefined GCD since m = 0, n = 0."; - } - return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m)); - // return absolute value of whichever is nonzero - } - // could use compile-time assertion - // but involves messing with complex template stuff. - KALDI_ASSERT(std::numeric_limits::is_integer); - while (1) { - m %= n; - if (m == 0) return (n > 0 ? n : -n); - n %= m; - if (n == 0) return (m > 0 ? m : -m); - } -} - -/// Returns the least common multiple of two integers. Will -/// crash unless the inputs are positive. -template I Lcm(I m, I n) { - KALDI_ASSERT(m > 0 && n > 0); - I gcd = Gcd(m, n); - return gcd * (m/gcd) * (n/gcd); -} - - -template void Factorize(I m, std::vector *factors) { - // Splits a number into its prime factors, in sorted order from - // least to greatest, with duplication. A very inefficient - // algorithm, which is mainly intended for use in the - // mixed-radix FFT computation (where we assume most factors - // are small). - KALDI_ASSERT(factors != NULL); - KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers. - factors->clear(); - I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 }; - - // First try small factors. - for (I i = 0; i < 10; i++) { - if (m == 1) return; // We're done. - while (m % small_factors[i] == 0) { - m /= small_factors[i]; - factors->push_back(small_factors[i]); - } - } - // Next try all odd numbers starting from 31. - for (I j = 31;; j += 2) { - if (m == 1) return; - while (m % j == 0) { - m /= j; - factors->push_back(j); - } - } -} - -inline double Hypot(double x, double y) { return hypot(x, y); } -inline float Hypot(float x, float y) { return hypotf(x, y); } - - - - -} // namespace kaldi - - -#endif // KALDI_BASE_KALDI_MATH_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-types.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-types.h deleted file mode 100644 index 7ebf4f85386192a65e176d8f0ecde9bb348af4a0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-types.h +++ /dev/null @@ -1,75 +0,0 @@ -// base/kaldi-types.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_TYPES_H_ -#define KALDI_BASE_KALDI_TYPES_H_ 1 - -namespace kaldi { -// TYPEDEFS .................................................................. -#if (KALDI_DOUBLEPRECISION != 0) -typedef double BaseFloat; -#else -typedef float BaseFloat; -#endif -} - -#ifdef _MSC_VER -#include -#define ssize_t SSIZE_T -#endif - -// we can do this a different way if some platform -// we find in the future lacks stdint.h -#include - -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - -#endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-utils.h deleted file mode 100644 index bd434d09ed92ec94bc4208f53a4416f941edfdb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/base/kaldi-utils.h +++ /dev/null @@ -1,155 +0,0 @@ -// base/kaldi-utils.h - -// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; -// Saarland University; Karel Vesely; Yanmin Qian - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_BASE_KALDI_UTILS_H_ -#define KALDI_BASE_KALDI_UTILS_H_ 1 - -#if defined(_MSC_VER) -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX -# include -#endif - -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) -#if _MSC_VER < 1400 -#define __restrict__ -#else -#define __restrict__ __restrict -#endif -#endif - -#if defined(_MSC_VER) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = _aligned_malloc(size, align)) -# define KALDI_MEMALIGN_FREE(x) _aligned_free(x) -#elif defined(__CYGWIN__) -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = aligned_alloc(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#else -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#endif - -#ifdef __ICC -#pragma warning(disable: 383) // ICPC remark we don't want. -#pragma warning(disable: 810) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#pragma warning(disable: 1418) // ICPC remark we don't want. -#pragma warning(disable: 444) // ICPC remark we don't want. -#pragma warning(disable: 869) // ICPC remark we don't want. -#pragma warning(disable: 1287) // ICPC remark we don't want. -#pragma warning(disable: 279) // ICPC remark we don't want. -#pragma warning(disable: 981) // ICPC remark we don't want. -#endif - - -namespace kaldi { - - -// CharToString prints the character in a human-readable form, for debugging. -std::string CharToString(const char &c); - - -inline int MachineIsLittleEndian() { - int check = 1; - return (*reinterpret_cast(&check) != 0); -} - -// This function kaldi::Sleep() provides a portable way -// to sleep for a possibly fractional -// number of seconds. On Windows it's only accurate to microseconds. -void Sleep(float seconds); -} // namespace kaldi - -#define KALDI_SWAP8(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ - (reinterpret_cast(&a))[7] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ - (reinterpret_cast(&a))[6] = t;\ - t = (reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ - (reinterpret_cast(&a))[5] = t;\ - t = (reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ - (reinterpret_cast(&a))[4] = t;} while (0) -#define KALDI_SWAP4(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ - (reinterpret_cast(&a))[3] = t;\ - t = (reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ - (reinterpret_cast(&a))[2]=t;} while (0) -#define KALDI_SWAP2(a) do { \ - int t = (reinterpret_cast(&a))[0];\ - (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ - (reinterpret_cast(&a))[1] = t;} while (0) - - -// Makes copy constructor and operator= private. -#define KALDI_DISALLOW_COPY_AND_ASSIGN(type) \ - type(const type&); \ - void operator = (const type&) - -template class KaldiCompileTimeAssert { }; -template<> class KaldiCompileTimeAssert { - public: - static inline void Check() { } -}; - -#define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() - -#define KALDI_ASSERT_IS_INTEGER_TYPE(I) \ - KaldiCompileTimeAssert::is_specialized \ - && std::numeric_limits::is_integer>::Check() - -#define KALDI_ASSERT_IS_FLOATING_TYPE(F) \ - KaldiCompileTimeAssert::is_specialized \ - && !std::numeric_limits::is_integer>::Check() - -#if defined(_MSC_VER) -#define KALDI_STRCASECMP _stricmp -#elif defined(__CYGWIN__) -#include -#define KALDI_STRCASECMP strcasecmp -#else -#define KALDI_STRCASECMP strcasecmp -#endif -#ifdef _MSC_VER -# define KALDI_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -# define KALDI_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif - -#endif // KALDI_BASE_KALDI_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-decoder.cc deleted file mode 100644 index 06f77557fa49a23f6a44d07c327a1b3b081c6dec..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-decoder.cc +++ /dev/null @@ -1,1101 +0,0 @@ -// decoder/lattice-faster-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2018 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "decoder/lattice-faster-decoder.h" -// #include "lat/lattice-functions.h" - -namespace kaldi { - -// instantiate this class once for each thing you have to decode. -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : fst_(&fst), - delete_fst_(false), - config_(config), - num_toks_(0), - context_graph_(context_graph) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::LatticeFasterDecoderTpl( - const LatticeFasterDecoderConfig &config, FST *fst) - : fst_(fst), delete_fst_(true), config_(config), num_toks_(0) { - config.Check(); - toks_.SetSize( - 1000); // just so on the first frame we do something reasonable. -} - -template -LatticeFasterDecoderTpl::~LatticeFasterDecoderTpl() { - DeleteElems(toks_.Clear()); - ClearActiveTokens(); - if (delete_fst_) delete fst_; -} - -template -void LatticeFasterDecoderTpl::InitDecoding() { - // clean up from last time: - DeleteElems(toks_.Clear()); - cost_offsets_.clear(); - ClearActiveTokens(); - warned_ = false; - num_toks_ = 0; - decoding_finalized_ = false; - final_costs_.clear(); - StateId start_state = fst_->Start(); - KALDI_ASSERT(start_state != fst::kNoStateId); - active_toks_.resize(1); - Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL); - active_toks_[0].toks = start_tok; - toks_.Insert(start_state, start_tok); - num_toks_++; - ProcessNonemitting(config_.beam); -} - -// Returns true if any kind of traceback is available (not necessarily from -// a final state). It should only very rarely return false; this indicates -// an unusual search error. -template -bool LatticeFasterDecoderTpl::Decode( - DecodableInterface *decodable) { - InitDecoding(); - // We use 1-based indexing for frames in this decoder (if you view it in - // terms of features), but note that the decodable object uses zero-based - // numbering, which we have to correct for when we call it. - AdvanceDecoding(decodable); - FinalizeDecoding(); - - // Returns true if we have any kind of traceback available (not necessarily - // to the end state; query ReachedFinal() for that). - return !active_toks_.empty() && active_toks_.back().toks != NULL; -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - Lattice raw_lat; - GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, olat); - return (olat->NumStates() != 0); -} - -// Outputs an FST corresponding to the raw, state-level lattice -template -bool LatticeFasterDecoderTpl::GetRawLattice( - Lattice *ofst, bool use_final_probs) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (decoding_finalized_ ? final_costs_ : final_costs_local); - if (!decoding_finalized_ && use_final_probs) - ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_ / 2 + 3; - unordered_map tok_map(bucket_count); - // First create all states. - std::vector token_list; - for (int32 f = 0; f <= num_frames; f++) { - if (active_toks_[f].toks == NULL) { - KALDI_WARN << "GetRawLattice: no tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - TopSortTokens(active_toks_[f].toks, &token_list); - for (size_t i = 0; i < token_list.size(); i++) - if (token_list[i] != NULL) tok_map[token_list[i]] = ofst->AddState(); - } - // The next statement sets the start state of the output FST. Because we - // topologically sorted the tokens, state zero must be the start-state. - ofst->SetStart(0); - - KALDI_VLOG(4) << "init:" << num_toks_ / 2 + 3 - << " buckets:" << tok_map.bucket_count() - << " load:" << tok_map.load_factor() - << " max:" << tok_map.max_load_factor(); - // Now create all arcs. - for (int32 f = 0; f <= num_frames; f++) { - for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) { - StateId cur_state = tok_map[tok]; - for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) { - typename unordered_map::const_iterator iter = - tok_map.find(l->next_tok); - StateId nextstate = iter->second; - KALDI_ASSERT(iter != tok_map.end()); - BaseFloat cost_offset = 0.0; - if (l->ilabel != 0) { // emitting.. - KALDI_ASSERT(f >= 0 && f < cost_offsets_.size()); - cost_offset = cost_offsets_[f]; - } - - StateId state = cur_state; - if (l->is_start_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->start_tag_id(), Weight(0, 0), tmp); - ofst->AddArc(state, arc); - state = tmp; - } - if (l->is_end_boundary) { - StateId tmp = ofst->AddState(); - Arc arc(0, context_graph_->end_tag_id(), Weight(0, 0), nextstate); - ofst->AddArc(tmp, arc); - nextstate = tmp; - } - - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(state, arc); - } - if (f == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - } - - fst::TopSort(ofst); - return (ofst->NumStates() > 0); -} - -// This function is now deprecated, since now we do determinization from outside -// the LatticeFasterDecoder class. Outputs an FST corresponding to the -// lattice-determinized lattice (one path per word sequence). -template -bool LatticeFasterDecoderTpl::GetLattice( - CompactLattice *ofst, bool use_final_probs) const { - Lattice raw_fst; - GetRawLattice(&raw_fst, use_final_probs); - Invert(&raw_fst); // make it so word labels are on the input. - // (in phase where we get backward-costs). - fst::ILabelCompare ilabel_comp; - ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes - // lattice-determinization more efficient. - - fst::DeterminizeLatticePrunedOptions lat_opts; - lat_opts.max_mem = config_.det_opts.max_mem; - - DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); - raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. - Connect(ofst); // Remove unreachable states... there might be - // a small number of these, in some cases. - // Note: if something went wrong and the raw lattice was empty, - // we should still get to this point in the code without warnings or failures. - return (ofst->NumStates() != 0); -} - -template -void LatticeFasterDecoderTpl::PossiblyResizeHash(size_t num_toks) { - size_t new_sz = static_cast(static_cast(num_toks) * - config_.hash_ratio); - if (new_sz > toks_.Size()) { - toks_.SetSize(new_sz); - } -} - -/* - A note on the definition of extra_cost. - - extra_cost is used in pruning tokens, to save memory. - - extra_cost can be thought of as a beta (backward) cost assuming - we had set the betas on currently-active tokens to all be the negative - of the alphas for those tokens. (So all currently active tokens would - be on (tied) best paths). - - We can use the extra_cost to accurately prune away tokens that we know will - never appear in the lattice. If the extra_cost is greater than the desired - lattice beam, the token would provably never appear in the lattice, so we can - prune away the token. - - (Note: we don't update all the extra_costs every time we update a frame; we - only do it every 'config_.prune_interval' frames). - */ - -// FindOrAddToken either locates a token in hash of toks_, -// or if necessary inserts a new, empty token (i.e. with no forward links) -// for the current frame. [note: it's inserted if necessary into hash toks_ -// and also into the singly linked list of tokens active on this frame -// (whose head is at active_toks_[frame]). -template -inline typename LatticeFasterDecoderTpl::Elem * -LatticeFasterDecoderTpl::FindOrAddToken(StateId state, - int32 frame_plus_one, - BaseFloat tot_cost, - Token *backpointer, - bool *changed) { - // Returns the Token pointer. Sets "changed" (if non-NULL) to true - // if the token was newly created or the cost changed. - KALDI_ASSERT(frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - Elem *e_found = toks_.Insert(state, NULL); - if (e_found->val == NULL) { // no such token presently. - const BaseFloat extra_cost = 0.0; - // tokens on the currently final frame have zero extra_cost - // as any of them could end up - // on the winning path. - Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer); - // NULL: no forward links yet - toks = new_tok; - num_toks_++; - e_found->val = new_tok; - if (changed) *changed = true; - return e_found; - } else { - Token *tok = e_found->val; // There is an existing Token for this state. - if (tok->tot_cost > tot_cost) { // replace old token - tok->tot_cost = tot_cost; - // SetBackpointer() just does tok->backpointer = backpointer in - // the case where Token == BackpointerToken, else nothing. - tok->SetBackpointer(backpointer); - // we don't allocate a new token, the old stays linked in active_toks_ - // we only replace the tot_cost - // in the current frame, there are no forward links (and no extra_cost) - // only in ProcessNonemitting we have to delete forward links - // in case we visit a state for the second time - // those forward links, that lead to this replaced token before: - // they remain and will hopefully be pruned later (PruneForwardLinks...) - if (changed) *changed = true; - } else { - if (changed) *changed = false; - } - return e_found; - } -} - -// prunes outgoing links for all tokens in active_toks_[frame] -// it's called by PruneActiveTokens -// all links, that have link_extra_cost > lattice_beam are pruned -template -void LatticeFasterDecoderTpl::PruneForwardLinks( - int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned, - BaseFloat delta) { - // delta is the amount by which the extra_costs must change - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - - *extra_costs_changed = false; - *links_pruned = false; - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - if (active_toks_[frame_plus_one].toks == - NULL) { // empty list; should not happen. - if (!warned_) { - KALDI_WARN << "No tokens alive [doing pruning].. warning first " - "time only for each utterance\n"; - warned_ = true; - } - } - - // We have to iterate until there is no more change, because the links - // are not guaranteed to be in topological order. - bool changed = true; // difference new minus old extra cost >= delta ? - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost for tok. - BaseFloat tok_extra_cost = std::numeric_limits::infinity(); - // tok_extra_cost is the best (min) of link_extra_cost of outgoing links - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); // difference in brackets is >= 0 - // link_exta_cost is the difference in score between the best paths - // through link source state and through link destination state - KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN - // the graph_cost contatins the context score - // if it's the score of the backoff arc, it should be removed. - if (link->context_score < 0) { - link_extra_cost += link->context_score; - } - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - *links_pruned = true; - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; // move to next link - link = link->next; - } - } // for all outgoing links - if (fabs(tok_extra_cost - tok->extra_cost) > delta) - changed = true; // difference new minus old is bigger than delta - tok->extra_cost = tok_extra_cost; - // will be +infinity or <= lattice_beam_. - // infinity indicates, that no forward link survived pruning - } // for all Token on active_toks_[frame] - if (changed) *extra_costs_changed = true; - - // Note: it's theoretically possible that aggressive compiler - // optimizations could cause an infinite loop here for small delta and - // high-dynamic-range scores. - } // while changed -} - -// PruneForwardLinksFinal is a version of PruneForwardLinks that we call -// on the final frame. If there are final tokens active, it uses -// the final-probs for pruning, otherwise it treats all tokens as final. -template -void LatticeFasterDecoderTpl::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame_plus_one = active_toks_.size() - 1; - - if (active_toks_[frame_plus_one].toks == - NULL) // empty list; should not happen. - KALDI_WARN << "No tokens alive at end of file"; - - typedef typename unordered_map::const_iterator IterType; - ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); - decoding_finalized_ = true; - // We call DeleteElems() as a nicety, not because it's really necessary; - // otherwise there would be a time, after calling PruneTokensForFrame() on the - // final frame, when toks_.GetList() or toks_.Clear() would contain pointers - // to nonexistent tokens. - DeleteElems(toks_.Clear()); - - // Now go through tokens on this frame, pruning forward links... may have to - // iterate a few times until there is no more change, because the list is not - // in topological order. This is a modified version of the code in - // PruneForwardLinks, but here we also take account of the final-probs. - bool changed = true; - BaseFloat delta = 1.0e-05; - while (changed) { - changed = false; - for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL; - tok = tok->next) { - ForwardLinkT *link, *prev_link = NULL; - // will recompute tok_extra_cost. It has a term in it that corresponds - // to the "final-prob", so instead of initializing tok_extra_cost to - // infinity below we set it to the difference between the - // (score+final_prob) of this token, and the best such (score+final_prob). - BaseFloat final_cost; - if (final_costs_.empty()) { - final_cost = 0.0; - } else { - IterType iter = final_costs_.find(tok); - if (iter != final_costs_.end()) - final_cost = iter->second; - else - final_cost = std::numeric_limits::infinity(); - } - BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_; - // tok_extra_cost will be a "min" over either directly being final, or - // being indirectly final through other links, and the loop below may - // decrease its value: - for (link = tok->links; link != NULL;) { - // See if we need to excise this link... - Token *next_tok = link->next_tok; - BaseFloat link_extra_cost = - next_tok->extra_cost + - ((tok->tot_cost + link->acoustic_cost + link->graph_cost) - - next_tok->tot_cost); - if (link_extra_cost > config_.lattice_beam) { // excise link - ForwardLinkT *next_link = link->next; - if (prev_link != NULL) - prev_link->next = next_link; - else - tok->links = next_link; - delete link; - link = next_link; // advance link but leave prev_link the same. - } else { // keep the link and update the tok_extra_cost if needed. - if (link_extra_cost < 0.0) { // this is just a precaution. - // if (link_extra_cost < -0.01) - // KALDI_WARN << "Negative extra_cost: " << link_extra_cost; - link_extra_cost = 0.0; - } - if (link_extra_cost < tok_extra_cost) - tok_extra_cost = link_extra_cost; - prev_link = link; - link = link->next; - } - } - // prune away tokens worse than lattice_beam above best path. This step - // was not necessary in the non-final case because then, this case - // showed up as having no forward links. Here, the tok_extra_cost has - // an extra component relating to the final-prob. - if (tok_extra_cost > config_.lattice_beam) - tok_extra_cost = std::numeric_limits::infinity(); - // to be pruned in PruneTokensForFrame - - if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true; - tok->extra_cost = - tok_extra_cost; // will be +infinity or <= lattice_beam_. - } - } // while changed -} - -template -BaseFloat LatticeFasterDecoderTpl::FinalRelativeCost() const { - if (!decoding_finalized_) { - BaseFloat relative_cost; - ComputeFinalCosts(NULL, &relative_cost, NULL); - return relative_cost; - } else { - // we're not allowed to call that function if FinalizeDecoding() has - // been called; return a cached value. - return final_relative_cost_; - } -} - -// Prune away any tokens on this frame that have no forward links. -// [we don't do this in PruneForwardLinks because it would give us -// a problem with dangling pointers]. -// It's called by PruneActiveTokens if any forward links have been pruned -template -void LatticeFasterDecoderTpl::PruneTokensForFrame( - int32 frame_plus_one) { - KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size()); - Token *&toks = active_toks_[frame_plus_one].toks; - if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]"; - Token *tok, *next_tok, *prev_tok = NULL; - for (tok = toks; tok != NULL; tok = next_tok) { - next_tok = tok->next; - if (tok->extra_cost == std::numeric_limits::infinity()) { - // token is unreachable from end of graph; (no forward links survived) - // excise tok from list and delete tok. - if (prev_tok != NULL) - prev_tok->next = tok->next; - else - toks = tok->next; - delete tok; - num_toks_--; - } else { // fetch next Token - prev_tok = tok; - } - } -} - -// Go backwards through still-alive tokens, pruning them, starting not from -// the current frame (where we want to keep all tokens) but from the frame -// before that. We go backwards through the frames and stop when we reach a -// point where the delta-costs are not changing (and the delta controls when we -// consider a cost to have "not changed"). -template -void LatticeFasterDecoderTpl::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // The index "f" below represents a "frame plus one", i.e. you'd have to - // subtract one to get the corresponding index for the decodable object. - for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) { - // Reason why we need to prune forward links in this situation: - // (1) we have never pruned them (new TokenList) - // (2) we have not yet pruned the forward links to the next f, - // after any of those tokens have changed their extra_cost. - if (active_toks_[f].must_prune_forward_links) { - bool extra_costs_changed = false, links_pruned = false; - PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); - if (extra_costs_changed && f > 0) // any token has changed extra_cost - active_toks_[f - 1].must_prune_forward_links = true; - if (links_pruned) // any link was pruned - active_toks_[f].must_prune_tokens = true; - active_toks_[f].must_prune_forward_links = false; // job done - } - if (f + 1 < cur_frame_plus_one && // except for last f (no forward links) - active_toks_[f + 1].must_prune_tokens) { - PruneTokensForFrame(f + 1); - active_toks_[f + 1].must_prune_tokens = false; - } - } - KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin - << " to " << num_toks_; -} - -template -void LatticeFasterDecoderTpl::ComputeFinalCosts( - unordered_map *final_costs, - BaseFloat *final_relative_cost, BaseFloat *final_best_cost) const { - KALDI_ASSERT(!decoding_finalized_); - if (final_costs != NULL) final_costs->clear(); - const Elem *final_toks = toks_.GetList(); - BaseFloat infinity = std::numeric_limits::infinity(); - BaseFloat best_cost = infinity, best_cost_with_final = infinity; - - while (final_toks != NULL) { - StateId state = final_toks->key; - Token *tok = final_toks->val; - const Elem *next = final_toks->tail; - BaseFloat final_cost = fst_->Final(state).Value(); - BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost; - best_cost = std::min(cost, best_cost); - best_cost_with_final = std::min(cost_with_final, best_cost_with_final); - if (final_costs != NULL && final_cost != infinity) - (*final_costs)[tok] = final_cost; - final_toks = next; - } - if (final_relative_cost != NULL) { - if (best_cost == infinity && best_cost_with_final == infinity) { - // Likely this will only happen if there are no tokens surviving. - // This seems the least bad way to handle it. - *final_relative_cost = infinity; - } else { - *final_relative_cost = best_cost_with_final - best_cost; - } - } - if (final_best_cost != NULL) { - if (best_cost_with_final != infinity) { // final-state exists. - *final_best_cost = best_cost_with_final; - } else { // no final-state exists. - *final_best_cost = best_cost; - } - } -} - -template -void LatticeFasterDecoderTpl::AdvanceDecoding( - DecodableInterface *decodable, int32 max_num_frames) { - if (std::is_same >::value) { - // if the type 'FST' is the FST base-class, then see if the FST type of fst_ - // is actually VectorFst or ConstFst. If so, call the AdvanceDecoding() - // function after casting *this to the more specific type. - if (fst_->Type() == "const") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } else if (fst_->Type() == "vector") { - LatticeFasterDecoderTpl, Token> *this_cast = - reinterpret_cast< - LatticeFasterDecoderTpl, Token> *>( - this); - this_cast->AdvanceDecoding(decodable, max_num_frames); - return; - } - } - - KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ && - "You must call InitDecoding() before AdvanceDecoding"); - int32 num_frames_ready = decodable->NumFramesReady(); - // num_frames_ready must be >= num_frames_decoded, or else - // the number of frames ready must have decreased (which doesn't - // make sense) or the decodable object changed between calls - // (which isn't allowed). - KALDI_ASSERT(num_frames_ready >= NumFramesDecoded()); - int32 target_frames_decoded = num_frames_ready; - if (max_num_frames >= 0) - target_frames_decoded = - std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames); - while (NumFramesDecoded() < target_frames_decoded) { - if (NumFramesDecoded() % config_.prune_interval == 0) { - PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); - } -} - -// FinalizeDecoding() is a version of PruneActiveTokens that we call -// (optionally) on the final frame. Takes into account the final-prob of -// tokens. This function used to be called PruneActiveTokensFinal(). -template -void LatticeFasterDecoderTpl::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); - int32 num_toks_begin = num_toks_; - // PruneForwardLinksFinal() prunes final frame (with final-probs), and - // sets decoding_finalized_. - PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { - bool b1, b2; // values not used. - BaseFloat dontcare = 0.0; // delta of zero means we must always update - PruneForwardLinks(f, &b1, &b2, dontcare); - PruneTokensForFrame(f + 1); - } - PruneTokensForFrame(0); - KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " - << num_toks_; -} - -/// Gets the weight cutoff. Also counts the active tokens. -template -BaseFloat LatticeFasterDecoderTpl::GetCutoff( - Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, - Elem **best_elem) { - BaseFloat best_weight = std::numeric_limits::infinity(); - // positive == high cost == bad. - size_t count = 0; - if (config_.max_active == std::numeric_limits::max() && - config_.min_active == 0) { - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = static_cast(e->val->tot_cost); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - if (adaptive_beam != NULL) *adaptive_beam = config_.beam; - return best_weight + config_.beam; - } else { - tmp_array_.clear(); - for (Elem *e = list_head; e != NULL; e = e->tail, count++) { - BaseFloat w = e->val->tot_cost; - tmp_array_.push_back(w); - if (w < best_weight) { - best_weight = w; - if (best_elem) *best_elem = e; - } - } - if (tok_count != NULL) *tok_count = count; - - BaseFloat beam_cutoff = best_weight + config_.beam, - min_active_cutoff = std::numeric_limits::infinity(), - max_active_cutoff = std::numeric_limits::infinity(); - - KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() - << " is " << tmp_array_.size(); - - if (tmp_array_.size() > static_cast(config_.max_active)) { - std::nth_element(tmp_array_.begin(), - tmp_array_.begin() + config_.max_active, - tmp_array_.end()); - max_active_cutoff = tmp_array_[config_.max_active]; - } - if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam. - if (adaptive_beam) - *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; - return max_active_cutoff; - } - if (tmp_array_.size() > static_cast(config_.min_active)) { - if (config_.min_active == 0) { - min_active_cutoff = best_weight; - } else { - std::nth_element( - tmp_array_.begin(), tmp_array_.begin() + config_.min_active, - tmp_array_.size() > static_cast(config_.max_active) - ? tmp_array_.begin() + config_.max_active - : tmp_array_.end()); - min_active_cutoff = tmp_array_[config_.min_active]; - } - } - if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. - if (adaptive_beam) - *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; - return min_active_cutoff; - } else { - *adaptive_beam = config_.beam; - return beam_cutoff; - } - } -} - -template -BaseFloat LatticeFasterDecoderTpl::ProcessEmitting( - DecodableInterface *decodable) { - KALDI_ASSERT(active_toks_.size() > 0); - int32 frame = - active_toks_.size() - 1; // frame is the frame-index - // (zero-based) used to get likelihoods - // from the decodable object. - active_toks_.resize(active_toks_.size() + 1); - - Elem *final_toks = - toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_ - // in simple-decoder.h. Removes the Elems from - // being indexed in the hash in toks_. - Elem *best_elem = NULL; - BaseFloat adaptive_beam; - size_t tok_cnt; - BaseFloat cur_cutoff = - GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); - KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " - << adaptive_beam; - - PossiblyResizeHash( - tok_cnt); // This makes sure the hash is always big enough. - - BaseFloat next_cutoff = std::numeric_limits::infinity(); - // pruning "online" before having seen all tokens - - BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good - // dynamic range. - - // First process the best token to get a hopefully - // reasonably tight bound on the next cutoff. The only - // products of the next block are "next_cutoff" and "cost_offset". - if (best_elem) { - StateId state = best_elem->key; - Token *tok = best_elem->val; - cost_offset = -tok->tot_cost; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat new_weight = arc.weight.Value() + cost_offset - - decodable->LogLikelihood(frame, arc.ilabel) + - tok->tot_cost; - if (state != arc.nextstate) { - new_weight += config_.length_penalty; - } - if (new_weight + adaptive_beam < next_cutoff) - next_cutoff = new_weight + adaptive_beam; - } - } - } - - // Store the offset on the acoustic likelihoods that we're applying. - // Could just do cost_offsets_.push_back(cost_offset), but we - // do it this way as it's more robust to future code changes. - cost_offsets_.resize(frame + 1, 0.0); - cost_offsets_[frame] = cost_offset; - - // the tokens are now owned here, in final_toks, and the hash is empty. - // 'owned' is a complex thing here; the point is we need to call DeleteElem - // on each elem 'e' to let toks_ know we're done with them. - for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) { - // loop this way because we delete "e" as we go. - StateId state = e->key; - Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel != 0) { // propagate.. - BaseFloat ac_cost = cost_offset - - decodable->LogLikelihood(frame, arc.ilabel), - graph_cost = arc.weight.Value(); - if (state != arc.nextstate) { - graph_cost += config_.length_penalty; - } - BaseFloat cur_cost = tok->tot_cost, - tot_cost = cur_cost + ac_cost + graph_cost; - if (tot_cost >= next_cutoff) - continue; - else if (tot_cost + adaptive_beam < next_cutoff) - next_cutoff = - tot_cost + adaptive_beam; // prune by best current token - // Note: the frame indexes into active_toks_ are one-based, - // hence the + 1. - Elem *e_next = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL); - // NULL: no change indicator needed - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_next->val->context_state = tok->context_state; - } else { - e_next->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - // Add ForwardLink from tok to next_tok (put on head of list - // tok->links) - tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, - graph_cost, ac_cost, is_start_boundary, - is_end_boundary, tok->links); - tok->links->context_score = context_score; - } - } // for all arcs - } - e_tail = e->tail; - toks_.Delete(e); // delete Elem - } - return next_cutoff; -} - -// static inline -template -void LatticeFasterDecoderTpl::DeleteForwardLinks(Token *tok) { - ForwardLinkT *l = tok->links, *m; - while (l != NULL) { - m = l->next; - delete l; - l = m; - } - tok->links = NULL; -} - -template -void LatticeFasterDecoderTpl::ProcessNonemitting(BaseFloat cutoff) { - KALDI_ASSERT(!active_toks_.empty()); - int32 frame = static_cast(active_toks_.size()) - 2; - // Note: "frame" is the time-index we just processed, or -1 if - // we are processing the nonemitting transitions before the - // first frame (called from InitDecoding()). - - // Processes nonemitting arcs for one frame. Propagates within toks_. - // Note-- this queue structure is not very optimal as - // it may cause us to process states unnecessarily (e.g. more than once), - // but in the baseline code, turning this vector into a set to fix this - // problem did not improve overall speed. - - KALDI_ASSERT(queue_.empty()); - - if (toks_.GetList() == NULL) { - if (!warned_) { - KALDI_WARN << "Error, no surviving tokens: frame is " << frame; - warned_ = true; - } - } - - int before = 0, after = 0; - for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) { - StateId state = e->key; - if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(e); - ++before; - } - - while (!queue_.empty()) { - ++after; - const Elem *e = queue_.back(); - queue_.pop_back(); - - StateId state = e->key; - Token *tok = - e->val; // would segfault if e is a NULL pointer but this can't happen. - BaseFloat cur_cost = tok->tot_cost; - if (cur_cost >= cutoff) // Don't bother processing successors. - continue; - // If "tok" has any existing forward links, delete them, - // because we're about to regenerate them. This is a kind - // of non-optimality (remember, this is the simple decoder), - // but since most states are emitting it's not a huge issue. - DeleteForwardLinks(tok); // necessary when re-visiting - tok->links = NULL; - for (fst::ArcIterator aiter(*fst_, state); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.ilabel == 0) { // propagate nonemitting only... - BaseFloat graph_cost = arc.weight.Value(), - tot_cost = cur_cost + graph_cost; - if (tot_cost < cutoff) { - bool changed; - - Elem *e_new = - FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed); - - bool is_start_boundary = false; - bool is_end_boundary = false; - float context_score = 0; - if (context_graph_) { - if (arc.olabel == 0) { - e_new->val->context_state = tok->context_state; - } else { - e_new->val->context_state = context_graph_->GetNextState( - tok->context_state, arc.olabel, &context_score, - &is_start_boundary, &is_end_boundary); - graph_cost -= context_score; - } - } - - tok->links = - new ForwardLinkT(e_new->val, 0, arc.olabel, graph_cost, 0, - is_start_boundary, is_end_boundary, tok->links); - tok->links->context_score = context_score; - - // "changed" tells us whether the new token has a different - // cost from before, or is new [if so, add into queue]. - if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0) - queue_.push_back(e_new); - } - } - } // for all arcs - } // while queue not empty - KALDI_VLOG(3) << "ProcessNonemitting " << before << " " << after; -} - -template -void LatticeFasterDecoderTpl::DeleteElems(Elem *list) { - for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { - e_tail = e->tail; - toks_.Delete(e); - } -} - -template -void LatticeFasterDecoderTpl< - FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin - for (size_t i = 0; i < active_toks_.size(); i++) { - // Delete all tokens alive on this frame, and any forward - // links they may have. - for (Token *tok = active_toks_[i].toks; tok != NULL;) { - DeleteForwardLinks(tok); - Token *next_tok = tok->next; - delete tok; - num_toks_--; - tok = next_tok; - } - } - active_toks_.clear(); - KALDI_ASSERT(num_toks_ == 0); -} - -// static -template -void LatticeFasterDecoderTpl::TopSortTokens( - Token *tok_list, std::vector *topsorted_list) { - unordered_map token2pos; - using std::unordered_set; - typedef typename unordered_map::iterator IterType; - int32 num_toks = 0; - for (Token *tok = tok_list; tok != NULL; tok = tok->next) num_toks++; - int32 cur_pos = 0; - // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0. - // This is likely to be in closer to topological order than - // if we had given them ascending order, because of the way - // new tokens are put at the front of the list. - for (Token *tok = tok_list; tok != NULL; tok = tok->next) - token2pos[tok] = num_toks - ++cur_pos; - - unordered_set reprocess; - - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) { - Token *tok = iter->first; - int32 pos = iter->second; - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - // We only need to consider epsilon links, since non-epsilon links - // transition between frames and this function only needs to sort a list - // of tokens from a single frame. - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { // another token on this - // frame, so must consider it. - int32 next_pos = following_iter->second; - if (next_pos < pos) { // reassign the position of the next Token. - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - // In case we had previously assigned this token to be reprocessed, we can - // erase it from that set because it's "happy now" (we just processed it). - reprocess.erase(tok); - } - - size_t max_loop = 1000000, - loop_count; // max_loop is to detect epsilon cycles. - for (loop_count = 0; !reprocess.empty() && loop_count < max_loop; - ++loop_count) { - std::vector reprocess_vec; - for (typename unordered_set::iterator iter = reprocess.begin(); - iter != reprocess.end(); ++iter) - reprocess_vec.push_back(*iter); - reprocess.clear(); - for (typename std::vector::iterator iter = reprocess_vec.begin(); - iter != reprocess_vec.end(); ++iter) { - Token *tok = *iter; - int32 pos = token2pos[tok]; - // Repeat the processing we did above (for comments, see above). - for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) { - if (link->ilabel == 0) { - IterType following_iter = token2pos.find(link->next_tok); - if (following_iter != token2pos.end()) { - int32 next_pos = following_iter->second; - if (next_pos < pos) { - following_iter->second = cur_pos++; - reprocess.insert(link->next_tok); - } - } - } - } - } - } - KALDI_ASSERT(loop_count < max_loop && - "Epsilon loops exist in your decoding " - "graph (this is not allowed!)"); - - topsorted_list->clear(); - topsorted_list->resize(cur_pos, - NULL); // create a list with NULLs in between. - for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) - (*topsorted_list)[iter->second] = iter->first; -} - -// Instantiate the template for the combination of token types and FST types -// that we'll need. -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; -template class LatticeFasterDecoderTpl, - decoder::StdToken>; - -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -template class LatticeFasterDecoderTpl, - decoder::BackpointerToken>; -// template class LatticeFasterDecoderTpl; template class -// LatticeFasterDecoderTpl; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-decoder.h deleted file mode 100644 index 0152b85447e354b770745b748d266b1ca2d57024..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-decoder.h +++ /dev/null @@ -1,558 +0,0 @@ -// decoder/lattice-faster-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen -// 2021 Binbin Zhang, Zhendong Peng - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_ - -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "decoder/context_graph.h" -#include "fst/fstlib.h" -#include "fstext/fstext-lib.h" -#include "itf/decodable-itf.h" -#include "lat/determinize-lattice-pruned.h" -#include "lat/kaldi-lattice.h" -#include "util/hash-list.h" - -namespace kaldi { - -struct LatticeFasterDecoderConfig { - BaseFloat beam; - int32 max_active; - int32 min_active; - BaseFloat lattice_beam; - int32 prune_interval; - bool determinize_lattice; // not inspected by this class... used in - // command-line program. - BaseFloat beam_delta; - BaseFloat hash_ratio; - // Note: we don't make prune_scale configurable on the command line, it's not - // a very important parameter. It affects the algorithm that prunes the - // tokens as we go. - BaseFloat prune_scale; - BaseFloat length_penalty; // for balancing the del/ins ratio, suggested -3.0 - - // Most of the options inside det_opts are not actually queried by the - // LatticeFasterDecoder class itself, but by the code that calls it, for - // example in the function DecodeUtteranceLatticeFaster. - fst::DeterminizeLatticePhonePrunedOptions det_opts; - - LatticeFasterDecoderConfig() - : beam(16.0), - max_active(std::numeric_limits::max()), - min_active(200), - lattice_beam(10.0), - prune_interval(25), - determinize_lattice(true), - beam_delta(0.5), - hash_ratio(2.0), - prune_scale(0.1), - length_penalty(0.0) {} - void Register(OptionsItf *opts) { - det_opts.Register(opts); - opts->Register("beam", &beam, - "Decoding beam. Larger->slower, more accurate."); - opts->Register("max-active", &max_active, - "Decoder max active states. Larger->slower; " - "more accurate"); - opts->Register("min-active", &min_active, - "Decoder minimum #active states."); - opts->Register("lattice-beam", &lattice_beam, - "Lattice generation beam. Larger->slower, " - "and deeper lattices"); - opts->Register("prune-interval", &prune_interval, - "Interval (in frames) at " - "which to prune tokens"); - opts->Register( - "determinize-lattice", &determinize_lattice, - "If true, " - "determinize the lattice (lattice-determinization, keeping only " - "best pdf-sequence for each word-sequence)."); - opts->Register( - "beam-delta", &beam_delta, - "Increment used in decoding-- this " - "parameter is obscure and relates to a speedup in the way the " - "max-active constraint is applied. Larger is more accurate."); - opts->Register("hash-ratio", &hash_ratio, - "Setting used in decoder to " - "control hash behavior"); - } - void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && - min_active <= max_active && prune_interval > 0 && - beam_delta > 0.0 && hash_ratio >= 1.0 && prune_scale > 0.0 && - prune_scale < 1.0); - } -}; - -namespace decoder { -// We will template the decoder on the token type as well as the FST type; this -// is a mechanism so that we can use the same underlying decoder code for -// versions of the decoder that support quickly getting the best path -// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also -// those that do not (LatticeFasterDecoder). - -// ForwardLinks are the links from a token to a token on the next frame. -// or sometimes on the current frame (for input-epsilon links). -template -struct ForwardLink { - using Label = fst::StdArc::Label; - - Token *next_tok; // the next token [or NULL if represents final-state] - Label ilabel; // ilabel on arc - Label olabel; // olabel on arc - BaseFloat graph_cost; // graph cost of traversing arc (contains LM, etc.) - BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing arc - bool is_start_boundary; - bool is_end_boundary; - float context_score; - ForwardLink *next; // next in singly-linked list of forward arcs (arcs - // in the state-level lattice) from a token. - inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, - bool is_start_boundary, bool is_end_boundary, - ForwardLink *next) - : next_tok(next_tok), - ilabel(ilabel), - olabel(olabel), - graph_cost(graph_cost), - acoustic_cost(acoustic_cost), - is_start_boundary(is_start_boundary), - is_end_boundary(is_end_boundary), - context_score(0), - next(next) {} -}; - -struct StdToken { - using ForwardLinkT = ForwardLink; - using Token = StdToken; - - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals the - // minimum difference between the cost of the best path that this link is a - // part of, and the cost of the absolute best path, under the assumption that - // any of the currently active states at the decoding front may eventually - // succeed (e.g. if you were to take the currently active states one by one - // and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - Token *next; - - // This function does nothing and should be optimized out; it's needed - // so we can share the regular LatticeFasterDecoderTpl code and the code - // for LatticeFasterOnlineDecoder that supports fast traceback. - inline void SetBackpointer(Token *backpointer) {} - - // This constructor just ignores the 'backpointer' argument. That argument is - // needed so that we can use the same decoder code for LatticeFasterDecoderTpl - // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a - // fast way to obtain the best path). - inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links, - Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - context_state(0), - next(next) {} -}; - -struct BackpointerToken { - using ForwardLinkT = ForwardLink; - using Token = BackpointerToken; - - // BackpointerToken is like Token but also - // Standard token type for LatticeFasterDecoder. Each active HCLG - // (decoding-graph) state on each frame has one token. - - // tot_cost is the total (LM + acoustic) cost from the beginning of the - // utterance up to this point. (but see cost_offset_, which is subtracted - // to keep it in a good numerical range). - BaseFloat tot_cost; - - // exta_cost is >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). - BaseFloat extra_cost; - - int context_state = 0; - - // 'links' is the head of singly-linked list of ForwardLinks, which is what we - // use for lattice generation. - ForwardLinkT *links; - - // 'next' is the next in the singly-linked list of tokens for this frame. - BackpointerToken *next; - - // Best preceding BackpointerToken (could be a on this frame, connected to - // this via an epsilon transition, or on a previous frame). This is only - // required for an efficient GetBestPath function in - // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation - // (the "links" list is what stores the forward links, for that). - Token *backpointer; - - inline void SetBackpointer(Token *backpointer) { - this->backpointer = backpointer; - } - - inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, - ForwardLinkT *links, Token *next, Token *backpointer) - : tot_cost(tot_cost), - extra_cost(extra_cost), - links(links), - next(next), - backpointer(backpointer), - context_state(0) {} -}; - -} // namespace decoder - -/** This is the "normal" lattice-generating decoder. - See \ref lattices_generation \ref decoders_faster and \ref decoders_simple - for more information. - - The decoder is templated on the FST type and the token type. The token type - will normally be StdToken, but also may be BackpointerToken which is to - support quick lookup of the current best path (see - lattice-faster-online-decoder.h) - - The FST you invoke this decoder which is expected to equal - Fst::Fst, a.k.a. StdFst, or GrammarFst. If you invoke it with - FST == StdFst and it notices that the actual FST type is - fst::VectorFst or fst::ConstFst, the decoder object - will internally cast itself to one that is templated on those more specific - types; this is an optimization for speed. - */ -template -class LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph); - - // This version of the constructor takes ownership of the fst, and will delete - // it when this object is destroyed. - LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config, FST *fst); - - void SetOptions(const LatticeFasterDecoderConfig &config) { - config_ = config; - } - - const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - - ~LatticeFasterDecoderTpl(); - - /// Decodes until there are no more frames left in the "decodable" object.. - /// note, this may block waiting for input if the "decodable" object blocks. - /// Returns true if any kind of traceback is available (not necessarily from a - /// final state). - bool Decode(DecodableInterface *decodable); - - /// says whether a final-state was active on the last frame. If it was not, - /// the lattice (or traceback) will end with states that are not final-states. - bool ReachedFinal() const { - return FinalRelativeCost() != std::numeric_limits::infinity(); - } - - /// Outputs an FST corresponding to the single best path through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. Note: this just calls - /// GetRawLattice() and figures out the shortest path. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// Outputs an FST corresponding to the raw, state-level - /// tracebacks. Returns true if result is nonempty. - /// If "use_final_probs" is true AND we reached the final-state - /// of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - /// The raw lattice will be topologically sorted. - /// - /// See also GetRawLatticePruned in lattice-faster-online-decoder.h, - /// which also supports a pruning beam, in case for some reason - /// you want it pruned tighter than the regular lattice beam. - /// We could put that here in future needed. - bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const; - - /// [Deprecated, users should now use GetRawLattice and determinize it - /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper]. - /// Outputs an FST corresponding to the lattice-determinized - /// lattice (one path per word sequence). Returns true if result is - /// nonempty. If "use_final_probs" is true AND we reached the final-state of - /// the graph then it will include those as final-probs, else it will treat - /// all final-probs as one. - bool GetLattice(CompactLattice *ofst, bool use_final_probs = true) const; - - /// InitDecoding initializes the decoding, and should only be used if you - /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to - /// call this. You can also call InitDecoding if you have already decoded an - /// utterance and want to start with a new utterance. - void InitDecoding(); - - /// This will decode until there are no more frames ready in the decodable - /// object. You can keep calling it each time more frames become available. - /// If max_num_frames is specified, it specifies the maximum number of frames - /// the function will decode before returning. - void AdvanceDecoding(DecodableInterface *decodable, - int32 max_num_frames = -1); - - /// This function may be optionally called after AdvanceDecoding(), when you - /// do not plan to decode any further. It does an extra pruning step that - /// will help to prune the lattices output by GetLattice and (particularly) - /// GetRawLattice more completely, particularly toward the end of the - /// utterance. If you call this, you cannot call AdvanceDecoding again (it - /// will fail), and you cannot call GetLattice() and related functions with - /// use_final_probs = false. Used to be called PruneActiveTokensFinal(). - void FinalizeDecoding(); - - /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives - /// more information. It returns the difference between the best (final-cost - /// plus cost) of any token on the final frame, and the best cost of any token - /// on the final frame. If it is infinity it means no final-states were - /// present on the final frame. It will usually be nonnegative. If it not - /// too positive (e.g. < 5 is my first guess, but this is not tested) you can - /// take it as a good indication that we reached the final-state with - /// reasonable likelihood. - BaseFloat FinalRelativeCost() const; - - // Returns the number of frames decoded so far. The value returned changes - // whenever we call ProcessEmitting(). - inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; } - - protected: - // we make things protected instead of private, as code in - // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the - // internals. - - // Deletes the elements of the singly linked list tok->links. - inline static void DeleteForwardLinks(Token *tok); - - // head of per-frame list of Tokens (list is in topological order), - // and something saying whether we ever pruned it using PruneForwardLinks. - struct TokenList { - Token *toks; - bool must_prune_forward_links; - bool must_prune_tokens; - TokenList() - : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) {} - }; - - using Elem = typename HashList::Elem; - // Equivalent to: - // struct Elem { - // StateId key; - // Token *val; - // Elem *tail; - // }; - - void PossiblyResizeHash(size_t num_toks); - - // FindOrAddToken either locates a token in hash of toks_, or if necessary - // inserts a new, empty token (i.e. with no forward links) for the current - // frame. [note: it's inserted if necessary into hash toks_ and also into the - // singly linked list of tokens active on this frame (whose head is at - // active_toks_[frame]). The frame_plus_one argument is the acoustic frame - // index plus one, which is used to index into the active_toks_ array. - // Returns the Token pointer. Sets "changed" (if non-NULL) to true if the - // token was newly created or the cost changed. - // If Token == StdToken, the 'backpointer' argument has no purpose (and will - // hopefully be optimized out). - inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one, - BaseFloat tot_cost, Token *backpointer, - bool *changed); - - // prunes outgoing links for all tokens in active_toks_[frame] - // it's called by PruneActiveTokens - // all links, that have link_extra_cost > lattice_beam are pruned - // delta is the amount by which the extra_costs must change - // before we set *extra_costs_changed = true. - // If delta is larger, we'll tend to go back less far - // toward the beginning of the file. - // extra_costs_changed is set to true if extra_cost was changed for any token - // links_pruned is set to true if any link in any token was pruned - void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed, - bool *links_pruned, BaseFloat delta); - - // This function computes the final-costs for tokens active on the final - // frame. It outputs to final-costs, if non-NULL, a map from the Token* - // pointer to the final-prob of the corresponding state, for all Tokens - // that correspond to states that have final-probs. This map will be - // empty if there were no final-probs. It outputs to - // final_relative_cost, if non-NULL, the difference between the best - // forward-cost including the final-prob cost, and the best forward-cost - // without including the final-prob cost (this will usually be positive), or - // infinity if there were no final-probs. [c.f. FinalRelativeCost(), which - // outputs this quanitity]. It outputs to final_best_cost, if - // non-NULL, the lowest for any token t active on the final frame, of - // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in - // the graph of the state corresponding to token t, or the best of - // forward-cost[t] if there were no final-probs active on the final frame. - // You cannot call this after FinalizeDecoding() has been called; in that - // case you should get the answer from class-member variables. - void ComputeFinalCosts(unordered_map *final_costs, - BaseFloat *final_relative_cost, - BaseFloat *final_best_cost) const; - - // PruneForwardLinksFinal is a version of PruneForwardLinks that we call - // on the final frame. If there are final tokens active, it uses - // the final-probs for pruning, otherwise it treats all tokens as final. - void PruneForwardLinksFinal(); - - // Prune away any tokens on this frame that have no forward links. - // [we don't do this in PruneForwardLinks because it would give us - // a problem with dangling pointers]. - // It's called by PruneActiveTokens if any forward links have been pruned - void PruneTokensForFrame(int32 frame_plus_one); - - // Go backwards through still-alive tokens, pruning them if the - // forward+backward cost is more than lat_beam away from the best path. It's - // possible to prove that this is "correct" in the sense that we won't lose - // anything outside of lat_beam, regardless of what happens in the future. - // delta controls when it considers a cost to have changed enough to continue - // going backward and propagating the change. larger delta -> will recurse - // less far. - void PruneActiveTokens(BaseFloat delta); - - /// Gets the weight cutoff. Also counts the active tokens. - BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, - BaseFloat *adaptive_beam, Elem **best_elem); - - /// Processes emitting arcs for one frame. Propagates from prev_toks_ to - /// cur_toks_. Returns the cost cutoff for subsequent ProcessNonemitting() to - /// use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); - - /// Processes nonemitting (epsilon) arcs for one frame. Called after - /// ProcessEmitting() on each frame. The cost cutoff is computed by the - /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); - - // HashList defined in ../util/hash-list.h. It actually allows us to maintain - // more than one list (e.g. for current and previous frames), but only one of - // them at a time can be indexed by StateId. It is indexed by frame-index - // plus one, where the frame-index is zero-based, as used in decodable object. - // That is, the emitting probs of frame t are accounted for in tokens at - // toks_[t+1]. The zeroth frame is for nonemitting transition at the start of - // the graph. - HashList toks_; - - std::vector active_toks_; // Lists of tokens, indexed by - // frame (members of TokenList are toks, must_prune_forward_links, - // must_prune_tokens). - std::vector - queue_; // temp variable used in ProcessNonemitting, - std::vector tmp_array_; // used in GetCutoff. - - // fst_ is a pointer to the FST we are decoding from. - const FST *fst_; - // delete_fst_ is true if the pointer fst_ needs to be deleted when this - // object is destroyed. - bool delete_fst_; - - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic log-likelihoods on that - // frame in order to keep everything in a nice dynamic range i.e. close to - // zero, to reduce roundoff errors. - LatticeFasterDecoderConfig config_; - int32 num_toks_; // current total #toks allocated... - bool warned_; - - /// decoding_finalized_ is true if someone called FinalizeDecoding(). [note, - /// calling this is optional]. If true, it's forbidden to decode more. Also, - /// if this is set, then the output of ComputeFinalCosts() is in the next - /// three variables. The reason we need to do this is that after - /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some - /// of the tokens on the last frame are freed, so we free the list from toks_ - /// to avoid having dangling pointers hanging around. - bool decoding_finalized_; - /// For the meaning of the next 3 variables, see the comment for - /// decoding_finalized_ above., and ComputeFinalCosts(). - unordered_map final_costs_; - BaseFloat final_relative_cost_; - BaseFloat final_best_cost_; - - std::shared_ptr context_graph_ = nullptr; - - // There are various cleanup tasks... the toks_ structure contains - // singly linked lists of Token pointers, where Elem is the list type. - // It also indexes them in a hash, indexed by state (this hash is only - // maintained for the most recent frame). toks_.Clear() - // deletes them from the hash and returns the list of Elems. The - // function DeleteElems calls toks_.Delete(elem) for each elem in - // the list, which returns ownership of the Elem to the toks_ structure - // for reuse, but does not delete the Token pointer. The Token pointers - // are reference-counted and are ultimately deleted in PruneTokensForFrame, - // but are also linked together on each frame by their own linked-list, - // using the "next" pointer. We delete them manually. - void DeleteElems(Elem *list); - - // This function takes a singly linked list of tokens for a single frame, and - // outputs a list of them in topological order (it will crash if no such order - // can be found, which will typically be due to decoding graphs with epsilon - // cycles, which are not allowed). Note: the output list may contain NULLs, - // which the caller should pass over; it just happens to be more efficient for - // the algorithm to output a list that contains NULLs. - static void TopSortTokens(Token *tok_list, - std::vector *topsorted_list); - - void ClearActiveTokens(); - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl); -}; - -typedef LatticeFasterDecoderTpl - LatticeFasterDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-online-decoder.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-online-decoder.cc deleted file mode 100644 index 2345b4d129ff905784762e973bad279f2fb55d31..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-online-decoder.cc +++ /dev/null @@ -1,278 +0,0 @@ -// decoder/lattice-faster-online-decoder.cc - -// Copyright 2009-2012 Microsoft Corporation Mirko Hannemann -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2014 IMSL, PKU-HKUST (author: Wei Shi) -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.cc, about how to maintain this -// file in sync with lattice-faster-decoder.cc - -#include -#include -#include -#include - -#include "decoder/lattice-faster-online-decoder.h" - -namespace kaldi { - -template -bool LatticeFasterOnlineDecoderTpl::TestGetBestPath( - bool use_final_probs) const { - Lattice lat1; - { - Lattice raw_lat; - this->GetRawLattice(&raw_lat, use_final_probs); - ShortestPath(raw_lat, &lat1); - } - Lattice lat2; - GetBestPath(&lat2, use_final_probs); - BaseFloat delta = 0.1; - int32 num_paths = 1; - if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) { - KALDI_WARN << "Best-path test failed"; - return false; - } else { - return true; - } -} - -// Outputs an FST corresponding to the single best path through the lattice. -template -bool LatticeFasterOnlineDecoderTpl::GetBestPath( - Lattice *olat, bool use_final_probs) const { - olat->DeleteStates(); - BaseFloat final_graph_cost; - BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost); - if (iter.Done()) return false; // would have printed warning. - StateId state = olat->AddState(); - olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0)); - while (!iter.Done()) { - LatticeArc arc; - iter = TraceBackBestPath(iter, &arc); - arc.nextstate = state; - StateId new_state = olat->AddState(); - olat->AddArc(new_state, arc); - state = new_state; - } - olat->SetStart(state); - return true; -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::BestPathEnd( - bool use_final_probs, BaseFloat *final_cost_out) const { - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "BestPathEnd() with use_final_probs == false"; - KALDI_ASSERT(this->NumFramesDecoded() > 0 && - "You cannot call BestPathEnd if no frames were decoded."); - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - // Singly linked list of tokens on last frame (access list through "next" - // pointer). - BaseFloat best_cost = std::numeric_limits::infinity(); - BaseFloat best_final_cost = 0; - Token *best_tok = NULL; - for (Token *tok = this->active_toks_.back().toks; tok != NULL; - tok = tok->next) { - BaseFloat cost = tok->tot_cost, final_cost = 0.0; - if (use_final_probs && !final_costs.empty()) { - // if we are instructed to use final-probs, and any final tokens were - // active on final frame, include the final-prob in the cost of the token. - typename unordered_map::const_iterator iter = - final_costs.find(tok); - if (iter != final_costs.end()) { - final_cost = iter->second; - cost += final_cost; - } else { - cost = std::numeric_limits::infinity(); - } - } - if (cost < best_cost) { - best_cost = cost; - best_tok = tok; - best_final_cost = final_cost; - } - } - if (best_tok == - NULL) { // this should not happen, and is likely a code error or - // caused by infinities in likelihoods, but I'm not making - // it a fatal error for now. - KALDI_WARN << "No final token found."; - } - if (final_cost_out) *final_cost_out = best_final_cost; - return BestPathIterator(best_tok, this->NumFramesDecoded() - 1); -} - -template -typename LatticeFasterOnlineDecoderTpl::BestPathIterator -LatticeFasterOnlineDecoderTpl::TraceBackBestPath(BestPathIterator iter, - LatticeArc *oarc) const { - KALDI_ASSERT(!iter.Done() && oarc != NULL); - Token *tok = static_cast(iter.tok); - int32 cur_t = iter.frame, step_t = 0; - if (tok->backpointer != NULL) { - // retrieve the correct forward link(with the best link cost) - BaseFloat best_cost = std::numeric_limits::infinity(); - ForwardLinkT *link; - for (link = tok->backpointer->links; link != NULL; link = link->next) { - if (link->next_tok == tok) { // this is a link to "tok" - BaseFloat graph_cost = link->graph_cost, - acoustic_cost = link->acoustic_cost; - BaseFloat cost = graph_cost + acoustic_cost; - if (cost < best_cost) { - oarc->ilabel = link->ilabel; - oarc->olabel = link->olabel; - if (link->ilabel != 0) { - KALDI_ASSERT(static_cast(cur_t) < - this->cost_offsets_.size()); - acoustic_cost -= this->cost_offsets_[cur_t]; - step_t = -1; - } else { - step_t = 0; - } - oarc->weight = LatticeWeight(graph_cost, acoustic_cost); - best_cost = cost; - } - } - } - if (link == NULL && - best_cost == - std::numeric_limits::infinity()) { // Did not find - // correct link. - KALDI_ERR << "Error tracing best-path back (likely " - << "bug in token-pruning algorithm)"; - } - } else { - oarc->ilabel = 0; - oarc->olabel = 0; - oarc->weight = LatticeWeight::One(); // zero costs. - } - return BestPathIterator(tok->backpointer, cur_t + step_t); -} - -template -bool LatticeFasterOnlineDecoderTpl::GetRawLatticePruned( - Lattice *ofst, bool use_final_probs, BaseFloat beam) const { - typedef LatticeArc Arc; - typedef Arc::StateId StateId; - typedef Arc::Weight Weight; - typedef Arc::Label Label; - - // Note: you can't use the old interface (Decode()) if you want to - // get the lattice with use_final_probs = false. You'd have to do - // InitDecoding() and then AdvanceDecoding(). - if (this->decoding_finalized_ && !use_final_probs) - KALDI_ERR << "You cannot call FinalizeDecoding() and then call " - << "GetRawLattice() with use_final_probs == false"; - - unordered_map final_costs_local; - - const unordered_map &final_costs = - (this->decoding_finalized_ ? this->final_costs_ : final_costs_local); - if (!this->decoding_finalized_ && use_final_probs) - this->ComputeFinalCosts(&final_costs_local, NULL, NULL); - - ofst->DeleteStates(); - // num-frames plus one (since frames are one-based, and we have - // an extra frame for the start-state). - int32 num_frames = this->active_toks_.size() - 1; - KALDI_ASSERT(num_frames > 0); - for (int32 f = 0; f <= num_frames; f++) { - if (this->active_toks_[f].toks == NULL) { - KALDI_WARN << "No tokens active on frame " << f - << ": not producing lattice.\n"; - return false; - } - } - unordered_map tok_map; - std::queue > tok_queue; - // First initialize the queue and states. Put the initial state on the queue; - // this is the last token in the list active_toks_[0].toks. - for (Token *tok = this->active_toks_[0].toks; tok != NULL; tok = tok->next) { - if (tok->next == NULL) { - tok_map[tok] = ofst->AddState(); - ofst->SetStart(tok_map[tok]); - std::pair tok_pair(tok, 0); // #frame = 0 - tok_queue.push(tok_pair); - } - } - - // Next create states for "good" tokens - while (!tok_queue.empty()) { - std::pair cur_tok_pair = tok_queue.front(); - tok_queue.pop(); - Token *cur_tok = cur_tok_pair.first; - int32 cur_frame = cur_tok_pair.second; - KALDI_ASSERT(cur_frame >= 0 && cur_frame <= this->cost_offsets_.size()); - - typename unordered_map::const_iterator iter = - tok_map.find(cur_tok); - KALDI_ASSERT(iter != tok_map.end()); - StateId cur_state = iter->second; - - for (ForwardLinkT *l = cur_tok->links; l != NULL; l = l->next) { - Token *next_tok = l->next_tok; - if (next_tok->extra_cost < beam) { - // so both the current and the next token are good; create the arc - int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1; - StateId nextstate; - if (tok_map.find(next_tok) == tok_map.end()) { - nextstate = tok_map[next_tok] = ofst->AddState(); - tok_queue.push(std::pair(next_tok, next_frame)); - } else { - nextstate = tok_map[next_tok]; - } - BaseFloat cost_offset = - (l->ilabel != 0 ? this->cost_offsets_[cur_frame] : 0); - Arc arc(l->ilabel, l->olabel, - Weight(l->graph_cost, l->acoustic_cost - cost_offset), - nextstate); - ofst->AddArc(cur_state, arc); - } - } - if (cur_frame == num_frames) { - if (use_final_probs && !final_costs.empty()) { - typename unordered_map::const_iterator iter = - final_costs.find(cur_tok); - if (iter != final_costs.end()) - ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0)); - } else { - ofst->SetFinal(cur_state, LatticeWeight::One()); - } - } - } - return (ofst->NumStates() != 0); -} - -// Instantiate the template for the FST types that we'll need. -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; -template class LatticeFasterOnlineDecoderTpl >; - -} // end namespace kaldi. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-online-decoder.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-online-decoder.h deleted file mode 100644 index dc50cfa73e6574e9625eda9045c47f674fcbc1e3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/decoder/lattice-faster-online-decoder.h +++ /dev/null @@ -1,131 +0,0 @@ -// decoder/lattice-faster-online-decoder.h - -// Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; -// 2013-2014 Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen -// 2018 Zhehuai Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -// see note at the top of lattice-faster-decoder.h, about how to maintain this -// file in sync with lattice-faster-decoder.h - -#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ -#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ - -#include "decoder/lattice-faster-decoder.h" - -#include - -namespace kaldi { - -/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also - supports an efficient way to get the best path (see the function - BestPathEnd()), which is useful in endpointing and in situations where you - might want to frequently access the best path. - - This is only templated on the FST type, since the Token type is required to - be BackpointerToken. Actually it only makes sense to instantiate - LatticeFasterDecoderTpl with Token == BackpointerToken if you do so - indirectly via this child class. - */ -template -class LatticeFasterOnlineDecoderTpl - : public LatticeFasterDecoderTpl { - public: - using Arc = typename FST::Arc; - using Label = typename Arc::Label; - using StateId = typename Arc::StateId; - using Weight = typename Arc::Weight; - using Token = decoder::BackpointerToken; - using ForwardLinkT = decoder::ForwardLink; - - // Instantiate this class once for each thing you have to decode. - // This version of the constructor does not take ownership of - // 'fst'. - LatticeFasterOnlineDecoderTpl( - const FST &fst, const LatticeFasterDecoderConfig &config, - const std::shared_ptr &context_graph) - : LatticeFasterDecoderTpl(fst, config, context_graph) {} - - // This version of the initializer takes ownership of 'fst', and will delete - // it when this object is destroyed. - LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config, - FST *fst) - : LatticeFasterDecoderTpl(config, fst) {} - - struct BestPathIterator { - void *tok; - int32 frame; - // note, "frame" is the frame-index of the frame you'll get the - // transition-id for next time, if you call TraceBackBestPath on this - // iterator (assuming it's not an epsilon transition). Note that this - // is one less than you might reasonably expect, e.g. it's -1 for - // the nonemitting transitions before the first frame. - BestPathIterator(void *t, int32 f) : tok(t), frame(f) {} - bool Done() const { return tok == NULL; } - }; - - /// Outputs an FST corresponding to the single best path through the lattice. - /// This is quite efficient because it doesn't get the entire raw lattice and - /// find the best path through it; instead, it uses the BestPathEnd and - /// BestPathIterator so it basically traces it back through the lattice. - /// Returns true if result is nonempty (using the return status is deprecated, - /// it will become void). If "use_final_probs" is true AND we reached the - /// final-state of the graph then it will include those as final-probs, else - /// it will treat all final-probs as one. - bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - - /// This function does a self-test of GetBestPath(). Returns true on - /// success; returns false and prints a warning on failure. - bool TestGetBestPath(bool use_final_probs = true) const; - - /// This function returns an iterator that can be used to trace back - /// the best path. If use_final_probs == true and at least one final state - /// survived till the end, it will use the final-probs in working out the best - /// final Token, and will output the final cost to *final_cost (if non-NULL), - /// else it will use only the forward likelihood, and will put zero in - /// *final_cost (if non-NULL). - /// Requires that NumFramesDecoded() > 0. - BestPathIterator BestPathEnd(bool use_final_probs, - BaseFloat *final_cost = NULL) const; - - /// This function can be used in conjunction with BestPathEnd() to trace back - /// the best path one link at a time (e.g. this can be useful in endpoint - /// detection). By "link" we mean a link in the graph; not all links cross - /// frame boundaries, but each time you see a nonzero ilabel you can interpret - /// that as a frame. The return value is the updated iterator. It outputs - /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" - /// pointer, while leaving its "nextstate" variable unchanged. - BestPathIterator TraceBackBestPath(BestPathIterator iter, - LatticeArc *arc) const; - - /// Behaves the same as GetRawLattice but only processes tokens whose - /// extra_cost is smaller than the best-cost plus the specified beam. - /// It is only worthwhile to call this function if beam is less than - /// the lattice_beam specified in the config; otherwise, it would - /// return essentially the same thing as GetRawLattice, but more slowly. - bool GetRawLatticePruned(Lattice *ofst, bool use_final_probs, - BaseFloat beam) const; - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl); -}; - -typedef LatticeFasterOnlineDecoderTpl LatticeFasterOnlineDecoder; - -} // end namespace kaldi. - -#endif // KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstaddselfloops.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstaddselfloops.cc deleted file mode 100644 index 145bf006f2324136c5fea4a8d0012a7a4126c646..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstaddselfloops.cc +++ /dev/null @@ -1,100 +0,0 @@ -// fstbin/fstaddselfloops.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#include "util/simple-io-funcs.h" - -/* some test examples: - pushd ~/tmpdir - ( echo 3; echo 4) > in.list - ( echo 5; echo 6) > out.list - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list - | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | - fstcompile | fstaddselfloops in.list out.list | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Adds self-loops to states of an FST to propagate disambiguation " - "symbols through it\n" - "They are added on each final state and each state with non-epsilon " - "output symbols\n" - "on at least one arc out of the state. Useful in conjunction with " - "predeterminize\n" - "\n" - "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " - "[out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" - "in.list and out.list are lists of integers, one per line, of the\n" - "same length.\n"; - - ParseOptions po(usage); - po.Read(argc, argv); - - if (po.NumArgs() < 2 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string disambig_in_rxfilename = po.GetArg(1), - disambig_out_rxfilename = po.GetArg(2), - fst_in_filename = po.GetOptArg(3), - fst_out_filename = po.GetOptArg(4); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - std::vector disambig_in; - if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_in_rxfilename); - - std::vector disambig_out; - if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) - KALDI_ERR - << "fstaddselfloops: Could not read disambiguation symbols from " - << kaldi::PrintableRxfilename(disambig_out_rxfilename); - - if (disambig_in.size() != disambig_out.size()) - KALDI_ERR - << "fstaddselfloops: mismatch in size of disambiguation symbols"; - - AddSelfLoops(fst, disambig_in, disambig_out); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstdeterminizestar.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstdeterminizestar.cc deleted file mode 100644 index e818143025c0fd5d389c28c77715d65711fe63f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstdeterminizestar.cc +++ /dev/null @@ -1,114 +0,0 @@ -// fstbin/fstdeterminizestar.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/parse-options.h" -#if !defined(_MSC_VER) && !defined(__APPLE__) -#include // Comment this line and the call to signal below if -// it causes compilation problems. It is only to enable a debugging procedure -// when determinization does not terminate. We are disabling this code if -// compiling on Windows because signal.h is not available there, and on -// MacOS due to a problem with in the initial release of Sierra. -#endif - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | - fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 - 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint - - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst - > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n - "." done - - Test of debugging [with non-determinizable input]: - ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo - "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id - of fstdeterminizestar] # prints out a bunch of debugging output showing the - mess it got itself into. -*/ - -bool debug_location = false; -void signal_handler(int) { debug_location = true; } - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Removes epsilons and determinizes in one step\n" - "\n" - "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" - "\n" - "See also: fstdeterminizelog, lattice-determinize\n"; - - float delta = kDelta; - int max_states = -1; - bool use_log = false; - ParseOptions po(usage); - po.Register("use-log", &use_log, "Determinize in log semiring."); - po.Register("delta", &delta, - "Delta value used to determine equivalence of weights."); - po.Register( - "max-states", &max_states, - "Maximum number of states in determinized FST before it will abort."); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); - - // This enables us to get traceback info from determinization that is - // not seeming to terminate. -#if !defined(_MSC_VER) && !defined(__APPLE__) - signal(SIGUSR1, signal_handler); -#endif - // Normal case: just files. - VectorFst *fst = ReadFstKaldi(fst_in_str); - - ArcSort(fst, ILabelCompare()); // improves speed. - if (use_log) { - DeterminizeStarInLog(fst, delta, &debug_location, max_states); - } else { - VectorFst det_fst; - DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); - *fst = det_fst; // will do shallow copy and then det_fst goes - // out of scope anyway. - } - WriteFstKaldi(*fst, fst_out_str); - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstisstochastic.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstisstochastic.cc deleted file mode 100644 index 468ed0daa7d37cb9a25cf25264f86e48e137b975..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstisstochastic.cc +++ /dev/null @@ -1,91 +0,0 @@ -// fstbin/fstisstochastic.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -// e.g. of test: -// echo " 0 0" | fstcompile | fstisstochastic -// should return 0 and print "0 0" [meaning, min and -// max weight are one = exp(0)] -// echo " 0 1" | fstcompile | fstisstochastic -// should return 1, not stochastic, and print 1 1 -// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 -// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo -// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, -// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; -// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic -// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" -// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | -// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even -// though not stochastic because we gave it an absurdly large delta. - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Checks whether an FST is stochastic and exits with success if so.\n" - "Prints out maximum error (in log units).\n" - "\n" - "Usage: fstisstochastic [ in.fst ]\n"; - - float delta = 0.01; - bool test_in_log = true; - - ParseOptions po(usage); - po.Register("delta", &delta, "Maximum error to accept."); - po.Register("test-in-log", &test_in_log, - "Test stochasticity in log semiring."); - po.Read(argc, argv); - - if (po.NumArgs() > 1) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1); - - Fst *fst = ReadFstKaldiGeneric(fst_in_filename); - - bool ans; - StdArc::Weight min, max; - if (test_in_log) - ans = IsStochasticFstInLog(*fst, delta, &min, &max); - else - ans = IsStochasticFst(*fst, delta, &min, &max); - - std::cout << min.Value() << " " << max.Value() << '\n'; - delete fst; - if (ans) - return 0; // success; - else - return 1; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstminimizeencoded.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstminimizeencoded.cc deleted file mode 100644 index ae9ca6d75abe67d9a195572dd6d91ec3c7b44851..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fstminimizeencoded.cc +++ /dev/null @@ -1,74 +0,0 @@ -// fstbin/fstminimizeencoded.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/determinize-star.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -/* some test examples: - ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint - ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | - fstminimizeencoded | fstprint -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - - const char *usage = - "Minimizes FST after encoding [similar to fstminimize, but no " - "weight-pushing]\n" - "\n" - "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; - - float delta = kDelta; - ParseOptions po(usage); - po.Register("delta", &delta, - "Delta likelihood used for quantization of weights"); - po.Read(argc, argv); - - if (po.NumArgs() > 2) { - po.PrintUsage(); - exit(1); - } - - std::string fst_in_filename = po.GetOptArg(1), - fst_out_filename = po.GetOptArg(2); - - VectorFst *fst = ReadFstKaldi(fst_in_filename); - - MinimizeEncoded(fst, delta); - - WriteFstKaldi(*fst, fst_out_filename); - - delete fst; - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } - return 0; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fsttablecompose.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fsttablecompose.cc deleted file mode 100644 index bdd476da78b8cb8823c60abf33b5278e05bfd92c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstbin/fsttablecompose.cc +++ /dev/null @@ -1,133 +0,0 @@ -// fstbin/fsttablecompose.cc - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "fst/fstlib.h" -#include "fstext/fstext-utils.h" -#include "fstext/kaldi-fst-io.h" -#include "fstext/table-matcher.h" -#include "util/parse-options.h" - -/* - cd ~/tmpdir - while true; do - fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort - > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst - fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" - echo -n "." - done - -*/ - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; // NOLINT - using namespace fst; // NOLINT - using kaldi::int32; - /* - fsttablecompose should always give equivalent results to compose, - but it is more efficient for certain kinds of inputs. - In particular, it is useful when, say, the left FST has states - that typically either have epsilon olabels, or - one transition out for each of the possible symbols (as the - olabel). The same with the input symbols of the right-hand FST - is possible. - */ - - const char *usage = - "Composition algorithm [between two FSTs of standard type, in " - "tropical\n" - "semiring] that is more efficient for certain cases-- in particular,\n" - "where one of the FSTs (the left one, if --match-side=left) has large\n" - "out-degree\n" - "\n" - "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " - "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; - - ParseOptions po(usage); - - TableComposeOptions opts; - std::string match_side = "left"; - std::string compose_filter = "sequence"; - - po.Register("connect", &opts.connect, "If true, trim FST before output."); - po.Register("match-side", &match_side, - "Side of composition to do table " - "match, one of: \"left\" or \"right\"."); - po.Register("compose-filter", &compose_filter, - "Composition filter to use, " - "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); - - po.Read(argc, argv); - - if (match_side == "left") { - opts.table_match_type = MATCH_OUTPUT; - } else if (match_side == "right") { - opts.table_match_type = MATCH_INPUT; - } else { - KALDI_ERR << "Invalid match-side option: " << match_side; - } - - if (compose_filter == "alt_sequence") { - opts.filter_type = ALT_SEQUENCE_FILTER; - } else if (compose_filter == "auto") { - opts.filter_type = AUTO_FILTER; - } else if (compose_filter == "match") { - opts.filter_type = MATCH_FILTER; - } else if (compose_filter == "sequence") { - opts.filter_type = SEQUENCE_FILTER; - } else { - KALDI_ERR << "Invalid compose-filter option: " << compose_filter; - } - - if (po.NumArgs() < 2 || po.NumArgs() > 3) { - po.PrintUsage(); - exit(1); - } - - std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), - fst_out_str = po.GetOptArg(3); - - VectorFst *fst1 = ReadFstKaldi(fst1_in_str); - - VectorFst *fst2 = ReadFstKaldi(fst2_in_str); - - // Checks if is olabel sorted and is ilabel sorted. - if (fst1->Properties(fst::kOLabelSorted, true) == 0) { - KALDI_WARN << "The first FST is not olabel sorted."; - } - if (fst2->Properties(fst::kILabelSorted, true) == 0) { - KALDI_WARN << "The second FST is not ilabel sorted."; - } - - VectorFst composed_fst; - - TableCompose(*fst1, *fst2, &composed_fst, opts); - - delete fst1; - delete fst2; - - WriteFstKaldi(composed_fst, fst_out_str); - return 0; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstext/determinize-lattice-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstext/determinize-lattice-inl.h deleted file mode 100644 index 0bfbc8f41c7e439b1fac037f60490e04fdcbdd8b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/fstext/determinize-lattice-inl.h +++ /dev/null @@ -1,1357 +0,0 @@ -// fstext/determinize-lattice-inl.h - -// Copyright 2009-2012 Microsoft Corporation -// 2012-2013 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -#define KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_ -// Do not include this file directly. It is included by determinize-lattice.h - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fst { - -// This class maps back and forth from/to integer id's to sequences of strings. -// used in determinization algorithm. It is constructed in such a way that -// finding the string-id of the successor of (string, next-label) has constant -// time. - -// Note: class IntType, typically int32, is the type of the element in the -// string (typically a template argument of the CompactLatticeWeightTpl). - -template -class LatticeStringRepository { - public: - struct Entry { - const Entry *parent; // NULL for empty string. - IntType i; - inline bool operator==(const Entry &other) const { - return (parent == other.parent && i == other.i); - } - Entry() {} - Entry(const Entry &e) : parent(e.parent), i(e.i) {} - }; - // Note: all Entry* pointers returned in function calls are - // owned by the repository itself, not by the caller! - - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } - - // Returns string of "parent" with i appended. Pointer - // owned by repository - const Entry *Successor(const Entry *parent, IntType i) { - new_entry_->parent = parent; - new_entry_->i = i; - - std::pair pr = set_.insert(new_entry_); - if (pr.second) { // Was successfully inserted (was not there). We need to - // replace the element we inserted, which resides on the - // stack, with one from the heap. - const Entry *ans = new_entry_; - new_entry_ = new Entry(); - return ans; - } else { // Was not inserted because an equivalent Entry already - // existed. - return *pr.first; - } - } - - const Entry *Concatenate(const Entry *a, const Entry *b) { - if (a == NULL) - return b; - else if (b == NULL) - return a; - std::vector v; - ConvertToVector(b, &v); - const Entry *ans = a; - for (size_t i = 0; i < v.size(); i++) ans = Successor(ans, v[i]); - return ans; - } - const Entry *CommonPrefix(const Entry *a, const Entry *b) { - std::vector a_vec, b_vec; - ConvertToVector(a, &a_vec); - ConvertToVector(b, &b_vec); - const Entry *ans = NULL; - for (size_t i = 0; - i < a_vec.size() && i < b_vec.size() && a_vec[i] == b_vec[i]; i++) - ans = Successor(ans, a_vec[i]); - return ans; - } - - // removes any elements from b that are not part of - // a common prefix with a. - void ReduceToCommonPrefix(const Entry *a, std::vector *b) { - size_t a_size = Size(a), b_size = b->size(); - while (a_size > b_size) { - a = a->parent; - a_size--; - } - if (b_size > a_size) b_size = a_size; - typename std::vector::iterator b_begin = b->begin(); - while (a_size != 0) { - if (a->i != *(b_begin + a_size - 1)) b_size = a_size - 1; - a = a->parent; - a_size--; - } - if (b_size != b->size()) b->resize(b_size); - } - - // removes the first n elements of a. - const Entry *RemovePrefix(const Entry *a, size_t n) { - if (n == 0) return a; - std::vector a_vec; - ConvertToVector(a, &a_vec); - assert(a_vec.size() >= n); - const Entry *ans = NULL; - for (size_t i = n; i < a_vec.size(); i++) ans = Successor(ans, a_vec[i]); - return ans; - } - - // Returns true if a is a prefix of b. If a is prefix of b, - // time taken is |b| - |a|. Else, time taken is |b|. - bool IsPrefixOf(const Entry *a, const Entry *b) const { - if (a == NULL) return true; // empty string prefix of all. - if (a == b) return true; - if (b == NULL) return false; - return IsPrefixOf(a, b->parent); - } - - inline size_t Size(const Entry *entry) const { - size_t ans = 0; - while (entry != NULL) { - ans++; - entry = entry->parent; - } - return ans; - } - - void ConvertToVector(const Entry *entry, std::vector *out) const { - size_t length = Size(entry); - out->resize(length); - if (entry != NULL) { - typename std::vector::reverse_iterator iter = out->rbegin(); - while (entry != NULL) { - *iter = entry->i; - entry = entry->parent; - ++iter; - } - } - } - - const Entry *ConvertFromVector(const std::vector &vec) { - const Entry *e = NULL; - for (size_t i = 0; i < vec.size(); i++) e = Successor(e, vec[i]); - return e; - } - - LatticeStringRepository() { new_entry_ = new Entry; } - - void Destroy() { - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) - delete *iter; - SetType tmp; - tmp.swap(set_); - if (new_entry_) { - delete new_entry_; - new_entry_ = NULL; - } - } - - // Rebuild will rebuild this object, guaranteeing only - // to preserve the Entry values that are in the vector pointed - // to (this list does not have to be unique). The point of - // this is to save memory. - void Rebuild(const std::vector &to_keep) { - SetType tmp_set; - for (typename std::vector::const_iterator iter = - to_keep.begin(); - iter != to_keep.end(); ++iter) - RebuildHelper(*iter, &tmp_set); - // Now delete all elems not in tmp_set. - for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); - ++iter) { - if (tmp_set.count(*iter) == 0) - delete (*iter); // delete the Entry; not needed. - } - set_.swap(tmp_set); - } - - ~LatticeStringRepository() { Destroy(); } - int32 MemSize() const { - return set_.size() * sizeof(Entry) * 2; // this is a lower bound - // on the size this structure might take. - } - - private: - class EntryKey { // Hash function object. - public: - inline size_t operator()(const Entry *entry) const { - size_t prime = 49109; - return static_cast(entry->i) + - prime * reinterpret_cast(entry->parent); - } - }; - class EntryEqual { - public: - inline bool operator()(const Entry *e1, const Entry *e2) const { - return (*e1 == *e2); - } - }; - typedef std::unordered_set SetType; - - void RebuildHelper(const Entry *to_add, SetType *tmp_set) { - while (true) { - if (to_add == NULL) return; - typename SetType::iterator iter = tmp_set->find(to_add); - if (iter == tmp_set->end()) { // not in tmp_set. - tmp_set->insert(to_add); - to_add = to_add->parent; // and loop. - } else { - return; - } - } - } - - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); - Entry *new_entry_; // We always have a pre-allocated Entry ready to use, - // to avoid unnecessary news and deletes. - SetType set_; -}; - -// class LatticeDeterminizer is templated on the same types that -// CompactLatticeWeight is templated on: the base weight (Weight), typically -// LatticeWeightTpl etc. but could also be e.g. TropicalWeight, and the -// IntType, typically int32, used for the output symbols in the compact -// representation of strings [note: the output symbols would usually be -// p.d.f. id's in the anticipated use of this code] It has a special requirement -// on the Weight type: that there should be a Compare function on the weights -// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 -// > w2. This requires that there be a total order on the weights. - -template -class LatticeDeterminizer { - public: - // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 - // correspondence between our states and the states in ofst. If destroy == - // true, release memory as we go (but we cannot output again). - - typedef CompactLatticeWeightTpl CompactWeight; - typedef ArcTpl - CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - - // Output to standard FST with CompactWeightTpl as its weight type - // (the weight stores the original output-symbol strings). If destroy == - // true, release memory as we go (but we cannot output again). - void Output(MutableFst *ofst, bool destroy = true) { - assert(determinized_); - typedef typename Arc::StateId StateId; - StateId nStates = static_cast(output_arcs_.size()); - if (destroy) FreeMostMemory(); - ofst->DeleteStates(); - ofst->SetStart(kNoStateId); - if (nStates == 0) { - return; - } - for (StateId s = 0; s < nStates; s++) { - OutputStateId news = ofst->AddState(); - assert(news == s); - } - ofst->SetStart(0); - // now process transitions. - for (StateId this_state = 0; this_state < nStates; this_state++) { - std::vector &this_vec(output_arcs_[this_state]); - typename std::vector::const_iterator iter = this_vec.begin(), - end = this_vec.end(); - - for (; iter != end; ++iter) { - const TempArc &temp_arc(*iter); - CompactArc new_arc; - std::vector is not treated as epsilon, create a common end state for - // all transitions accepting the , since they do not back off. This small - // optimization saves about 2% states in an average grammar. - if (sub_eps_ == 0) { - eos_state_ = fst_->AddState(); - fst_->SetFinal(eos_state_, 0); - } -} - -template -void ArpaLmCompilerImpl::ConsumeNGram(const NGram& ngram, - bool is_highest) { - // Generally, we do the following. Suppose we are adding an n-gram "A B - // C". Then find the node for "A B", add a new node for "A B C", and connect - // them with the arc accepting "C" with the specified weight. Also, add a - // backoff arc from the new "A B C" node to its backoff state "B C". - // - // Two notable exceptions are the highest order n-grams, and final n-grams. - // - // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM), - // the following optimization is performed. There is no point adding a node - // for "A B C" with a "C" arc from "A B", since there will be no other - // arcs ingoing to this node, and an epsilon backoff arc into the backoff - // model "B C", with the weight of \bar{1}. To save a node, create an arc - // accepting "C" directly from "A B" to "B C". This saves as many nodes - // as there are the highest order n-grams, which is typically about half - // the size of a large 3-gram model. - // - // Indeed, this does not apply to n-grams ending in EOS, since they do not - // back off. These are special, as they do not have a back-off state, and - // the node for "(..anything..) " is always final. These are handled - // in one of the two possible ways, If symbols and are being - // replaced by epsilons, neither node nor arc is created, and the logprob - // of the n-gram is applied to its source node as final weight. If and - // are preserved, then a special final node for is allocated and - // used as the destination of the "" acceptor arc. - HistKey heads(ngram.words.begin(), ngram.words.end() - 1); - typename HistoryMap::iterator source_it = history_.find(heads); - if (source_it == history_.end()) { - // There was no "A B", therefore the probability of "A B C" is zero. - // Print a warning and discard current n-gram. - if (parent_->ShouldWarn()) - KALDI_WARN << parent_->LineReference() - << " skipped: no parent (n-1)-gram exists"; - return; - } - - StateId source = source_it->second; - StateId dest; - Symbol sym = ngram.words.back(); - float weight = -ngram.logprob; - if (sym == sub_eps_ || sym == 0) { - KALDI_ERR << " or disambiguation symbol " << sym - << "found in the ARPA file. "; - } - if (sym == eos_symbol_) { - if (sub_eps_ == 0) { - // Keep as a real symbol when not substituting. - dest = eos_state_; - } else { - // Treat as if it was epsilon: mark source final, with the weight - // of the n-gram. - fst_->SetFinal(source, weight); - return; - } - } else { - // For the highest order n-gram, this may find an existing state, for - // non-highest, will create one (unless there are duplicate n-grams - // in the grammar, which cannot be reliably detected if highest order, - // so we better do not do that at all). - dest = AddStateWithBackoff( - HistKey(ngram.words.begin() + (is_highest ? 1 : 0), ngram.words.end()), - -ngram.backoff); - } - - if (sym == bos_symbol_) { - weight = 0; // Accepting is always free. - if (sub_eps_ == 0) { - // is as a real symbol, only accepted in the start state. - source = fst_->AddState(); - fst_->SetStart(source); - } else { - // The new state for unigram history *is* the start state. - fst_->SetStart(dest); - return; - } - } - - // Add arc from source to dest, whichever way it was found. - fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest)); - return; -} - -// Find or create a new state for n-gram defined by key, and ensure it has a -// backoff transition. The key is either the current n-gram for all but -// highest orders, or the tails of the n-gram for the highest order. The -// latter arises from the chain-collapsing optimization described above. -template -StateId ArpaLmCompilerImpl::AddStateWithBackoff(HistKey key, - float backoff) { - typename HistoryMap::iterator dest_it = history_.find(key); - if (dest_it != history_.end()) { - // Found an existing state in the history map. Invariant: if the state in - // the map, then its backoff arc is in the FST. We are done. - return dest_it->second; - } - // Otherwise create a new state and its backoff arc, and register in the map. - StateId dest = fst_->AddState(); - history_[key] = dest; - CreateBackoff(key.Tails(), dest, backoff); - return dest; -} - -// Create a backoff arc for a state. Key is a backoff destination that may or -// may not exist. When the destination is not found, naturally fall back to -// the lower order model, and all the way down until one is found (since the -// 0-gram model is always present, the search is guaranteed to terminate). -template -inline void ArpaLmCompilerImpl::CreateBackoff(HistKey key, - StateId state, - float weight) { - typename HistoryMap::iterator dest_it = history_.find(key); - while (dest_it == history_.end()) { - key = key.Tails(); - dest_it = history_.find(key); - } - - // The arc should transduce either or #0 to , depending on the - // epsilon substitution mode. This is the only case when input and output - // label may differ. - fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second)); -} - -ArpaLmCompiler::~ArpaLmCompiler() { - if (impl_ != NULL) delete impl_; -} - -void ArpaLmCompiler::HeaderAvailable() { - KALDI_ASSERT(impl_ == NULL); - // Use optimized implementation if the grammar is 4-gram or less, and the - // maximum attained symbol id will fit into the optimized range. - int64 max_symbol = 0; - if (Symbols() != NULL) max_symbol = Symbols()->AvailableKey() - 1; - // If augmenting the symbol table, assume the worst case when all words in - // the model being read are novel. - if (Options().oov_handling == ArpaParseOptions::kAddToSymbols) - max_symbol += NgramCounts()[0]; - - if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - } else { - impl_ = new ArpaLmCompilerImpl(this, &fst_, sub_eps_); - KALDI_LOG << "Reverting to slower state tracking because model is large: " - << NgramCounts().size() << "-gram with symbols up to " - << max_symbol; - } -} - -void ArpaLmCompiler::ConsumeNGram(const NGram& ngram) { - // is invalid in tails, in heads of an n-gram. - for (int i = 0; i < ngram.words.size(); ++i) { - if ((i > 0 && ngram.words[i] == Options().bos_symbol) || - (i + 1 < ngram.words.size() && - ngram.words[i] == Options().eos_symbol)) { - if (ShouldWarn()) - KALDI_WARN << LineReference() - << " skipped: n-gram has invalid BOS/EOS placement"; - return; - } - } - - bool is_highest = ngram.words.size() == NgramCounts().size(); - impl_->ConsumeNGram(ngram, is_highest); -} - -void ArpaLmCompiler::RemoveRedundantStates() { - fst::StdArc::Label backoff_symbol = sub_eps_; - if (backoff_symbol == 0) { - // The method of removing redundant states implemented in this function - // leads to slow determinization of L o G when people use the older style of - // usage of arpa2fst where the --disambig-symbol option was not specified. - // The issue seems to be that it creates a non-deterministic FST, while G is - // supposed to be deterministic. By 'return'ing below, we just disable this - // method if people were using an older script. This method isn't really - // that consequential anyway, and people will move to the newer-style - // scripts (see current utils/format_lm.sh), so this isn't much of a - // problem. - return; - } - - fst::StdArc::StateId num_states = fst_.NumStates(); - - // replace the #0 symbols on the input of arcs out of redundant states (states - // that are not final and have only a backoff arc leaving them), with . - for (fst::StdArc::StateId state = 0; state < num_states; state++) { - if (fst_.NumArcs(state) == 1 && - fst_.Final(state) == fst::TropicalWeight::Zero()) { - fst::MutableArcIterator iter(&fst_, state); - fst::StdArc arc = iter.Value(); - if (arc.ilabel == backoff_symbol) { - arc.ilabel = 0; - iter.SetValue(arc); - } - } - } - - // we could call fst::RemoveEps, and it would have the same effect in normal - // cases, where backoff_symbol != 0 and there are no epsilons in unexpected - // places, but RemoveEpsLocal is a bit safer in case something weird is going - // on; it guarantees not to blow up the FST. - fst::RemoveEpsLocal(&fst_); - KALDI_LOG << "Reduced num-states from " << num_states << " to " - << fst_.NumStates(); -} - -void ArpaLmCompiler::Check() const { - if (fst_.Start() == fst::kNoStateId) { - KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " - << Symbols()->Find(Options().bos_symbol) << "."; - } -} - -void ArpaLmCompiler::ReadComplete() { - fst_.SetInputSymbols(Symbols()); - fst_.SetOutputSymbols(Symbols()); - RemoveRedundantStates(); - Check(); -} - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/lm/arpa-lm-compiler.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/lm/arpa-lm-compiler.h deleted file mode 100644 index 069c71bd0e6f5acf0b9521ec1ef46796eb31fe4d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/lm/arpa-lm-compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// lm/arpa-lm-compiler.h - -// Copyright 2009-2011 Gilles Boulianne -// Copyright 2016 Smart Action LLC (kkm) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_LM_ARPA_LM_COMPILER_H_ -#define KALDI_LM_ARPA_LM_COMPILER_H_ - -#include - -#include "lm/arpa-file-parser.h" - -namespace kaldi { - -class ArpaLmCompilerImplInterface; - -class ArpaLmCompiler : public ArpaFileParser { - public: - ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps, - fst::SymbolTable* symbols) - : ArpaFileParser(options, symbols), sub_eps_(sub_eps), impl_(NULL) {} - ~ArpaLmCompiler(); - - const fst::StdVectorFst& Fst() const { return fst_; } - fst::StdVectorFst* MutableFst() { return &fst_; } - - protected: - // ArpaFileParser overrides. - virtual void HeaderAvailable(); - virtual void ConsumeNGram(const NGram& ngram); - virtual void ReadComplete(); - - private: - // this function removes states that only have a backoff arc coming - // out of them. - void RemoveRedundantStates(); - void Check() const; - - int sub_eps_; - ArpaLmCompilerImplInterface* impl_; // Owned. - fst::StdVectorFst fst_; - template - friend class ArpaLmCompilerImpl; -}; - -} // namespace kaldi - -#endif // KALDI_LM_ARPA_LM_COMPILER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/lmbin/arpa2fst.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/lmbin/arpa2fst.cc deleted file mode 100644 index 881a45c5b37810247ea38dae56237f59b5554a9c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/lmbin/arpa2fst.cc +++ /dev/null @@ -1,145 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "lm/arpa-lm-compiler.h" -#include "util/kaldi-io.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; // NOLINT - try { - const char *usage = - "Convert an ARPA format language model into an FST\n" - "Usage: arpa2fst [opts] \n" - " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" - "data/lang/words.txt lm/input.arpa G.fst\n\n" - "Note: When called without switches, the output G.fst will contain\n" - "an embedded symbol table. This is compatible with the way a previous\n" - "version of arpa2fst worked.\n"; - - ParseOptions po(usage); - - ArpaParseOptions options; - options.Register(&po); - - // Option flags. - std::string bos_symbol = ""; - std::string eos_symbol = ""; - std::string disambig_symbol; - std::string read_syms_filename; - std::string write_syms_filename; - bool keep_symbols = false; - bool ilabel_sort = true; - - po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); - po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); - po.Register("disambig-symbol", &disambig_symbol, - "Disambiguator. If provided (e. g. #0), used on input side of " - "backoff links, and and are replaced with epsilons"); - po.Register("read-symbol-table", &read_syms_filename, - "Use existing symbol table"); - po.Register("write-symbol-table", &write_syms_filename, - "Write generated symbol table to a file"); - po.Register("keep-symbols", &keep_symbols, - "Store symbol table with FST. Symbols always saved to FST if " - "symbol tables are neither read or written (otherwise symbols " - "would be lost entirely)"); - po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); - - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_rxfilename = po.GetArg(1), - fst_wxfilename = po.GetOptArg(2); - - int64 disambig_symbol_id = 0; - - fst::SymbolTable *symbols; - if (!read_syms_filename.empty()) { - // Use existing symbols. Required symbols must be in the table. - kaldi::Input kisym(read_syms_filename); - symbols = fst::SymbolTable::ReadText( - kisym.Stream(), PrintableWxfilename(read_syms_filename)); - if (symbols == NULL) - KALDI_ERR << "Could not read symbol table from file " - << read_syms_filename; - - options.oov_handling = ArpaParseOptions::kSkipNGram; - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == -1) // fst::kNoSymbol - KALDI_ERR << "Symbol table " << read_syms_filename - << " has no symbol for " << disambig_symbol; - } - } else { - // Create a new symbol table and populate it from ARPA file. - symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); - options.oov_handling = ArpaParseOptions::kAddToSymbols; - symbols->AddSymbol("", 0); - if (!disambig_symbol.empty()) { - disambig_symbol_id = symbols->AddSymbol(disambig_symbol); - } - } - - // Add or use existing BOS and EOS. - options.bos_symbol = symbols->AddSymbol(bos_symbol); - options.eos_symbol = symbols->AddSymbol(eos_symbol); - - // If producing new (not reading existing) symbols and not saving them, - // need to keep symbols with FST, otherwise they would be lost. - if (read_syms_filename.empty() && write_syms_filename.empty()) - keep_symbols = true; - - // Actually compile LM. - KALDI_ASSERT(symbols != NULL); - ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); - { - Input ki(arpa_rxfilename); - lm_compiler.Read(ki.Stream()); - } - - // Sort the FST in-place if requested by options. - if (ilabel_sort) { - fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); - } - - // Write symbols if requested. - if (!write_syms_filename.empty()) { - kaldi::Output kosym(write_syms_filename, false); - symbols->WriteText(kosym.Stream()); - } - - // Write LM FST. - bool write_binary = true, write_header = false; - kaldi::Output kofst(fst_wxfilename, write_binary, write_header); - fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); - wopts.write_isymbols = wopts.write_osymbols = keep_symbols; - lm_compiler.Fst().Write(kofst.Stream(), wopts); - - delete symbols; - } catch (const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/basic-filebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/basic-filebuf.h deleted file mode 100644 index 22ec891064d5955c8b1d255e0d34781a9f505a38..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/basic-filebuf.h +++ /dev/null @@ -1,952 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////// -// This is a modified version of the std::basic_filebuf from libc++ -// Copyright 20XX LLVM -// (http://libcxx.llvm.org/). -// It allows one to create basic_filebuf from an existing FILE* handle or file -// descriptor. -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source License licenses. See LICENSE.TXT for details (included at the -// bottom). -/////////////////////////////////////////////////////////////////////////////// -#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ -#define KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include -#include -#include - -/////////////////////////////////////////////////////////////////////////////// -namespace kaldi { -/////////////////////////////////////////////////////////////////////////////// -template > -class basic_filebuf : public std::basic_streambuf { - public: - typedef CharT char_type; - typedef Traits traits_type; - typedef typename traits_type::int_type int_type; - typedef typename traits_type::pos_type pos_type; - typedef typename traits_type::off_type off_type; - typedef typename traits_type::state_type state_type; - - basic_filebuf(); - basic_filebuf(basic_filebuf&& rhs); - virtual ~basic_filebuf(); - - basic_filebuf& operator=(basic_filebuf&& rhs); - void swap(basic_filebuf& rhs); - - bool is_open() const; - basic_filebuf* open(const char* s, std::ios_base::openmode mode); - basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); - basic_filebuf* open(int fd, std::ios_base::openmode mode); - basic_filebuf* open(FILE* f, std::ios_base::openmode mode); - basic_filebuf* close(); - - FILE* file() { return this->_M_file; } - int fd() { return fileno(this->_M_file); } - - protected: - int_type underflow() override; - int_type pbackfail(int_type c = traits_type::eof()) override; - int_type overflow(int_type c = traits_type::eof()) override; - std::basic_streambuf* setbuf( - char_type* s, std::streamsize n) override; - pos_type seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - pos_type seekpos(pos_type sp, - std::ios_base::openmode wch = std::ios_base::in | - std::ios_base::out) override; - int sync() override; - void imbue(const std::locale& loc) override; - - protected: - char* _M_extbuf; - const char* _M_extbufnext; - const char* _M_extbufend; - char _M_extbuf_min[8]; - size_t _M_ebs; - char_type* _M_intbuf; - size_t _M_ibs; - FILE* _M_file; - const std::codecvt* _M_cv; - state_type _M_st; - state_type _M_st_last; - std::ios_base::openmode _M_om; - std::ios_base::openmode _M_cm; - bool _M_owns_eb; - bool _M_owns_ib; - bool _M_always_noconv; - - const char* _M_get_mode(std::ios_base::openmode mode); - bool _M_read_mode(); - void _M_write_mode(); -}; - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf() - : _M_extbuf(nullptr), - _M_extbufnext(nullptr), - _M_extbufend(nullptr), - _M_ebs(0), - _M_intbuf(nullptr), - _M_ibs(0), - _M_file(nullptr), - _M_cv(nullptr), - _M_st(), - _M_st_last(), - _M_om(std::ios_base::openmode(0)), - _M_cm(std::ios_base::openmode(0)), - _M_owns_eb(false), - _M_owns_ib(false), - _M_always_noconv(false) { - if (std::has_facet >( - this->getloc())) { - _M_cv = &std::use_facet >( - this->getloc()); - _M_always_noconv = _M_cv->always_noconv(); - } - setbuf(0, 4096); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::basic_filebuf(basic_filebuf&& rhs) - : std::basic_streambuf(rhs) { - if (rhs._M_extbuf == rhs._M_extbuf_min) { - _M_extbuf = _M_extbuf_min; - _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); - _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); - } else { - _M_extbuf = rhs._M_extbuf; - _M_extbufnext = rhs._M_extbufnext; - _M_extbufend = rhs._M_extbufend; - } - _M_ebs = rhs._M_ebs; - _M_intbuf = rhs._M_intbuf; - _M_ibs = rhs._M_ibs; - _M_file = rhs._M_file; - _M_cv = rhs._M_cv; - _M_st = rhs._M_st; - _M_st_last = rhs._M_st_last; - _M_om = rhs._M_om; - _M_cm = rhs._M_cm; - _M_owns_eb = rhs._M_owns_eb; - _M_owns_ib = rhs._M_owns_ib; - _M_always_noconv = rhs._M_always_noconv; - if (rhs.pbase()) { - if (rhs.pbase() == rhs._M_intbuf) - this->setp(_M_intbuf, _M_intbuf + (rhs.epptr() - rhs.pbase())); - else - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + - (rhs.epptr() - rhs.pbase())); - this->pbump(rhs.pptr() - rhs.pbase()); - } else if (rhs.eback()) { - if (rhs.eback() == rhs._M_intbuf) - this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), - _M_intbuf + (rhs.egptr() - rhs.eback())); - else - this->setg( - reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (rhs.gptr() - rhs.eback()), - reinterpret_cast(_M_extbuf) + - (rhs.egptr() - rhs.eback())); - } - rhs._M_extbuf = nullptr; - rhs._M_extbufnext = nullptr; - rhs._M_extbufend = nullptr; - rhs._M_ebs = 0; - rhs._M_intbuf = nullptr; - rhs._M_ibs = 0; - rhs._M_file = nullptr; - rhs._M_st = state_type(); - rhs._M_st_last = state_type(); - rhs._M_om = std::ios_base::openmode(0); - rhs._M_cm = std::ios_base::openmode(0); - rhs._M_owns_eb = false; - rhs._M_owns_ib = false; - rhs.setg(0, 0, 0); - rhs.setp(0, 0); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf& basic_filebuf::operator=( - basic_filebuf&& rhs) { - close(); - swap(rhs); - return *this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf::~basic_filebuf() { - // try - // { - // close(); - // } - // catch (...) - // { - // } - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::swap(basic_filebuf& rhs) { - std::basic_streambuf::swap(rhs); - if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - std::swap(_M_extbuf, rhs._M_extbuf); - std::swap(_M_extbufnext, rhs._M_extbufnext); - std::swap(_M_extbufend, rhs._M_extbufend); - } else { - ptrdiff_t ln = _M_extbufnext - _M_extbuf; - ptrdiff_t le = _M_extbufend - _M_extbuf; - ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; - ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; - if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) { - _M_extbuf = rhs._M_extbuf; - rhs._M_extbuf = rhs._M_extbuf_min; - } else if (_M_extbuf != _M_extbuf_min && - rhs._M_extbuf == rhs._M_extbuf_min) { - rhs._M_extbuf = _M_extbuf; - _M_extbuf = _M_extbuf_min; - } - _M_extbufnext = _M_extbuf + rn; - _M_extbufend = _M_extbuf + re; - rhs._M_extbufnext = rhs._M_extbuf + ln; - rhs._M_extbufend = rhs._M_extbuf + le; - } - std::swap(_M_ebs, rhs._M_ebs); - std::swap(_M_intbuf, rhs._M_intbuf); - std::swap(_M_ibs, rhs._M_ibs); - std::swap(_M_file, rhs._M_file); - std::swap(_M_cv, rhs._M_cv); - std::swap(_M_st, rhs._M_st); - std::swap(_M_st_last, rhs._M_st_last); - std::swap(_M_om, rhs._M_om); - std::swap(_M_cm, rhs._M_cm); - std::swap(_M_owns_eb, rhs._M_owns_eb); - std::swap(_M_owns_ib, rhs._M_owns_ib); - std::swap(_M_always_noconv, rhs._M_always_noconv); - if (this->eback() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->gptr() - this->eback(); - ptrdiff_t e = this->egptr() - this->eback(); - this->setg(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + n, - reinterpret_cast(_M_extbuf_min) + e); - } else if (this->pbase() == reinterpret_cast(rhs._M_extbuf_min)) { - ptrdiff_t n = this->pptr() - this->pbase(); - ptrdiff_t e = this->epptr() - this->pbase(); - this->setp(reinterpret_cast(_M_extbuf_min), - reinterpret_cast(_M_extbuf_min) + e); - this->pbump(n); - } - if (rhs.eback() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.gptr() - rhs.eback(); - ptrdiff_t e = rhs.egptr() - rhs.eback(); - rhs.setg(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + n, - reinterpret_cast(rhs._M_extbuf_min) + e); - } else if (rhs.pbase() == reinterpret_cast(_M_extbuf_min)) { - ptrdiff_t n = rhs.pptr() - rhs.pbase(); - ptrdiff_t e = rhs.epptr() - rhs.pbase(); - rhs.setp(reinterpret_cast(rhs._M_extbuf_min), - reinterpret_cast(rhs._M_extbuf_min) + e); - rhs.pbump(n); - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline void swap(basic_filebuf& x, - basic_filebuf& y) { - x.swap(y); -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline bool basic_filebuf::is_open() const { - return _M_file != nullptr; -} - -/////////////////////////////////////////////////////////////////////////////// -template -const char* basic_filebuf::_M_get_mode( - std::ios_base::openmode mode) { - switch ((mode & ~std::ios_base::ate) | 0) { - case std::ios_base::out: - case std::ios_base::out | std::ios_base::trunc: - return "w"; - case std::ios_base::out | std::ios_base::app: - case std::ios_base::app: - return "a"; - break; - case std::ios_base::in: - return "r"; - case std::ios_base::in | std::ios_base::out: - return "r+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: - return "w+"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app: - case std::ios_base::in | std::ios_base::app: - return "a+"; - case std::ios_base::out | std::ios_base::binary: - case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: - return "wb"; - case std::ios_base::out | std::ios_base::app | std::ios_base::binary: - case std::ios_base::app | std::ios_base::binary: - return "ab"; - case std::ios_base::in | std::ios_base::binary: - return "rb"; - case std::ios_base::in | std::ios_base::out | std::ios_base::binary: - return "r+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | - std::ios_base::binary: - return "w+b"; - case std::ios_base::in | std::ios_base::out | std::ios_base::app | - std::ios_base::binary: - case std::ios_base::in | std::ios_base::app | std::ios_base::binary: - return "a+b"; - default: - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - const char* s, std::ios_base::openmode mode) { - basic_filebuf* rt = nullptr; - if (_M_file == nullptr) { - const char* md = _M_get_mode(mode); - if (md) { - _M_file = fopen(s, md); - if (_M_file) { - rt = this; - _M_om = mode; - if (mode & std::ios_base::ate) { - if (fseek(_M_file, 0, SEEK_END)) { - fclose(_M_file); - _M_file = nullptr; - rt = nullptr; - } - } - } - } - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -inline basic_filebuf* basic_filebuf::open( - const std::string& s, std::ios_base::openmode mode) { - return open(s.c_str(), mode); -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - int fd, std::ios_base::openmode mode) { - const char* md = this->_M_get_mode(mode); - if (md) { - this->_M_file = fdopen(fd, md); - this->_M_om = mode; - return this; - } else { - return nullptr; - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::open( - FILE* f, std::ios_base::openmode mode) { - this->_M_file = f; - this->_M_om = mode; - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -basic_filebuf* basic_filebuf::close() { - basic_filebuf* rt = nullptr; - if (_M_file) { - rt = this; - std::unique_ptr h(_M_file, fclose); - if (sync()) rt = nullptr; - if (fclose(h.release()) == 0) - _M_file = nullptr; - else - rt = nullptr; - } - return rt; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::underflow() { - if (_M_file == nullptr) return traits_type::eof(); - bool initial = _M_read_mode(); - char_type buf; - if (this->gptr() == nullptr) this->setg(&buf, &buf + 1, &buf + 1); - const size_t unget_sz = - initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); - int_type c = traits_type::eof(); - if (this->gptr() == this->egptr()) { - memmove(this->eback(), this->egptr() - unget_sz, - unget_sz * sizeof(char_type)); - if (_M_always_noconv) { - size_t nmemb = - static_cast(this->egptr() - this->eback() - unget_sz); - nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); - if (nmemb != 0) { - this->setg(this->eback(), this->eback() + unget_sz, - this->eback() + unget_sz + nmemb); - c = traits_type::to_int_type(*this->gptr()); - } - } else { - memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); - _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); - _M_extbufend = - _M_extbuf + - (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); - size_t nmemb = - std::min(static_cast(_M_ibs - unget_sz), - static_cast(_M_extbufend - _M_extbufnext)); - std::codecvt_base::result r; - _M_st_last = _M_st; - size_t nr = - fread(reinterpret_cast(const_cast(_M_extbufnext)), - 1, nmemb, _M_file); - if (nr != 0) { - if (!_M_cv) throw std::bad_cast(); - _M_extbufend = _M_extbufnext + nr; - char_type* inext; - r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, - this->eback() + unget_sz, this->eback() + _M_ibs, inext); - if (r == std::codecvt_base::noconv) { - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf), - const_cast(_M_extbufend)); - c = traits_type::to_int_type(*this->gptr()); - } else if (inext != this->eback() + unget_sz) { - this->setg(this->eback(), this->eback() + unget_sz, inext); - c = traits_type::to_int_type(*this->gptr()); - } - } - } - } else { - c = traits_type::to_int_type(*this->gptr()); - } - if (this->eback() == &buf) this->setg(0, 0, 0); - return c; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::pbackfail(int_type c) { - if (_M_file && this->eback() < this->gptr()) { - if (traits_type::eq_int_type(c, traits_type::eof())) { - this->gbump(-1); - return traits_type::not_eof(c); - } - if ((_M_om & std::ios_base::out) || - traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) { - this->gbump(-1); - *this->gptr() = traits_type::to_char_type(c); - return c; - } - } - return traits_type::eof(); -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::int_type -basic_filebuf::overflow(int_type c) { - if (_M_file == nullptr) return traits_type::eof(); - _M_write_mode(); - char_type buf; - char_type* pb_save = this->pbase(); - char_type* epb_save = this->epptr(); - if (!traits_type::eq_int_type(c, traits_type::eof())) { - if (this->pptr() == nullptr) this->setp(&buf, &buf + 1); - *this->pptr() = traits_type::to_char_type(c); - this->pbump(1); - } - if (this->pptr() != this->pbase()) { - if (_M_always_noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else { - char* extbe = _M_extbuf; - std::codecvt_base::result r; - do { - if (!_M_cv) throw std::bad_cast(); - const char_type* e; - r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, _M_extbuf, - _M_extbuf + _M_ebs, extbe); - if (e == this->pbase()) return traits_type::eof(); - if (r == std::codecvt_base::noconv) { - size_t nmemb = static_cast(this->pptr() - this->pbase()); - if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - } else if (r == std::codecvt_base::ok || - r == std::codecvt_base::partial) { - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) - return traits_type::eof(); - if (r == std::codecvt_base::partial) { - this->setp(const_cast(e), this->pptr()); - this->pbump(this->epptr() - this->pbase()); - } - } else { - return traits_type::eof(); - } - } while (r == std::codecvt_base::partial); - } - this->setp(pb_save, epb_save); - } - return traits_type::not_eof(c); -} - -/////////////////////////////////////////////////////////////////////////////// -template -std::basic_streambuf* basic_filebuf::setbuf( - char_type* s, std::streamsize n) { - this->setg(0, 0, 0); - this->setp(0, 0); - if (_M_owns_eb) delete[] _M_extbuf; - if (_M_owns_ib) delete[] _M_intbuf; - _M_ebs = n; - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv && s) { - _M_extbuf = reinterpret_cast(s); - _M_owns_eb = false; - } else { - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } - } else { - _M_extbuf = _M_extbuf_min; - _M_ebs = sizeof(_M_extbuf_min); - _M_owns_eb = false; - } - if (!_M_always_noconv) { - _M_ibs = std::max(n, sizeof(_M_extbuf_min)); - if (s && _M_ibs >= sizeof(_M_extbuf_min)) { - _M_intbuf = s; - _M_owns_ib = false; - } else { - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } else { - _M_ibs = 0; - _M_intbuf = 0; - _M_owns_ib = false; - } - return this; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, - std::ios_base::openmode) { - if (!_M_cv) throw std::bad_cast(); - int width = _M_cv->encoding(); - if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) - return pos_type(off_type(-1)); - // width > 0 || off == 0 - int whence; - switch (way) { - case std::ios_base::beg: - whence = SEEK_SET; - break; - case std::ios_base::cur: - whence = SEEK_CUR; - break; - case std::ios_base::end: - whence = SEEK_END; - break; - default: - return pos_type(off_type(-1)); - } -#if _WIN32 - if (fseek(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftell(_M_file); -#else - if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) - return pos_type(off_type(-1)); - pos_type r = ftello(_M_file); -#endif - r.state(_M_st); - return r; -} - -/////////////////////////////////////////////////////////////////////////////// -template -typename basic_filebuf::pos_type -basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) { - if (_M_file == nullptr || sync()) return pos_type(off_type(-1)); -#if _WIN32 - if (fseek(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#else - if (fseeko(_M_file, sp, SEEK_SET)) return pos_type(off_type(-1)); -#endif - _M_st = sp.state(); - return sp; -} - -/////////////////////////////////////////////////////////////////////////////// -template -int basic_filebuf::sync() { - if (_M_file == nullptr) return 0; - if (!_M_cv) throw std::bad_cast(); - if (_M_cm & std::ios_base::out) { - if (this->pptr() != this->pbase()) - if (overflow() == traits_type::eof()) return -1; - std::codecvt_base::result r; - do { - char* extbe; - r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); - size_t nmemb = static_cast(extbe - _M_extbuf); - if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) return -1; - } while (r == std::codecvt_base::partial); - if (r == std::codecvt_base::error) return -1; - if (fflush(_M_file)) return -1; - } else if (_M_cm & std::ios_base::in) { - off_type c; - state_type state = _M_st_last; - bool update_st = false; - if (_M_always_noconv) { - c = this->egptr() - this->gptr(); - } else { - int width = _M_cv->encoding(); - c = _M_extbufend - _M_extbufnext; - if (width > 0) { - c += width * (this->egptr() - this->gptr()); - } else { - if (this->gptr() != this->egptr()) { - const int off = _M_cv->length(state, _M_extbuf, _M_extbufnext, - this->gptr() - this->eback()); - c += _M_extbufnext - _M_extbuf - off; - update_st = true; - } - } - } -#if _WIN32 - if (fseek(_M_file_, -c, SEEK_CUR)) return -1; -#else - if (fseeko(_M_file, -c, SEEK_CUR)) return -1; -#endif - if (update_st) _M_st = state; - _M_extbufnext = _M_extbufend = _M_extbuf; - this->setg(0, 0, 0); - _M_cm = std::ios_base::openmode(0); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::imbue(const std::locale& loc) { - sync(); - _M_cv = &std::use_facet >(loc); - bool old_anc = _M_always_noconv; - _M_always_noconv = _M_cv->always_noconv(); - if (old_anc != _M_always_noconv) { - this->setg(0, 0, 0); - this->setp(0, 0); - // invariant, char_type is char, else we couldn't get here - // need to dump _M_intbuf - if (_M_always_noconv) { - if (_M_owns_eb) delete[] _M_extbuf; - _M_owns_eb = _M_owns_ib; - _M_ebs = _M_ibs; - _M_extbuf = reinterpret_cast(_M_intbuf); - _M_ibs = 0; - _M_intbuf = nullptr; - _M_owns_ib = false; - } else { // need to obtain an _M_intbuf. - // If _M_extbuf is user-supplied, use it, else new _M_intbuf - if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) { - _M_ibs = _M_ebs; - _M_intbuf = reinterpret_cast(_M_extbuf); - _M_owns_ib = false; - _M_extbuf = new char[_M_ebs]; - _M_owns_eb = true; - } else { - _M_ibs = _M_ebs; - _M_intbuf = new char_type[_M_ibs]; - _M_owns_ib = true; - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -template -bool basic_filebuf::_M_read_mode() { - if (!(_M_cm & std::ios_base::in)) { - this->setp(0, 0); - if (_M_always_noconv) - this->setg(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + _M_ebs, - reinterpret_cast(_M_extbuf) + _M_ebs); - else - this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); - _M_cm = std::ios_base::in; - return true; - } - return false; -} - -/////////////////////////////////////////////////////////////////////////////// -template -void basic_filebuf::_M_write_mode() { - if (!(_M_cm & std::ios_base::out)) { - this->setg(0, 0, 0); - if (_M_ebs > sizeof(_M_extbuf_min)) { - if (_M_always_noconv) - this->setp(reinterpret_cast(_M_extbuf), - reinterpret_cast(_M_extbuf) + (_M_ebs - 1)); - else - this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); - } else { - this->setp(0, 0); - } - _M_cm = std::ios_base::out; - } -} - -/////////////////////////////////////////////////////////////////////////////// -} // namespace kaldi - -/////////////////////////////////////////////////////////////////////////////// -#endif // KALDI_UTIL_BASIC_FILEBUF_H_ - -/////////////////////////////////////////////////////////////////////////////// - -/* - * ============================================================================ - * libc++ License - * ============================================================================ - * - * The libc++ library is dual licensed under both the University of Illinois - * "BSD-Like" license and the MIT license. As a user of this code you may - * choose to use it under either license. As a contributor, you agree to allow - * your code to be used under both. - * - * Full text of the relevant licenses is included below. - * - * ============================================================================ - * - * University of Illinois/NCSA - * Open Source License - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * All rights reserved. - * - * Developed by: - * - * LLVM Team - * - * University of Illinois at Urbana-Champaign - * - * http://llvm.org - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * with the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the names of the LLVM Team, University of Illinois at - * Urbana-Champaign, nor the names of its contributors may be used to - * endorse or promote products derived from this Software without specific - * prior written permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH - * THE SOFTWARE. - * - * ============================================================================== - * - * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included - * below) - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * ============================================================================== - * - * This file is a partial list of people who have contributed to the LLVM/libc++ - * project. If you have contributed a patch or made some other contribution to - * LLVM/libc++, please submit a patch to this file to add yourself, and it will - * be done! - * - * The list is sorted by surname and formatted to allow easy grepping and - * beautification by scripts. The fields are: name (N), email (E), web-address - * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address - * (S). - * - * N: Saleem Abdulrasool - * E: compnerd@compnerd.org - * D: Minor patches and Linux fixes. - * - * N: Dimitry Andric - * E: dimitry@andric.com - * D: Visibility fixes, minor FreeBSD portability patches. - * - * N: Holger Arnold - * E: holgerar@gmail.com - * D: Minor fix. - * - * N: Ruben Van Boxem - * E: vanboxem dot ruben at gmail dot com - * D: Initial Windows patches. - * - * N: David Chisnall - * E: theraven at theravensnest dot org - * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. - * - * N: Marshall Clow - * E: mclow.lists@gmail.com - * E: marshall@idio.com - * D: C++14 support, patches and bug fixes. - * - * N: Bill Fisher - * E: william.w.fisher@gmail.com - * D: Regex bug fixes. - * - * N: Matthew Dempsky - * E: matthew@dempsky.org - * D: Minor patches and bug fixes. - * - * N: Google Inc. - * D: Copyright owner and contributor of the CityHash algorithm - * - * N: Howard Hinnant - * E: hhinnant@apple.com - * D: Architect and primary author of libc++ - * - * N: Hyeon-bin Jeong - * E: tuhertz@gmail.com - * D: Minor patches and bug fixes. - * - * N: Argyrios Kyrtzidis - * E: kyrtzidis@apple.com - * D: Bug fixes. - * - * N: Bruce Mitchener, Jr. - * E: bruce.mitchener@gmail.com - * D: Emscripten-related changes. - * - * N: Michel Morin - * E: mimomorin@gmail.com - * D: Minor patches to is_convertible. - * - * N: Andrew Morrow - * E: andrew.c.morrow@gmail.com - * D: Minor patches and Linux fixes. - * - * N: Arvid Picciani - * E: aep at exys dot org - * D: Minor patches and musl port. - * - * N: Bjorn Reese - * E: breese@users.sourceforge.net - * D: Initial regex prototype - * - * N: Nico Rieck - * E: nico.rieck@gmail.com - * D: Windows fixes - * - * N: Jonathan Sauer - * D: Minor patches, mostly related to constexpr - * - * N: Craig Silverstein - * E: csilvers@google.com - * D: Implemented Cityhash as the string hash function on 64-bit machines - * - * N: Richard Smith - * D: Minor patches. - * - * N: Joerg Sonnenberger - * E: joerg@NetBSD.org - * D: NetBSD port. - * - * N: Stephan Tolksdorf - * E: st@quanttec.com - * D: Minor fix - * - * N: Michael van der Westhuizen - * E: r1mikey at gmail dot com - * - * N: Klaas de Vries - * E: klaas at klaasgaaf dot nl - * D: Minor bug fix. - * - * N: Zhang Xiongpang - * E: zhangxiongpang@gmail.com - * D: Minor patches and bug fixes. - * - * N: Xing Xue - * E: xingxue@ca.ibm.com - * D: AIX port - * - * N: Zhihao Yuan - * E: lichray@gmail.com - * D: Standard compatibility fixes. - * - * N: Jeffrey Yasskin - * E: jyasskin@gmail.com - * E: jyasskin@google.com - * D: Linux fixes. - */ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/const-integer-set-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/const-integer-set-inl.h deleted file mode 100644 index b93846148a3e4595774507f638396ce13393ac0e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/const-integer-set-inl.h +++ /dev/null @@ -1,87 +0,0 @@ -// util/const-integer-set-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ - -// Do not include this file directly. It is included by const-integer-set.h - -namespace kaldi { - -template -void ConstIntegerSet::InitInternal() { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - quick_set_.clear(); // just in case we previously had data. - if (slow_set_.size() == 0) { - lowest_member_ = (I)1; - highest_member_ = (I)0; - contiguous_ = false; - quick_ = false; - } else { - lowest_member_ = slow_set_.front(); - highest_member_ = slow_set_.back(); - size_t range = highest_member_ + 1 - lowest_member_; - if (range == slow_set_.size()) { - contiguous_ = true; - quick_ = false; - } else { - contiguous_ = false; - // If it would be more compact to store as bool - if (range < slow_set_.size() * 8 * sizeof(I)) { - // (assuming 1 bit per element)... - quick_set_.resize(range, false); - for (size_t i = 0; i < slow_set_.size(); i++) - quick_set_[slow_set_[i] - lowest_member_] = true; - quick_ = true; - } else { - quick_ = false; - } - } - } -} - -template -int ConstIntegerSet::count(I i) const { - if (i < lowest_member_ || i > highest_member_) { - return 0; - } else { - if (contiguous_) return true; - if (quick_) { - return (quick_set_[i - lowest_member_] ? 1 : 0); - } else { - bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); - return (ans ? 1 : 0); - } - } -} - -template -void ConstIntegerSet::Write(std::ostream &os, bool binary) const { - WriteIntegerVector(os, binary, slow_set_); -} - -template -void ConstIntegerSet::Read(std::istream &is, bool binary) { - ReadIntegerVector(is, binary, &slow_set_); - InitInternal(); -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_CONST_INTEGER_SET_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/const-integer-set.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/const-integer-set.h deleted file mode 100644 index 809a56a7c83804bfaa4badb5e28059734bfcad1e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/const-integer-set.h +++ /dev/null @@ -1,96 +0,0 @@ -// util/const-integer-set.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ -#define KALDI_UTIL_CONST_INTEGER_SET_H_ -#include -#include -#include -#include -#include -#include "util/stl-utils.h" - -/* ConstIntegerSet is a way to efficiently test whether something is in a - supplied set of integers. It can be initialized from a vector or set, but - never changed after that. It either uses a sorted vector or an array of - bool, depending on the input. It behaves like a const version of an STL set, - with only a subset of the functionality, except all the member functions are - upper-case. - - Note that we could get rid of the member slow_set_, but we'd have to - do more work to implement an iterator type. This would save memory. -*/ - -namespace kaldi { - -template -class ConstIntegerSet { - public: - ConstIntegerSet() : lowest_member_(1), highest_member_(0) {} - - void Init(const std::vector &input) { - slow_set_ = input; - SortAndUniq(&slow_set_); - InitInternal(); - } - - void Init(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - - explicit ConstIntegerSet(const std::vector &input) : slow_set_(input) { - SortAndUniq(&slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const std::set &input) { - CopySetToVector(input, &slow_set_); - InitInternal(); - } - explicit ConstIntegerSet(const ConstIntegerSet &other) - : slow_set_(other.slow_set_) { - InitInternal(); - } - - int count(I i) const; // returns 1 or 0. - - typedef typename std::vector::const_iterator iterator; - iterator begin() const { return slow_set_.begin(); } - iterator end() const { return slow_set_.end(); } - size_t size() const { return slow_set_.size(); } - bool empty() const { return slow_set_.empty(); } - - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); - - private: - I lowest_member_; - I highest_member_; - bool contiguous_; - bool quick_; - std::vector quick_set_; - std::vector slow_set_; - void InitInternal(); -}; - -} // end namespace kaldi - -#include "util/const-integer-set-inl.h" - -#endif // KALDI_UTIL_CONST_INTEGER_SET_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/hash-list-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/hash-list-inl.h deleted file mode 100644 index 063fa7131ec618f0aae9dc30f4edd26c9dcce7fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/hash-list-inl.h +++ /dev/null @@ -1,193 +0,0 @@ -// util/hash-list-inl.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_INL_H_ -#define KALDI_UTIL_HASH_LIST_INL_H_ - -// Do not include this file directly. It is included by fast-hash.h - -namespace kaldi { - -template -HashList::HashList() { - list_head_ = NULL; - bucket_list_tail_ = static_cast(-1); // invalid. - hash_size_ = 0; - freed_head_ = NULL; -} - -template -void HashList::SetSize(size_t size) { - hash_size_ = size; - KALDI_ASSERT(list_head_ == NULL && - bucket_list_tail_ == - static_cast(-1)); // make sure empty. - if (size > buckets_.size()) buckets_.resize(size, HashBucket(0, NULL)); -} - -template -typename HashList::Elem *HashList::Clear() { - // Clears the hashtable and gives ownership of the currently contained list - // to the user. - for (size_t cur_bucket = bucket_list_tail_; - cur_bucket != static_cast(-1); - cur_bucket = buckets_[cur_bucket].prev_bucket) { - buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". - } - bucket_list_tail_ = static_cast(-1); - Elem *ans = list_head_; - list_head_ = NULL; - return ans; -} - -template -const typename HashList::Elem *HashList::GetList() const { - return list_head_; -} - -template -inline void HashList::Delete(Elem *e) { - e->tail = freed_head_; - freed_head_ = e; -} - -template -inline typename HashList::Elem *HashList::Find(I key) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - if (bucket.last_elem == NULL) { - return NULL; // empty bucket. - } else { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - return NULL; // Not found. - } -} - -template -inline typename HashList::Elem *HashList::New() { - if (freed_head_) { - Elem *ans = freed_head_; - freed_head_ = freed_head_->tail; - return ans; - } else { - Elem *tmp = new Elem[allocate_block_size_]; - for (size_t i = 0; i + 1 < allocate_block_size_; i++) - tmp[i].tail = tmp + i + 1; - tmp[allocate_block_size_ - 1].tail = NULL; - freed_head_ = tmp; - allocated_.push_back(tmp); - return this->New(); - } -} - -template -HashList::~HashList() { - // First test whether we had any memory leak within the - // HashList, i.e. things for which the user did not call Delete(). - size_t num_in_list = 0, num_allocated = 0; - for (Elem *e = freed_head_; e != NULL; e = e->tail) num_in_list++; - for (size_t i = 0; i < allocated_.size(); i++) { - num_allocated += allocate_block_size_; - delete[] allocated_[i]; - } - if (num_in_list != num_allocated) { - KALDI_WARN << "Possible memory leak: " << num_in_list - << " != " << num_allocated - << ": you might have forgotten to call Delete on " - << "some Elems"; - } -} - -template -inline typename HashList::Elem *HashList::Insert(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - // Check the element is existing or not. - if (bucket.last_elem != NULL) { - Elem *head = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail), - *tail = bucket.last_elem->tail; - for (Elem *e = head; e != tail; e = e->tail) - if (e->key == key) return e; - } - - // This is a new element. Insert it. - Elem *elem = New(); - elem->key = key; - elem->val = val; - if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at - // head of bucket list (which is tail of regular list, they go in - // opposite directions). - if (bucket_list_tail_ == static_cast(-1)) { - // list was empty so this is the first elem. - KALDI_ASSERT(list_head_ == NULL); - list_head_ = elem; - } else { - // link in to the chain of Elems - buckets_[bucket_list_tail_].last_elem->tail = elem; - } - elem->tail = NULL; - bucket.last_elem = elem; - bucket.prev_bucket = bucket_list_tail_; - bucket_list_tail_ = index; - } else { - // Already-occupied bucket. Insert at tail of list of elements within - // the bucket. - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - } - return elem; -} - -template -void HashList::InsertMore(I key, T val) { - size_t index = (static_cast(key) % hash_size_); - HashBucket &bucket = buckets_[index]; - Elem *elem = New(); - elem->key = key; - elem->val = val; - - KALDI_ASSERT(bucket.last_elem != NULL); // assume one element is already here - if (bucket.last_elem->key == key) { // standard behavior: add as last element - elem->tail = bucket.last_elem->tail; - bucket.last_elem->tail = elem; - bucket.last_elem = elem; - return; - } - Elem *e = (bucket.prev_bucket == static_cast(-1) - ? list_head_ - : buckets_[bucket.prev_bucket].last_elem->tail); - // find place to insert in linked list - while (e != bucket.last_elem->tail && e->key != key) e = e->tail; - KALDI_ASSERT(e->key == key); // not found? - should not happen - elem->tail = e->tail; - e->tail = elem; -} - -} // end namespace kaldi - -#endif // KALDI_UTIL_HASH_LIST_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/hash-list.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/hash-list.h deleted file mode 100644 index 31cc9bdc4870773475f8c5139539e320746bf5fe..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/hash-list.h +++ /dev/null @@ -1,146 +0,0 @@ -// util/hash-list.h - -// Copyright 2009-2011 Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_HASH_LIST_H_ -#define KALDI_UTIL_HASH_LIST_H_ - -#include -#include -#include -#include -#include - -#include "base/kaldi-error.h" - -/* This header provides utilities for a structure that's used in a decoder (but - is quite generic in nature so we implement and test it separately). - Basically it's a singly-linked list, but implemented in such a way that we - can quickly search for elements in the list. We give it a slightly richer - interface than just a hash and a list. The idea is that we want to separate - the hash part and the list part: basically, in the decoder, we want to have a - single hash for the current frame and the next frame, because by the time we - need to access the hash for the next frame we no longer need the hash for the - previous frame. So we have an operation that clears the hash but leaves the - list structure intact. We also control memory management inside this object, - to avoid repeated new's/deletes. - - See hash-list-test.cc for an example of how to use this object. -*/ - -namespace kaldi { - -template -class HashList { - public: - struct Elem { - I key; - T val; - Elem *tail; - }; - - /// Constructor takes no arguments. - /// Call SetSize to inform it of the likely size. - HashList(); - - /// Clears the hash and gives the head of the current list to the user; - /// ownership is transferred to the user (the user must call Delete() - /// for each element in the list, at his/her leisure). - Elem *Clear(); - - /// Gives the head of the current list to the user. Ownership retained in the - /// class. Caution: in December 2013 the return type was changed to const - /// Elem* and this function was made const. You may need to change some types - /// of local Elem* variables to const if this produces compilation errors. - const Elem *GetList() const; - - /// Think of this like delete(). It is to be called for each Elem in turn - /// after you "obtained ownership" by doing Clear(). This is not the opposite - /// of. Insert, it is the opposite of New. It's really a memory operation. - inline void Delete(Elem *e); - - /// This should probably not be needed to be called directly by the user. - /// Think of it as opposite - /// to Delete(); - inline Elem *New(); - - /// Find tries to find this element in the current list using the hashtable. - /// It returns NULL if not present. The Elem it returns is not owned by the - /// user, it is part of the internal list owned by this object, but the user - /// is free to modify the "val" element. - inline Elem *Find(I key); - - /// Insert inserts a new element into the hashtable/stored list. - /// Because element keys in a hashtable are unique, this operation checks - /// whether each inserted element has a key equivalent to the one of an - /// element already in the hashtable. If so, the element is not inserted, - /// returning an pointer to this existing element. - inline Elem *Insert(I key, T val); - - /// Insert inserts another element with same key into the hashtable/ - /// stored list. - /// By calling this, the user asserts that one element with that key is - /// already present. - /// We insert it that way, that all elements with the same key - /// follow each other. - /// Find() will return the first one of the elements with the same key. - inline void InsertMore(I key, T val); - - /// SetSize tells the object how many hash buckets to allocate (should - /// typically be at least twice the number of objects we expect to go in the - /// structure, for fastest performance). It must be called while the hash - /// is empty (e.g. after Clear() or after initializing the object, but before - /// adding anything to the hash. - void SetSize(size_t sz); - - /// Returns current number of hash buckets. - inline size_t Size() { return hash_size_; } - - ~HashList(); - - private: - struct HashBucket { - size_t prev_bucket; // index to next bucket (-1 if list tail). Note: - // list of buckets goes in opposite direction to list of Elems. - Elem *last_elem; // pointer to last element in this bucket (NULL if empty) - inline HashBucket(size_t i, Elem *e) : prev_bucket(i), last_elem(e) {} - }; - - Elem *list_head_; // head of currently stored list. - size_t bucket_list_tail_; // tail of list of active hash buckets. - - size_t hash_size_; // number of hash buckets. - - std::vector buckets_; - - Elem *freed_head_; // head of list of currently freed elements. [ready for - // allocation] - - std::vector allocated_; // list of allocated blocks. - - static const size_t allocate_block_size_ = 1024; // Number of Elements to - // allocate in one block. Must be largish so storing allocated_ doesn't - // become a problem. -}; - -} // end namespace kaldi - -#include "util/hash-list-inl.h" - -#endif // KALDI_UTIL_HASH_LIST_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io-inl.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io-inl.h deleted file mode 100644 index 8b0c92131c4af2113eb33da6f3cfa9dc4dee83e1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io-inl.h +++ /dev/null @@ -1,40 +0,0 @@ -// util/kaldi-io-inl.h - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_INL_H_ -#define KALDI_UTIL_KALDI_IO_INL_H_ - -#include - -namespace kaldi { - -bool Input::Open(const std::string &rxfilename, bool *binary) { - return OpenInternal(rxfilename, true, binary); -} - -bool Input::OpenTextMode(const std::string &rxfilename) { - return OpenInternal(rxfilename, false, NULL); -} - -bool Input::IsOpen() { return impl_ != NULL; } - -bool Output::IsOpen() { return impl_ != NULL; } - -} // end namespace kaldi. - -#endif // KALDI_UTIL_KALDI_IO_INL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io.cc deleted file mode 100644 index 5f8ec4870138df32f6aca9c12383cf3885411741..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io.cc +++ /dev/null @@ -1,898 +0,0 @@ -// util/kaldi-io.cc - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/kaldi-io.h" - -#include -#include -#include - -#include - -#include "base/io-funcs.h" -#include "base/kaldi-math.h" -#include "util/kaldi-pipebuf.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -#ifdef KALDI_CYGWIN_COMPAT -#include "util/kaldi-cygwin-io-inl.h" -#define MapOsPath(x) MapCygwinPath(x) -#else // KALDI_CYGWIN_COMPAT -#define MapOsPath(x) x -#endif // KALDI_CYGWIN_COMPAT - -#if defined(_MSC_VER) -static FILE *popen(const char *command, const char *mode) { -#ifdef KALDI_CYGWIN_COMPAT - return kaldi::CygwinCompatPopen(command, mode); -#else // KALDI_CYGWIN_COMPAT - return _popen(command, mode); -#endif // KALDI_CYGWIN_COMPAT -} -#endif // _MSC_VER - -namespace kaldi { - -#ifndef _MSC_VER // on VS, we don't need this type. -// could replace basic_pipebuf with stdio_filebuf on some platforms. -// Would mean we could use less of our own code. -typedef basic_pipebuf PipebufType; -#endif -} // namespace kaldi - -namespace kaldi { - -std::string PrintableRxfilename(const std::string &rxfilename) { - if (rxfilename == "" || rxfilename == "-") { - return "standard input"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return rxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(rxfilename); - } -} - -std::string PrintableWxfilename(const std::string &wxfilename) { - if (wxfilename == "" || wxfilename == "-") { - return "standard output"; - } else { - // If this call to Escape later causes compilation issues, - // just replace it with "return wxfilename"; it's only a - // pretty-printing issue. - return ParseOptions::Escape(wxfilename); - } -} - -OutputType ClassifyWxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardOutput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardOutput; - } else if (first_char == '|') { - return kPipeOutput; // An output pipe like "|blah". - } else if (isspace(first_char) || isspace(last_char) || last_char == '|') { - return kNoOutput; // Leading or trailing space: can't interpret this. - // Final '|' would represent an input pipe, not an - // output pipe. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoOutput; - } else if (isdigit(last_char)) { - // This could be a file, but we have to see if it's an offset into a file - // (like foo.ark:4314328), which is not allowed for writing (but is - // allowed for reaching). This eliminates some things which would be - // valid UNIX filenames but are not allowed by Kaldi. (Even if we allowed - // such filenames for writing, we woudln't be able to correctly read them). - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') return kNoOutput; - // else it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but we - // check for internal '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the" - " wrong place (pipe without | at the beginning?): " - << filename; - return kNoOutput; - } - return kFileOutput; // It matched no other pattern: assume it's a filename. -} - -InputType ClassifyRxfilename(const std::string &filename) { - const char *c = filename.c_str(); - size_t length = filename.length(); - char first_char = c[0], - last_char = (length == 0 ? '\0' : c[filename.length() - 1]); - - // if 'filename' is "" or "-", return kStandardInput. - if (length == 0 || (length == 1 && first_char == '-')) { - return kStandardInput; - } else if (first_char == '|') { - return kNoInput; // An output pipe like "|blah": not - // valid for input. - } else if (last_char == '|') { - return kPipeInput; - } else if (isspace(first_char) || isspace(last_char)) { - return kNoInput; // We don't allow leading or trailing space in a filename. - // } else if ((first_char == 'a' || first_char == 's') && - // strchr(c, ':') != NULL && - // (ClassifyWspecifier(filename, NULL, NULL, NULL) != - // kNoWspecifier || - // ClassifyRspecifier(filename, NULL, NULL) != kNoRspecifier)) { - // // e.g. ark:something or scp:something... this is almost certainly a - // // scripting error, so call it an error rather than treating it as a - // file. - // // In practice in modern kaldi scripts all (r,w)filenames begin with - // "ark" - // // or "scp", even though technically speaking options like "b", "t", - // "s" or - // // "cs" can appear before the ark or scp, like "b,ark". For - // efficiency, - // // and because this code is really just a nicety to catch errors - // earlier - // // than they would otherwise be caught, we only call those extra - // functions - // // for filenames beginning with 'a' or 's'. - // return kNoInput; - } else if (isdigit(last_char)) { - const char *d = c + length - 1; - while (isdigit(*d) && d > c) d--; - if (*d == ':') - return kOffsetFileInput; // Filename is like - // some_file:12345 - // otherwise it could still be a filename; continue to the next check. - } - - // At this point it matched no other pattern so we assume a filename, but - // we check for '|' as it's a common source of errors to have pipe - // commands without the pipe in the right place. Say that it can't be - // classified in this case. - if (strchr(c, '|') != NULL) { - KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the" - " wrong place (pipe without | at the end?): " - << filename; - return kNoInput; - } - return kFileInput; // It matched no other pattern: assume it's a filename. -} - -class OutputImplBase { - public: - // Open will open it as a file (no header), and return true - // on success. It cannot be called on an already open stream. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::ostream &Stream() = 0; - virtual bool Close() = 0; - virtual ~OutputImplBase() {} -}; - -class FileOutputImpl : public OutputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (os_.is_open()) - KALDI_ERR << "FileOutputImpl::Open(), " - << "open called on already open file."; - filename_ = filename; - os_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out); - return os_.is_open(); - } - - virtual std::ostream &Stream() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return os_; - } - - virtual bool Close() { - if (!os_.is_open()) - KALDI_ERR << "FileOutputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - os_.close(); - return !(os_.fail()); - } - virtual ~FileOutputImpl() { - if (os_.is_open()) { - os_.close(); - if (os_.fail()) KALDI_ERR << "Error closing output file " << filename_; - } - } - - private: - std::string filename_; - std::ofstream os_; -}; - -class StandardOutputImpl : public OutputImplBase { - public: - StandardOutputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardOutputImpl::Open(), " - "open called on already open file."; -#ifdef _MSC_VER - _setmode(_fileno(stdout), binary ? _O_BINARY : _O_TEXT); -#endif - is_open_ = std::cout.good(); - return is_open_; - } - - virtual std::ostream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cout; - } - - virtual bool Close() { - if (!is_open_) - KALDI_ERR << "StandardOutputImpl::Close(), file is not open."; - is_open_ = false; - std::cout << std::flush; - return !(std::cout.fail()); - } - virtual ~StandardOutputImpl() { - if (is_open_) { - std::cout << std::flush; - if (std::cout.fail()) KALDI_ERR << "Error writing to standard output"; - } - } - - private: - bool is_open_; -}; - -class PipeOutputImpl : public OutputImplBase { - public: - PipeOutputImpl() : f_(NULL), os_(NULL) {} - - virtual bool Open(const std::string &wxfilename, bool binary) { - filename_ = wxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should - // start with '|' - std::string cmd_name(wxfilename, 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); -#else - f_ = popen(cmd_name.c_str(), "w"); -#endif - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for writing, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't make the - // destructor try to close the stream when - // we're done. - (binary ? std::ios_base::out | std::ios_base::binary - : std::ios_base::out)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - os_ = new std::ostream(fb_); -#else - os_ = new std::ofstream(f_); -#endif - return os_->good(); - } - } - - virtual std::ostream &Stream() { - if (os_ == NULL) - KALDI_ERR << "PipeOutputImpl::Stream()," - " object not initialized."; - // I believe this error can only arise from coding error. - return *os_; - } - - virtual bool Close() { - if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Close(), file is not open."; - bool ok = true; - os_->flush(); - if (os_->fail()) ok = false; - delete os_; - os_ = NULL; - int status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return ok; - } - virtual ~PipeOutputImpl() { - if (os_) { - if (!Close()) - KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_); - } - } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::ostream *os_; -}; - -class InputImplBase { - public: - // Open will open it as a file, and return true on success. - // May be called twice only for kOffsetFileInput (otherwise, - // if called twice, we just create a new Input object, to avoid - // having to deal with the extra hassle of reopening with the - // same object. - // Note that we will to call Open with true (binary) for - // for text-mode Kaldi files; the only actual text-mode input - // is for non-Kaldi files. - virtual bool Open(const std::string &filename, bool binary) = 0; - virtual std::istream &Stream() = 0; - virtual int32 Close() = 0; // We only need to check failure in the case of - // kPipeInput. - // on close for input streams. - virtual InputType MyType() = 0; // Because if it's kOffsetFileInput, we may - // call Open twice - // (has efficiency benefits). - - virtual ~InputImplBase() {} -}; - -class FileInputImpl : public InputImplBase { - public: - virtual bool Open(const std::string &filename, bool binary) { - if (is_.is_open()) - KALDI_ERR << "FileInputImpl::Open(), " - << "open called on already open file."; - is_.open( - MapOsPath(filename).c_str(), - binary ? std::ios_base::in | std::ios_base::binary : std::ios_base::in); - return is_.is_open(); - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kFileInput; } - - virtual ~FileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::ifstream is_; -}; - -class StandardInputImpl : public InputImplBase { - public: - StandardInputImpl() : is_open_(false) {} - - virtual bool Open(const std::string &filename, bool binary) { - if (is_open_) - KALDI_ERR << "StandardInputImpl::Open(), " - "open called on already open file."; - is_open_ = true; -#ifdef _MSC_VER - _setmode(_fileno(stdin), binary ? _O_BINARY : _O_TEXT); -#endif - return true; // Don't check good() because would be false if - // eof, which may be valid input. - } - - virtual std::istream &Stream() { - if (!is_open_) - KALDI_ERR << "StandardInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return std::cin; - } - - virtual InputType MyType() { return kStandardInput; } - - virtual int32 Close() { - if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open."; - is_open_ = false; - return 0; - } - virtual ~StandardInputImpl() {} - - private: - bool is_open_; -}; - -class PipeInputImpl : public InputImplBase { - public: - PipeInputImpl() : f_(NULL), is_(NULL) {} - - virtual bool Open(const std::string &rxfilename, bool binary) { - filename_ = rxfilename; - KALDI_ASSERT(f_ == NULL); // Make sure closed. - KALDI_ASSERT(rxfilename.length() != 0 && - rxfilename[rxfilename.length() - 1] == - '|'); // should end with '|' - std::string cmd_name(rxfilename, 0, rxfilename.length() - 1); -#if defined(_MSC_VER) || defined(__CYGWIN__) - f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); -#else - f_ = popen(cmd_name.c_str(), "r"); -#endif - - if (!f_) { // Failure. - KALDI_WARN << "Failed opening pipe for reading, command is: " << cmd_name - << ", errno is " << strerror(errno); - return false; - } else { -#ifndef _MSC_VER - fb_ = new PipebufType(f_, // Using this constructor won't lead the - // destructor to close the stream. - (binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in)); - KALDI_ASSERT(fb_ != NULL); // or would be alloc error. - is_ = new std::istream(fb_); -#else - is_ = new std::ifstream(f_); -#endif - if (is_->fail() || is_->bad()) return false; - if (is_->eof()) { - KALDI_WARN << "Pipe opened with command " - << PrintableRxfilename(rxfilename) << " is empty."; - // don't return false: empty may be valid. - } - return true; - } - } - - virtual std::istream &Stream() { - if (is_ == NULL) - KALDI_ERR << "PipeInputImpl::Stream(), object not initialized."; - // I believe this error can only arise from coding error. - return *is_; - } - - virtual int32 Close() { - if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open."; - delete is_; - is_ = NULL; - int32 status; -#ifdef _MSC_VER - status = _pclose(f_); -#else - status = pclose(f_); -#endif - if (status) - KALDI_WARN << "Pipe " << filename_ << " had nonzero return status " - << status; - f_ = NULL; -#ifndef _MSC_VER - delete fb_; - fb_ = NULL; -#endif - return status; - } - virtual ~PipeInputImpl() { - if (is_) Close(); - } - virtual InputType MyType() { return kPipeInput; } - - private: - std::string filename_; - FILE *f_; -#ifndef _MSC_VER - PipebufType *fb_; -#endif - std::istream *is_; -}; - -/* -#else - -// Just have an empty implementation of the pipe input that crashes if -// called. -class PipeInputImpl: public InputImplBase { - public: - PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this - platform."); } - virtual bool Open(const std::string, bool) { return 0; } - virtual std::istream &Stream() const { return NULL; } - virtual void Close() {} - virtual InputType MyType() { return kPipeInput; } -}; - -#endif -*/ - -class OffsetFileInputImpl : public InputImplBase { - // This class is a bit more complicated than the - - public: - // splits a filename like /my/file:123 into /my/file and the - // number 123. Crashes if not this format. - static void SplitFilename(const std::string &rxfilename, - std::string *filename, size_t *offset) { - size_t pos = rxfilename.find_last_of(':'); - KALDI_ASSERT(pos != std::string::npos); // would indicate error in calling - // code, as the filename is supposed to be of the correct form at this - // point. - *filename = std::string(rxfilename, 0, pos); - std::string number(rxfilename, pos + 1); - bool ans = ConvertStringToInteger(number, offset); - if (!ans) - KALDI_ERR << "Cannot get offset from filename " << rxfilename - << " (possibly you compiled in 32-bit and have a >32-bit" - << " byte offset into a file; you'll have to compile 64-bit."; - } - - bool Seek(size_t offset) { - size_t cur_pos = is_.tellg(); - if (cur_pos == offset) { - return true; - } else if (cur_pos < offset && cur_pos + 100 > offset) { - // We're close enough that it may be faster to just - // read that data, rather than seek. - for (size_t i = cur_pos; i < offset; i++) is_.get(); - return (is_.tellg() == std::streampos(offset)); - } - // Try to actually seek. - is_.seekg(offset, std::ios_base::beg); - if (is_.fail()) { // failbit or badbit is set [error happened] - is_.close(); - return false; // failure. - } else { - is_.clear(); // Clear any failure bits (e.g. eof). - return true; // success. - } - } - - // This Open routine is unusual in that it is designed to work even - // if it was already open. This for efficiency when seeking multiple - // times. - virtual bool Open(const std::string &rxfilename, bool binary) { - if (is_.is_open()) { - // We are opening when we have an already-open file. - // We may have to seek within this file, or else close it and - // open a different one. - std::string tmp_filename; - size_t offset; - SplitFilename(rxfilename, &tmp_filename, &offset); - if (tmp_filename == filename_ && binary == binary_) { // Just seek - is_.clear(); // clear fail bit, etc. - return Seek(offset); - } else { - is_.close(); // don't bother checking error status of is_. - filename_ = tmp_filename; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } else { - size_t offset; - SplitFilename(rxfilename, &filename_, &offset); - binary_ = binary; - is_.open(MapOsPath(filename_).c_str(), - binary ? std::ios_base::in | std::ios_base::binary - : std::ios_base::in); - if (!is_.is_open()) - return false; - else - return Seek(offset); - } - } - - virtual std::istream &Stream() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Stream(), file is not open."; - // I believe this error can only arise from coding error. - return is_; - } - - virtual int32 Close() { - if (!is_.is_open()) - KALDI_ERR << "FileInputImpl::Close(), file is not open."; - // I believe this error can only arise from coding error. - is_.close(); - // Don't check status. - return 0; - } - - virtual InputType MyType() { return kOffsetFileInput; } - - virtual ~OffsetFileInputImpl() { - // Stream will automatically be closed, and we don't care about - // whether it fails. - } - - private: - std::string filename_; // the actual filename - bool binary_; // true if was opened in binary mode. - std::ifstream is_; -}; - -Output::Output(const std::string &wxfilename, bool binary, bool write_header) - : impl_(NULL) { - if (!Open(wxfilename, binary, write_header)) { - if (impl_) { - delete impl_; - impl_ = NULL; - } - KALDI_ERR << "Error opening output stream " - << PrintableWxfilename(wxfilename); - } -} - -bool Output::Close() { - if (!impl_) { - return false; // error to call Close if not open. - } else { - bool ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } -} - -Output::~Output() { - if (impl_) { - bool ok = impl_->Close(); - delete impl_; - impl_ = NULL; - if (!ok) - KALDI_ERR << "Error closing output file " - << PrintableWxfilename(filename_) - << (ClassifyWxfilename(filename_) == kFileOutput - ? " (disk full?)" - : ""); - } -} - -std::ostream &Output::Stream() { // will throw if not open; else returns - // stream. - if (!impl_) KALDI_ERR << "Output::Stream() called but not open."; - return impl_->Stream(); -} - -bool Output::Open(const std::string &wxfn, bool binary, bool header) { - if (IsOpen()) { - if (!Close()) { // Throw here rather than return status, as it's an error - // about something else: if the user wanted to avoid the exception he/she - // could have called Close(). - KALDI_ERR << "Output::Open(), failed to close output stream: " - << PrintableWxfilename(filename_); - } - } - - filename_ = wxfn; - - OutputType type = ClassifyWxfilename(wxfn); - KALDI_ASSERT(impl_ == NULL); - - if (type == kFileOutput) { - impl_ = new FileOutputImpl(); - } else if (type == kStandardOutput) { - impl_ = new StandardOutputImpl(); - } else if (type == kPipeOutput) { - impl_ = new PipeOutputImpl(); - } else { // type == kNoOutput - KALDI_WARN << "Invalid output filename format " - << PrintableWxfilename(wxfn); - return false; - } - if (!impl_->Open(wxfn, binary)) { - delete impl_; - impl_ = NULL; - return false; // failed to open. - } else { // successfully opened it. - if (header) { - InitKaldiOutputStream(impl_->Stream(), binary); - bool ok = impl_->Stream().good(); // still OK? - if (!ok) { - delete impl_; - impl_ = NULL; - return false; - } - return true; - } else { - return true; - } - } -} - -Input::Input(const std::string &rxfilename, bool *binary) : impl_(NULL) { - if (!Open(rxfilename, binary)) { - KALDI_ERR << "Error opening input stream " - << PrintableRxfilename(rxfilename); - } -} - -int32 Input::Close() { - if (impl_) { - int32 ans = impl_->Close(); - delete impl_; - impl_ = NULL; - return ans; - } else { - return 0; - } -} - -bool Input::OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary) { - InputType type = ClassifyRxfilename(rxfilename); - if (IsOpen()) { - // May have to close the stream first. - if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) { - // We want to use the same object to Open... this is in case - // the files are the same, so we can just seek. - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always open in binary. - delete impl_; - impl_ = NULL; - return false; - } - // read the binary header, if requested. - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; - } else { - Close(); - // and fall through to code below which actually opens the file. - } - } - if (type == kFileInput) { - impl_ = new FileInputImpl(); - } else if (type == kStandardInput) { - impl_ = new StandardInputImpl(); - } else if (type == kPipeInput) { - impl_ = new PipeInputImpl(); - } else if (type == kOffsetFileInput) { - impl_ = new OffsetFileInputImpl(); - } else { // type == kNoInput - KALDI_WARN << "Invalid input filename format " - << PrintableRxfilename(rxfilename); - return false; - } - if (!impl_->Open(rxfilename, file_binary)) { // true is binary mode-- - // always read in binary. - delete impl_; - impl_ = NULL; - return false; - } - if (contents_binary != NULL) - return InitKaldiInputStream(impl_->Stream(), contents_binary); - else - return true; -} - -Input::~Input() { - if (impl_) Close(); -} - -std::istream &Input::Stream() { - if (!IsOpen()) KALDI_ERR << "Input::Stream(), not open."; - return impl_->Stream(); -} - -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m) { -// if (!filename.empty() && filename[filename.size() - 1] == ']') { -// // This filename seems to have a 'range'... like foo.ark:4312423[20:30]. -// // (the bit in square brackets is the range). -// std::string rxfilename, range; -// if (!ExtractRangeSpecifier(filename, &rxfilename, &range)) { -// KALDI_ERR << "Could not make sense of possible range specifier in -// filename " -// << "while reading matrix: " << filename; -// } -// Matrix temp; -// bool binary_in; -// Input ki(rxfilename, &binary_in); -// temp.Read(ki.Stream(), binary_in); -// if (!ExtractObjectRange(temp, range, m)) { -// KALDI_ERR << "Error extracting range of object: " << filename; -// } -// } else { -// // The normal case, there is no range. -// bool binary_in; -// Input ki(filename, &binary_in); -// m->Read(ki.Stream(), binary_in); -// } -// } - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io.h deleted file mode 100644 index 2175ca8f89ed5f3e3bade26528e924208df692c6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-io.h +++ /dev/null @@ -1,266 +0,0 @@ -// util/kaldi-io.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky -// 2016 Xiaohui Zhang - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_KALDI_IO_H_ -#define KALDI_UTIL_KALDI_IO_H_ - -#ifdef _MSC_VER -#include -#include -#endif -#include // For isspace. -#include -#include -#include "base/kaldi-common.h" -// #include "matrix/kaldi-matrix.h" - -namespace kaldi { - -class OutputImplBase; // Forward decl; defined in a .cc file -class InputImplBase; // Forward decl; defined in a .cc file - -/// \addtogroup io_group -/// @{ - -// The Output and Input classes handle stream-opening for "extended" filenames -// that include actual files, standard-input/standard-output, pipes, and -// offsets into actual files. They also handle reading and writing the -// binary-mode headers for Kaldi files, where applicable. The classes have -// versions of the Open routines that throw and do not throw, depending whether -// the calling code wants to catch the errors or not; there are also versions -// that write (or do not write) the Kaldi binary-mode header that says if it's -// binary mode. Generally files that contain Kaldi objects will have the header -// on, so we know upon reading them whether they have the header. So you would -// use the OpenWithHeader routines for these (or the constructor); but other -// types of objects (e.g. FSTs) would have files without a header so you would -// use OpenNoHeader. - -// We now document the types of extended filenames that we use. -// -// A "wxfilename" is an extended filename for writing. It can take three forms: -// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My -// Documents\\boo" -// (whatever the actual file-system interprets) -// (2) Standard output: "" or "-" -// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" -// -// -// A "rxfilename" is an extended filename for reading. It can take four forms: -// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". -// (2) Standard input: "" or "-" -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" -// [these are created by the Table and TableWriter classes; I may also write -// a program that creates them for arbitrary files] -// - -// Typical usage: -// ... -// bool binary; -// MyObject.Write(Output(some_filename, binary).Stream(), binary); -// -// ... more extensive example: -// { -// Output ko(some_filename, binary); -// MyObject1.Write(ko.Stream(), binary); -// MyObject2.Write(ko.Stream(), binary); -// } - -enum OutputType { kNoOutput, kFileOutput, kStandardOutput, kPipeOutput }; - -/// ClassifyWxfilename interprets filenames as follows: -/// - kNoOutput: invalid filenames (leading or trailing space, things that look -/// like wspecifiers and rspecifiers or like pipes to read from with leading -/// |. -/// - kFileOutput: Normal filenames -/// - kStandardOutput: The empty string or "-", interpreted as standard output -/// - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz" -OutputType ClassifyWxfilename(const std::string &wxfilename); - -enum InputType { - kNoInput, - kFileInput, - kStandardInput, - kOffsetFileInput, - kPipeInput -}; - -/// ClassifyRxfilenames interprets filenames for reading as follows: -/// - kNoInput: invalid filenames (leading or trailing space, things that -/// look like wspecifiers and rspecifiers or pipes to write to -/// with trailing |. -/// - kFileInput: normal filenames -/// - kStandardInput: the empty string or "-" -/// - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |" -/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 -InputType ClassifyRxfilename(const std::string &rxfilename); - -class Output { - public: - // The normal constructor, provided for convenience. - // Equivalent to calling with default constructor then Open() - // with these arguments. - Output(const std::string &filename, bool binary, bool write_header = true); - - Output() : impl_(NULL) {} - - /// This opens the stream, with the given mode (binary or text). It returns - /// true on success and false on failure. However, it will throw if something - /// was already open and could not be closed (to avoid this, call Close() - /// first. if write_header == true and binary == true, it writes the Kaldi - /// binary-mode header ('\0' then 'B'). You may call Open even if it is - /// already open; it will close the existing stream and reopen (however if - /// closing the old stream failed it will throw). - bool Open(const std::string &wxfilename, bool binary, bool write_header); - - inline bool IsOpen(); // return true if we have an open stream. Does not - // imply stream is good for writing. - - std::ostream &Stream(); // will throw if not open; else returns stream. - - // Close closes the stream. Calling Close is never necessary unless you - // want to avoid exceptions being thrown. There are times when calling - // Close will hurt efficiency (basically, when using offsets into files, - // and using the same Input object), - // but most of the time the user won't be doing this directly, it will - // be done in kaldi-table.{h, cc}, so you don't have to worry about it. - bool Close(); - - // This will throw if stream could not be closed (to check error status, - // call Close()). - ~Output(); - - private: - OutputImplBase *impl_; // non-NULL if open. - std::string filename_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Output); -}; - -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject.Read(ki.Stream(), binary_in); -// -// ... more extensive example: -// -// { -// bool binary_in; -// Input ki(some_filename, &binary_in); -// MyObject1.Read(ki.Stream(), &binary_in); -// MyObject2.Write(ki.Stream(), &binary_in); -// } -// Note that to catch errors you need to use try.. catch. -// Input communicates errors by throwing exceptions. - -// Input interprets four kinds of filenames: -// (1) Normal filenames -// (2) The empty string or "-", interpreted as standard output -// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" -// (4) Offsets into [real] files, e.g. "/my/filename:12049" -// The last one has no correspondence in Output. - -class Input { - public: - /// The normal constructor. Opens the stream in binary mode. - /// Equivalent to calling the default constructor followed by Open(); then, if - /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it - /// throws on error. - explicit Input(const std::string &rxfilename, bool *contents_binary = NULL); - - Input() : impl_(NULL) {} - - // Open opens the stream for reading (the mode, where relevant, is binary; use - // OpenTextMode for text-mode, we made this a separate function rather than a - // boolean argument, to avoid confusion with Kaldi's text/binary distinction, - // since reading in the file system's text mode is unusual.) If - // contents_binary != NULL, it reads the binary-mode header and puts it in the - // "binary" variable. Returns true on success. If it returns false it will - // not be open. You may call Open even if it is already open; it will close - // the existing stream and reopen (however if closing the old stream failed it - // will throw). - inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); - - // As Open but (if the file system has text/binary modes) opens in text mode; - // you shouldn't ever have to use this as in Kaldi we read even text files in - // binary mode (and ignore the \r). - inline bool OpenTextMode(const std::string &rxfilename); - - // Return true if currently open for reading and Stream() will - // succeed. Does not guarantee that the stream is good. - inline bool IsOpen(); - - // It is never necessary or helpful to call Close, except if - // you are concerned about to many filehandles being open. - // Close does not throw. It returns the exit code as int32 - // in the case of a pipe [kPipeInput], and always zero otherwise. - int32 Close(); - - // Returns the underlying stream. Throws if !IsOpen() - std::istream &Stream(); - - // Destructor does not throw: input streams may legitimately fail so we - // don't worry about the status when we close them. - ~Input(); - - private: - bool OpenInternal(const std::string &rxfilename, bool file_binary, - bool *contents_binary); - InputImplBase *impl_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Input); -}; - -template -void ReadKaldiObject(const std::string &filename, C *c) { - bool binary_in; - Input ki(filename, &binary_in); - c->Read(ki.Stream(), binary_in); -} - -// Specialize the template for reading matrices, because we want to be able to -// support reading 'ranges' (row and column ranges), like foo.mat[10:20]. -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); -// -// -// template <> void ReadKaldiObject(const std::string &filename, -// Matrix *m); - -template -inline void WriteKaldiObject(const C &c, const std::string &filename, - bool binary) { - Output ko(filename, binary); - c.Write(ko.Stream(), binary); -} - -/// PrintableRxfilename turns the rxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard input". -std::string PrintableRxfilename(const std::string &rxfilename); - -/// PrintableWxfilename turns the wxfilename into a more human-readable -/// form for error reporting, i.e. it does quoting and escaping and -/// replaces "" or "-" with "standard output". -std::string PrintableWxfilename(const std::string &wxfilename); - -/// @} - -} // end namespace kaldi. - -#include "util/kaldi-io-inl.h" - -#endif // KALDI_UTIL_KALDI_IO_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-pipebuf.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-pipebuf.h deleted file mode 100644 index bcee80ccb1a6fa8ce3195483ac144c5ff66d2f89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/kaldi-pipebuf.h +++ /dev/null @@ -1,86 +0,0 @@ -// util/kaldi-pipebuf.h - -// Copyright 2009-2011 Ondrej Glembek - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/** @file kaldi-pipebuf.h - * This is an Kaldi C++ Library header. - */ - -#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ -#define KALDI_UTIL_KALDI_PIPEBUF_H_ - -#include -#if !defined(_LIBCPP_VERSION) // libc++ -#include -#else -#include "util/basic-filebuf.h" -#endif - -namespace kaldi { -// This class provides a way to initialize a filebuf with a FILE* pointer -// directly; it will not close the file pointer when it is deleted. -// The C++ standard does not allow implementations of C++ to provide -// this constructor within basic_filebuf, which makes it hard to deal -// with pipes using completely native C++. This is a workaround - -#ifdef _MSC_VER -#elif defined(_LIBCPP_VERSION) // libc++ -template > -class basic_pipebuf : public basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : basic_filebuf() { - this->open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - } -}; // class basic_pipebuf -#else -template > -class basic_pipebuf : public std::basic_filebuf { - public: - typedef basic_pipebuf ThisType; - - public: - basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) - : std::basic_filebuf() { - this->_M_file.sys_open(fptr, mode); - if (!this->is_open()) { - KALDI_WARN << "Error initializing pipebuf"; // probably indicates - // code error, if the fptr was good. - return; - } - this->_M_mode = mode; - this->_M_buf_size = BUFSIZ; - this->_M_allocate_internal_buffer(); - this->_M_reading = false; - this->_M_writing = false; - this->_M_set_buffer(-1); - } -}; // class basic_pipebuf -#endif // _MSC_VER - -} // namespace kaldi - -#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/parse-options.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/parse-options.cc deleted file mode 100644 index 1f2ef844d28d67ed58d2e0c9d7c7b674e8209df8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/parse-options.cc +++ /dev/null @@ -1,636 +0,0 @@ -// util/parse-options.cc - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -// Frantisek Skala; Arnab Ghoshal -// Copyright 2013 Tanel Alumae -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" -#include "util/parse-options.h" -#include "util/text-utils.h" - -namespace kaldi { - -ParseOptions::ParseOptions(const std::string &prefix, OptionsItf *other) - : print_args_(false), help_(false), usage_(""), argc_(0), argv_(NULL) { - ParseOptions *po = dynamic_cast(other); - if (po != NULL && po->other_parser_ != NULL) { - // we get here if this constructor is used twice, recursively. - other_parser_ = po->other_parser_; - } else { - other_parser_ = other; - } - if (po != NULL && po->prefix_ != "") { - prefix_ = po->prefix_ + std::string(".") + prefix; - } else { - prefix_ = prefix; - } -} - -void ParseOptions::Register(const std::string &name, bool *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, int32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, uint32 *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, float *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, double *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -void ParseOptions::Register(const std::string &name, std::string *ptr, - const std::string &doc) { - RegisterTmpl(name, ptr, doc); -} - -// old-style, used for registering application-specific parameters -template -void ParseOptions::RegisterTmpl(const std::string &name, T *ptr, - const std::string &doc) { - if (other_parser_ == NULL) { - this->RegisterCommon(name, ptr, doc, false); - } else { - KALDI_ASSERT(prefix_ != "" && - "Cannot use empty prefix when registering with prefix."); - std::string new_name = prefix_ + '.' + name; // name becomes prefix.name - other_parser_->Register(new_name, ptr, doc); - } -} - -// does the common part of the job of registering a parameter -template -void ParseOptions::RegisterCommon(const std::string &name, T *ptr, - const std::string &doc, bool is_standard) { - KALDI_ASSERT(ptr != NULL); - std::string idx = name; - NormalizeArgName(&idx); - if (doc_map_.find(idx) != doc_map_.end()) - KALDI_WARN << "Registering option twice, ignoring second time: " << name; - this->RegisterSpecific(name, idx, ptr, doc, is_standard); -} - -// used to register standard parameters (those that are present in all of the -// applications) -template -void ParseOptions::RegisterStandard(const std::string &name, T *ptr, - const std::string &doc) { - this->RegisterCommon(name, ptr, doc, true); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, bool *b, - const std::string &doc, bool is_standard) { - bool_map_[idx] = b; - doc_map_[idx] = - DocInfo(name, doc + " (bool, default = " + ((*b) ? "true)" : "false)"), - is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, int32 *i, - const std::string &doc, bool is_standard) { - int_map_[idx] = i; - std::ostringstream ss; - ss << doc << " (int, default = " << *i << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, uint32 *u, - const std::string &doc, bool is_standard) { - uint_map_[idx] = u; - std::ostringstream ss; - ss << doc << " (uint, default = " << *u << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, float *f, - const std::string &doc, bool is_standard) { - float_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (float, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, double *f, - const std::string &doc, bool is_standard) { - double_map_[idx] = f; - std::ostringstream ss; - ss << doc << " (double, default = " << *f << ")"; - doc_map_[idx] = DocInfo(name, ss.str(), is_standard); -} - -void ParseOptions::RegisterSpecific(const std::string &name, - const std::string &idx, std::string *s, - const std::string &doc, bool is_standard) { - string_map_[idx] = s; - doc_map_[idx] = - DocInfo(name, doc + " (string, default = \"" + *s + "\")", is_standard); -} -void ParseOptions::DisableOption(const std::string &name) { - if (argv_ != NULL) - KALDI_ERR << "DisableOption must not be called after calling Read()."; - if (doc_map_.erase(name) == 0) - KALDI_ERR << "Option " << name - << " was not registered so cannot be disabled: "; - bool_map_.erase(name); - int_map_.erase(name); - uint_map_.erase(name); - float_map_.erase(name); - double_map_.erase(name); - string_map_.erase(name); -} - -int ParseOptions::NumArgs() const { return positional_args_.size(); } - -std::string ParseOptions::GetArg(int i) const { - // use KALDI_ERR if code error - if (i < 1 || i > static_cast(positional_args_.size())) - KALDI_ERR << "ParseOptions::GetArg, invalid index " << i; - return positional_args_[i - 1]; -} - -// We currently do not support any other options. -enum ShellType { kBash = 0 }; - -// This can be changed in the code if it ever does need to be changed (as it's -// unlikely that one compilation of this tool-set would use both shells). -static ShellType kShellType = kBash; - -// Returns true if we need to escape a string before putting it into -// a shell (mainly thinking of bash shell, but should work for others) -// This is for the convenience of the user so command-lines that are -// printed out by ParseOptions::Read (with --print-args=true) are -// paste-able into the shell and will run. If you use a different type of -// shell, it might be necessary to change this function. -// But it's mostly a cosmetic issue as it basically affects how -// the program echoes its command-line arguments to the screen. -static bool MustBeQuoted(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - const char *c = str.c_str(); - if (*c == '\0') { - return true; // Must quote empty string - } else { - const char *ok_chars[2]; - - // These seem not to be interpreted as long as there are no other "bad" - // characters involved (e.g. "," would be interpreted as part of something - // like a{b,c}, but not on its own. - ok_chars[kBash] = "[]~#^_-+=:.,/"; - - // Just want to make sure that a space character doesn't get automatically - // inserted here via an automated style-checking script, like it did before. - KALDI_ASSERT(!strchr(ok_chars[kBash], ' ')); - - for (; *c != '\0'; c++) { - // For non-alphanumeric characters we have a list of characters which - // are OK. All others are forbidden (this is easier since the shell - // interprets most non-alphanumeric characters). - if (!isalnum(*c)) { - const char *d; - for (d = ok_chars[st]; *d != '\0'; d++) - if (*c == *d) break; - // If not alphanumeric or one of the "ok_chars", it must be escaped. - if (*d == '\0') return true; - } - } - return false; // The string was OK. No quoting or escaping. - } -} - -// Returns a quoted and escaped version of "str" -// which has previously been determined to need escaping. -// Our aim is to print out the command line in such a way that if it's -// pasted into a shell of ShellType "st" (only bash for now), it -// will get passed to the program in the same way. -static std::string QuoteAndEscape(const std::string &str, ShellType st) { - // Only Bash is supported (for the moment). - KALDI_ASSERT(st == kBash && "Invalid shell type."); - - // For now we use the following rules: - // In the normal case, we quote with single-quote "'", and to escape - // a single-quote we use the string: '\'' (interpreted as closing the - // single-quote, putting an escaped single-quote from the shell, and - // then reopening the single quote). - char quote_char = '\''; - const char *escape_str = "'\\''"; // e.g. echo 'a'\''b' returns a'b - - // If the string contains single-quotes that would need escaping this - // way, and we determine that the string could be safely double-quoted - // without requiring any escaping, then we double-quote the string. - // This is the case if the characters "`$\ do not appear in the string. - // e.g. see http://www.redhat.com/mirrors/LDP/LDP/abs/html/quotingvar.html - const char *c_str = str.c_str(); - if (strchr(c_str, '\'') && !strpbrk(c_str, "\"`$\\")) { - quote_char = '"'; - escape_str = "\\\""; // should never be accessed. - } - - char buf[2]; - buf[1] = '\0'; - - buf[0] = quote_char; - std::string ans = buf; - const char *c = str.c_str(); - for (; *c != '\0'; c++) { - if (*c == quote_char) { - ans += escape_str; - } else { - buf[0] = *c; - ans += buf; - } - } - buf[0] = quote_char; - ans += buf; - return ans; -} - -// static function -std::string ParseOptions::Escape(const std::string &str) { - return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; -} - -int ParseOptions::Read(int argc, const char *const argv[]) { - argc_ = argc; - argv_ = argv; - std::string key, value; - int i; - if (argc > 0) { - // set global "const char*" g_program_name (name of the program) - // so it can be printed out in error messages; - // it's useful because often the stderr of different programs will - // be mixed together in the same log file. -#ifdef _MSC_VER - const char *c = strrchr(argv[0], '\\'); -#else - const char *c = strrchr(argv[0], '/'); -#endif - SetProgramName(c == NULL ? argv[0] : c + 1); - } - // first pass: look for config parameter, look for priority - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // a lone "--" marks the end of named options - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (key.compare("config") == 0) { - ReadConfigFile(value); - } - if (key.compare("help") == 0) { - PrintUsage(); - exit(0); - } - } - } - bool double_dash_seen = false; - // second pass: add the command line options - for (i = 1; i < argc; i++) { - if (std::strncmp(argv[i], "--", 2) == 0) { - if (std::strcmp(argv[i], "--") == 0) { - // A lone "--" marks the end of named options. - // Skip that option and break the processing of named options - i += 1; - double_dash_seen = true; - break; - } - bool has_equal_sign; - SplitLongArg(argv[i], &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << argv[i]; - } - } else { - break; - } - } - - // process remaining arguments as positional - for (; i < argc; i++) { - if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { - double_dash_seen = true; - } else { - positional_args_.push_back(std::string(argv[i])); - } - } - - // if the user did not suppress this with --print-args = false.... - if (print_args_) { - std::ostringstream strm; - for (int j = 0; j < argc; j++) strm << Escape(argv[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } - return i; -} - -void ParseOptions::PrintUsage(bool print_command_line) { - std::cerr << '\n' << usage_ << '\n'; - DocMapType::iterator it; - // first we print application-specific options - bool app_specific_header_printed = false; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option - if (app_specific_header_printed == false) { // header was not yet printed - std::cerr << "Options:" << '\n'; - app_specific_header_printed = true; - } - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - if (app_specific_header_printed == true) { - std::cerr << '\n'; - } - - // then the standard options - std::cerr << "Standard options:" << '\n'; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - std::cerr << " --" << std::setw(25) << std::left << it->second.name_ - << " : " << it->second.use_msg_ << '\n'; - } - } - std::cerr << '\n'; - if (print_command_line) { - std::ostringstream strm; - strm << "Command line was: "; - for (int j = 0; j < argc_; j++) strm << Escape(argv_[j]) << " "; - strm << '\n'; - std::cerr << strm.str() << std::flush; - } -} - -void ParseOptions::PrintConfig(std::ostream &os) { - os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; - std::string key; - DocMapType::iterator it; - for (it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; - if (bool_map_.end() != bool_map_.find(key)) { - os << (*bool_map_[key] ? "true" : "false"); - } else if (int_map_.end() != int_map_.find(key)) { - os << (*int_map_[key]); - } else if (uint_map_.end() != uint_map_.find(key)) { - os << (*uint_map_[key]); - } else if (float_map_.end() != float_map_.find(key)) { - os << (*float_map_[key]); - } else if (double_map_.end() != double_map_.find(key)) { - os << (*double_map_[key]); - } else if (string_map_.end() != string_map_.find(key)) { - os << "'" << *string_map_[key] << "'"; - } else { - KALDI_ERR << "PrintConfig: unrecognized option " << key << "[code error]"; - } - os << '\n'; - } - os << '\n'; -} - -void ParseOptions::ReadConfigFile(const std::string &filename) { - std::ifstream is(filename.c_str(), std::ifstream::in); - if (!is.good()) { - KALDI_ERR << "Cannot open config file: " << filename; - } - - std::string line, key, value; - int32 line_number = 0; - while (std::getline(is, line)) { - line_number++; - // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { - line.erase(pos); - } - // skip empty lines - Trim(&line); - if (line.length() == 0) continue; - - if (line.substr(0, 2) != "--") { - KALDI_ERR << "Reading config file " << filename << ": line " - << line_number << " does not look like a line " - << "from a Kaldi command-line program's config file: should " - << "be of the form --x=y. Note: config files intended to " - << "be sourced by shell scripts lack the '--'."; - } - - // parse option - bool has_equal_sign; - SplitLongArg(line, &key, &value, &has_equal_sign); - NormalizeArgName(&key); - Trim(&value); - if (!SetOption(key, value, has_equal_sign)) { - PrintUsage(true); - KALDI_ERR << "Invalid option " << line << " in config file " << filename; - } - } -} - -void ParseOptions::SplitLongArg(const std::string &in, std::string *key, - std::string *value, bool *has_equal_sign) { - KALDI_ASSERT(in.substr(0, 2) == "--"); // precondition. - size_t pos = in.find_first_of('=', 0); - if (pos == std::string::npos) { // we allow --option for bools - // defaults to empty. We handle this differently in different cases. - *key = in.substr(2, in.size() - 2); // 2 because starts with --. - *value = ""; - *has_equal_sign = false; - } else if (pos == 2) { // we also don't allow empty keys: --=value - PrintUsage(true); - KALDI_ERR << "Invalid option (no key): " << in; - } else { // normal case: --option=value - *key = in.substr(2, pos - 2); // 2 because starts with --. - *value = in.substr(pos + 1); - *has_equal_sign = true; - } -} - -void ParseOptions::NormalizeArgName(std::string *str) { - std::string out; - std::string::iterator it; - - for (it = str->begin(); it != str->end(); ++it) { - if (*it == '_') - out += '-'; // convert _ to - - else - out += std::tolower(*it); - } - *str = out; - - KALDI_ASSERT(str->length() > 0); -} - -bool ParseOptions::SetOption(const std::string &key, const std::string &value, - bool has_equal_sign) { - if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") - KALDI_ERR << "Invalid option --" << key << "="; - *(bool_map_[key]) = ToBool(value); - } else if (int_map_.end() != int_map_.find(key)) { - *(int_map_[key]) = ToInt(value); - } else if (uint_map_.end() != uint_map_.find(key)) { - *(uint_map_[key]) = ToUint(value); - } else if (float_map_.end() != float_map_.find(key)) { - *(float_map_[key]) = ToFloat(value); - } else if (double_map_.end() != double_map_.find(key)) { - *(double_map_[key]) = ToDouble(value); - } else if (string_map_.end() != string_map_.find(key)) { - if (!has_equal_sign) - KALDI_ERR << "Invalid option --" << key << " (option format is --x=y)."; - *(string_map_[key]) = value; - } else { - return false; - } - return true; -} - -bool ParseOptions::ToBool(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); - - // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { - return true; - } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { - return false; - } - // if it is neither true nor false: - PrintUsage(true); - KALDI_ERR << "Invalid format for boolean argument [expected true or false]: " - << str; - return false; // never reached -} - -int32 ParseOptions::ToInt(const std::string &str) { - int32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -uint32 ParseOptions::ToUint(const std::string &str) { - uint32 ret; - if (!ConvertStringToInteger(str, &ret)) - KALDI_ERR << "Invalid integer option \"" << str << "\""; - return ret; -} - -float ParseOptions::ToFloat(const std::string &str) { - float ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -double ParseOptions::ToDouble(const std::string &str) { - double ret; - if (!ConvertStringToReal(str, &ret)) - KALDI_ERR << "Invalid floating-point option \"" << str << "\""; - return ret; -} - -// instantiate templates -template void ParseOptions::RegisterTmpl(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr, - const std::string &doc); -template void ParseOptions::RegisterTmpl(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterStandard(const std::string &name, bool *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - int32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - uint32 *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - float *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - double *ptr, - const std::string &doc); -template void ParseOptions::RegisterStandard(const std::string &name, - std::string *ptr, - const std::string &doc); - -template void ParseOptions::RegisterCommon(const std::string &name, bool *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, int32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, uint32 *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, float *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, double *ptr, - const std::string &doc, - bool is_standard); -template void ParseOptions::RegisterCommon(const std::string &name, - std::string *ptr, - const std::string &doc, - bool is_standard); - -} // namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/parse-options.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/parse-options.h deleted file mode 100644 index 93a060f4a411dfd63298a91bb313e0b66d337a75..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/parse-options.h +++ /dev/null @@ -1,265 +0,0 @@ -// util/parse-options.h - -// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; -// Saarland University (Author: Arnab Ghoshal); -// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ -#define KALDI_UTIL_PARSE_OPTIONS_H_ - -#include -#include -#include - -#include "base/kaldi-common.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/// The class ParseOptions is for parsing command-line options; see -/// \ref parse_options for more documentation. -class ParseOptions : public OptionsItf { - public: - explicit ParseOptions(const char *usage) - : print_args_(true), - help_(false), - usage_(usage), - argc_(0), - argv_(NULL), - prefix_(""), - other_parser_(NULL) { -#if !defined(_MSC_VER) && \ - !defined(__CYGWIN__) // This is just a convenient place to set the stderr - // to line - setlinebuf(stderr); // buffering mode, since it's called at program start. -#endif // This helps ensure different programs' output is not mixed up. - RegisterStandard("config", &config_, - "Configuration file to read (this " - "option may be repeated)"); - RegisterStandard("print-args", &print_args_, - "Print the command line arguments (to stderr)"); - RegisterStandard("help", &help_, "Print out usage message"); - RegisterStandard("verbose", &g_kaldi_verbose_level, - "Verbose level (higher->more logging)"); - } - - /** - This is a constructor for the special case where some options are - registered with a prefix to avoid conflicts. The object thus created will - only be used temporarily to register an options class with the original - options parser (which is passed as the *other pointer) using the given - prefix. It should not be used for any other purpose, and the prefix must - not be the empty string. It seems to be the least bad way of implementing - options with prefixes at this point. - Example of usage is: - ParseOptions po; // original ParseOptions object - ParseOptions po_mfcc("mfcc", &po); // object with prefix. - MfccOptions mfcc_opts; - mfcc_opts.Register(&po_mfcc); - The options will now get registered as, e.g., --mfcc.frame-shift=10.0 - instead of just --frame-shift=10.0 - */ - ParseOptions(const std::string &prefix, OptionsItf *other); - - ~ParseOptions() {} - - // Methods from the interface - void Register(const std::string &name, bool *ptr, const std::string &doc); - void Register(const std::string &name, int32 *ptr, const std::string &doc); - void Register(const std::string &name, uint32 *ptr, const std::string &doc); - void Register(const std::string &name, float *ptr, const std::string &doc); - void Register(const std::string &name, double *ptr, const std::string &doc); - void Register(const std::string &name, std::string *ptr, - const std::string &doc); - - /// If called after registering an option and before calling - /// Read(), disables that option from being used. Will crash - /// at runtime if that option had not been registered. - void DisableOption(const std::string &name); - - /// This one is used for registering standard parameters of all the programs - template - void RegisterStandard(const std::string &name, T *ptr, - const std::string &doc); - - /** - Parses the command line options and fills the ParseOptions-registered - variables. This must be called after all the variables were registered!!! - - Initially the variables have implicit values, - then the config file values are set-up, - finally the command line values given. - Returns the first position in argv that was not used. - [typically not useful: use NumParams() and GetParam(). ] - */ - int Read(int argc, const char *const *argv); - - /// Prints the usage documentation [provided in the constructor]. - void PrintUsage(bool print_command_line = false); - /// Prints the actual configuration of all the registered variables - void PrintConfig(std::ostream &os); - - /// Reads the options values from a config file. Must be called after - /// registering all options. This is usually used internally after the - /// standard --config option is used, but it may also be called from a - /// program. - void ReadConfigFile(const std::string &filename); - - /// Number of positional parameters (c.f. argc-1). - int NumArgs() const; - - /// Returns one of the positional parameters; 1-based indexing for argc/argv - /// compatibility. Will crash if param is not >=1 and <=NumArgs(). - std::string GetArg(int param) const; - - std::string GetOptArg(int param) const { - return (param <= NumArgs() ? GetArg(param) : ""); - } - - /// The following function will return a possibly quoted and escaped - /// version of "str", according to the current shell. Currently - /// this is just hardwired to bash. It's useful for debug output. - static std::string Escape(const std::string &str); - - private: - /// Template to register various variable types, - /// used for program-specific parameters - template - void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); - - // Following functions do just the datatype-specific part of the job - /// Register boolean variable - void RegisterSpecific(const std::string &name, const std::string &idx, - bool *b, const std::string &doc, bool is_standard); - /// Register int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - int32 *i, const std::string &doc, bool is_standard); - /// Register unsinged int32 variable - void RegisterSpecific(const std::string &name, const std::string &idx, - uint32 *u, const std::string &doc, bool is_standard); - /// Register float variable - void RegisterSpecific(const std::string &name, const std::string &idx, - float *f, const std::string &doc, bool is_standard); - /// Register double variable [useful as we change BaseFloat type]. - void RegisterSpecific(const std::string &name, const std::string &idx, - double *f, const std::string &doc, bool is_standard); - /// Register string variable - void RegisterSpecific(const std::string &name, const std::string &idx, - std::string *s, const std::string &doc, - bool is_standard); - - /// Does the actual job for both kinds of parameters - /// Does the common part of the job for all datatypes, - /// then calls RegisterSpecific - template - void RegisterCommon(const std::string &name, T *ptr, const std::string &doc, - bool is_standard); - - /// Set option with name "key" to "value"; will crash if can't do it. - /// "has_equal_sign" is used to allow --x for a boolean option x, - /// and --y=, for a string option y. - bool SetOption(const std::string &key, const std::string &value, - bool has_equal_sign); - - bool ToBool(std::string str); - int32 ToInt(const std::string &str); - uint32 ToUint(const std::string &str); - float ToFloat(const std::string &str); - double ToDouble(const std::string &str); - - // maps for option variables - std::map bool_map_; - std::map int_map_; - std::map uint_map_; - std::map float_map_; - std::map double_map_; - std::map string_map_; - - /** - Structure for options' documentation - */ - struct DocInfo { - DocInfo() {} - DocInfo(const std::string &name, const std::string &usemsg) - : name_(name), use_msg_(usemsg), is_standard_(false) {} - DocInfo(const std::string &name, const std::string &usemsg, - bool is_standard) - : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} - - std::string name_; - std::string use_msg_; - bool is_standard_; - }; - typedef std::map DocMapType; - DocMapType doc_map_; ///< map for the documentation - - bool print_args_; ///< variable for the implicit --print-args parameter - bool help_; ///< variable for the implicit --help parameter - std::string config_; ///< variable for the implicit --config parameter - std::vector positional_args_; - const char *usage_; - int argc_; - const char *const *argv_; - - /// These members are not normally used. They are only used when the object - /// is constructed with a prefix - std::string prefix_; - OptionsItf *other_parser_; - - protected: - /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, - /// and sets "has_equal_sign" to true if an equals-sign was parsed.. - /// this is needed in order to correctly allow --x for a boolean option - /// x, and --y= for a string option y, and to disallow --x= and --y. - void SplitLongArg(const std::string &in, std::string *key, std::string *value, - bool *has_equal_sign); - - void NormalizeArgName(std::string *str); -}; - -/// This template is provided for convenience in reading config classes from -/// files; this is not the standard way to read configuration options, but may -/// occasionally be needed. This function assumes the config has a function -/// "void Register(OptionsItf *opts)" which it can call to register the -/// ParseOptions object. -template -void ReadConfigFromFile(const std::string &config_filename, C *c) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << config_filename << "'"; - ParseOptions po(usage_str.str().c_str()); - c->Register(&po); - po.ReadConfigFile(config_filename); -} - -/// This variant of the template ReadConfigFromFile is for if you need to read -/// two config classes from the same file. -template -void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { - std::ostringstream usage_str; - usage_str << "Parsing config from " - << "from '" << conf << "'"; - ParseOptions po(usage_str.str().c_str()); - c1->Register(&po); - c2->Register(&po); - po.ReadConfigFile(conf); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/simple-io-funcs.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/simple-io-funcs.cc deleted file mode 100644 index 5ace601b6a2bb186dec78b0b25cb5a3227c48bc9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/simple-io-funcs.cc +++ /dev/null @@ -1,80 +0,0 @@ -// util/simple-io-funcs.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#include "util/simple-io-funcs.h" -#include "util/text-utils.h" - -namespace kaldi { - -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - for (size_t i = 0; i < list.size(); i++) ko.Stream() << list[i] << '\n'; - return ko.Close(); -} - -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - int32 i; - list->clear(); - while (!(is >> i).fail()) list->push_back(i); - is >> std::ws; - return is.eof(); // should be eof, or junk at end of file. -} - -bool WriteIntegerVectorVectorSimple( - const std::string &wxfilename, - const std::vector > &list) { - kaldi::Output ko; - // false, false is: text-mode, no Kaldi header. - if (!ko.Open(wxfilename, false, false)) return false; - std::ostream &os = ko.Stream(); - for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i].size(); j++) { - os << list[i][j]; - if (j + 1 < list[i].size()) os << ' '; - } - os << '\n'; - } - return ko.Close(); -} - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *list) { - kaldi::Input ki; - if (!ki.OpenTextMode(rxfilename)) return false; - std::istream &is = ki.Stream(); - list->clear(); - std::string line; - while (std::getline(is, line)) { - std::vector v; - if (!SplitStringToIntegers(line, " \t\r", true, &v)) { - list->clear(); - return false; - } - list->push_back(v); - } - return is.eof(); // if we're not at EOF, something weird happened. -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/simple-io-funcs.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/simple-io-funcs.h deleted file mode 100644 index 1ead12790ba9bd6a44ccdff855918270191b8ebd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/simple-io-funcs.h +++ /dev/null @@ -1,61 +0,0 @@ -// util/simple-io-funcs.h - -// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ -#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ - -#include -#include -#include "util/kaldi-io.h" - -// This header contains some utilities for reading some common, simple text -// formats:integers in files, one per line, and integers in files, possibly -// multiple per line. these are not really fully native Kaldi formats; they are -// mostly for small files that might be generated by scripts, and can be read -// all at one time. for longer files of this type, we would probably use the -// Table code. - -namespace kaldi { - -/// WriteToList attempts to write this list of integers, one per line, -/// to the given file, in text format. -/// returns true if succeeded. -bool WriteIntegerVectorSimple(const std::string &wxfilename, - const std::vector &v); - -/// ReadFromList attempts to read this list of integers, one per line, -/// from the given file, in text format. -/// returns true if succeeded. -bool ReadIntegerVectorSimple(const std::string &rxfilename, - std::vector *v); - -// This is a file format like: -// 1 2 -// 3 -// -// 4 5 6 -// etc. -bool WriteIntegerVectorVectorSimple(const std::string &wxfilename, - const std::vector > &v); - -bool ReadIntegerVectorVectorSimple(const std::string &rxfilename, - std::vector > *v); - -} // end namespace kaldi. - -#endif // KALDI_UTIL_SIMPLE_IO_FUNCS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/stl-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/stl-utils.h deleted file mode 100644 index 8a29cd582c77b3078277aa9713b8676032bbc5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/stl-utils.h +++ /dev/null @@ -1,310 +0,0 @@ -// util/stl-utils.h - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_STL_UTILS_H_ -#define KALDI_UTIL_STL_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -using std::unordered_map; -using std::unordered_set; - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Sorts and uniq's (removes duplicates) from a vector. -template -inline void SortAndUniq(std::vector *vec) { - std::sort(vec->begin(), vec->end()); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Returns true if the vector is sorted. -template -inline bool IsSorted(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter < *iter) return false; - iter = next_iter; - } -} - -/// Returns true if the vector is sorted and contains each element -/// only once. -template -inline bool IsSortedAndUniq(const std::vector &vec) { - typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); - if (iter == end) return true; - while (1) { - typename std::vector::const_iterator next_iter = iter; - ++next_iter; - if (next_iter == end) return true; // end of loop and nothing out of order - if (*next_iter <= *iter) return false; - iter = next_iter; - } -} - -/// Removes duplicate elements from a sorted list. -template -inline void Uniq(std::vector *vec) { // must be already sorted. - KALDI_PARANOID_ASSERT(IsSorted(*vec)); - KALDI_ASSERT(vec); - vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); -} - -/// Copies the elements of a set to a vector. -template -void CopySetToVector(const std::set &s, std::vector *v) { - // copies members of s into v, in sorted order from lowest to highest - // (because the set was in sorted order). - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename std::set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -template -void CopySetToVector(const unordered_set &s, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(s.size()); - typename unordered_set::const_iterator siter = s.begin(), send = s.end(); - typename std::vector::iterator viter = v->begin(); - for (; siter != send; ++siter, ++viter) { - *viter = *siter; - } -} - -/// Copies the (key, value) pairs in a map to a vector of pairs. -template -void CopyMapToVector(const std::map &m, - std::vector > *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector >::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = std::make_pair(miter->first, miter->second); - // do it like this because of const casting. - } -} - -/// Copies the keys in a map to a vector. -template -void CopyMapKeysToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->first; - } -} - -/// Copies the values in a map to a vector. -template -void CopyMapValuesToVector(const std::map &m, std::vector *v) { - KALDI_ASSERT(v != NULL); - v->resize(m.size()); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - typename std::vector::iterator viter = v->begin(); - for (; miter != mend; ++miter, ++viter) { - *viter = miter->second; - } -} - -/// Copies the keys in a map to a set. -template -void CopyMapKeysToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) { - s->insert(s->end(), miter->first); - } -} - -/// Copies the values in a map to a set. -template -void CopyMapValuesToSet(const std::map &m, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::map::const_iterator miter = m.begin(), mend = m.end(); - for (; miter != mend; ++miter) s->insert(s->end(), miter->second); -} - -/// Copies the contents of a vector to a set. -template -void CopyVectorToSet(const std::vector &v, std::set *s) { - KALDI_ASSERT(s != NULL); - s->clear(); - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) s->insert(s->end(), *iter); - // s->end() is a hint in case v was sorted. will work regardless. -} - -/// Deletes any non-NULL pointers in the vector v, and sets -/// the corresponding entries of v to NULL -template -void DeletePointers(std::vector *v) { - KALDI_ASSERT(v != NULL); - typename std::vector::iterator iter = v->begin(), end = v->end(); - for (; iter != end; ++iter) { - if (*iter != NULL) { - delete *iter; - *iter = NULL; // set to NULL for extra safety. - } - } -} - -/// Returns true if the vector of pointers contains NULL pointers. -template -bool ContainsNullPointers(const std::vector &v) { - typename std::vector::const_iterator iter = v.begin(), end = v.end(); - for (; iter != end; ++iter) - if (*iter == static_cast(NULL)) return true; - return false; -} - -/// Copies the contents a vector of one type to a vector -/// of another type. -template -void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { - KALDI_ASSERT(vec_out != NULL); - vec_out->resize(vec_in.size()); - for (size_t i = 0; i < vec_in.size(); i++) - (*vec_out)[i] = static_cast(vec_in[i]); -} - -/// A hashing function-object for vectors. -template -struct VectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const noexcept { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - VectorHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - } - - private: - static const int kPrime = 7853; -}; - -/// A hashing function-object for pairs of ints -template -struct PairHasher { // hashing function for pair - size_t operator()(const std::pair &x) const noexcept { - // 7853 was chosen at random from a list of primes. - return x.first + x.second * 7853; - } - PairHasher() { // Check we're instantiated with an integer type. - KALDI_ASSERT_IS_INTEGER_TYPE(Int1); - KALDI_ASSERT_IS_INTEGER_TYPE(Int2); - } -}; - -/// A hashing function object for strings. -struct StringHasher { // hashing function for std::string - size_t operator()(const std::string &str) const noexcept { - size_t ans = 0, len = str.length(); - const char *c = str.c_str(), *end = c + len; - for (; c != end; c++) { - ans *= kPrime; - ans += *c; - } - return ans; - } - - private: - static const int kPrime = 7853; -}; - -/// Reverses the contents of a vector. -template -inline void ReverseVector(std::vector *vec) { - KALDI_ASSERT(vec != NULL); - size_t sz = vec->size(); - for (size_t i = 0; i < sz / 2; i++) std::swap((*vec)[i], (*vec)[sz - 1 - i]); -} - -/// Comparator object for pairs that compares only the first pair. -template -struct CompareFirstMemberOfPair { - inline bool operator()(const std::pair &p1, const std::pair &p2) { - return p1.first < p2.first; - } -}; - -/// For a vector of pair where I is an integer and F a floating-point or -/// integer type, this function sorts a vector of type vector > on -/// the I value and then merges elements with equal I values, summing these over -/// the F component and then removing any F component with zero value. This -/// is for where the vector of pairs represents a map from the integer to float -/// component, with an "adding" type of semantics for combining the elements. -template -inline void MergePairVectorSumming(std::vector > *vec) { - KALDI_ASSERT_IS_INTEGER_TYPE(I); - CompareFirstMemberOfPair c; - std::sort(vec->begin(), vec->end(), c); // sort on 1st element. - typename std::vector >::iterator out = vec->begin(), - in = vec->begin(), - end = vec->end(); - // special case: while there is nothing to be changed, skip over - // initial input (avoids unnecessary copying). - while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) { - in++; - out++; - } - while (in < end) { - // We reach this point only at the first element of - // each stretch of identical .first elements. - *out = *in; - ++in; - while (in < end && in->first == out->first) { - out->second += in->second; // this is the merge operation. - ++in; - } - if (out->second != static_cast(0)) // Don't keep zero elements. - out++; - } - vec->erase(out, end); -} - -} // namespace kaldi - -#endif // KALDI_UTIL_STL_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/text-utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/text-utils.cc deleted file mode 100644 index fd70889644f6b4e14793ddd4f5b0d71a66768699..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/text-utils.cc +++ /dev/null @@ -1,580 +0,0 @@ -// util/text-utils.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/text-utils.h" - -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out) { - KALDI_ASSERT(out != NULL); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - F f = 0; - if (!ConvertStringToReal(split[i], &f)) return false; - (*out)[i] = f; - } - return true; -} - -// Instantiate the template above for float and double. -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); -template bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out) { - std::string tmp_str; - for (size_t i = 0; i < vec_in.size(); i++) { - if (!omit_empty_strings || !vec_in[i].empty()) { - tmp_str.append(vec_in[i]); - if (i < vec_in.size() - 1) - if (!omit_empty_strings || !vec_in[i + 1].empty()) - tmp_str.append(delim); - } - } - str_out->swap(tmp_str); -} - -void Trim(std::string *str) { - const char *white_chars = " \t\n\r\f\v"; - - std::string::size_type pos = str->find_last_not_of(white_chars); - if (pos != std::string::npos) { - str->erase(pos + 1); - pos = str->find_first_not_of(white_chars); - if (pos != std::string::npos) str->erase(0, pos); - } else { - str->erase(str->begin(), str->end()); - } -} - -bool IsToken(const std::string &token) { - size_t l = token.length(); - if (l == 0) return false; - for (size_t i = 0; i < l; i++) { - unsigned char c = token[i]; - if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255)) - return false; - // The "&& (isascii(c) || c == 255)" was added so that we won't reject - // non-ASCII characters such as French characters with accents [except for - // 255 which is "nbsp", a form of space]. - } - return true; -} - -void SplitStringOnFirstSpace(const std::string &str, std::string *first, - std::string *rest) { - const char *white_chars = " \t\n\r\f\v"; - typedef std::string::size_type I; - const I npos = std::string::npos; - I first_nonwhite = str.find_first_not_of(white_chars); - if (first_nonwhite == npos) { - first->clear(); - rest->clear(); - return; - } - // next_white is first whitespace after first nonwhitespace. - I next_white = str.find_first_of(white_chars, first_nonwhite); - - if (next_white == npos) { // no more whitespace... - *first = std::string(str, first_nonwhite); - rest->clear(); - return; - } - I next_nonwhite = str.find_first_not_of(white_chars, next_white); - if (next_nonwhite == npos) { - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - rest->clear(); - return; - } - - I last_nonwhite = str.find_last_not_of(white_chars); - KALDI_ASSERT(last_nonwhite != npos); // or coding error. - - *first = std::string(str, first_nonwhite, next_white - first_nonwhite); - *rest = std::string(str, next_nonwhite, last_nonwhite + 1 - next_nonwhite); -} - -bool IsLine(const std::string &line) { - if (line.find('\n') != std::string::npos) return false; - if (line.empty()) return true; - if (isspace(*(line.begin()))) return false; - if (isspace(*(line.rbegin()))) return false; - std::string::const_iterator iter = line.begin(), end = line.end(); - for (; iter != end; iter++) - if (!isprint(*iter)) return false; - return true; -} - -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - -template bool ConvertStringToReal(const std::string &str, float *out); -template bool ConvertStringToReal(const std::string &str, double *out); - -/* - This function is a helper function of StringsApproxEqual. It should be - thought of as a recursive function-- it was designed that way-- but rather - than actually recursing (which would cause problems with stack overflow), we - just set the args and return to the start. - - The 'decimal_places_tolerance' argument is just passed in from outside, - see the documentation for StringsApproxEqual in text-utils.h to see an - explanation. The argument 'places_into_number' provides some information - about the strings 'a' and 'b' that precedes the current pointers. - For purposes of this comment, let's define the 'decimal' of a number - as the part that comes after the decimal point, e.g. in '99.123', - '123' would be the decimal. If 'places_into_number' is -1, it means - we're not currently inside some place like that (i.e. it's not the - case that we're pointing to the '1' or the '2' or the '3'). - If it's 0, then we'd be pointing to the first place after the decimal, - '1' in this case. Note if one of the numbers is shorter than the - other, like '99.123' versus '99.1234' and 'a' points to the first '3' - while 'b' points to the second '4', 'places_into_number' referes to the - shorter of the two, i.e. it would be 2 in this example. - - - */ -bool StringsApproxEqualInternal(const char *a, const char *b, - int32 decimal_places_tolerance, - int32 places_into_number) { -start: - char ca = *a, cb = *b; - if (ca == cb) { - if (ca == '\0') { - return true; - } else { - if (places_into_number >= 0) { - if (isdigit(ca)) { - places_into_number++; - } else { - places_into_number = -1; - } - } else { - if (ca == '.') { - places_into_number = 0; - } - } - a++; - b++; - goto start; - } - } else { - if (places_into_number >= decimal_places_tolerance && - (isdigit(ca) || isdigit(cb))) { - // we're potentially willing to accept this difference between the - // strings. - if (isdigit(ca)) a++; - if (isdigit(cb)) b++; - // we'll have advanced at least one of the two strings. - goto start; - } else if (places_into_number >= 0 && - ((ca == '0' && !isdigit(cb)) || (cb == '0' && !isdigit(ca)))) { - // this clause is designed to ensure that, for example, - // "0.1" would count the same as "0.100001". - if (ca == '0') - a++; - else - b++; - places_into_number++; - goto start; - } else { - return false; - } - } -} - -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_tolerance) { - return StringsApproxEqualInternal(a.c_str(), b.c_str(), - decimal_places_tolerance, -1); -} - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = - std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals - // sign, or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign + 1] == '\'' || - line[next_equals_sign + 1] == '"') { - char my_quote = line[next_equals_sign + 1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote - << " in config line '" << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) - // foo=bar": in general, config values with spaces in them, even without - // quoting. - - size_t next_next_equals_sign = - line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != - std::string::npos) { // found a later equals sign. - size_t preceding_space = - line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = - data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -// void ExpectOneOrTwoTokens(std::istream &is, bool binary, -// const std::string &token1, -// const std::string &token2) { -// KALDI_ASSERT(token1 != token2); -// std::string temp; -// ReadToken(is, binary, &temp); -// if (temp == token1) { -// ExpectToken(is, binary, token2); -// } else { -// if (temp != token2) { -// KALDI_ERR << "Expecting token " << token1 << " or " << token2 -// << " but got " << temp; -// } -// } -// } - -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines) { - config_lines->resize(lines.size()); - for (size_t i = 0; i < lines.size(); i++) { - bool ret = (*config_lines)[i].ParseLine(lines[i]); - if (!ret) { - KALDI_ERR << "Error parsing config line: " << lines[i]; - } - } -} - -} // end namespace kaldi diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/text-utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/text-utils.h deleted file mode 100644 index bc7763c4aff38214d97cbeda3b29c8717dd65318..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/kaldi/util/text-utils.h +++ /dev/null @@ -1,264 +0,0 @@ -// util/text-utils.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_UTIL_TEXT_UTILS_H_ -#define KALDI_UTIL_TEXT_UTILS_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "base/kaldi-common.h" - -namespace kaldi { - -/// Split a string using any of the single character delimiters. -/// If omit_empty_strings == true, the output will contain any -/// nonempty strings after splitting on any of the -/// characters in the delimiter. If omit_empty_strings == false, -/// the output will contain n+1 strings if there are n characters -/// in the set "delim" within the input string. In this case -/// the empty string is split to a single empty string. -void SplitStringToVector(const std::string &full, const char *delim, - bool omit_empty_strings, - std::vector *out); - -/// Joins the elements of a vector of strings into a single string using -/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings -/// in the vector are skipped. A vector of empty strings results in an empty -/// string on the output. -void JoinVectorToString(const std::vector &vec_in, - const char *delim, bool omit_empty_strings, - std::string *str_out); - -/** - \brief Split a string (e.g. 1:2:3) into a vector of integers. - - \param [in] delim String containing a list of characters, any of which - is allowed as a delimiter. - \param [in] omit_empty_strings If true, empty strings between delimiters are - allowed and will not produce an output integer; if false, - instances of characters in 'delim' that are consecutive or - at the start or end of the string would be an error. - You'll normally want this to be true if 'delim' consists - of spaces, and false otherwise. - \param [out] out The output list of integers. -*/ -template -bool SplitStringToIntegers(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false [but - // should probably be true - // if "delim" is spaces]. - std::vector *out) { - KALDI_ASSERT(out != NULL); - KALDI_ASSERT_IS_INTEGER_TYPE(I); - if (*(full.c_str()) == '\0') { - out->clear(); - return true; - } - std::vector split; - SplitStringToVector(full, delim, omit_empty_strings, &split); - out->resize(split.size()); - for (size_t i = 0; i < split.size(); i++) { - const char *this_str = split[i].c_str(); - char *end = NULL; - int64 j = 0; - j = KALDI_STRTOLL(this_str, &end); - if (end == this_str || *end != '\0') { - out->clear(); - return false; - } else { - I jI = static_cast(j); - if (static_cast(jI) != j) { - // output type cannot fit this integer. - out->clear(); - return false; - } - (*out)[i] = jI; - } - } - return true; -} - -// This is defined for F = float and double. -template -bool SplitStringToFloats(const std::string &full, const char *delim, - bool omit_empty_strings, // typically false - std::vector *out); - -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - KALDI_ASSERT_IS_INTEGER_TYPE(Int); - const char *this_str = str.c_str(); - char *end = NULL; - errno = 0; - int64 i = KALDI_STRTOLL(this_str, &end); - if (end != this_str) - while (isspace(*end)) end++; - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out); - -/// Removes the beginning and trailing whitespaces from a string -void Trim(std::string *str); - -/// Removes leading and trailing white space from the string, then splits on the -/// first section of whitespace found (if present), putting the part before the -/// whitespace in "first" and the rest in "rest". If there is no such space, -/// everything that remains after removing leading and trailing whitespace goes -/// in "first". -void SplitStringOnFirstSpace(const std::string &line, std::string *first, - std::string *rest); - -/// Returns true if "token" is nonempty, and all characters are -/// printable and whitespace-free. -bool IsToken(const std::string &token); - -/// Returns true if "line" is free of \n characters and unprintable -/// characters, and does not contain leading or trailing whitespace. -bool IsLine(const std::string &line); - -/** - This function returns true when two text strings are approximately equal, and - false when they are not. The definition of 'equal' is normal string - equality, except that two substrings like "0.31134" and "0.311341" would be - considered equal. 'decimal_places_tolerance' controls how many digits after - the '.' have to match up. - E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would - return false because there is a difference in the 2nd decimal, but with - an argument of 1 it would return true. - */ -bool StringsApproxEqual(const std::string &a, const std::string &b, - int32 decimal_places_check = 2); - -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' - baz="a b c d='a b' e" and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free - of the '=' character. If values are going to contain the '=' character, you - need to quote them with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; -}; - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, const std::string &token2); - -/** - This function reads in a config file and *appends* its contents to a vector - of lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, std::vector *lines); - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); - -/// Returns true if 'name' would be a valid name for a component or node in a -/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing -/// only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - -} // namespace kaldi - -#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/CPPLINT.cfg b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/CPPLINT.cfg deleted file mode 100644 index 51ff339c18435a6c3a3be03131080d7b8ab8de86..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/CPPLINT.cfg +++ /dev/null @@ -1 +0,0 @@ -exclude_files=.* diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/CMakeLists.txt deleted file mode 100644 index 04051ef5ae46c04a40c1ffccc98c37fa594ad13e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - -#-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o - -include_directories(./include/) -install(DIRECTORY include/ DESTINATION include/ - FILES_MATCHING PATTERN "*.h") - -add_subdirectory(lib) - -if(HAVE_SCRIPT) - add_subdirectory(script) -endif(HAVE_SCRIPT) - -if(HAVE_BIN) - add_subdirectory(bin) -endif(HAVE_BIN) - -add_subdirectory(extensions) - -if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) -endif(BUILD_TESTING) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/extensions/special/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/extensions/special/CMakeLists.txt deleted file mode 100644 index 9c71b750a72ffe3c2dafde657273361c3dbae409..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/extensions/special/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) -message(STATUS "${HEADER_FILES}") - -if(HAVE_BIN) - add_executable(fstspecial-bin - ../../bin/fstconvert.cc - ../../bin/fstconvert-main.cc - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ) - - set_target_properties(fstspecial-bin PROPERTIES - FOLDER special/bin - OUTPUT_NAME fstspecial - ) - - target_link_libraries(fstspecial-bin - fstscript - fst - ${CMAKE_DL_LIBS} - ) -endif(HAVE_BIN) - - -add_library(fstspecial - phi-fst.cc - rho-fst.cc - sigma-fst.cc - ${HEADER_FILES} -) - -set_target_properties(fstspecial PROPERTIES - SOVERSION "${SOVERSION}" - FOLDER special -) -target_link_libraries(fstspecial - fst -) - -set(FST_SPECIAL_INSTALL_TARGETS fstspecial) -if(HAVE_BIN) - list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) -endif() - -install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib -) - -function (add_module _name) - add_library(${ARGV}) - if (TARGET ${_name}) - target_link_libraries(${_name} fst) - set_target_properties(${_name} - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true - FOLDER special/modules - ) - endif() - - install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) -endfunction() - -add_module(phi-fst MODULE phi-fst.cc) -add_module(rho-fst MODULE rho-fst.cc) -add_module(sigma-fst MODULE sigma-fst.cc) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/include/fst/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/include/fst/flags.h deleted file mode 100644 index b5ec8ff7416774a0612ae0fe7e008a630b289dd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/include/fst/flags.h +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style flag handling declarations and inline definitions. - -#ifndef FST_LIB_FLAGS_H_ -#define FST_LIB_FLAGS_H_ - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" - -using std::string; - -// FLAGS USAGE: -// -// Definition example: -// -// DEFINE_int32(length, 0, "length"); -// -// This defines variable FLAGS_length, initialized to 0. -// -// Declaration example: -// -// DECLARE_int32(length); -// -// SET_FLAGS() can be used to set flags from the command line -// using, for example, '--length=2'. -// -// ShowUsage() can be used to print out command and flag usage. - -// #define DECLARE_bool(name) extern bool FLAGS_ ## name -// #define DECLARE_string(name) extern string FLAGS_ ## name -// #define DECLARE_int32(name) extern int32 FLAGS_ ## name -// #define DECLARE_int64(name) extern int64 FLAGS_ ## name -// #define DECLARE_double(name) extern double FLAGS_ ## name - -template -struct FlagDescription { - FlagDescription(T *addr, const char *doc, const char *type, - const char *file, const T val) - : address(addr), - doc_string(doc), - type_name(type), - file_name(file), - default_value(val) {} - - T *address; - const char *doc_string; - const char *type_name; - const char *file_name; - const T default_value; -}; - -template -class FlagRegister { - public: - static FlagRegister *GetRegister() { - static auto reg = new FlagRegister; - return reg; - } - - const FlagDescription &GetFlagDescription(const string &name) const { - fst::MutexLock l(&flag_lock_); - auto it = flag_table_.find(name); - return it != flag_table_.end() ? it->second : 0; - } - - void SetDescription(const string &name, - const FlagDescription &desc) { - fst::MutexLock l(&flag_lock_); - flag_table_.insert(make_pair(name, desc)); - } - - bool SetFlag(const string &val, bool *address) const { - if (val == "true" || val == "1" || val.empty()) { - *address = true; - return true; - } else if (val == "false" || val == "0") { - *address = false; - return true; - } - else { - return false; - } - } - - bool SetFlag(const string &val, string *address) const { - *address = val; - return true; - } - - bool SetFlag(const string &val, int32 *address) const { - char *p = 0; - *address = strtol(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, int64 *address) const { - char *p = 0; - *address = strtoll(val.c_str(), &p, 0); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &val, double *address) const { - char *p = 0; - *address = strtod(val.c_str(), &p); - return !val.empty() && *p == '\0'; - } - - bool SetFlag(const string &arg, const string &val) const { - for (typename std::map< string, FlagDescription >::const_iterator it = - flag_table_.begin(); - it != flag_table_.end(); - ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - if (arg == name) - return SetFlag(val, desc.address); - } - return false; - } - - void GetUsage(std::set> *usage_set) const { - for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) { - const string &name = it->first; - const FlagDescription &desc = it->second; - string usage = " --" + name; - usage += ": type = "; - usage += desc.type_name; - usage += ", default = "; - usage += GetDefault(desc.default_value) + "\n "; - usage += desc.doc_string; - usage_set->insert(make_pair(desc.file_name, usage)); - } - } - - private: - string GetDefault(bool default_value) const { - return default_value ? "true" : "false"; - } - - string GetDefault(const string &default_value) const { - return "\"" + default_value + "\""; - } - - template - string GetDefault(const V &default_value) const { - std::ostringstream strm; - strm << default_value; - return strm.str(); - } - - mutable fst::Mutex flag_lock_; // Multithreading lock. - std::map> flag_table_; -}; - -template -class FlagRegisterer { - public: - FlagRegisterer(const string &name, const FlagDescription &desc) { - auto registr = FlagRegister::GetRegister(); - registr->SetDescription(name, desc); - } - - private: - FlagRegisterer(const FlagRegisterer &) = delete; - FlagRegisterer &operator=(const FlagRegisterer &) = delete; -}; - - -#define DEFINE_VAR(type, name, value, doc) \ - type FLAGS_ ## name = value; \ - static FlagRegisterer \ - name ## _flags_registerer(#name, FlagDescription(&FLAGS_ ## name, \ - doc, \ - #type, \ - __FILE__, \ - value)) - -// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc) -// #define DEFINE_string(name, value, doc) \ -// DEFINE_VAR(string, name, value, doc) -// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc) -// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc) -// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc) - - -// Temporary directory. -DECLARE_string(tmpdir); - -void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags, - const char *src = ""); - -#define SET_FLAGS(usage, argc, argv, rmflags) \ -gflags::ParseCommandLineFlags(argc, argv, true) -// SetFlags(usage, argc, argv, rmflags, __FILE__) - -// Deprecated; for backward compatibility. -inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) { - return SetFlags(usage, argc, argv, rmflags); -} - -void ShowUsage(bool long_usage = true); - -#endif // FST_LIB_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/include/fst/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/include/fst/log.h deleted file mode 100644 index bf041c58ebfab73d03bb14adf28c7c7916a2217d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/patch/openfst/src/include/fst/log.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// See www.openfst.org for extensive documentation on this weighted -// finite-state transducer library. -// -// Google-style logging declarations and inline definitions. - -#ifndef FST_LIB_LOG_H_ -#define FST_LIB_LOG_H_ - -#include -#include -#include - -#include -#include - -using std::string; - -DECLARE_int32(v); - -class LogMessage { - public: - LogMessage(const string &type) : fatal_(type == "FATAL") { - std::cerr << type << ": "; - } - ~LogMessage() { - std::cerr << std::endl; - if(fatal_) - exit(1); - } - std::ostream &stream() { return std::cerr; } - - private: - bool fatal_; -}; - -// #define LOG(type) LogMessage(#type).stream() -// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO) - -// Checks -inline void FstCheck(bool x, const char* expr, - const char *file, int line) { - if (!x) { - LOG(FATAL) << "Check failed: \"" << expr - << "\" file: " << file - << " line: " << line; - } -} - -// #define CHECK(x) FstCheck(static_cast(x), #x, __FILE__, __LINE__) -// #define CHECK_EQ(x, y) CHECK((x) == (y)) -// #define CHECK_LT(x, y) CHECK((x) < (y)) -// #define CHECK_GT(x, y) CHECK((x) > (y)) -// #define CHECK_LE(x, y) CHECK((x) <= (y)) -// #define CHECK_GE(x, y) CHECK((x) >= (y)) -// #define CHECK_NE(x, y) CHECK((x) != (y)) - -// Debug checks -// #define DCHECK(x) assert(x) -// #define DCHECK_EQ(x, y) DCHECK((x) == (y)) -// #define DCHECK_LT(x, y) DCHECK((x) < (y)) -// #define DCHECK_GT(x, y) DCHECK((x) > (y)) -// #define DCHECK_LE(x, y) DCHECK((x) <= (y)) -// #define DCHECK_GE(x, y) DCHECK((x) >= (y)) -// #define DCHECK_NE(x, y) DCHECK((x) != (y)) - - -// Ports -#define ATTRIBUTE_DEPRECATED __attribute__((deprecated)) - -#endif // FST_LIB_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/CMakeLists.txt deleted file mode 100644 index 6113bbc26eb8fe35e4e17ffd1cab382f0fb0f1f8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(post_processor STATIC - post_processor.cc -) -target_link_libraries(post_processor PUBLIC utils) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/post_processor.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/post_processor.cc deleted file mode 100644 index 315f62d34cbc441ecbaf7c07667eb35ee61c2c8d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/post_processor.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "utils/string.h" - -namespace wenet { - -std::string PostProcessor::ProcessSpace(const std::string& str) { - std::string result = str; - // 1. remove ' ' if needed - // only spaces between mandarin words need to be removed, please note that - // if str contains '_', we assume that the decoding type must be - // `CtcPrefixBeamSearch` and this branch will do nothing since str must be - // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) - if (opts_.language_type == kMandarinEnglish && !str.empty()) { - result.clear(); - // split str by ' ' - std::vector words; - std::stringstream ss(str); - std::string tmp; - while (ss >> tmp) { - words.push_back(tmp); - } - // check english word - bool is_englishword_prev = false; - bool is_englishword_now = false; - for (std::string& w : words) { - is_englishword_now = CheckEnglishWord(w); - if (is_englishword_prev && is_englishword_now) { - result += (' ' + w); - } else { - result += (w); - } - is_englishword_prev = is_englishword_now; - } - } - // 2. replace '_' with ' ' - // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) - result = ProcessBlank(result, opts_.lowercase); - return result; -} - -std::string PostProcessor::Process(const std::string& str, bool finish) { - std::string result; - result = ProcessSpace(str); - // TODO(xcsong): do itn/punctuation if finish == true - return result; -} - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/post_processor.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/post_processor.h deleted file mode 100644 index 54597845ebc88ad22e1244d2e693e2088cff6d21..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/post_processor/post_processor.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#ifndef POST_PROCESSOR_POST_PROCESSOR_H_ -#define POST_PROCESSOR_POST_PROCESSOR_H_ - -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -enum LanguageType { - // spaces between **mandarin words** should be removed. - // cases of processing spaces with mandarin-only, english-only - // and mandarin-english code-switch can be found in post_processor_test.cc - kMandarinEnglish = 0x00, - // spaces should be kept for most of the - // Indo-European languages (i.e., deutsch or english-deutsch code-switch). - // cases of those languages can be found in post_processor_test.cc - kIndoEuropean = 0x01 -}; - -struct PostProcessOptions { - // space options - // The decoded result may contain spaces (' ' or '_'), - // we will process those spaces according to language_type. More details can - // be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - LanguageType language_type = kMandarinEnglish; - // whether lowercase letters are required - bool lowercase = true; -}; - -// TODO(xcsong): add itn/punctuation related resource -struct PostProcessResource {}; - -// Post Processor -class PostProcessor { - public: - explicit PostProcessor(PostProcessOptions&& opts) : opts_(std::move(opts)) {} - explicit PostProcessor(const PostProcessOptions& opts) : opts_(opts) {} - // call other functions to do post processing - std::string Process(const std::string& str, bool finish); - // process spaces according to configurations - std::string ProcessSpace(const std::string& str); - // TODO(xcsong): add itn/punctuation - // void InverseTN(const std::string& str); - // void Punctuate(const std::string& str); - - private: - const PostProcessOptions opts_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(PostProcessor); -}; - -} // namespace wenet - -#endif // POST_PROCESSOR_POST_PROCESSOR_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/CMakeLists.txt deleted file mode 100644 index 145654105350e91a5f9121b47197f5fc60663f5c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -link_libraries(gtest_main gmock) - -add_executable(utils_test utils_test.cc) -target_link_libraries(utils_test PUBLIC utils) -add_test(UTILS_TEST utils_test) - -add_executable(ctc_prefix_beam_search_test ctc_prefix_beam_search_test.cc) -target_link_libraries(ctc_prefix_beam_search_test PUBLIC decoder) -add_test(CTC_PREFIX_BEAM_SEARCH_TEST ctc_prefix_beam_search_test) - -add_executable(post_processor_test post_processor_test.cc) -target_link_libraries(post_processor_test PUBLIC post_processor) -add_test(POST_PROCESSOR_TEST post_processor_test) - - -add_executable(feature_pipeline_test feature_pipeline_test.cc) -target_link_libraries(feature_pipeline_test PUBLIC frontend) -add_test(FEATURE_PIPELINE_TEST feature_pipeline_test) \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/ctc_prefix_beam_search_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/ctc_prefix_beam_search_test.cc deleted file mode 100644 index d8f3b65693b934beb33f3a770795f0b6e7ce3456..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/ctc_prefix_beam_search_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "decoder/ctc_prefix_beam_search.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(CtcPrefixBeamSearchTest, CtcPrefixBeamSearchLogicTest) { - using ::testing::ElementsAre; - // See https://robin1001.github.io/2020/12/11/ctc-search for the - // graph demonstration of the data - std::vector> data = { - {0.25, 0.40, 0.35}, {0.40, 0.35, 0.25}, {0.10, 0.50, 0.40}}; - // Apply log - for (int i = 0; i < data.size(); i++) { - for (int j = 0; j < data[i].size(); j++) { - data[i][j] = std::log(data[i][j]); - } - } - wenet::CtcPrefixBeamSearchOptions option; - option.first_beam_size = 3; - option.second_beam_size = 3; - wenet::CtcPrefixBeamSearch prefix_beam_search(option); - prefix_beam_search.Search(data); - /* Test case info - | top k | result index | prefix score | viterbi score | timestamp | - |-------|--------------|--------------|---------------|-----------| - | top 1 | [2, 1] | 0.2185 | 0.07 | [0, 2] | - | top 2 | [1, 2] | 0.1550 | 0.064 | [0, 2] | - | top 3 | [1] | 0.1525 | 0.07 | [2] | - */ - const std::vector>& result = prefix_beam_search.Outputs(); - EXPECT_EQ(result.size(), 3); - ASSERT_THAT(result[0], ElementsAre(2, 1)); - ASSERT_THAT(result[1], ElementsAre(1, 2)); - ASSERT_THAT(result[2], ElementsAre(1)); - - const std::vector& likelihood = prefix_beam_search.Likelihood(); - EXPECT_EQ(likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(likelihood[0]), 0.2185); - EXPECT_FLOAT_EQ(std::exp(likelihood[1]), 0.1550); - EXPECT_FLOAT_EQ(std::exp(likelihood[2]), 0.1525); - - const std::vector& viterbi_likelihood = - prefix_beam_search.viterbi_likelihood(); - EXPECT_EQ(viterbi_likelihood.size(), 3); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[0]), 0.07); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[1]), 0.064); - EXPECT_FLOAT_EQ(std::exp(viterbi_likelihood[2]), 0.07); - - const std::vector>& times = prefix_beam_search.Times(); - EXPECT_EQ(times.size(), 3); - ASSERT_THAT(times[0], ElementsAre(0, 2)); - ASSERT_THAT(times[1], ElementsAre(0, 2)); - ASSERT_THAT(times[2], ElementsAre(2)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/feature_pipeline_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/feature_pipeline_test.cc deleted file mode 100644 index 244ec0735b6086211b476e8d97569e1ee5959bc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/feature_pipeline_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2022 Roney -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "frontend/feature_pipeline.h" -#include "utils/blocking_queue.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -void pushQueue(const std::shared_ptr>& que, - std::vector vec) { - que->Push(vec); -} - -void popQueue(const std::shared_ptr>& que, int num, - int back_data) { - auto pop_data = que->Pop(num); - ASSERT_EQ(pop_data[num - 1], back_data); -} - -TEST(FeaturePipelineTest, BlockingQueueTest) { - auto capacity_queue = std::make_shared>(2); - std::vector test_data{1, 2, 3, 4, 5}; - std::thread push_thread(&pushQueue, capacity_queue, test_data); - ASSERT_EQ(capacity_queue->Pop(), 1); - ASSERT_LE(capacity_queue->Size(), 2); // capacity_queue: 2 or 2,3 - auto pop_data = capacity_queue->Pop(3); // 2,3,4 num > capacity - ASSERT_EQ(pop_data.size(), 3); - ASSERT_EQ(pop_data[2], 4); - push_thread.join(); - ASSERT_EQ(capacity_queue->Size(), 1); // capacity_queue:5 - - std::thread pop_thread(&popQueue, capacity_queue, 3, 0); // num > capacity - capacity_queue->Push(9); // capacity_queue:5,9 - capacity_queue->Push(0); // capacity_queue:5,9,0 - pop_thread.join(); // capacity_queue: - ASSERT_EQ(capacity_queue->Size(), 0); - - pop_data = capacity_queue->Pop(0); - ASSERT_TRUE(pop_data.empty()); -} - -TEST(FeaturePipelineTest, PipelineTest) { - wenet::FeaturePipelineConfig config(80, 8000); - wenet::FeaturePipeline feature_pipeline(config); - int audio_len = 8 * 55; // audio len 55ms,4 frames - std::vector pcm(audio_len, 0); - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 4); - - std::vector> out_feats; - auto b = feature_pipeline.Read(2, &out_feats); - ASSERT_TRUE(b); - ASSERT_EQ(out_feats.size(), 2); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 2); - - std::vector out_feat; - b = feature_pipeline.ReadOne(&out_feat); - ASSERT_TRUE(b); - ASSERT_FALSE(out_feat.empty()); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 1); - - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 1); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); - - feature_pipeline.AcceptWaveform(pcm.data(), audio_len); - feature_pipeline.Read(2, &out_feats); - feature_pipeline.Reset(); - feature_pipeline.set_input_finished(); - b = feature_pipeline.Read(2, &out_feats); - ASSERT_FALSE(b); - ASSERT_EQ(out_feats.size(), 0); - ASSERT_EQ(feature_pipeline.NumQueuedFrames(), 0); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/post_processor_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/post_processor_test.cc deleted file mode 100644 index fa11fa29231032d62389a93fd00b0ec782bf8a3b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/post_processor_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -#include "post_processor/post_processor.h" - -#include -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include "utils/utils.h" - -TEST(PostProcessorTest, ProcessSpacekMandarinEnglishTest) { - wenet::PostProcessOptions opts_lowercase; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: mandarin character - // decode type: CtcPrefixBeamSearch, "".join() - "震东好帅", - // modeling unit: mandarin word - // decode type: CtcWfstBeamSearch, " ".join() - " 吴迪 也 好帅", - // modeling unit: english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁binbin▁is▁also▁handsome", - // modeling unit: english word - // decode type: CtcWfstBeamSearch, " ".join() - " life is short i use wenet", - // modeling unit: mandarin character + english wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "超哥▁is▁the▁most▁handsome", - // modeling unit: mandarin word + english word - // decode type: CtcWfstBeamSearch, " ".join() - " 人生 苦短 i use wenet", - }; - - std::vector result_lowercase = { - "震东好帅", - "吴迪也好帅", - "binbin is also handsome", - "life is short i use wenet", - "超哥 is the most handsome", - "人生苦短i use wenet", - }; - - std::vector result_uppercase = { - "震东好帅", - "吴迪也好帅", - "BINBIN IS ALSO HANDSOME", - "LIFE IS SHORT I USE WENET", - "超哥 IS THE MOST HANDSOME", - "人生苦短I USE WENET", - }; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} - -TEST(PostProcessorTest, ProcessSpacekIndoEuropeanTest) { - wenet::PostProcessOptions opts_lowercase; - opts_lowercase.language_type = wenet::kIndoEuropean; - wenet::PostProcessor post_processor_lowercase(opts_lowercase); - - wenet::PostProcessOptions opts_uppercase; - opts_uppercase.language_type = wenet::kIndoEuropean; - opts_uppercase.lowercase = false; - wenet::PostProcessor post_processor_uppercase(opts_uppercase); - - std::vector input = { - // modeling unit: wordpiece - // decode type: CtcPrefixBeamSearch, "".join() - "▁zhendong▁ist▁so▁schön", - // modeling unit: word - // decode type: CtcWfstBeamSearch, " ".join() - " zhendong ist so schön"}; - - std::vector result_lowercase = {"zhendong ist so schön", - "zhendong ist so schön"}; - - std::vector result_uppercase = {"ZHENDONG IST SO SCHÖN", - "ZHENDONG IST SO SCHÖN"}; - - for (size_t i = 0; i < input.size(); ++i) { - EXPECT_EQ(post_processor_lowercase.ProcessSpace(input[i]), - result_lowercase[i]); - EXPECT_EQ(post_processor_uppercase.ProcessSpace(input[i]), - result_uppercase[i]); - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/utils_test.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/utils_test.cc deleted file mode 100644 index 6b2bbac25e000ce854d5e55a50cb51109d62d758..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/test/utils_test.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "utils/utils.h" - -#include - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -TEST(UtilsTest, TopKTest) { - using ::testing::ElementsAre; - using ::testing::FloatNear; - using ::testing::Pointwise; - std::vector data = {1, 3, 5, 7, 9, 2, 4, 6, 8, 10}; - std::vector values; - std::vector indices; - wenet::TopK(data, 3, &values, &indices); - EXPECT_THAT(values, Pointwise(FloatNear(1e-8), {10, 9, 8})); - ASSERT_THAT(indices, ElementsAre(9, 4, 8)); -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/toolchains/aarch64-linux-gnu.toolchain.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/toolchains/aarch64-linux-gnu.toolchain.cmake deleted file mode 100644 index 9ad37cba9eb6fa58aa194ece96cf9a5da472a76d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/toolchains/aarch64-linux-gnu.toolchain.cmake +++ /dev/null @@ -1,5 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -SET (CMAKE_SYSTEM_PROCESSOR aarch64) - -set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) -set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/toolchains/ios.toolchain.cmake b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/toolchains/ios.toolchain.cmake deleted file mode 100644 index 2bcb0adf7b07c0c5fd5bf16d1b687050579ba673..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/toolchains/ios.toolchain.cmake +++ /dev/null @@ -1,1014 +0,0 @@ -# This file is part of the ios-cmake project. It was retrieved from -# https://github.com/leetal/ios-cmake.git, which is a fork of -# https://github.com/gerstrong/ios-cmake.git, which is a fork of -# https://github.com/cristeab/ios-cmake.git, which is a fork of -# https://code.google.com/p/ios-cmake/. Which in turn is based off of -# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which -# are included with CMake 2.8.4 -# -# The ios-cmake project is licensed under the new BSD license. -# -# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, -# Kitware, Inc., Insight Software Consortium. All rights reserved. -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# This file is based off of the Platform/Darwin.cmake and -# Platform/UnixPaths.cmake files which are included with CMake 2.8.4 -# It has been altered for iOS development. -# -# Updated by Alex Stewart (alexs.mac@gmail.com) -# -# ***************************************************************************** -# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) -# under the BSD-3-Clause license -# https://github.com/leetal/ios-cmake -# ***************************************************************************** -# -# INFORMATION / HELP -# -############################################################################### -# OPTIONS # -############################################################################### -# -# PLATFORM: (default "OS64") -# OS = Build for iPhoneOS. -# OS64 = Build for arm64 iphoneOS. -# OS64COMBINED = Build for arm64 x86_64 iphoneOS + iphoneOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) -# SIMULATOR = Build for x86 i386 iphoneOS Simulator. -# SIMULATOR64 = Build for x86_64 iphoneOS Simulator. -# SIMULATORARM64 = Build for arm64 iphoneOS Simulator. -# TVOS = Build for arm64 tvOS. -# TVOSCOMBINED = Build for arm64 x86_64 tvOS + tvOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) -# SIMULATOR_TVOS = Build for x86_64 tvOS Simulator. -# WATCHOS = Build for armv7k arm64_32 for watchOS. -# WATCHOSCOMBINED = Build for armv7k arm64_32 x86_64 watchOS + watchOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) -# SIMULATOR_WATCHOS = Build for x86_64 for watchOS Simulator. -# MAC = Build for x86_64 macOS. -# MAC_ARM64 = Build for Apple Silicon macOS. -# MAC_CATALYST = Build for x86_64 macOS with Catalyst support (iOS toolchain on macOS). -# Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS -# MAC_CATALYST_ARM64 = Build for Apple Silicon macOS with Catalyst support (iOS toolchain on macOS). -# Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS -# -# CMAKE_OSX_SYSROOT: Path to the SDK to use. By default this is -# automatically determined from PLATFORM and xcodebuild, but -# can also be manually specified (although this should not be required). -# -# CMAKE_DEVELOPER_ROOT: Path to the Developer directory for the platform -# being compiled for. By default this is automatically determined from -# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should -# not be required). -# -# DEPLOYMENT_TARGET: Minimum SDK version to target. Default 2.0 on watchOS and 9.0 on tvOS+iOS -# -# NAMED_LANGUAGE_SUPPORT: -# ON (default) = Will require "enable_language(OBJC) and/or enable_language(OBJCXX)" for full OBJC|OBJCXX support -# OFF = Will embed the OBJC and OBJCXX flags into the CMAKE_C_FLAGS and CMAKE_CXX_FLAGS (legacy behaviour, CMake version < 3.16) -# -# ENABLE_BITCODE: (ON|OFF) Enables or disables bitcode support. Default ON -# -# ENABLE_ARC: (ON|OFF) Enables or disables ARC support. Default ON (ARC enabled by default) -# -# ENABLE_VISIBILITY: (ON|OFF) Enables or disables symbol visibility support. Default OFF (visibility hidden by default) -# -# ENABLE_STRICT_TRY_COMPILE: (ON|OFF) Enables or disables strict try_compile() on all Check* directives (will run linker -# to actually check if linking is possible). Default OFF (will set CMAKE_TRY_COMPILE_TARGET_TYPE to STATIC_LIBRARY) -# -# ARCHS: (armv7 armv7s armv7k arm64 arm64_32 i386 x86_64) If specified, will override the default architectures for the given PLATFORM -# OS = armv7 armv7s arm64 (if applicable) -# OS64 = arm64 (if applicable) -# SIMULATOR = i386 -# SIMULATOR64 = x86_64 -# SIMULATORARM64 = arm64 -# TVOS = arm64 -# SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated) -# WATCHOS = armv7k arm64_32 (if applicable) -# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) -# MAC = x86_64 -# MAC_ARM64 = arm64 -# MAC_CATALYST = x86_64 -# MAC_CATALYST_ARM64 = arm64 -# -# NOTE: When manually specifying ARCHS, put a semi-colon between the entries. E.g., -DARCHS="armv7;arm64" -# -############################################################################### -# END OPTIONS # -############################################################################### -# -# This toolchain defines the following properties (available via get_property()) for use externally: -# -# PLATFORM: The currently targeted platform. -# XCODE_VERSION: Version number (not including Build version) of Xcode detected. -# SDK_VERSION: Version of SDK being used. -# OSX_ARCHITECTURES: Architectures being compiled for (generated from PLATFORM). -# APPLE_TARGET_TRIPLE: Used by autoconf build systems. NOTE: If "ARCHS" are overridden, this will *NOT* be set! -# -# This toolchain defines the following macros for use externally: -# -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) -# A convenience macro for setting xcode specific properties on targets. -# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). -# -# find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the -# environment. Thanks to the android-cmake project for providing the -# command. -# - -cmake_minimum_required(VERSION 3.8.0) - -# CMake invokes the toolchain file twice during the first build, but only once during subsequent rebuilds. -if(DEFINED ENV{_IOS_TOOLCHAIN_HAS_RUN}) - return() -endif() -set(ENV{_IOS_TOOLCHAIN_HAS_RUN} true) - -# List of supported platform values -list(APPEND _supported_platforms - "OS" "OS64" "OS64COMBINED" "SIMULATOR" "SIMULATOR64" "SIMULATORARM64" - "TVOS" "TVOSCOMBINED" "SIMULATOR_TVOS" - "WATCHOS" "WATCHOSCOMBINED" "SIMULATOR_WATCHOS" - "MAC" "MAC_ARM64" - "MAC_CATALYST" "MAC_CATALYST_ARM64") - -# Cache what generator is used -set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}") - -# Check if using a CMake version capable of building combined FAT builds (simulator and target slices combined in one static lib) -if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14") - set(MODERN_CMAKE YES) -endif() - -# Get the Xcode version being used. -# Problem: CMake runs toolchain files multiple times, but can't read cache variables on some runs. -# Workaround: On first run (in which cache variables are always accessible), set an intermediary environment variable. -# -# NOTE: This pattern is used i many places in this toolchain to speed up checks of all sorts -if(DEFINED XCODE_VERSION_INT) - # Environment variables are always preserved. - set(ENV{_XCODE_VERSION_INT} "${XCODE_VERSION_INT}") -elseif(DEFINED ENV{_XCODE_VERSION_INT}) - set(XCODE_VERSION_INT "$ENV{_XCODE_VERSION_INT}") -elseif(NOT DEFINED XCODE_VERSION_INT) - find_program(XCODEBUILD_EXECUTABLE xcodebuild) - if(NOT XCODEBUILD_EXECUTABLE) - message(FATAL_ERROR "xcodebuild not found. Please install either the standalone commandline tools or Xcode.") - endif() - execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version - OUTPUT_VARIABLE XCODE_VERSION_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION_INT "${XCODE_VERSION_INT}") - string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION_INT "${XCODE_VERSION_INT}") - set(XCODE_VERSION_INT "${XCODE_VERSION_INT}" CACHE INTERNAL "") -endif() - -# Assuming that xcode 12.0 is installed you most probably have ios sdk 14.0 or later installed (tested on Big Sur) -# if you don't set a deployment target it will be set the way you only get 64-bit builds -if(NOT DEFINED DEPLOYMENT_TARGET AND XCODE_VERSION_INT VERSION_GREATER 12.0) - # Temporarily fix the arm64 issues in CMake install-combined by excluding arm64 for simulator builds (needed for Apple Silicon...) - set(CMAKE_XCODE_ATTRIBUTE_EXCLUDED_ARCHS[sdk=iphonesimulator*] "arm64") -endif() - -# Check if the platform variable is set -if(DEFINED PLATFORM) - # Environment variables are always preserved. - set(ENV{_PLATFORM} "${PLATFORM}") -elseif(DEFINED ENV{_PLATFORM}) - set(PLATFORM "$ENV{_PLATFORM}") -elseif(NOT DEFINED PLATFORM) - message(FATAL_ERROR "PLATFORM argument not set. Bailing configure since I don't know what target you want to build for!") -endif () - -if(PLATFORM MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode") - message(FATAL_ERROR "The combined builds support requires Xcode to be used as generator via '-G Xcode' command-line argument in CMake") -endif() - -# Safeguard that the platform value is set and is one of the supported values -list(FIND _supported_platforms ${PLATFORM} contains_PLATFORM) -if("${contains_PLATFORM}" EQUAL "-1") - string(REPLACE ";" "\n * " _supported_platforms_formatted "${_supported_platforms}") - message(FATAL_ERROR " Invalid PLATFORM specified! Current value: ${PLATFORM}.\n" - " Supported PLATFORM values: \n * ${_supported_platforms_formatted}") -endif() - -# Check if Apple Silicon is supported -if(PLATFORM MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$" AND ${CMAKE_VERSION} VERSION_LESS "3.19.5") - message(FATAL_ERROR "Apple Silicon builds requires a minimum of CMake 3.19.5") -endif() - -# Touch toolchain variable to suppress "unused variable" warning. -# This happens if CMake is invoked with the same command line the second time. -if(CMAKE_TOOLCHAIN_FILE) -endif() - -# Fix for PThread library not in path -set(CMAKE_THREAD_LIBS_INIT "-lpthread") -set(CMAKE_HAVE_THREADS_LIBRARY 1) -set(CMAKE_USE_WIN32_THREADS_INIT 0) -set(CMAKE_USE_PTHREADS_INIT 1) - -# Specify named language support defaults. -if(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16") - set(NAMED_LANGUAGE_SUPPORT ON) - message(STATUS "[DEFAULTS] Using explicit named language support! E.g., enable_language(CXX) is needed in the project files.") -elseif(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16") - set(NAMED_LANGUAGE_SUPPORT OFF) - message(STATUS "[DEFAULTS] Disabling explicit named language support. Falling back to legacy behaviour.") -elseif(DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16") - message(FATAL_ERROR "CMake named language support for OBJC and OBJCXX was added in CMake 3.16.") -endif() -set(NAMED_LANGUAGE_SUPPORT_INT ${NAMED_LANGUAGE_SUPPORT} CACHE BOOL - "Whether or not to enable explicit named language support" FORCE) - -# Specify minimum version of deployment target. -if(NOT DEFINED DEPLOYMENT_TARGET) - if (PLATFORM MATCHES "WATCHOS") - # Unless specified, SDK version 4.0 is used by default as minimum target version (watchOS). - set(DEPLOYMENT_TARGET "4.0") - elseif(PLATFORM STREQUAL "MAC") - # Unless specified, SDK version 10.13 (High sierra) is used by default as minimum target version (macos). - set(DEPLOYMENT_TARGET "10.13") - elseif(PLATFORM STREQUAL "MAC_ARM64") - # Unless specified, SDK version 11.0 (Big Sur) is used by default as minimum target version (macos on arm). - set(DEPLOYMENT_TARGET "11.0") - elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64") - # Unless specified, SDK version 13.0 is used by default as minimum target version (mac catalyst minimum requirement). - set(DEPLOYMENT_TARGET "13.1") - else() - # Unless specified, SDK version 11.0 is used by default as minimum target version (iOS, tvOS). - set(DEPLOYMENT_TARGET "11.0") - endif() - message(STATUS "[DEFAULTS] Using the default min-version since DEPLOYMENT_TARGET not provided!") -elseif(DEFINED DEPLOYMENT_TARGET AND PLATFORM MATCHES "^MAC_CATALYST" AND ${DEPLOYMENT_TARGET} VERSION_LESS "13.1") - message(FATAL_ERROR "Mac Catalyst builds requires a minimum deployment target of 13.1!") -endif() - -# Store the DEPLOYMENT_TARGET in the cache -set(DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}" CACHE INTERNAL "") - -# Handle the case where we are targeting iOS and a version above 10.3.4 (32-bit support dropped officially) -if(PLATFORM STREQUAL "OS" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4) - set(PLATFORM "OS64") - message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") -elseif(PLATFORM STREQUAL "SIMULATOR" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4) - set(PLATFORM "SIMULATOR64") - message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") -endif() - -set(PLATFORM_INT "${PLATFORM}") - -if(DEFINED ARCHS) - string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") -endif() - -# Determine the platform name and architectures for use in xcodebuild commands -# from the specified PLATFORM_INT name. -if(PLATFORM_INT STREQUAL "OS") - set(SDK_NAME iphoneos) - if(NOT ARCHS) - set(ARCHS armv7 armv7s arm64) - set(APPLE_TARGET_TRIPLE_INT arm-apple-ios${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) - endif() -elseif(PLATFORM_INT STREQUAL "OS64") - set(SDK_NAME iphoneos) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS arm64) # FIXME: Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example - else() - set(ARCHS arm64) - endif() - set(APPLE_TARGET_TRIPLE_INT aarch64-apple-ios${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) - endif() -elseif(PLATFORM_INT STREQUAL "OS64COMBINED") - set(SDK_NAME iphoneos) - if(MODERN_CMAKE) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS arm64 x86_64) # FIXME: Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64") - else() - set(ARCHS arm64 x86_64) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64") - endif() - set(APPLE_TARGET_TRIPLE_INT aarch64-x86_64-apple-ios${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the OS64COMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS i386) - set(APPLE_TARGET_TRIPLE_INT i386-apple-ios${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) - endif() - message(DEPRECATION "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.") -elseif(PLATFORM_INT STREQUAL "SIMULATOR64") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS x86_64) - set(APPLE_TARGET_TRIPLE_INT x86_64-apple-ios${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATORARM64") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS arm64) - set(APPLE_TARGET_TRIPLE_INT aarch64-apple-ios${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "TVOS") - set(SDK_NAME appletvos) - if(NOT ARCHS) - set(ARCHS arm64) - set(APPLE_TARGET_TRIPLE_INT aarch64-apple-tvos${DEPLOYMENT_TARGET}) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}) - endif() -elseif (PLATFORM_INT STREQUAL "TVOSCOMBINED") - set(SDK_NAME appletvos) - if(MODERN_CMAKE) - if(NOT ARCHS) - set(ARCHS arm64 x86_64) - set(APPLE_TARGET_TRIPLE_INT aarch64-x86_64-apple-tvos${DEPLOYMENT_TARGET}) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvsimulator*] "x86_64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvos*] "arm64") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvsimulator*] "x86_64") - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the TVOSCOMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") - set(SDK_NAME appletvsimulator) - if(NOT ARCHS) - set(ARCHS x86_64) - set(APPLE_TARGET_TRIPLE_INT x86_64-apple-tvos${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "WATCHOS") - set(SDK_NAME watchos) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS armv7k arm64_32) - set(APPLE_TARGET_TRIPLE_INT aarch64_32-apple-watchos${DEPLOYMENT_TARGET}) - else() - set(ARCHS armv7k) - set(APPLE_TARGET_TRIPLE_INT arm-apple-watchos${DEPLOYMENT_TARGET}) - endif() - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}) - endif() -elseif(PLATFORM_INT STREQUAL "WATCHOSCOMBINED") - set(SDK_NAME watchos) - if(MODERN_CMAKE) - if(NOT ARCHS) - if (XCODE_VERSION_INT VERSION_GREATER 10.0) - set(ARCHS armv7k arm64_32 i386) - set(APPLE_TARGET_TRIPLE_INT aarch64_32-i386-apple-watchos${DEPLOYMENT_TARGET}) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k arm64_32") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k arm64_32") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386") - else() - set(ARCHS armv7k i386) - set(APPLE_TARGET_TRIPLE_INT arm-i386-apple-watchos${DEPLOYMENT_TARGET}) - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386") - endif() - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the WATCHOSCOMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - set(SDK_NAME watchsimulator) - if(NOT ARCHS) - set(ARCHS i386) - set(APPLE_TARGET_TRIPLE_INT i386-apple-watchos${DEPLOYMENT_TARGET}-simulator) - else() - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}-simulator) - endif() -elseif(PLATFORM_INT STREQUAL "MAC" OR PLATFORM_INT STREQUAL "MAC_CATALYST") - set(SDK_NAME macosx) - if(NOT ARCHS) - set(ARCHS x86_64) - endif() - string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") - if(PLATFORM_INT STREQUAL "MAC") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) - elseif(PLATFORM_INT STREQUAL "MAC_CATALYST") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi) - endif() -elseif(PLATFORM_INT MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$") - set(SDK_NAME macosx) - if(NOT ARCHS) - set(ARCHS arm64) - endif() - string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") - if(PLATFORM_INT STREQUAL "MAC_ARM64") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) - elseif(PLATFORM_INT STREQUAL "MAC_CATALYST_ARM64") - set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi) - endif() -else() - message(FATAL_ERROR "Invalid PLATFORM: ${PLATFORM_INT}") -endif() - -string(REPLACE ";" " " ARCHS_SPACED "${ARCHS}") - -if(MODERN_CMAKE AND PLATFORM_INT MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode") - message(FATAL_ERROR "The COMBINED options only work with Xcode generator, -G Xcode") -endif() - -if(CMAKE_GENERATOR MATCHES "Xcode" AND PLATFORM_INT MATCHES "^MAC_CATALYST") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") - set(CMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS "macosx") - set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-maccatalyst") - if(NOT DEFINED MACOSX_DEPLOYMENT_TARGET) - set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "10.15") - else() - set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "${MACOSX_DEPLOYMENT_TARGET}") - endif() -elseif(CMAKE_GENERATOR MATCHES "Xcode") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") - set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}") - if(NOT PLATFORM_INT MATCHES ".*COMBINED") - set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}") - set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}") - endif() -endif() - -# If user did not specify the SDK root to use, then query xcodebuild for it. -if(DEFINED CMAKE_OSX_SYSROOT_INT) - # Environment variables are always preserved. - set(ENV{_CMAKE_OSX_SYSROOT_INT} "${CMAKE_OSX_SYSROOT_INT}") -elseif(DEFINED ENV{_CMAKE_OSX_SYSROOT_INT}) - set(CMAKE_OSX_SYSROOT_INT "$ENV{_CMAKE_OSX_SYSROOT_INT}") -elseif(NOT DEFINED CMAKE_OSX_SYSROOT_INT) - execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version -sdk ${SDK_NAME} Path - OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() - -if (NOT DEFINED CMAKE_OSX_SYSROOT_INT AND NOT DEFINED CMAKE_OSX_SYSROOT) - message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" - "is pointing to the correct path. Please run:" - "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" - "and see if that fixes the problem for you.") - message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} " - "does not exist.") -elseif(DEFINED CMAKE_OSX_SYSROOT_INT) - set(CMAKE_OSX_SYSROOT_INT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") - # Specify the location or name of the platform SDK to be used in CMAKE_OSX_SYSROOT. - set(CMAKE_OSX_SYSROOT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") -endif() - -# Use bitcode or not -if(NOT DEFINED ENABLE_BITCODE AND NOT ARCHS MATCHES "((^|;|, )(i386|x86_64))+") - # Unless specified, enable bitcode support by default - message(STATUS "[DEFAULTS] Enabling bitcode support by default. ENABLE_BITCODE not provided!") - set(ENABLE_BITCODE ON) -elseif(NOT DEFINED ENABLE_BITCODE) - message(STATUS "[DEFAULTS] Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!") - set(ENABLE_BITCODE OFF) -endif() -set(ENABLE_BITCODE_INT ${ENABLE_BITCODE} CACHE BOOL - "Whether or not to enable bitcode" FORCE) -# Use ARC or not -if(NOT DEFINED ENABLE_ARC) - # Unless specified, enable ARC support by default - set(ENABLE_ARC ON) - message(STATUS "[DEFAULTS] Enabling ARC support by default. ENABLE_ARC not provided!") -endif() -set(ENABLE_ARC_INT ${ENABLE_ARC} CACHE BOOL "Whether or not to enable ARC" FORCE) -# Use hidden visibility or not -if(NOT DEFINED ENABLE_VISIBILITY) - # Unless specified, disable symbols visibility by default - set(ENABLE_VISIBILITY OFF) - message(STATUS "[DEFAULTS] Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") -endif() -set(ENABLE_VISIBILITY_INT ${ENABLE_VISIBILITY} CACHE BOOL "Whether or not to hide symbols from the dynamic linker (-fvisibility=hidden)" FORCE) -# Set strict compiler checks or not -if(NOT DEFINED ENABLE_STRICT_TRY_COMPILE) - # Unless specified, disable strict try_compile() - set(ENABLE_STRICT_TRY_COMPILE OFF) - message(STATUS "[DEFAULTS] Using NON-strict compiler checks by default. ENABLE_STRICT_TRY_COMPILE not provided!") -endif() -set(ENABLE_STRICT_TRY_COMPILE_INT ${ENABLE_STRICT_TRY_COMPILE} CACHE BOOL - "Whether or not to use strict compiler checks" FORCE) - -# Get the SDK version information. -if(DEFINED SDK_VERSION) - # Environment variables are always preserved. - set(ENV{_SDK_VERSION} "${SDK_VERSION}") -elseif(DEFINED ENV{_SDK_VERSION}) - set(SDK_VERSION "$ENV{_SDK_VERSION}") -elseif(NOT DEFINED SDK_VERSION) - execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -sdk ${CMAKE_OSX_SYSROOT_INT} -version SDKVersion - OUTPUT_VARIABLE SDK_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() - -# Find the Developer root for the specific iOS platform being compiled for -# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in -# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain -# this information from xcrun or xcodebuild. -if (NOT DEFINED CMAKE_DEVELOPER_ROOT AND NOT CMAKE_GENERATOR MATCHES "Xcode") - get_filename_component(PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT_INT} PATH) - get_filename_component(CMAKE_DEVELOPER_ROOT ${PLATFORM_SDK_DIR} PATH) - if (NOT EXISTS "${CMAKE_DEVELOPER_ROOT}") - message(FATAL_ERROR "Invalid CMAKE_DEVELOPER_ROOT: ${CMAKE_DEVELOPER_ROOT} does not exist.") - endif() -endif() - -# Find the C & C++ compilers for the specified SDK. -if(DEFINED CMAKE_C_COMPILER) - # Environment variables are always preserved. - set(ENV{_CMAKE_C_COMPILER} "${CMAKE_C_COMPILER}") -elseif(DEFINED ENV{_CMAKE_C_COMPILER}) - set(CMAKE_C_COMPILER "$ENV{_CMAKE_C_COMPILER}") - set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) -elseif(NOT DEFINED CMAKE_C_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang - OUTPUT_VARIABLE CMAKE_C_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) -endif() -if(DEFINED CMAKE_CXX_COMPILER) - # Environment variables are always preserved. - set(ENV{_CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}") -elseif(DEFINED ENV{_CMAKE_CXX_COMPILER}) - set(CMAKE_CXX_COMPILER "$ENV{_CMAKE_CXX_COMPILER}") -elseif(NOT DEFINED CMAKE_CXX_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang++ - OUTPUT_VARIABLE CMAKE_CXX_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() -# Find (Apple's) libtool. -if(DEFINED BUILD_LIBTOOL) - # Environment variables are always preserved. - set(ENV{_BUILD_LIBTOOL} "${BUILD_LIBTOOL}") -elseif(DEFINED ENV{_BUILD_LIBTOOL}) - set(BUILD_LIBTOOL "$ENV{_BUILD_LIBTOOL}") -elseif(NOT DEFINED BUILD_LIBTOOL) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find libtool - OUTPUT_VARIABLE BUILD_LIBTOOL - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -endif() -# Find the toolchain's provided install_name_tool if none is found on the host -if(DEFINED CMAKE_INSTALL_NAME_TOOL) - # Environment variables are always preserved. - set(ENV{_CMAKE_INSTALL_NAME_TOOL} "${CMAKE_INSTALL_NAME_TOOL}") -elseif(DEFINED ENV{_CMAKE_INSTALL_NAME_TOOL}) - set(CMAKE_INSTALL_NAME_TOOL "$ENV{_CMAKE_INSTALL_NAME_TOOL}") -elseif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find install_name_tool - OUTPUT_VARIABLE CMAKE_INSTALL_NAME_TOOL_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - set(CMAKE_INSTALL_NAME_TOOL ${CMAKE_INSTALL_NAME_TOOL_INT} CACHE INTERNAL "") -endif() - -# Configure libtool to be used instead of ar + ranlib to build static libraries. -# This is required on Xcode 7+, but should also work on previous versions of -# Xcode. -get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES) -foreach(lang ${languages}) - set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "${BUILD_LIBTOOL} -static -o " CACHE INTERNAL "") -endforeach() - -# CMake 3.14+ support building for iOS, watchOS and tvOS out of the box. -if(MODERN_CMAKE) - if(SDK_NAME MATCHES "iphone") - set(CMAKE_SYSTEM_NAME iOS) - elseif(SDK_NAME MATCHES "macosx") - set(CMAKE_SYSTEM_NAME Darwin) - elseif(SDK_NAME MATCHES "appletv") - set(CMAKE_SYSTEM_NAME tvOS) - elseif(SDK_NAME MATCHES "watch") - set(CMAKE_SYSTEM_NAME watchOS) - endif() - # Provide flags for a combined FAT library build on newer CMake versions - if(PLATFORM_INT MATCHES ".*COMBINED") - set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO") - set(CMAKE_IOS_INSTALL_COMBINED YES) - endif() -elseif(NOT DEFINED CMAKE_SYSTEM_NAME AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.10") - # Legacy code path prior to CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified - set(CMAKE_SYSTEM_NAME iOS) -elseif(NOT DEFINED CMAKE_SYSTEM_NAME) - # Legacy code path prior to CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified - set(CMAKE_SYSTEM_NAME Darwin) -endif() -# Standard settings. -set(CMAKE_SYSTEM_VERSION ${SDK_VERSION} CACHE INTERNAL "") -set(UNIX ON CACHE BOOL "") -set(APPLE ON CACHE BOOL "") -if(PLATFORM STREQUAL "MAC" OR PLATFORM STREQUAL "MAC_ARM64") - set(IOS OFF CACHE BOOL "") - set(MACOS ON CACHE BOOL "") -elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64") - set(IOS ON CACHE BOOL "") - set(MACOS ON CACHE BOOL "") -else() - set(IOS ON CACHE BOOL "") -endif() -set(CMAKE_AR ar CACHE FILEPATH "" FORCE) -set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE) -set(CMAKE_STRIP strip CACHE FILEPATH "" FORCE) -# Set the architectures for which to build. -set(CMAKE_OSX_ARCHITECTURES ${ARCHS} CACHE INTERNAL "") -# Change the type of target generated for try_compile() so it'll work when cross-compiling, weak compiler checks -if(NOT ENABLE_STRICT_TRY_COMPILE_INT) - set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) -endif() -# All iOS/Darwin specific settings - some may be redundant. -set(CMAKE_MACOSX_BUNDLE YES) -set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO") -set(CMAKE_SHARED_LIBRARY_PREFIX "lib") -set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") -set(CMAKE_SHARED_MODULE_PREFIX "lib") -set(CMAKE_SHARED_MODULE_SUFFIX ".so") -set(CMAKE_C_COMPILER_ABI ELF) -set(CMAKE_CXX_COMPILER_ABI ELF) -set(CMAKE_C_HAS_ISYSROOT 1) -set(CMAKE_CXX_HAS_ISYSROOT 1) -set(CMAKE_MODULE_EXISTS 1) -set(CMAKE_DL_LIBS "") -set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") -set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") -set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") - -if(ARCHS MATCHES "((^|;|, )(arm64|arm64e|x86_64))+") - set(CMAKE_C_SIZEOF_DATA_PTR 8) - set(CMAKE_CXX_SIZEOF_DATA_PTR 8) - if(ARCHS MATCHES "((^|;|, )(arm64|arm64e))+") - set(CMAKE_SYSTEM_PROCESSOR "aarch64") - else() - set(CMAKE_SYSTEM_PROCESSOR "x86_64") - endif() -else() - set(CMAKE_C_SIZEOF_DATA_PTR 4) - set(CMAKE_CXX_SIZEOF_DATA_PTR 4) - set(CMAKE_SYSTEM_PROCESSOR "arm") -endif() - -# Note that only Xcode 7+ supports the newer more specific: -# -m${SDK_NAME}-version-min flags, older versions of Xcode use: -# -m(ios/ios-simulator)-version-min instead. -if(${CMAKE_VERSION} VERSION_LESS "3.11") - if(PLATFORM_INT STREQUAL "OS" OR PLATFORM_INT STREQUAL "OS64") - if(XCODE_VERSION_INT VERSION_LESS 7.0) - set(SDK_NAME_VERSION_FLAGS - "-mios-version-min=${DEPLOYMENT_TARGET}") - else() - # Xcode 7.0+ uses flags we can build directly from SDK_NAME. - set(SDK_NAME_VERSION_FLAGS - "-m${SDK_NAME}-version-min=${DEPLOYMENT_TARGET}") - endif() - elseif(PLATFORM_INT STREQUAL "TVOS") - set(SDK_NAME_VERSION_FLAGS - "-mtvos-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") - set(SDK_NAME_VERSION_FLAGS - "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "WATCHOS") - set(SDK_NAME_VERSION_FLAGS - "-mwatchos-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - set(SDK_NAME_VERSION_FLAGS - "-mwatchos-simulator-version-min=${DEPLOYMENT_TARGET}") - elseif(PLATFORM_INT STREQUAL "MAC") - set(SDK_NAME_VERSION_FLAGS - "-mmacosx-version-min=${DEPLOYMENT_TARGET}") - else() - # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min. - set(SDK_NAME_VERSION_FLAGS - "-mios-simulator-version-min=${DEPLOYMENT_TARGET}") - endif() -elseif(NOT PLATFORM_INT MATCHES "^MAC_CATALYST") - # Newer versions of CMake sets the version min flags correctly, skip this for Mac Catalyst targets - set(CMAKE_OSX_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET}) -endif() - -if(DEFINED APPLE_TARGET_TRIPLE_INT) - set(APPLE_TARGET_TRIPLE ${APPLE_TARGET_TRIPLE_INT} CACHE INTERNAL "") - set(CMAKE_C_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) - set(CMAKE_CXX_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) - set(CMAKE_ASM_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) -endif() - -if(PLATFORM_INT MATCHES "^MAC_CATALYST") - set(C_TARGET_FLAGS "-isystem ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/usr/include -iframework ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks") -endif() - -if(ENABLE_BITCODE_INT) - set(BITCODE "-fembed-bitcode") - set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode") - set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "YES") -else() - set(BITCODE "") - set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "NO") -endif() - -if(ENABLE_ARC_INT) - set(FOBJC_ARC "-fobjc-arc") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "YES") -else() - set(FOBJC_ARC "-fno-objc-arc") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "NO") -endif() - -if(NAMED_LANGUAGE_SUPPORT_INT) - set(OBJC_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0") - set(OBJC_LEGACY_VARS "") -else() - set(OBJC_VARS "") - set(OBJC_LEGACY_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0") -endif() - -if(NOT ENABLE_VISIBILITY_INT) - foreach(lang ${languages}) - set(CMAKE_${lang}_VISIBILITY_PRESET "hidden" CACHE INTERNAL "") - endforeach() - set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES") - set(VISIBILITY "-fvisibility=hidden -fvisibility-inlines-hidden") -else() - foreach(lang ${languages}) - set(CMAKE_${lang}_VISIBILITY_PRESET "default" CACHE INTERNAL "") - endforeach() - set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "NO") - set(VISIBILITY "-fvisibility=default") -endif() - -if(DEFINED APPLE_TARGET_TRIPLE) - set(APPLE_TARGET_TRIPLE_FLAG "-target ${APPLE_TARGET_TRIPLE}") -endif() - -#Check if Xcode generator is used, since that will handle these flags automagically -if(CMAKE_GENERATOR MATCHES "Xcode") - message(STATUS "Not setting any manual command-line buildflags, since Xcode is selected as generator. Modifying the Xcode build-settings directly instead.") -else() - set(CMAKE_C_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_C_FLAGS}") - set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_C_FLAGS_DEBUG}") - set(CMAKE_C_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_C_FLAGS_MINSIZEREL}") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_C_FLAGS_RELWITHDEBINFO}") - set(CMAKE_C_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_C_FLAGS_RELEASE}") - set(CMAKE_CXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_CXX_FLAGS}") - set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_CXX_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_CXX_FLAGS_MINSIZEREL}") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") - set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_CXX_FLAGS_RELEASE}") - if(NAMED_LANGUAGE_SUPPORT_INT) - set(CMAKE_OBJC_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJC_FLAGS}") - set(CMAKE_OBJC_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJC_FLAGS_DEBUG}") - set(CMAKE_OBJC_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJC_FLAGS_MINSIZEREL}") - set(CMAKE_OBJC_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJC_FLAGS_RELWITHDEBINFO}") - set(CMAKE_OBJC_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJC_FLAGS_RELEASE}") - set(CMAKE_OBJCXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJCXX_FLAGS}") - set(CMAKE_OBJCXX_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJCXX_FLAGS_DEBUG}") - set(CMAKE_OBJCXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJCXX_FLAGS_MINSIZEREL}") - set(CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO}") - set(CMAKE_OBJCXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJCXX_FLAGS_RELEASE}") - endif() - set(CMAKE_C_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") - set(CMAKE_CXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") - if(NAMED_LANGUAGE_SUPPORT_INT) - set(CMAKE_OBJC_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJC_LINK_FLAGS}") - set(CMAKE_OBJCXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJCXX_LINK_FLAGS}") - endif() - set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -x assembler-with-cpp -arch ${CMAKE_OSX_ARCHITECTURES} ${APPLE_TARGET_TRIPLE_FLAG}") -endif() - -## Print status messages to inform of the current state -message(STATUS "Configuring ${SDK_NAME} build for platform: ${PLATFORM_INT}, architecture(s): ${ARCHS}") -message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT_INT}") -message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}") -message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}") -message(STATUS "Using libtool: ${BUILD_LIBTOOL}") -message(STATUS "Using install name tool: ${CMAKE_INSTALL_NAME_TOOL}") -if(DEFINED APPLE_TARGET_TRIPLE) - message(STATUS "Autoconf target triple: ${APPLE_TARGET_TRIPLE}") -endif() -message(STATUS "Using minimum deployment version: ${DEPLOYMENT_TARGET}" - " (SDK version: ${SDK_VERSION})") -if(MODERN_CMAKE) - message(STATUS "Merging integrated CMake 3.14+ iOS,tvOS,watchOS,macOS toolchain(s) with this toolchain!") - if(PLATFORM_INT MATCHES ".*COMBINED") - message(STATUS "Will combine built (static) artifacts into FAT lib...") - endif() -endif() -if(CMAKE_GENERATOR MATCHES "Xcode") - message(STATUS "Using Xcode version: ${XCODE_VERSION_INT}") -endif() -message(STATUS "CMake version: ${CMAKE_VERSION}") -if(DEFINED SDK_NAME_VERSION_FLAGS) - message(STATUS "Using version flags: ${SDK_NAME_VERSION_FLAGS}") -endif() -message(STATUS "Using a data_ptr size of: ${CMAKE_CXX_SIZEOF_DATA_PTR}") -if(ENABLE_BITCODE_INT) - message(STATUS "Bitcode: Enabled") -else() - message(STATUS "Bitcode: Disabled") -endif() - -if(ENABLE_ARC_INT) - message(STATUS "ARC: Enabled") -else() - message(STATUS "ARC: Disabled") -endif() - -if(ENABLE_VISIBILITY_INT) - message(STATUS "Hiding symbols: Disabled") -else() - message(STATUS "Hiding symbols: Enabled") -endif() - -# Set global properties -set_property(GLOBAL PROPERTY PLATFORM "${PLATFORM}") -set_property(GLOBAL PROPERTY APPLE_TARGET_TRIPLE "${APPLE_TARGET_TRIPLE_INT}") -set_property(GLOBAL PROPERTY SDK_VERSION "${SDK_VERSION}") -set_property(GLOBAL PROPERTY XCODE_VERSION "${XCODE_VERSION_INT}") -set_property(GLOBAL PROPERTY OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}") - -# Export configurable variables for the try_compile() command. -set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES - PLATFORM - XCODE_VERSION_INT - SDK_VERSION - NAMED_LANGUAGE_SUPPORT - DEPLOYMENT_TARGET - CMAKE_DEVELOPER_ROOT - CMAKE_OSX_SYSROOT_INT - ENABLE_BITCODE - ENABLE_ARC - CMAKE_ASM_COMPILER - CMAKE_C_COMPILER - CMAKE_C_COMPILER_TARGET - CMAKE_CXX_COMPILER - CMAKE_CXX_COMPILER_TARGET - BUILD_LIBTOOL - CMAKE_INSTALL_NAME_TOOL - CMAKE_C_FLAGS - CMAKE_C_DEBUG - CMAKE_C_MINSIZEREL - CMAKE_C_RELWITHDEBINFO - CMAKE_C_RELEASE - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_MINSIZEREL - CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_LINK_FLAGS - CMAKE_CXX_LINK_FLAGS - CMAKE_ASM_FLAGS -) - -if(NAMED_LANGUAGE_SUPPORT_INT) - list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES - CMAKE_OBJC_FLAGS - CMAKE_OBJC_DEBUG - CMAKE_OBJC_MINSIZEREL - CMAKE_OBJC_RELWITHDEBINFO - CMAKE_OBJC_RELEASE - CMAKE_OBJCXX_FLAGS - CMAKE_OBJCXX_DEBUG - CMAKE_OBJCXX_MINSIZEREL - CMAKE_OBJCXX_RELWITHDEBINFO - CMAKE_OBJCXX_RELEASE - CMAKE_OBJC_LINK_FLAGS - CMAKE_OBJCXX_LINK_FLAGS - ) -endif() - -set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) -set(CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") -set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -Wl,-headerpad_max_install_names") -set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -Wl,-headerpad_max_install_names") -set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") -set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") -set(CMAKE_FIND_LIBRARY_SUFFIXES ".tbd" ".dylib" ".so" ".a") -set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-install_name") - -# Set the find root to the SDK developer roots. -# Note: CMAKE_FIND_ROOT_PATH is only useful when cross-compiling. Thus, do not set on macOS builds. -if(NOT PLATFORM_INT MATCHES "^MAC.*$") - list(APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") - set(CMAKE_IGNORE_PATH "/System/Library/Frameworks;/usr/local/lib" CACHE INTERNAL "") -endif() - -# Default to searching for frameworks first. -set(CMAKE_FIND_FRAMEWORK FIRST) - -# Set up the default search directories for frameworks. -if(PLATFORM_INT MATCHES "^MAC_CATALYST") - set(CMAKE_FRAMEWORK_PATH - ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks - ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks - ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks - ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "") -else() - set(CMAKE_FRAMEWORK_PATH - ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks - ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks - ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "") -endif() - -# By default, search both the specified iOS SDK and the remainder of the host filesystem. -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH CACHE INTERNAL "") -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH CACHE INTERNAL "") -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH CACHE INTERNAL "") -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH CACHE INTERNAL "") -endif() - -# -# Some helper-macros below to simplify and beautify the CMakeFile -# - -# This little macro lets you set any Xcode specific property. -macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) - set(XCODE_RELVERSION_I "${XCODE_RELVERSION}") - if(XCODE_RELVERSION_I STREQUAL "All") - set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") - else() - set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}") - endif() -endmacro(set_xcode_property) - -# This macro lets you find executable programs on the host system. -macro(find_host_package) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE NEVER) - set(_TOOLCHAIN_IOS ${IOS}) - set(IOS OFF) - find_package(${ARGN}) - set(IOS ${_TOOLCHAIN_IOS}) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) -endmacro(find_host_package) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/CMakeLists.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/CMakeLists.txt deleted file mode 100644 index 686362688c050d48224ca0a01e0d24b03d94758a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_library(utils STATIC - string.cc - utils.cc -) - -if(NOT ANDROID) - if(MSVC) - target_link_libraries(utils PUBLIC fst) - else() - target_link_libraries(utils PUBLIC fst dl) - endif() -endif() \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/blocking_queue.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/blocking_queue.h deleted file mode 100644 index 9bf0127d9298fbfae2eeebb9431c680fc5dd7647..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/blocking_queue.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_BLOCKING_QUEUE_H_ -#define UTILS_BLOCKING_QUEUE_H_ - -#include -#include -#include -#include -#include -#include - -#include "utils/utils.h" - -namespace wenet { - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity = std::numeric_limits::max()) - : capacity_(capacity) {} - - void Push(const T& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(value); - } - not_empty_condition_.notify_one(); - } - - void Push(T&& value) { - { - std::unique_lock lock(mutex_); - while (queue_.size() >= capacity_) { - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - void Push(const std::vector& values) { - { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(value); - } - } - not_empty_condition_.notify_one(); - } - - void Push(std::vector&& values) { - std::unique_lock lock(mutex_); - for (auto& value : values) { - while (queue_.size() >= capacity_) { - not_empty_condition_.notify_one(); - not_full_condition_.wait(lock); - } - queue_.push(std::move(value)); - } - not_empty_condition_.notify_one(); - } - - T Pop() { - std::unique_lock lock(mutex_); - while (queue_.empty()) { - not_empty_condition_.wait(lock); - } - T t(std::move(queue_.front())); - queue_.pop(); - not_full_condition_.notify_one(); - return t; - } - - // num can be greater than capacity,but it needs to be used with care - std::vector Pop(size_t num) { - std::unique_lock lock(mutex_); - std::vector block_data; - while (block_data.size() < num) { - while (queue_.empty()) { - not_full_condition_.notify_one(); - not_empty_condition_.wait(lock); - } - block_data.push_back(std::move(queue_.front())); - queue_.pop(); - } - not_full_condition_.notify_one(); - return block_data; - } - - bool Empty() const { - std::lock_guard lock(mutex_); - return queue_.empty(); - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - void Clear() { - while (!Empty()) { - Pop(); - } - } - - private: - size_t capacity_; - mutable std::mutex mutex_; - std::condition_variable not_full_condition_; - std::condition_variable not_empty_condition_; - std::queue queue_; - - public: - WENET_DISALLOW_COPY_AND_ASSIGN(BlockingQueue); -}; - -} // namespace wenet - -#endif // UTILS_BLOCKING_QUEUE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/file.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/file.h deleted file mode 100644 index 83ad9c8c52fecd334b3549285bf39cd4f59b9f2b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/file.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FILE_H_ -#define UTILS_FILE_H_ - -#include -#include - -namespace wenet { - -inline bool FileExists(const std::string& path) { - std::ifstream f(path.c_str()); - return f.good(); -} - -} // namespace wenet - -#endif // UTILS_FILE_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/flags.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/flags.h deleted file mode 100644 index 3432aa78847322edec8d6d2aec59ed7ca5352fcd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/flags.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_FLAGS_H_ -#define UTILS_FLAGS_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/flags.h" - -#endif // UTILS_FLAGS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/json.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/json.h deleted file mode 100644 index bf8d94a3e42504139b10daa39b8f8e7a8b2d93cc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/json.h +++ /dev/null @@ -1,754 +0,0 @@ -// Copyright (c) From https://github.com/nbsdx/SimpleJSON -// 2022 Binbin Zhang (binbzha@qq.com) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_JSON_H_ -#define UTILS_JSON_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace json { - -using std::deque; -using std::enable_if; -using std::initializer_list; -using std::is_convertible; -using std::is_floating_point; -using std::is_integral; -using std::is_same; -using std::map; -using std::string; - -namespace { // NOLINT -string json_escape(const string& str) { - string output; - for (unsigned i = 0; i < str.length(); ++i) switch (str[i]) { - case '\"': - output += "\\\""; - break; - case '\\': - output += "\\\\"; - break; - case '\b': - output += "\\b"; - break; - case '\f': - output += "\\f"; - break; - case '\n': - output += "\\n"; - break; - case '\r': - output += "\\r"; - break; - case '\t': - output += "\\t"; - break; - default: - output += str[i]; - break; - } - return std::move(output); -} -} // namespace - -class JSON { - union BackingData { - BackingData(double d) : Float(d) {} - BackingData(int l) : Int(l) {} - BackingData(bool b) : Bool(b) {} - BackingData(string s) : String(new string(s)) {} - BackingData() : Int(0) {} - - deque* List; - map* Map; - string* String; - double Float; - int Int; - bool Bool; - } Internal; - - public: - enum class Class { Null, Object, Array, String, Floating, Integral, Boolean }; - - template - class JSONWrapper { - Container* object; - - public: - explicit JSONWrapper(Container* val) : object(val) {} - explicit JSONWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::iterator begin() { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::iterator end() { - return object ? object->end() : typename Container::iterator(); - } - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::iterator(); - } - }; - - template - class JSONConstWrapper { - const Container* object; - - public: - explicit JSONConstWrapper(const Container* val) : object(val) {} - explicit JSONConstWrapper(std::nullptr_t) : object(nullptr) {} - - typename Container::const_iterator begin() const { - return object ? object->begin() : typename Container::const_iterator(); - } - typename Container::const_iterator end() const { - return object ? object->end() : typename Container::const_iterator(); - } - }; - - JSON() : Internal(), Type(Class::Null) {} - - explicit JSON(initializer_list list) : JSON() { - SetType(Class::Object); - for (auto i = list.begin(), e = list.end(); i != e; ++i, ++i) - operator[](i->ToString()) = *std::next(i); - } - - JSON(JSON&& other) : Internal(other.Internal), Type(other.Type) { - other.Type = Class::Null; - other.Internal.Map = nullptr; - } - - JSON& operator=(JSON&& other) { - ClearInternal(); - Internal = other.Internal; - Type = other.Type; - other.Internal.Map = nullptr; - other.Type = Class::Null; - return *this; - } - - JSON(const JSON& other) { - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - } - - JSON& operator=(const JSON& other) { - ClearInternal(); - switch (other.Type) { - case Class::Object: - Internal.Map = new map(other.Internal.Map->begin(), - other.Internal.Map->end()); - break; - case Class::Array: - Internal.List = new deque(other.Internal.List->begin(), - other.Internal.List->end()); - break; - case Class::String: - Internal.String = new string(*other.Internal.String); - break; - default: - Internal = other.Internal; - } - Type = other.Type; - return *this; - } - - ~JSON() { - switch (Type) { - case Class::Array: - delete Internal.List; - break; - case Class::Object: - delete Internal.Map; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - template - explicit JSON(T b, typename enable_if::value>::type* = 0) - : Internal(b), Type(Class::Boolean) {} - - template - explicit JSON(T i, typename enable_if::value && - !is_same::value>::type* = 0) - : Internal(static_cast(i)), Type(Class::Integral) {} - - template - explicit JSON(T f, typename enable_if::value>::type* = 0) - : Internal(static_cast(f)), Type(Class::Floating) {} - - template - explicit JSON(T s, - typename enable_if::value>::type* = 0) - : Internal(string(s)), Type(Class::String) {} - - explicit JSON(std::nullptr_t) : Internal(), Type(Class::Null) {} - - static JSON Make(Class type) { - JSON ret; - ret.SetType(type); - return ret; - } - - static JSON Load(const string&); - - template - void append(T arg) { - SetType(Class::Array); - Internal.List->emplace_back(arg); - } - - template - void append(T arg, U... args) { - append(arg); - append(args...); - } - - template - typename enable_if::value, JSON&>::type operator=(T b) { - SetType(Class::Boolean); - Internal.Bool = b; - return *this; - } - - template - typename enable_if::value && !is_same::value, - JSON&>::type - operator=(T i) { - SetType(Class::Integral); - Internal.Int = i; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=(T f) { - SetType(Class::Floating); - Internal.Float = f; - return *this; - } - - template - typename enable_if::value, JSON&>::type operator=( - T s) { - SetType(Class::String); - *Internal.String = string(s); - return *this; - } - - JSON& operator[](const string& key) { - SetType(Class::Object); - return Internal.Map->operator[](key); - } - - JSON& operator[](unsigned index) { - SetType(Class::Array); - if (index >= Internal.List->size()) Internal.List->resize(index + 1); - return Internal.List->operator[](index); - } - - JSON& at(const string& key) { return operator[](key); } - - const JSON& at(const string& key) const { return Internal.Map->at(key); } - - JSON& at(unsigned index) { return operator[](index); } - - const JSON& at(unsigned index) const { return Internal.List->at(index); } - - int length() const { - if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - bool hasKey(const string& key) const { - if (Type == Class::Object) - return Internal.Map->find(key) != Internal.Map->end(); - return false; - } - - int size() const { - if (Type == Class::Object) - return Internal.Map->size(); - else if (Type == Class::Array) - return Internal.List->size(); - else - return -1; - } - - Class JSONType() const { return Type; } - - /// Functions for getting primitives from the JSON object. - bool IsNull() const { return Type == Class::Null; } - - string ToString() const { - bool b; - return std::move(ToString(&b)); - } - string ToString(bool* ok) const { - *ok = (Type == Class::String); - return *ok ? std::move(json_escape(*Internal.String)) : string(""); - } - - double ToFloat() const { - bool b; - return ToFloat(&b); - } - double ToFloat(bool* ok) const { - *ok = (Type == Class::Floating); - return *ok ? Internal.Float : 0.0; - } - - int ToInt() const { - bool b; - return ToInt(&b); - } - int ToInt(bool* ok) const { - *ok = (Type == Class::Integral); - return *ok ? Internal.Int : 0; - } - - bool ToBool() const { - bool b; - return ToBool(&b); - } - bool ToBool(bool* ok) const { - *ok = (Type == Class::Boolean); - return *ok ? Internal.Bool : false; - } - - JSONWrapper> ObjectRange() { - if (Type == Class::Object) - return JSONWrapper>(Internal.Map); - return JSONWrapper>(nullptr); - } - - JSONWrapper> ArrayRange() { - if (Type == Class::Array) return JSONWrapper>(Internal.List); - return JSONWrapper>(nullptr); - } - - JSONConstWrapper> ObjectRange() const { - if (Type == Class::Object) - return JSONConstWrapper>(Internal.Map); - return JSONConstWrapper>(nullptr); - } - - JSONConstWrapper> ArrayRange() const { - if (Type == Class::Array) - return JSONConstWrapper>(Internal.List); - return JSONConstWrapper>(nullptr); - } - - string dump(int depth = 1, string tab = " ") const { - string pad = ""; - for (int i = 0; i < depth; ++i, pad += tab) { - } - - switch (Type) { - case Class::Null: - return "null"; - case Class::Object: { - string s = "{\n"; - bool skip = true; - for (auto& p : *Internal.Map) { - if (!skip) s += ",\n"; - s += (pad + "\"" + p.first + "\" : " + p.second.dump(depth + 1, tab)); - skip = false; - } - s += ("\n" + pad.erase(0, 2) + "}"); - return s; - } - case Class::Array: { - string s = "["; - bool skip = true; - for (auto& p : *Internal.List) { - if (!skip) s += ", "; - s += p.dump(depth + 1, tab); - skip = false; - } - s += "]"; - return s; - } - case Class::String: - return "\"" + json_escape(*Internal.String) + "\""; - case Class::Floating: - return std::to_string(Internal.Float); - case Class::Integral: - return std::to_string(Internal.Int); - case Class::Boolean: - return Internal.Bool ? "true" : "false"; - default: - return ""; - } - return ""; - } - - friend std::ostream& operator<<(std::ostream&, const JSON&); - - private: - void SetType(Class type) { - if (type == Type) return; - - ClearInternal(); - - switch (type) { - case Class::Null: - Internal.Map = nullptr; - break; - case Class::Object: - Internal.Map = new map(); - break; - case Class::Array: - Internal.List = new deque(); - break; - case Class::String: - Internal.String = new string(); - break; - case Class::Floating: - Internal.Float = 0.0; - break; - case Class::Integral: - Internal.Int = 0; - break; - case Class::Boolean: - Internal.Bool = false; - break; - } - - Type = type; - } - - private: - /* beware: only call if YOU know that Internal is allocated. No checks - performed here. This function should be called in a constructed JSON just - before you are going to overwrite Internal... -*/ - void ClearInternal() { - switch (Type) { - case Class::Object: - delete Internal.Map; - break; - case Class::Array: - delete Internal.List; - break; - case Class::String: - delete Internal.String; - break; - default: { - }; - } - } - - private: - Class Type = Class::Null; -}; - -JSON Array() { return std::move(JSON::Make(JSON::Class::Array)); } - -template -JSON Array(T... args) { - JSON arr = JSON::Make(JSON::Class::Array); - arr.append(args...); - return std::move(arr); -} - -JSON Object() { return std::move(JSON::Make(JSON::Class::Object)); } - -std::ostream& operator<<(std::ostream& os, const JSON& json) { - os << json.dump(); - return os; -} - -namespace { // NOLINT -JSON parse_next(const string&, size_t&); - -void consume_ws(const string& str, size_t& offset) { // NOLINT - while (isspace(str[offset])) ++offset; -} - -JSON parse_object(const string& str, size_t& offset) { // NOLINT - JSON Object = JSON::Make(JSON::Class::Object); - - ++offset; - consume_ws(str, offset); - if (str[offset] == '}') { - ++offset; - return std::move(Object); - } - - while (true) { - JSON Key = parse_next(str, offset); - consume_ws(str, offset); - if (str[offset] != ':') { - std::cerr << "Error: Object: Expected colon, found '" << str[offset] - << "'\n"; - break; - } - consume_ws(str, ++offset); - JSON Value = parse_next(str, offset); - Object[Key.ToString()] = Value; - - consume_ws(str, offset); - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == '}') { - ++offset; - break; - } else { - std::cerr << "ERROR: Object: Expected comma, found '" << str[offset] - << "'\n"; - break; - } - } - - return std::move(Object); -} - -JSON parse_array(const string& str, size_t& offset) { // NOLINT - JSON Array = JSON::Make(JSON::Class::Array); - unsigned index = 0; - - ++offset; - consume_ws(str, offset); - if (str[offset] == ']') { - ++offset; - return std::move(Array); - } - - while (true) { - Array[index++] = parse_next(str, offset); - consume_ws(str, offset); - - if (str[offset] == ',') { - ++offset; - continue; - } else if (str[offset] == ']') { - ++offset; - break; - } else { - std::cerr << "ERROR: Array: Expected ',' or ']', found '" << str[offset] - << "'\n"; - return std::move(JSON::Make(JSON::Class::Array)); - } - } - - return std::move(Array); -} - -JSON parse_string(const string& str, size_t& offset) { // NOLINT - JSON String; - string val; - for (char c = str[++offset]; c != '\"'; c = str[++offset]) { - if (c == '\\') { - switch (str[++offset]) { - case '\"': - val += '\"'; - break; - case '\\': - val += '\\'; - break; - case '/': - val += '/'; - break; - case 'b': - val += '\b'; - break; - case 'f': - val += '\f'; - break; - case 'n': - val += '\n'; - break; - case 'r': - val += '\r'; - break; - case 't': - val += '\t'; - break; - case 'u': { - val += "\\u"; - for (unsigned i = 1; i <= 4; ++i) { - c = str[offset + i]; - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { - val += c; - } else { - std::cerr << "ERROR: String: Expected hex character in unicode " - "escape, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::String)); - } - } - offset += 4; - } break; - default: - val += '\\'; - break; - } - } else { - val += c; - } - } - ++offset; - String = val; - return std::move(String); -} - -JSON parse_number(const string& str, size_t& offset) { // NOLINT - JSON Number; - string val, exp_str; - char c; - bool isDouble = false; - int exp = 0; - while (true) { - c = str[offset++]; - if ((c == '-') || (c >= '0' && c <= '9')) { - val += c; - } else if (c == '.') { - val += c; - isDouble = true; - } else { - break; - } - } - if (c == 'E' || c == 'e') { - c = str[offset++]; - if (c == '-') { - ++offset; - exp_str += '-'; - } - while (true) { - c = str[offset++]; - if (c >= '0' && c <= '9') { - exp_str += c; - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: Expected a number for exponent, found '" - << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } else { - break; - } - } - exp = std::stol(exp_str); - } else if (!isspace(c) && c != ',' && c != ']' && c != '}') { - std::cerr << "ERROR: Number: unexpected character '" << c << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - --offset; - - if (isDouble) { - Number = std::stod(val) * std::pow(10, exp); - } else { - if (!exp_str.empty()) - Number = std::stol(val) * std::pow(10, exp); - else - Number = std::stol(val); - } - return std::move(Number); -} - -JSON parse_bool(const string& str, size_t& offset) { // NOLINT - JSON Bool; - if (str.substr(offset, 4) == "true") { - Bool = true; - } else if (str.substr(offset, 5) == "false") { - Bool = false; - } else { - std::cerr << "ERROR: Bool: Expected 'true' or 'false', found '" - << str.substr(offset, 5) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += (Bool.ToBool() ? 4 : 5); - return std::move(Bool); -} - -JSON parse_null(const string& str, size_t& offset) { // NOLINT - JSON Null; - if (str.substr(offset, 4) != "null") { - std::cerr << "ERROR: Null: Expected 'null', found '" - << str.substr(offset, 4) << "'\n"; - return std::move(JSON::Make(JSON::Class::Null)); - } - offset += 4; - return std::move(Null); -} - -JSON parse_next(const string& str, size_t& offset) { // NOLINT - char value; - consume_ws(str, offset); - value = str[offset]; - switch (value) { - case '[': - return std::move(parse_array(str, offset)); - case '{': - return std::move(parse_object(str, offset)); - case '\"': - return std::move(parse_string(str, offset)); - case 't': - case 'f': - return std::move(parse_bool(str, offset)); - case 'n': - return std::move(parse_null(str, offset)); - default: - if ((value <= '9' && value >= '0') || value == '-') - return std::move(parse_number(str, offset)); - } - std::cerr << "ERROR: Parse: Unknown starting character '" << value << "'\n"; - return JSON(); -} -} // namespace - -JSON JSON::Load(const string& str) { - size_t offset = 0; - return std::move(parse_next(str, offset)); -} - -} // namespace json - -#endif // UTILS_JSON_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/log.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/log.h deleted file mode 100644 index c2bf03f261a8711f74da819d80d68e8eb9fb124a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/log.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_LOG_H_ -#define UTILS_LOG_H_ - -// Because openfst is a dynamic library compiled with gflags/glog, we must use -// the gflags/glog from openfst to avoid them linked both statically and -// dynamically into the executable. -#include "fst/log.h" - -#endif // UTILS_LOG_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/string.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/string.cc deleted file mode 100644 index 1ab93adf3cac1bc5a42c0b8c6cadbde399678fef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/string.cc +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/string.h" - -#include -#include -#include - -#include "utils/log.h" -#include "utils/utils.h" - -namespace wenet { - -void SplitString(const std::string& str, std::vector* strs) { - SplitStringToVector(Trim(str), " \t", true, strs); -} - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out) { - size_t start = 0, found = 0, end = full.size(); - out->clear(); - while (found != std::string::npos) { - found = full.find_first_of(delim, start); - // start != end condition is for when the delimiter is at the end - if (!omit_empty_strings || (found != start && start != end)) - out->push_back(full.substr(start, found - start)); - start = found + 1; - } -} - -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars) { - chars->clear(); - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - assert((str[i] & 0xF8) <= 0xF0); - if ((str[i] & 0x80) == 0x00) { - // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - // The next 1,920 characters need two bytes to encode, - // which covers the remainder of almost all Latin-script alphabets. - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - // Three bytes are needed for characters in the rest of - // the Basic Multilingual Plane, which contains virtually all characters - // in common use, including most Chinese, Japanese and Korean characters. - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - // Four bytes are needed for characters in the other planes of Unicode, - // which include less common CJK characters, various historic scripts, - // mathematical symbols, and emoji (pictographic symbols). - bytes = 4; - } - chars->push_back(str.substr(i, bytes)); - } -} - -int UTF8StringLength(const std::string& str) { - int len = 0; - int bytes = 1; - for (size_t i = 0; i < str.length(); i += bytes) { - if ((str[i] & 0x80) == 0x00) { - bytes = 1; - } else if ((str[i] & 0xE0) == 0xC0) { - bytes = 2; - } else if ((str[i] & 0xF0) == 0xE0) { - bytes = 3; - } else if ((str[i] & 0xF8) == 0xF0) { - bytes = 4; - } - ++len; - } - return len; -} - -bool CheckEnglishChar(const std::string& ch) { - // all english characters should be encoded in one byte - if (ch.size() != 1) return false; - // english words may contain apostrophe, i.e., "He's" - return isalpha(ch[0]) || ch[0] == '\''; -} - -bool CheckEnglishWord(const std::string& word) { - std::vector chars; - SplitUTF8StringToChars(word, &chars); - for (size_t k = 0; k < chars.size(); k++) { - if (!CheckEnglishChar(chars[k])) { - return false; - } - } - return true; -} - -std::string JoinString(const std::string& c, - const std::vector& strs) { - std::string result; - if (strs.size() > 0) { - for (int i = 0; i < strs.size() - 1; i++) { - result += (strs[i] + c); - } - result += strs.back(); - } - return result; -} - -bool IsAlpha(const std::string& str) { - for (size_t i = 0; i < str.size(); i++) { - if (!isalpha(str[i])) { - return false; - } - } - return true; -} - -std::string ProcessBlank(const std::string& str, bool lowercase) { - std::string result; - if (!str.empty()) { - std::vector chars; - SplitUTF8StringToChars(Trim(str), &chars); - - for (std::string& ch : chars) { - if (ch != kSpaceSymbol) { - result.append(ch); - } else { - // Ignore consecutive space or located in head - if (!result.empty() && result.back() != ' ') { - result.push_back(' '); - } - } - } - // Ignore tailing space - if (!result.empty() && result.back() == ' ') { - result.pop_back(); - } - // NOTE: convert string to wstring - // see issue 745: https://github.com/wenet-e2e/wenet/issues/745 - std::locale loc(""); - std::wstring_convert, wchar_t> converter; - std::wstring wsresult = converter.from_bytes(result); - for (auto& c : wsresult) { - c = lowercase ? tolower(c, loc) : toupper(c, loc); - } - result = converter.to_bytes(wsresult); - } - return result; -} - -std::string Ltrim(const std::string& str) { - size_t start = str.find_first_not_of(WHITESPACE); - return (start == std::string::npos) ? "" : str.substr(start); -} - -std::string Rtrim(const std::string& str) { - size_t end = str.find_last_not_of(WHITESPACE); - return (end == std::string::npos) ? "" : str.substr(0, end + 1); -} - -std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } - -std::string JoinPath(const std::string& left, const std::string& right) { - std::string path(left); - if (path.size() && path.back() != '/') { - path.push_back('/'); - } - path.append(right); - return path; -} - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str) { - unsigned len = str.size() * 2; - setlocale(LC_CTYPE, ""); - wchar_t* p = new wchar_t[len]; - mbstowcs(p, str.c_str(), len); - std::wstring wstr(p); - delete[] p; - return wstr; -} -#endif - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/string.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/string.h deleted file mode 100644 index bf7a52ae09bce45ab7e34a5277652d7ae91bae1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/string.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_STRING_H_ -#define UTILS_STRING_H_ - -#include -#include -#include -#include -#include - -#include "fst/symbol-table.h" - -namespace wenet { - -const char WHITESPACE[] = " \n\r\t\f\v"; - -// Split the string with space or tab. -void SplitString(const std::string& str, std::vector* strs); - -void SplitStringToVector(const std::string& full, const char* delim, - bool omit_empty_strings, - std::vector* out); - -// NOTE(Xingchen Song): we add this function to make it possible to -// support multilingual recipe in the future, in which characters of -// different languages are all encoded in UTF-8 format. -// UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding -// Split the UTF-8 string into chars. -void SplitUTF8StringToChars(const std::string& str, - std::vector* chars); - -int UTF8StringLength(const std::string& str); - -// Check whether the UTF-8 char is alphabet or '. -bool CheckEnglishChar(const std::string& ch); - -// Check whether the UTF-8 word is only contains alphabet or '. -bool CheckEnglishWord(const std::string& word); - -std::string JoinString(const std::string& c, - const std::vector& strs); - -bool IsAlpha(const std::string& str); - -// Split the UTF-8 string into words by symbol table. -// Return whether not contains oov. -bool SplitUTF8StringToWords( - const std::string& str, - const std::shared_ptr& symbol_table, - std::vector* words); - -// Replace ▁ with space, then remove head, tail and consecutive space. -std::string ProcessBlank(const std::string& str, bool lowercase); - -std::string Ltrim(const std::string& str); - -std::string Rtrim(const std::string& str); - -std::string Trim(const std::string& str); - -std::string JoinPath(const std::string& left, const std::string& right); - -#ifdef _MSC_VER -std::wstring ToWString(const std::string& str); -#endif - -} // namespace wenet - -#endif // UTILS_STRING_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/thread_pool.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/thread_pool.h deleted file mode 100644 index a78162995d90bf079ad091cf14cb9f2cd4476d05..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/thread_pool.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2012 Jakob Progsch, Václav Zeman - -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. - -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: - -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. - -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original software. - -// 3. This notice may not be removed or altered from any source -// distribution. - -#ifndef UTILS_THREAD_POOL_H_ -#define UTILS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { - public: - explicit ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; - ~ThreadPool(); - - private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - }); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared >( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) { - worker.join(); - } -} - -#endif // UTILS_THREAD_POOL_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/timer.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/timer.h deleted file mode 100644 index 068519f98d140ba0eef68babe2ad2fdcb798c074..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/timer.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_TIMER_H_ -#define UTILS_TIMER_H_ - -#include - -namespace wenet { - -class Timer { - public: - Timer() : time_start_(std::chrono::steady_clock::now()) {} - void Reset() { time_start_ = std::chrono::steady_clock::now(); } - // return int in milliseconds - int Elapsed() const { - auto time_now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(time_now - - time_start_) - .count(); - } - - private: - std::chrono::time_point time_start_; -}; -} // namespace wenet - -#endif // UTILS_TIMER_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/utils.cc b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/utils.cc deleted file mode 100644 index c37e36c6e9f629e0a4b11cf21a791aefd58b659f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/utils.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "utils/utils.h" - -#include -#include -#include -#include -#include -#include - -#include "utils/log.h" - -namespace wenet { - -float LogAdd(float x, float y) { - static float num_min = -std::numeric_limits::max(); - if (x <= num_min) return y; - if (y <= num_min) return x; - float xmax = std::max(x, y); - return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax; -} - -template -struct ValueComp { - bool operator()(const std::pair& lhs, - const std::pair& rhs) const { - return lhs.first > rhs.first || - (lhs.first == rhs.first && lhs.second < rhs.second); - } -}; - -// We refer the pytorch topk implementation -// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/top_k.cc -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices) { - std::vector> heap_data; - int n = data.size(); - for (int32_t i = 0; i < k && i < n; ++i) { - heap_data.emplace_back(data[i], i); - } - std::priority_queue, std::vector>, - ValueComp> - pq(ValueComp(), std::move(heap_data)); - for (int32_t i = k; i < n; ++i) { - if (pq.top().first < data[i]) { - pq.pop(); - pq.emplace(data[i], i); - } - } - - values->resize(std::min(k, n)); - indices->resize(std::min(k, n)); - int32_t cur = values->size() - 1; - while (!pq.empty()) { - const auto& item = pq.top(); - (*values)[cur] = item.first; - (*indices)[cur] = item.second; - pq.pop(); - cur -= 1; - } -} - -template void TopK(const std::vector& data, int32_t k, - std::vector* values, - std::vector* indices); - -} // namespace wenet diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/utils.h b/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/utils.h deleted file mode 100644 index f9957c0b6e8ae27d9260e75cf55e786055827801..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/raspberrypi/utils/utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTILS_UTILS_H_ -#define UTILS_UTILS_H_ - -#include -#include -#include - -namespace wenet { - -#define WENET_DISALLOW_COPY_AND_ASSIGN(Type) \ - Type(const Type&) = delete; \ - Type& operator=(const Type&) = delete; - -const float kFloatMax = std::numeric_limits::max(); -// kSpaceSymbol in UTF-8 is: ▁ -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -// Return the sum of two probabilities in log scale -float LogAdd(float x, float y); - -template -void TopK(const std::vector& data, int32_t k, std::vector* values, - std::vector* indices); - -} // namespace wenet - -#endif // UTILS_UTILS_H_ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/README.md b/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/README.md deleted file mode 100644 index 08da0ad2a4a67a4c197c7e29b48de6a652294952..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/README.md +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -pip3 install transformers datasets h5py==3.1.0 tqdm argparse -U - -DIR_NAME='test_models' - -if [[ ! -d ${DIR_NAME} ]]; then - mkdir -p ${DIR_NAME} -fi - -if [[ ! -f "${DIR_NAME}/vocab.txt" ]]; then - wget 'http://10.113.3.3/data/Model/bert_squad/vocab.txt' -P ${DIR_NAME} -fi - -if [[ ! -f "${DIR_NAME}/train.json" ]]; then - wget 'http://10.113.3.3/data/Model/bert_squad/train.json' -P ${DIR_NAME} -fi - -if [[ ! -f "${DIR_NAME}/dev.json" ]]; then - wget 'http://10.113.3.3/data/Model/bert_squad/dev.json' -P ${DIR_NAME} -fi - - -if [[ ! -f "${DIR_NAME}/tokenizer_config.json" ]]; then - wget 'http://10.113.3.3/data/Model/bert_squad/tokenizer_config.json' -P ${DIR_NAME} -fi - -if [[ ! -f "${DIR_NAME}/config.json" ]]; then - wget 'http://10.113.3.3/data/Model/bert_squad/config.json' -P ${DIR_NAME} -fi - -# model_name="base" - -# if [[ ${model_name} == 'base' ]]; then -# if [[ ! -f "${DIR_NAME}/bert_base_quant.hdf5" ]]; then -# wget 'http://10.113.3.3/data/Model/bert_squad/bert_base_quant.hdf5' -P ${DIR_NAME} -# wget 'http://10.113.3.3/data/Model/bert_squad/bert_base_quant.hdf5.md5' -P ${DIR_NAME} -# fi -# fi - -# model_name="large" - -# if [[ ${model_name} == 'large' ]]; then -# if [[ ! -f "${DIR_NAME}/bert_large_quant.hdf5" ]]; then -# wget 'http://10.113.3.3/data/Model/bert_squad/bert_large_quant.hdf5' -P ${DIR_NAME} -# wget 'http://10.113.3.3/data/Model/bert_squad/bert_large_quant.hdf5.md5' -P ${DIR_NAME} -# fi -# fi \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/app.py b/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/app.py deleted file mode 100644 index 85b63efcf5c11a7410e847d14c3cb1c8db2995b5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/app.py +++ /dev/null @@ -1,22 +0,0 @@ -import json -import gradio as gr -import numpy as np -import torch -import wenetruntime as wenet - -torch.manual_seed(777) # for lint - -wenet.set_log_level(2) -decoder = wenet.Decoder(lang='chs') - -def recognition(audio): - sr, y = audio - assert sr in [48000, 16000] - if sr == 48000: # Optional resample to 16000 - y = (y / max(np.max(y), 1) * 32767)[::3].astype("int16") - ans = decoder.decode(y.tobytes(), True) - return json.loads(ans) - -text = "Speech Recognition in WeNet | 基于 WeNet 的语音识别" -gr.Interface(recognition, inputs="mic", outputs="json", - description=text).launch() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/requirements.txt b/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/requirements.txt deleted file mode 100644 index ef9d340d3c798affa06d885d7f59ada58cd37e93..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/runtime/web/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -wenetruntime -gradio diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/aishell2.words.txt b/models/audio/speech_recognition/conformer/igie/wenet/test/resources/aishell2.words.txt deleted file mode 100644 index 5478d9ad9ee70bc7e1f98a6f003b7f7260b9f1ef..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/aishell2.words.txt +++ /dev/null @@ -1,5235 +0,0 @@ - 0 - 1 -' 2 -( 3 -) 4 -A 5 -𫖯 6 -a 7 -A 8 -b 9 -B 10 -c 11 -C 12 -d 13 -D 14 -e 15 -E 16 -f 17 -F 18 -g 19 -G 20 -h 21 -H 22 -i 23 -I 24 -j 25 -J 26 -k 27 -K 28 -l 29 -L 30 -m 31 -M 32 -n 33 -N 34 -o 35 -O 36 -p 37 -P 38 -q 39 -Q 40 -r 41 -R 42 -s 43 -S 44 - 45 -t 46 -T 47 -u 48 -U 49 -v 50 -V 51 -w 52 -W 53 -x 54 -X 55 -y 56 -Y 57 -z 58 -Z 59 -一 60 -丁 61 -七 62 -万 63 -丈 64 -三 65 -上 66 -下 67 -不 68 -与 69 -丐 70 -丑 71 -专 72 -且 73 -丕 74 -世 75 -丘 76 -丙 77 -业 78 -丛 79 -东 80 -丝 81 -丞 82 -丢 83 -两 84 -严 85 -丧 86 -个 87 -丫 88 -中 89 -丰 90 -串 91 -临 92 -丸 93 -丹 94 -为 95 -主 96 -丽 97 -举 98 -乃 99 -久 100 -么 101 -义 102 -之 103 -乌 104 -乍 105 -乎 106 -乏 107 -乐 108 -乒 109 -乓 110 -乔 111 -乖 112 -乘 113 -乙 114 -九 115 -乞 116 -也 117 -习 118 -乡 119 -书 120 -买 121 -乱 122 -乳 123 -乾 124 -了 125 -予 126 -争 127 -事 128 -二 129 -于 130 -亏 131 -云 132 -互 133 -五 134 -井 135 -亘 136 -亚 137 -些 138 -亟 139 -亡 140 -亢 141 -交 142 -亥 143 -亦 144 -产 145 -亨 146 -亩 147 -享 148 -京 149 -亭 150 -亮 151 -亲 152 -亳 153 -亵 154 -人 155 -亿 156 -什 157 -仁 158 -仄 159 -仅 160 -仆 161 -仇 162 -今 163 -介 164 -仍 165 -从 166 -仑 167 -仓 168 -仔 169 -仕 170 -他 171 -仗 172 -付 173 -仙 174 -仞 175 -仟 176 -仡 177 -代 178 -令 179 -以 180 -仨 181 -仪 182 -们 183 -仰 184 -仲 185 -件 186 -价 187 -任 188 -份 189 -仿 190 -企 191 -伉 192 -伊 193 -伍 194 -伎 195 -伏 196 -伐 197 -休 198 -众 199 -优 200 -伙 201 -会 202 -伞 203 -伟 204 -传 205 -伢 206 -伤 207 -伦 208 -伪 209 -伫 210 -伯 211 -估 212 -伴 213 -伶 214 -伸 215 -伺 216 -似 217 -伽 218 -佃 219 -但 220 -位 221 -低 222 -住 223 -佐 224 -佑 225 -体 226 -何 227 -佗 228 -佘 229 -余 230 -佚 231 -佛 232 -作 233 -佝 234 -佟 235 -你 236 -佣 237 -佩 238 -佬 239 -佯 240 -佰 241 -佳 242 -佶 243 -佻 244 -佼 245 -使 246 -侃 247 -侄 248 -侈 249 -例 250 -侍 251 -侏 252 -侑 253 -侗 254 -供 255 -依 256 -侠 257 -侣 258 -侥 259 -侦 260 -侧 261 -侨 262 -侬 263 -侮 264 -侯 265 -侵 266 -便 267 -促 268 -俄 269 -俊 270 -俎 271 -俏 272 -俐 273 -俑 274 -俗 275 -俘 276 -俚 277 -保 278 -俞 279 -俟 280 -信 281 -俨 282 -俩 283 -俪 284 -俭 285 -修 286 -俯 287 -俱 288 -俸 289 -俺 290 -俾 291 -倌 292 -倍 293 -倒 294 -倔 295 -倘 296 -候 297 -倚 298 -倜 299 -借 300 -倡 301 -倦 302 -倩 303 -倪 304 -倭 305 -债 306 -值 307 -倾 308 -偃 309 -假 310 -偈 311 -偌 312 -偎 313 -偏 314 -偓 315 -偕 316 -做 317 -停 318 -健 319 -偶 320 -偷 321 -偻 322 -偿 323 -傀 324 -傅 325 -傍 326 -傣 327 -傥 328 -储 329 -催 330 -傲 331 -傻 332 -像 333 -僚 334 -僧 335 -僮 336 -僵 337 -僻 338 -儋 339 -儒 340 -儡 341 -儿 342 -兀 343 -允 344 -元 345 -兄 346 -充 347 -兆 348 -先 349 -光 350 -克 351 -免 352 -兑 353 -兔 354 -兖 355 -党 356 -兜 357 -兢 358 -入 359 -全 360 -八 361 -公 362 -六 363 -兮 364 -兰 365 -共 366 -关 367 -兴 368 -兵 369 -其 370 -具 371 -典 372 -兹 373 -养 374 -兼 375 -兽 376 -冀 377 -内 378 -冈 379 -冉 380 -册 381 -再 382 -冒 383 -冕 384 -冗 385 -写 386 -军 387 -农 388 -冠 389 -冢 390 -冤 391 -冥 392 -冬 393 -冯 394 -冰 395 -冲 396 -决 397 -况 398 -冶 399 -冷 400 -冻 401 -冼 402 -冽 403 -净 404 -凄 405 -准 406 -凇 407 -凉 408 -凋 409 -凌 410 -减 411 -凑 412 -凛 413 -凝 414 -几 415 -凡 416 -凤 417 -凭 418 -凯 419 -凰 420 -凳 421 -凶 422 -凸 423 -凹 424 -出 425 -击 426 -函 427 -凿 428 -刀 429 -刁 430 -刃 431 -分 432 -切 433 -刊 434 -刍 435 -刎 436 -刑 437 -划 438 -列 439 -刘 440 -则 441 -刚 442 -创 443 -初 444 -删 445 -判 446 -刨 447 -利 448 -别 449 -刮 450 -到 451 -制 452 -刷 453 -券 454 -刹 455 -刺 456 -刻 457 -刽 458 -剁 459 -剂 460 -剃 461 -削 462 -剌 463 -前 464 -剐 465 -剑 466 -剔 467 -剖 468 -剜 469 -剥 470 -剧 471 -剩 472 -剪 473 -副 474 -割 475 -剽 476 -剿 477 -劈 478 -力 479 -劝 480 -办 481 -功 482 -加 483 -务 484 -劣 485 -动 486 -助 487 -努 488 -劫 489 -劭 490 -励 491 -劲 492 -劳 493 -劵 494 -劾 495 -势 496 -勃 497 -勇 498 -勉 499 -勋 500 -勐 501 -勒 502 -勘 503 -募 504 -勤 505 -勺 506 -勾 507 -勿 508 -匀 509 -包 510 -匆 511 -匈 512 -匏 513 -匕 514 -化 515 -北 516 -匙 517 -匝 518 -匠 519 -匡 520 -匣 521 -匪 522 -匮 523 -匹 524 -区 525 -医 526 -匾 527 -匿 528 -十 529 -千 530 -升 531 -午 532 -卉 533 -半 534 -华 535 -协 536 -卑 537 -卒 538 -卓 539 -单 540 -卖 541 -南 542 -博 543 -卜 544 -卞 545 -占 546 -卡 547 -卢 548 -卤 549 -卦 550 -卧 551 -卫 552 -卯 553 -印 554 -危 555 -卲 556 -即 557 -却 558 -卵 559 -卷 560 -卸 561 -卿 562 -厂 563 -厄 564 -厅 565 -历 566 -厉 567 -压 568 -厌 569 -厕 570 -厘 571 -厚 572 -厝 573 -原 574 -厢 575 -厥 576 -厦 577 -厨 578 -厩 579 -厮 580 -去 581 -县 582 -叁 583 -参 584 -又 585 -叉 586 -及 587 -友 588 -双 589 -反 590 -发 591 -叔 592 -取 593 -受 594 -变 595 -叙 596 -叛 597 -叠 598 -口 599 -古 600 -句 601 -另 602 -叨 603 -叩 604 -只 605 -叫 606 -召 607 -叭 608 -叮 609 -可 610 -台 611 -叱 612 -史 613 -右 614 -叵 615 -叶 616 -号 617 -司 618 -叹 619 -叼 620 -叽 621 -吁 622 -吃 623 -各 624 -吆 625 -合 626 -吉 627 -吊 628 -吋 629 -同 630 -名 631 -后 632 -吏 633 -吐 634 -向 635 -吒 636 -吓 637 -吕 638 -吖 639 -吗 640 -君 641 -吝 642 -吞 643 -吟 644 -吠 645 -否 646 -吧 647 -吨 648 -吩 649 -含 650 -听 651 -吭 652 -吮 653 -启 654 -吱 655 -吴 656 -吵 657 -吸 658 -吹 659 -吻 660 -吼 661 -吾 662 -呀 663 -呃 664 -呆 665 -呈 666 -告 667 -呐 668 -呕 669 -呗 670 -员 671 -呛 672 -呜 673 -呢 674 -呦 675 -周 676 -呱 677 -呲 678 -味 679 -呵 680 -呷 681 -呸 682 -呻 683 -呼 684 -命 685 -咀 686 -咂 687 -咄 688 -咆 689 -咋 690 -和 691 -咎 692 -咏 693 -咐 694 -咒 695 -咔 696 -咕 697 -咖 698 -咘 699 -咙 700 -咚 701 -咝 702 -咣 703 -咤 704 -咦 705 -咧 706 -咨 707 -咩 708 -咪 709 -咫 710 -咬 711 -咭 712 -咯 713 -咱 714 -咳 715 -咸 716 -咻 717 -咽 718 -哀 719 -品 720 -哂 721 -哄 722 -哆 723 -哇 724 -哈 725 -哉 726 -响 727 -哎 728 -哐 729 -哑 730 -哒 731 -哔 732 -哕 733 -哗 734 -哟 735 -哥 736 -哦 737 -哨 738 -哩 739 -哪 740 -哭 741 -哮 742 -哲 743 -哺 744 -哼 745 -哽 746 -唁 747 -唆 748 -唇 749 -唉 750 -唏 751 -唐 752 -唑 753 -唛 754 -唠 755 -唢 756 -唤 757 -唧 758 -唬 759 -售 760 -唯 761 -唰 762 -唱 763 -唳 764 -唷 765 -唾 766 -啃 767 -啄 768 -商 769 -啊 770 -啕 771 -啖 772 -啜 773 -啡 774 -啤 775 -啥 776 -啦 777 -啧 778 -啪 779 -啬 780 -啰 781 -啲 782 -啵 783 -啶 784 -啸 785 -啼 786 -啾 787 -喀 788 -喁 789 -喂 790 -喃 791 -善 792 -喆 793 -喇 794 -喉 795 -喊 796 -喋 797 -喔 798 -喘 799 -喜 800 -喝 801 -喟 802 -喧 803 -喱 804 -喳 805 -喵 806 -喷 807 -喻 808 -喽 809 -嗄 810 -嗅 811 -嗑 812 -嗒 813 -嗓 814 -嗔 815 -嗖 816 -嗜 817 -嗝 818 -嗡 819 -嗣 820 -嗤 821 -嗦 822 -嗨 823 -嗪 824 -嗫 825 -嗬 826 -嗯 827 -嗲 828 -嗷 829 -嗽 830 -嘀 831 -嘈 832 -嘉 833 -嘎 834 -嘏 835 -嘘 836 -嘛 837 -嘞 838 -嘟 839 -嘣 840 -嘭 841 -嘱 842 -嘲 843 -嘴 844 -嘶 845 -嘹 846 -嘻 847 -嘿 848 -噌 849 -噎 850 -噗 851 -噘 852 -噙 853 -噜 854 -噢 855 -噤 856 -器 857 -噩 858 -噪 859 -噬 860 -噱 861 -噶 862 -噻 863 -噼 864 -嚎 865 -嚏 866 -嚓 867 -嚣 868 -嚷 869 -嚼 870 -囊 871 -囍 872 -囔 873 -囗 874 -囚 875 -四 876 -回 877 -因 878 -团 879 -囤 880 -囧 881 -囫 882 -园 883 -囯 884 -困 885 -囱 886 -围 887 -囵 888 -囹 889 -固 890 -国 891 -图 892 -圃 893 -圄 894 -圆 895 -圈 896 -土 897 -圣 898 -在 899 -圩 900 -圪 901 -圭 902 -地 903 -圳 904 -圹 905 -场 906 -圻 907 -圾 908 -址 909 -坂 910 -均 911 -坊 912 -坍 913 -坎 914 -坏 915 -坐 916 -坑 917 -块 918 -坚 919 -坛 920 -坝 921 -坞 922 -坟 923 -坠 924 -坡 925 -坤 926 -坦 927 -坨 928 -坩 929 -坪 930 -坭 931 -坯 932 -坳 933 -坷 934 -坻 935 -垂 936 -垃 937 -垄 938 -垅 939 -型 940 -垌 941 -垒 942 -垚 943 -垛 944 -垡 945 -垢 946 -垣 947 -垤 948 -垦 949 -垩 950 -垫 951 -垭 952 -垮 953 -埂 954 -埃 955 -埇 956 -埋 957 -城 958 -埔 959 -埕 960 -埚 961 -埝 962 -域 963 -埠 964 -埭 965 -埸 966 -培 967 -基 968 -堀 969 -堂 970 -堃 971 -堆 972 -堇 973 -堕 974 -堡 975 -堤 976 -堪 977 -堰 978 -堵 979 -堺 980 -塌 981 -塍 982 -塑 983 -塔 984 -塘 985 -塞 986 -填 987 -塬 988 -塾 989 -境 990 -墅 991 -墉 992 -墓 993 -増 994 -墙 995 -增 996 -墟 997 -墨 998 -墩 999 -壁 1000 -壑 1001 -壕 1002 -壤 1003 -士 1004 -壬 1005 -壮 1006 -声 1007 -壳 1008 -壶 1009 -壹 1010 -处 1011 -备 1012 -复 1013 -夏 1014 -夔 1015 -夕 1016 -外 1017 -夙 1018 -多 1019 -夜 1020 -够 1021 -大 1022 -天 1023 -太 1024 -夫 1025 -夭 1026 -央 1027 -夯 1028 -失 1029 -头 1030 -夷 1031 -夸 1032 -夹 1033 -夺 1034 -奁 1035 -奂 1036 -奄 1037 -奇 1038 -奈 1039 -奉 1040 -奋 1041 -奎 1042 -奏 1043 -契 1044 -奔 1045 -奕 1046 -奖 1047 -套 1048 -奘 1049 -奚 1050 -奠 1051 -奢 1052 -奥 1053 -女 1054 -奴 1055 -奶 1056 -奸 1057 -她 1058 -好 1059 -如 1060 -妃 1061 -妄 1062 -妆 1063 -妇 1064 -妈 1065 -妊 1066 -妍 1067 -妒 1068 -妓 1069 -妖 1070 -妙 1071 -妞 1072 -妤 1073 -妥 1074 -妨 1075 -妩 1076 -妪 1077 -妫 1078 -妮 1079 -妯 1080 -妲 1081 -妹 1082 -妻 1083 -妾 1084 -姆 1085 -姊 1086 -始 1087 -姐 1088 -姑 1089 -姓 1090 -委 1091 -姗 1092 -姚 1093 -姜 1094 -姝 1095 -姣 1096 -姥 1097 -姨 1098 -姬 1099 -姻 1100 -姿 1101 -威 1102 -娃 1103 -娄 1104 -娅 1105 -娆 1106 -娇 1107 -娈 1108 -娉 1109 -娌 1110 -娓 1111 -娘 1112 -娜 1113 -娟 1114 -娠 1115 -娣 1116 -娥 1117 -娩 1118 -娱 1119 -娲 1120 -娴 1121 -娶 1122 -娼 1123 -婀 1124 -婆 1125 -婉 1126 -婊 1127 -婕 1128 -婚 1129 -婢 1130 -婧 1131 -婪 1132 -婴 1133 -婵 1134 -婶 1135 -婷 1136 -婺 1137 -婿 1138 -媒 1139 -媚 1140 -媛 1141 -媞 1142 -媲 1143 -媳 1144 -嫁 1145 -嫂 1146 -嫉 1147 -嫌 1148 -嫒 1149 -嫔 1150 -嫖 1151 -嫚 1152 -嫡 1153 -嫣 1154 -嫦 1155 -嫩 1156 -嫫 1157 -嬅 1158 -嬉 1159 -嬗 1160 -嬛 1161 -嬴 1162 -嬷 1163 -孀 1164 -子 1165 -孑 1166 -孔 1167 -孕 1168 -字 1169 -存 1170 -孙 1171 -孚 1172 -孛 1173 -孜 1174 -孝 1175 -孟 1176 -孢 1177 -季 1178 -孤 1179 -学 1180 -孩 1181 -孪 1182 -孬 1183 -孰 1184 -孱 1185 -孳 1186 -孵 1187 -孺 1188 -孽 1189 -宁 1190 -它 1191 -宅 1192 -宇 1193 -守 1194 -安 1195 -宋 1196 -完 1197 -宏 1198 -宓 1199 -宕 1200 -宗 1201 -官 1202 -宙 1203 -定 1204 -宛 1205 -宜 1206 -宝 1207 -实 1208 -宠 1209 -审 1210 -客 1211 -宣 1212 -室 1213 -宥 1214 -宦 1215 -宪 1216 -宫 1217 -宰 1218 -害 1219 -宴 1220 -宵 1221 -家 1222 -宸 1223 -容 1224 -宽 1225 -宾 1226 -宿 1227 -寂 1228 -寄 1229 -寅 1230 -密 1231 -寇 1232 -富 1233 -寐 1234 -寒 1235 -寓 1236 -寝 1237 -寞 1238 -察 1239 -寡 1240 -寥 1241 -寨 1242 -寮 1243 -寰 1244 -寸 1245 -对 1246 -寺 1247 -寻 1248 -导 1249 -寿 1250 -封 1251 -射 1252 -尅 1253 -将 1254 -尉 1255 -尊 1256 -小 1257 -少 1258 -尔 1259 -尕 1260 -尖 1261 -尘 1262 -尚 1263 -尝 1264 -尤 1265 -尧 1266 -尬 1267 -就 1268 -尴 1269 -尸 1270 -尹 1271 -尺 1272 -尼 1273 -尽 1274 -尾 1275 -尿 1276 -局 1277 -屁 1278 -层 1279 -居 1280 -屈 1281 -屉 1282 -届 1283 -屋 1284 -屌 1285 -屎 1286 -屏 1287 -屐 1288 -屑 1289 -展 1290 -属 1291 -屠 1292 -屡 1293 -履 1294 -屯 1295 -山 1296 -屹 1297 -屿 1298 -岁 1299 -岂 1300 -岌 1301 -岐 1302 -岑 1303 -岔 1304 -岖 1305 -岗 1306 -岚 1307 -岛 1308 -岩 1309 -岬 1310 -岭 1311 -岱 1312 -岳 1313 -岷 1314 -岸 1315 -峁 1316 -峋 1317 -峒 1318 -峙 1319 -峡 1320 -峥 1321 -峦 1322 -峨 1323 -峪 1324 -峭 1325 -峰 1326 -峻 1327 -崂 1328 -崃 1329 -崆 1330 -崇 1331 -崎 1332 -崔 1333 -崖 1334 -崛 1335 -崧 1336 -崩 1337 -崭 1338 -崮 1339 -崴 1340 -崽 1341 -嵇 1342 -嵊 1343 -嵋 1344 -嵌 1345 -嵘 1346 -嵛 1347 -嵩 1348 -嵬 1349 -嶂 1350 -嶙 1351 -嶝 1352 -巅 1353 -巍 1354 -川 1355 -州 1356 -巡 1357 -巢 1358 -工 1359 -左 1360 -巧 1361 -巨 1362 -巩 1363 -巫 1364 -差 1365 -己 1366 -已 1367 -巳 1368 -巴 1369 -巷 1370 -巾 1371 -币 1372 -市 1373 -布 1374 -帅 1375 -帆 1376 -师 1377 -希 1378 -帐 1379 -帕 1380 -帖 1381 -帘 1382 -帚 1383 -帛 1384 -帜 1385 -帝 1386 -带 1387 -帧 1388 -席 1389 -帮 1390 -帷 1391 -常 1392 -帼 1393 -帽 1394 -幂 1395 -幄 1396 -幅 1397 -幌 1398 -幔 1399 -幕 1400 -幡 1401 -幢 1402 -干 1403 -平 1404 -年 1405 -并 1406 -幸 1407 -幺 1408 -幻 1409 -幼 1410 -幽 1411 -广 1412 -庄 1413 -庆 1414 -庇 1415 -床 1416 -序 1417 -庐 1418 -库 1419 -应 1420 -底 1421 -庖 1422 -店 1423 -庙 1424 -庚 1425 -府 1426 -庞 1427 -废 1428 -度 1429 -座 1430 -庭 1431 -庵 1432 -庶 1433 -康 1434 -庸 1435 -庹 1436 -庾 1437 -廉 1438 -廊 1439 -廓 1440 -廖 1441 -延 1442 -廷 1443 -建 1444 -开 1445 -异 1446 -弃 1447 -弄 1448 -弈 1449 -弊 1450 -弋 1451 -式 1452 -弑 1453 -弓 1454 -引 1455 -弗 1456 -弘 1457 -弛 1458 -弟 1459 -张 1460 -弥 1461 -弦 1462 -弧 1463 -弩 1464 -弭 1465 -弯 1466 -弱 1467 -弹 1468 -强 1469 -弼 1470 -归 1471 -当 1472 -录 1473 -彗 1474 -彝 1475 -形 1476 -彤 1477 -彦 1478 -彩 1479 -彪 1480 -彬 1481 -彭 1482 -彰 1483 -影 1484 -彷 1485 -役 1486 -彻 1487 -彼 1488 -往 1489 -征 1490 -径 1491 -待 1492 -徇 1493 -很 1494 -徉 1495 -徊 1496 -律 1497 -徐 1498 -徒 1499 -得 1500 -徘 1501 -徙 1502 -徜 1503 -御 1504 -徨 1505 -循 1506 -微 1507 -德 1508 -徽 1509 -心 1510 -必 1511 -忆 1512 -忌 1513 -忍 1514 -忏 1515 -忐 1516 -忑 1517 -忒 1518 -忖 1519 -志 1520 -忘 1521 -忙 1522 -忠 1523 -忡 1524 -忤 1525 -忧 1526 -忪 1527 -快 1528 -忱 1529 -念 1530 -忻 1531 -忽 1532 -忿 1533 -怀 1534 -态 1535 -怂 1536 -怄 1537 -怅 1538 -怆 1539 -怎 1540 -怒 1541 -怕 1542 -怖 1543 -怜 1544 -思 1545 -怠 1546 -怡 1547 -急 1548 -怦 1549 -性 1550 -怨 1551 -怪 1552 -怫 1553 -怯 1554 -怵 1555 -总 1556 -怼 1557 -怿 1558 -恁 1559 -恃 1560 -恋 1561 -恍 1562 -恐 1563 -恒 1564 -恕 1565 -恙 1566 -恢 1567 -恣 1568 -恤 1569 -恨 1570 -恩 1571 -恪 1572 -恬 1573 -恭 1574 -息 1575 -恰 1576 -恳 1577 -恶 1578 -恸 1579 -恺 1580 -恻 1581 -恼 1582 -恿 1583 -悄 1584 -悉 1585 -悌 1586 -悍 1587 -悔 1588 -悖 1589 -悚 1590 -悟 1591 -悠 1592 -患 1593 -悦 1594 -您 1595 -悬 1596 -悭 1597 -悯 1598 -悱 1599 -悲 1600 -悴 1601 -悸 1602 -悻 1603 -悼 1604 -情 1605 -惆 1606 -惊 1607 -惋 1608 -惑 1609 -惕 1610 -惚 1611 -惜 1612 -惟 1613 -惠 1614 -惦 1615 -惧 1616 -惨 1617 -惩 1618 -惫 1619 -惬 1620 -惭 1621 -惮 1622 -惯 1623 -惰 1624 -想 1625 -惶 1626 -惹 1627 -惺 1628 -愁 1629 -愈 1630 -愉 1631 -意 1632 -愕 1633 -愚 1634 -感 1635 -愣 1636 -愤 1637 -愧 1638 -愫 1639 -愿 1640 -慈 1641 -慌 1642 -慎 1643 -慑 1644 -慕 1645 -慢 1646 -慧 1647 -慨 1648 -慰 1649 -慵 1650 -慷 1651 -憋 1652 -憎 1653 -憔 1654 -憧 1655 -憨 1656 -憩 1657 -憬 1658 -憷 1659 -憾 1660 -懂 1661 -懈 1662 -懊 1663 -懋 1664 -懑 1665 -懒 1666 -懦 1667 -懵 1668 -懿 1669 -戈 1670 -戊 1671 -戌 1672 -戍 1673 -戎 1674 -戏 1675 -成 1676 -我 1677 -戒 1678 -或 1679 -戗 1680 -战 1681 -戚 1682 -戛 1683 -戟 1684 -截 1685 -戬 1686 -戮 1687 -戳 1688 -戴 1689 -户 1690 -戾 1691 -房 1692 -所 1693 -扁 1694 -扇 1695 -扈 1696 -扉 1697 -手 1698 -才 1699 -扎 1700 -扑 1701 -扒 1702 -打 1703 -扔 1704 -托 1705 -扛 1706 -扞 1707 -扣 1708 -扦 1709 -执 1710 -扩 1711 -扪 1712 -扫 1713 -扬 1714 -扭 1715 -扮 1716 -扯 1717 -扰 1718 -扳 1719 -扶 1720 -批 1721 -扼 1722 -找 1723 -承 1724 -技 1725 -抄 1726 -抉 1727 -把 1728 -抑 1729 -抒 1730 -抓 1731 -投 1732 -抖 1733 -抗 1734 -折 1735 -抚 1736 -抛 1737 -抠 1738 -抡 1739 -抢 1740 -护 1741 -报 1742 -抨 1743 -披 1744 -抬 1745 -抱 1746 -抵 1747 -抹 1748 -押 1749 -抽 1750 -抿 1751 -拂 1752 -拄 1753 -担 1754 -拆 1755 -拇 1756 -拈 1757 -拉 1758 -拌 1759 -拍 1760 -拎 1761 -拐 1762 -拒 1763 -拓 1764 -拔 1765 -拖 1766 -拗 1767 -拘 1768 -拙 1769 -拚 1770 -招 1771 -拜 1772 -拟 1773 -拢 1774 -拣 1775 -拥 1776 -拦 1777 -拧 1778 -拨 1779 -择 1780 -括 1781 -拭 1782 -拮 1783 -拯 1784 -拱 1785 -拳 1786 -拴 1787 -拷 1788 -拼 1789 -拽 1790 -拾 1791 -拿 1792 -持 1793 -挂 1794 -指 1795 -按 1796 -挎 1797 -挑 1798 -挖 1799 -挚 1800 -挛 1801 -挝 1802 -挞 1803 -挟 1804 -挠 1805 -挡 1806 -挣 1807 -挤 1808 -挥 1809 -挨 1810 -挪 1811 -挫 1812 -振 1813 -挺 1814 -挽 1815 -捂 1816 -捅 1817 -捆 1818 -捉 1819 -捋 1820 -捌 1821 -捍 1822 -捎 1823 -捏 1824 -捐 1825 -捕 1826 -捞 1827 -损 1828 -捡 1829 -换 1830 -捣 1831 -捧 1832 -据 1833 -捶 1834 -捷 1835 -捺 1836 -捻 1837 -掀 1838 -掂 1839 -掇 1840 -授 1841 -掉 1842 -掌 1843 -掏 1844 -掐 1845 -排 1846 -掖 1847 -掘 1848 -掠 1849 -探 1850 -掣 1851 -接 1852 -控 1853 -推 1854 -掩 1855 -措 1856 -掬 1857 -掮 1858 -掰 1859 -掳 1860 -掴 1861 -掷 1862 -掸 1863 -掺 1864 -揄 1865 -揉 1866 -揍 1867 -描 1868 -提 1869 -插 1870 -握 1871 -揣 1872 -揩 1873 -揪 1874 -揭 1875 -援 1876 -揶 1877 -揽 1878 -搀 1879 -搁 1880 -搂 1881 -搅 1882 -搏 1883 -搐 1884 -搓 1885 -搔 1886 -搜 1887 -搞 1888 -搡 1889 -搧 1890 -搪 1891 -搬 1892 -搭 1893 -携 1894 -搽 1895 -摁 1896 -摄 1897 -摆 1898 -摇 1899 -摈 1900 -摊 1901 -摒 1902 -摔 1903 -摘 1904 -摞 1905 -摧 1906 -摩 1907 -摸 1908 -摹 1909 -撂 1910 -撅 1911 -撇 1912 -撑 1913 -撒 1914 -撕 1915 -撞 1916 -撤 1917 -撩 1918 -撬 1919 -播 1920 -撮 1921 -撰 1922 -撵 1923 -撸 1924 -撺 1925 -撼 1926 -擀 1927 -擂 1928 -擅 1929 -操 1930 -擎 1931 -擒 1932 -擘 1933 -擞 1934 -擢 1935 -擦 1936 -攀 1937 -攒 1938 -攘 1939 -攥 1940 -攫 1941 -支 1942 -收 1943 -攸 1944 -改 1945 -攻 1946 -放 1947 -政 1948 -故 1949 -效 1950 -敌 1951 -敏 1952 -救 1953 -敕 1954 -敖 1955 -教 1956 -敛 1957 -敝 1958 -敞 1959 -敢 1960 -散 1961 -敦 1962 -敬 1963 -数 1964 -敲 1965 -整 1966 -敷 1967 -文 1968 -斋 1969 -斌 1970 -斐 1971 -斑 1972 -斓 1973 -斗 1974 -料 1975 -斛 1976 -斜 1977 -斟 1978 -斡 1979 -斤 1980 -斥 1981 -斧 1982 -斩 1983 -断 1984 -斯 1985 -新 1986 -方 1987 -施 1988 -旁 1989 -旅 1990 -旋 1991 -旌 1992 -族 1993 -旖 1994 -旗 1995 -无 1996 -既 1997 -日 1998 -旦 1999 -旧 2000 -旨 2001 -早 2002 -旬 2003 -旭 2004 -旮 2005 -旯 2006 -旱 2007 -时 2008 -旷 2009 -旺 2010 -旻 2011 -昀 2012 -昂 2013 -昆 2014 -昊 2015 -昌 2016 -明 2017 -昏 2018 -易 2019 -昔 2020 -昕 2021 -昙 2022 -昝 2023 -星 2024 -映 2025 -春 2026 -昧 2027 -昨 2028 -昭 2029 -是 2030 -昱 2031 -昴 2032 -昵 2033 -昶 2034 -昼 2035 -显 2036 -晃 2037 -晋 2038 -晌 2039 -晏 2040 -晒 2041 -晓 2042 -晔 2043 -晕 2044 -晖 2045 -晗 2046 -晚 2047 -晞 2048 -晟 2049 -晤 2050 -晦 2051 -晨 2052 -普 2053 -景 2054 -晰 2055 -晴 2056 -晶 2057 -晷 2058 -智 2059 -晾 2060 -暂 2061 -暄 2062 -暇 2063 -暌 2064 -暑 2065 -暖 2066 -暗 2067 -暧 2068 -暨 2069 -暮 2070 -暴 2071 -暹 2072 -暾 2073 -曈 2074 -曙 2075 -曜 2076 -曝 2077 -曦 2078 -曰 2079 -曲 2080 -曳 2081 -更 2082 -曹 2083 -曼 2084 -曾 2085 -替 2086 -最 2087 -月 2088 -有 2089 -朋 2090 -服 2091 -朐 2092 -朔 2093 -朕 2094 -朗 2095 -望 2096 -朝 2097 -期 2098 -朦 2099 -木 2100 -未 2101 -末 2102 -本 2103 -札 2104 -术 2105 -朱 2106 -朴 2107 -朵 2108 -机 2109 -朽 2110 -杀 2111 -杂 2112 -权 2113 -杆 2114 -杈 2115 -杉 2116 -李 2117 -杏 2118 -材 2119 -村 2120 -杓 2121 -杖 2122 -杜 2123 -杞 2124 -束 2125 -杠 2126 -条 2127 -来 2128 -杨 2129 -杭 2130 -杯 2131 -杰 2132 -杳 2133 -杵 2134 -杷 2135 -松 2136 -板 2137 -极 2138 -构 2139 -枇 2140 -枉 2141 -枋 2142 -析 2143 -枕 2144 -林 2145 -枚 2146 -果 2147 -枝 2148 -枞 2149 -枢 2150 -枣 2151 -枥 2152 -枪 2153 -枫 2154 -枭 2155 -枯 2156 -枰 2157 -枳 2158 -架 2159 -枷 2160 -枸 2161 -柃 2162 -柄 2163 -柏 2164 -某 2165 -柑 2166 -柒 2167 -染 2168 -柔 2169 -柘 2170 -柚 2171 -柜 2172 -柞 2173 -柠 2174 -查 2175 -柩 2176 -柬 2177 -柯 2178 -柱 2179 -柳 2180 -柴 2181 -柿 2182 -栀 2183 -栅 2184 -标 2185 -栈 2186 -栋 2187 -栌 2188 -栎 2189 -栏 2190 -树 2191 -栓 2192 -栖 2193 -栗 2194 -校 2195 -栩 2196 -株 2197 -样 2198 -核 2199 -根 2200 -格 2201 -栽 2202 -栾 2203 -桁 2204 -桂 2205 -桃 2206 -框 2207 -案 2208 -桉 2209 -桌 2210 -桎 2211 -桐 2212 -桑 2213 -桓 2214 -桔 2215 -桠 2216 -桢 2217 -档 2218 -桥 2219 -桦 2220 -桨 2221 -桩 2222 -桴 2223 -桶 2224 -桷 2225 -梁 2226 -梅 2227 -梆 2228 -梏 2229 -梓 2230 -梗 2231 -梢 2232 -梦 2233 -梧 2234 -梨 2235 -梭 2236 -梯 2237 -械 2238 -梳 2239 -梵 2240 -检 2241 -棂 2242 -棉 2243 -棋 2244 -棍 2245 -棒 2246 -棕 2247 -棘 2248 -棚 2249 -棠 2250 -棣 2251 -森 2252 -棱 2253 -棵 2254 -棺 2255 -椁 2256 -椅 2257 -椋 2258 -植 2259 -椎 2260 -椒 2261 -椟 2262 -椤 2263 -椭 2264 -椰 2265 -椴 2266 -椹 2267 -椿 2268 -楂 2269 -楔 2270 -楚 2271 -楞 2272 -楠 2273 -楣 2274 -楷 2275 -楸 2276 -楼 2277 -概 2278 -榄 2279 -榆 2280 -榈 2281 -榉 2282 -榔 2283 -榕 2284 -榛 2285 -榜 2286 -榨 2287 -榫 2288 -榭 2289 -榴 2290 -榷 2291 -榻 2292 -槃 2293 -槌 2294 -槎 2295 -槐 2296 -槛 2297 -槟 2298 -槭 2299 -槽 2300 -槿 2301 -樊 2302 -樟 2303 -模 2304 -樨 2305 -横 2306 -樯 2307 -樱 2308 -樵 2309 -樽 2310 -樾 2311 -橄 2312 -橇 2313 -橐 2314 -橘 2315 -橙 2316 -橡 2317 -橱 2318 -檀 2319 -檐 2320 -檗 2321 -檬 2322 -欠 2323 -次 2324 -欢 2325 -欣 2326 -欧 2327 -欲 2328 -欸 2329 -欺 2330 -款 2331 -歆 2332 -歇 2333 -歉 2334 -歌 2335 -歙 2336 -止 2337 -正 2338 -此 2339 -步 2340 -武 2341 -歧 2342 -歩 2343 -歪 2344 -歹 2345 -死 2346 -歼 2347 -殁 2348 -殃 2349 -殆 2350 -殇 2351 -殉 2352 -殊 2353 -残 2354 -殒 2355 -殓 2356 -殖 2357 -殚 2358 -殡 2359 -殴 2360 -段 2361 -殷 2362 -殿 2363 -毁 2364 -毂 2365 -毅 2366 -毋 2367 -母 2368 -每 2369 -毒 2370 -毓 2371 -比 2372 -毕 2373 -毗 2374 -毙 2375 -毛 2376 -毡 2377 -毫 2378 -毯 2379 -毽 2380 -氏 2381 -民 2382 -氓 2383 -气 2384 -氚 2385 -氛 2386 -氟 2387 -氢 2388 -氤 2389 -氦 2390 -氧 2391 -氨 2392 -氪 2393 -氮 2394 -氯 2395 -氰 2396 -氲 2397 -水 2398 -永 2399 -汀 2400 -汁 2401 -求 2402 -汇 2403 -汉 2404 -汊 2405 -汐 2406 -汕 2407 -汗 2408 -汛 2409 -汝 2410 -汞 2411 -江 2412 -池 2413 -污 2414 -汤 2415 -汨 2416 -汩 2417 -汪 2418 -汰 2419 -汲 2420 -汴 2421 -汶 2422 -汹 2423 -汽 2424 -汾 2425 -沁 2426 -沂 2427 -沃 2428 -沅 2429 -沈 2430 -沉 2431 -沌 2432 -沏 2433 -沐 2434 -沓 2435 -沙 2436 -沛 2437 -沟 2438 -没 2439 -沢 2440 -沣 2441 -沥 2442 -沦 2443 -沧 2444 -沪 2445 -沫 2446 -沭 2447 -沮 2448 -沱 2449 -河 2450 -沸 2451 -油 2452 -治 2453 -沼 2454 -沽 2455 -沾 2456 -沿 2457 -泄 2458 -泉 2459 -泊 2460 -泌 2461 -泓 2462 -泔 2463 -法 2464 -泖 2465 -泗 2466 -泛 2467 -泞 2468 -泠 2469 -泡 2470 -波 2471 -泣 2472 -泥 2473 -注 2474 -泪 2475 -泫 2476 -泮 2477 -泯 2478 -泰 2479 -泱 2480 -泳 2481 -泵 2482 -泷 2483 -泸 2484 -泺 2485 -泻 2486 -泼 2487 -泽 2488 -泾 2489 -洁 2490 -洋 2491 -洒 2492 -洗 2493 -洙 2494 -洛 2495 -洞 2496 -津 2497 -洪 2498 -洮 2499 -洱 2500 -洲 2501 -洵 2502 -洹 2503 -洺 2504 -活 2505 -洼 2506 -洽 2507 -派 2508 -流 2509 -浃 2510 -浅 2511 -浆 2512 -浇 2513 -浈 2514 -浊 2515 -测 2516 -济 2517 -浏 2518 -浐 2519 -浑 2520 -浒 2521 -浓 2522 -浔 2523 -浙 2524 -浚 2525 -浜 2526 -浠 2527 -浣 2528 -浦 2529 -浩 2530 -浪 2531 -浮 2532 -浴 2533 -海 2534 -浸 2535 -涂 2536 -涅 2537 -消 2538 -涉 2539 -涌 2540 -涎 2541 -涑 2542 -涓 2543 -涕 2544 -涛 2545 -涝 2546 -涞 2547 -涟 2548 -涠 2549 -涡 2550 -涣 2551 -涤 2552 -润 2553 -涧 2554 -涨 2555 -涩 2556 -涪 2557 -涮 2558 -涯 2559 -液 2560 -涵 2561 -涸 2562 -涿 2563 -淀 2564 -淄 2565 -淅 2566 -淆 2567 -淇 2568 -淋 2569 -淌 2570 -淑 2571 -淖 2572 -淘 2573 -淝 2574 -淞 2575 -淡 2576 -淤 2577 -淦 2578 -淫 2579 -淬 2580 -淮 2581 -深 2582 -淳 2583 -混 2584 -淹 2585 -添 2586 -淼 2587 -清 2588 -渊 2589 -渌 2590 -渍 2591 -渎 2592 -渐 2593 -渑 2594 -渔 2595 -渗 2596 -渚 2597 -渝 2598 -渠 2599 -渡 2600 -渣 2601 -渤 2602 -渥 2603 -温 2604 -渭 2605 -港 2606 -渲 2607 -渴 2608 -游 2609 -渺 2610 -湃 2611 -湄 2612 -湉 2613 -湍 2614 -湎 2615 -湖 2616 -湘 2617 -湛 2618 -湫 2619 -湾 2620 -湿 2621 -溃 2622 -溅 2623 -溆 2624 -溉 2625 -溏 2626 -源 2627 -溜 2628 -溟 2629 -溢 2630 -溥 2631 -溧 2632 -溪 2633 -溯 2634 -溶 2635 -溺 2636 -滁 2637 -滇 2638 -滋 2639 -滑 2640 -滔 2641 -滕 2642 -滘 2643 -滚 2644 -滞 2645 -满 2646 -滢 2647 -滤 2648 -滥 2649 -滦 2650 -滨 2651 -滩 2652 -滴 2653 -滹 2654 -漂 2655 -漆 2656 -漉 2657 -漏 2658 -漓 2659 -演 2660 -漕 2661 -漠 2662 -漩 2663 -漪 2664 -漫 2665 -漭 2666 -漯 2667 -漱 2668 -漳 2669 -漾 2670 -潆 2671 -潇 2672 -潋 2673 -潍 2674 -潘 2675 -潜 2676 -潞 2677 -潢 2678 -潦 2679 -潭 2680 -潮 2681 -潸 2682 -潺 2683 -潼 2684 -澄 2685 -澈 2686 -澍 2687 -澎 2688 -澜 2689 -澡 2690 -澧 2691 -澳 2692 -澶 2693 -激 2694 -濂 2695 -濑 2696 -濒 2697 -濠 2698 -濡 2699 -濮 2700 -濯 2701 -瀑 2702 -瀚 2703 -瀛 2704 -灌 2705 -灏 2706 -灞 2707 -火 2708 -灭 2709 -灯 2710 -灰 2711 -灵 2712 -灶 2713 -灸 2714 -灼 2715 -灾 2716 -灿 2717 -炀 2718 -炅 2719 -炉 2720 -炊 2721 -炎 2722 -炒 2723 -炔 2724 -炕 2725 -炖 2726 -炙 2727 -炜 2728 -炫 2729 -炬 2730 -炭 2731 -炮 2732 -炯 2733 -炳 2734 -炷 2735 -炸 2736 -点 2737 -炼 2738 -炽 2739 -烀 2740 -烁 2741 -烂 2742 -烃 2743 -烈 2744 -烊 2745 -烘 2746 -烙 2747 -烛 2748 -烟 2749 -烤 2750 -烦 2751 -烧 2752 -烨 2753 -烩 2754 -烫 2755 -烬 2756 -热 2757 -烯 2758 -烷 2759 -烹 2760 -烽 2761 -焉 2762 -焊 2763 -焓 2764 -焕 2765 -焖 2766 -焗 2767 -焘 2768 -焙 2769 -焚 2770 -焦 2771 -焯 2772 -焰 2773 -焱 2774 -然 2775 -煊 2776 -煌 2777 -煎 2778 -煜 2779 -煞 2780 -煤 2781 -煦 2782 -照 2783 -煨 2784 -煮 2785 -煲 2786 -煳 2787 -煽 2788 -熄 2789 -熊 2790 -熏 2791 -熔 2792 -熙 2793 -熟 2794 -熠 2795 -熨 2796 -熬 2797 -熵 2798 -熹 2799 -燃 2800 -燊 2801 -燎 2802 -燕 2803 -燥 2804 -燮 2805 -爆 2806 -爪 2807 -爬 2808 -爱 2809 -爵 2810 -父 2811 -爷 2812 -爸 2813 -爹 2814 -爽 2815 -片 2816 -版 2817 -牌 2818 -牍 2819 -牒 2820 -牙 2821 -牛 2822 -牟 2823 -牠 2824 -牡 2825 -牢 2826 -牧 2827 -物 2828 -牲 2829 -牵 2830 -特 2831 -牺 2832 -牾 2833 -犀 2834 -犁 2835 -犄 2836 -犇 2837 -犊 2838 -犒 2839 -犟 2840 -犬 2841 -犯 2842 -状 2843 -犷 2844 -犸 2845 -犹 2846 -狂 2847 -狄 2848 -狈 2849 -狐 2850 -狒 2851 -狗 2852 -狙 2853 -狞 2854 -狠 2855 -狡 2856 -狩 2857 -独 2858 -狭 2859 -狮 2860 -狰 2861 -狱 2862 -狸 2863 -狼 2864 -猁 2865 -猎 2866 -猖 2867 -猛 2868 -猜 2869 -猝 2870 -猥 2871 -猩 2872 -猪 2873 -猫 2874 -猬 2875 -献 2876 -猴 2877 -猷 2878 -猹 2879 -猾 2880 -猿 2881 -獒 2882 -獗 2883 -獭 2884 -獾 2885 -玄 2886 -率 2887 -玉 2888 -王 2889 -玑 2890 -玖 2891 -玛 2892 -玟 2893 -玥 2894 -玩 2895 -玫 2896 -玮 2897 -环 2898 -现 2899 -玲 2900 -玳 2901 -玷 2902 -玹 2903 -玺 2904 -玻 2905 -珀 2906 -珂 2907 -珈 2908 -珉 2909 -珊 2910 -珍 2911 -珏 2912 -珑 2913 -珙 2914 -珞 2915 -珠 2916 -珥 2917 -班 2918 -珮 2919 -珲 2920 -珺 2921 -球 2922 -琅 2923 -理 2924 -琉 2925 -琊 2926 -琏 2927 -琐 2928 -琛 2929 -琢 2930 -琤 2931 -琥 2932 -琦 2933 -琨 2934 -琪 2935 -琬 2936 -琮 2937 -琰 2938 -琳 2939 -琴 2940 -琵 2941 -琶 2942 -琼 2943 -瑁 2944 -瑄 2945 -瑕 2946 -瑙 2947 -瑚 2948 -瑛 2949 -瑜 2950 -瑞 2951 -瑟 2952 -瑠 2953 -瑭 2954 -瑰 2955 -瑶 2956 -瑷 2957 -瑾 2958 -璀 2959 -璃 2960 -璇 2961 -璋 2962 -璐 2963 -璞 2964 -璟 2965 -璧 2966 -璨 2967 -瓜 2968 -瓢 2969 -瓣 2970 -瓦 2971 -瓮 2972 -瓯 2973 -瓶 2974 -瓷 2975 -甄 2976 -甘 2977 -甚 2978 -甜 2979 -生 2980 -甥 2981 -用 2982 -甩 2983 -甫 2984 -甬 2985 -甭 2986 -田 2987 -由 2988 -甲 2989 -申 2990 -电 2991 -男 2992 -甸 2993 -町 2994 -画 2995 -畅 2996 -畈 2997 -畊 2998 -界 2999 -畏 3000 -畔 3001 -留 3002 -畜 3003 -略 3004 -番 3005 -畴 3006 -畸 3007 -畿 3008 -疃 3009 -疆 3010 -疏 3011 -疑 3012 -疖 3013 -疗 3014 -疙 3015 -疚 3016 -疝 3017 -疟 3018 -疡 3019 -疣 3020 -疤 3021 -疫 3022 -疮 3023 -疯 3024 -疱 3025 -疲 3026 -疴 3027 -疵 3028 -疸 3029 -疹 3030 -疼 3031 -疽 3032 -疾 3033 -病 3034 -症 3035 -痉 3036 -痊 3037 -痍 3038 -痒 3039 -痔 3040 -痕 3041 -痘 3042 -痛 3043 -痞 3044 -痢 3045 -痣 3046 -痧 3047 -痨 3048 -痪 3049 -痫 3050 -痰 3051 -痱 3052 -痴 3053 -痹 3054 -痼 3055 -瘀 3056 -瘁 3057 -瘙 3058 -瘟 3059 -瘠 3060 -瘢 3061 -瘤 3062 -瘦 3063 -瘩 3064 -瘪 3065 -瘫 3066 -瘳 3067 -瘴 3068 -瘸 3069 -瘾 3070 -癌 3071 -癖 3072 -癜 3073 -癞 3074 -癣 3075 -癫 3076 -登 3077 -白 3078 -百 3079 -皂 3080 -的 3081 -皆 3082 -皇 3083 -皋 3084 -皎 3085 -皑 3086 -皓 3087 -皖 3088 -皙 3089 -皮 3090 -皱 3091 -皿 3092 -盂 3093 -盅 3094 -盆 3095 -盈 3096 -益 3097 -盎 3098 -盏 3099 -盐 3100 -监 3101 -盒 3102 -盔 3103 -盖 3104 -盗 3105 -盘 3106 -盛 3107 -盟 3108 -目 3109 -盯 3110 -盱 3111 -盲 3112 -直 3113 -相 3114 -盹 3115 -盼 3116 -盾 3117 -省 3118 -眈 3119 -眉 3120 -看 3121 -眙 3122 -真 3123 -眠 3124 -眨 3125 -眩 3126 -眬 3127 -眯 3128 -眶 3129 -眷 3130 -眸 3131 -眺 3132 -眼 3133 -着 3134 -睁 3135 -睇 3136 -睐 3137 -睑 3138 -睛 3139 -睡 3140 -睢 3141 -督 3142 -睦 3143 -睫 3144 -睬 3145 -睹 3146 -睽 3147 -睾 3148 -睿 3149 -瞄 3150 -瞅 3151 -瞌 3152 -瞎 3153 -瞑 3154 -瞒 3155 -瞟 3156 -瞠 3157 -瞥 3158 -瞧 3159 -瞩 3160 -瞪 3161 -瞬 3162 -瞭 3163 -瞰 3164 -瞳 3165 -瞻 3166 -瞿 3167 -矍 3168 -矗 3169 -矛 3170 -矜 3171 -矢 3172 -矣 3173 -知 3174 -矩 3175 -矫 3176 -矬 3177 -短 3178 -矮 3179 -石 3180 -矶 3181 -矸 3182 -矽 3183 -矾 3184 -矿 3185 -砀 3186 -码 3187 -砂 3188 -砌 3189 -砍 3190 -砒 3191 -研 3192 -砖 3193 -砚 3194 -砝 3195 -砣 3196 -砥 3197 -砭 3198 -砰 3199 -破 3200 -砷 3201 -砸 3202 -砺 3203 -砼 3204 -砾 3205 -础 3206 -硅 3207 -硌 3208 -硒 3209 -硕 3210 -硖 3211 -硚 3212 -硝 3213 -硫 3214 -硬 3215 -确 3216 -硼 3217 -碉 3218 -碌 3219 -碍 3220 -碎 3221 -碑 3222 -碓 3223 -碗 3224 -碘 3225 -碚 3226 -碜 3227 -碟 3228 -碣 3229 -碧 3230 -碰 3231 -碱 3232 -碳 3233 -碴 3234 -碾 3235 -磁 3236 -磅 3237 -磊 3238 -磋 3239 -磐 3240 -磕 3241 -磨 3242 -磴 3243 -磷 3244 -磺 3245 -礁 3246 -示 3247 -礼 3248 -社 3249 -祀 3250 -祁 3251 -祈 3252 -祉 3253 -祎 3254 -祐 3255 -祖 3256 -祚 3257 -祛 3258 -祝 3259 -神 3260 -祟 3261 -祠 3262 -祢 3263 -祥 3264 -票 3265 -祭 3266 -祯 3267 -祷 3268 -祸 3269 -祺 3270 -禀 3271 -禁 3272 -禄 3273 -禅 3274 -福 3275 -禧 3276 -禹 3277 -禺 3278 -离 3279 -禽 3280 -禾 3281 -秀 3282 -私 3283 -秃 3284 -秆 3285 -秉 3286 -秋 3287 -种 3288 -科 3289 -秒 3290 -秘 3291 -租 3292 -秣 3293 -秤 3294 -秦 3295 -秧 3296 -秩 3297 -积 3298 -称 3299 -秸 3300 -移 3301 -秽 3302 -稀 3303 -程 3304 -稍 3305 -税 3306 -稔 3307 -稚 3308 -稞 3309 -稠 3310 -稣 3311 -稳 3312 -稷 3313 -稹 3314 -稻 3315 -稼 3316 -稽 3317 -稿 3318 -穆 3319 -穗 3320 -穴 3321 -究 3322 -穷 3323 -穹 3324 -空 3325 -穿 3326 -突 3327 -窃 3328 -窄 3329 -窈 3330 -窍 3331 -窑 3332 -窒 3333 -窕 3334 -窖 3335 -窗 3336 -窘 3337 -窜 3338 -窝 3339 -窟 3340 -窠 3341 -窥 3342 -窦 3343 -窨 3344 -窿 3345 -立 3346 -竖 3347 -站 3348 -竞 3349 -竟 3350 -章 3351 -竣 3352 -童 3353 -竭 3354 -端 3355 -竹 3356 -竺 3357 -竽 3358 -竿 3359 -笃 3360 -笆 3361 -笈 3362 -笋 3363 -笑 3364 -笔 3365 -笙 3366 -笛 3367 -笠 3368 -符 3369 -笨 3370 -第 3371 -笳 3372 -笸 3373 -笼 3374 -等 3375 -筋 3376 -筏 3377 -筐 3378 -筑 3379 -筒 3380 -答 3381 -策 3382 -筛 3383 -筝 3384 -筠 3385 -筱 3386 -筵 3387 -筷 3388 -筹 3389 -签 3390 -简 3391 -箍 3392 -箔 3393 -箕 3394 -算 3395 -管 3396 -箩 3397 -箫 3398 -箭 3399 -箱 3400 -箴 3401 -篁 3402 -篆 3403 -篇 3404 -篑 3405 -篓 3406 -篝 3407 -篡 3408 -篦 3409 -篪 3410 -篮 3411 -篱 3412 -篷 3413 -篼 3414 -簇 3415 -簋 3416 -簧 3417 -簪 3418 -簸 3419 -簿 3420 -籁 3421 -籍 3422 -米 3423 -类 3424 -籼 3425 -籽 3426 -粉 3427 -粑 3428 -粒 3429 -粕 3430 -粗 3431 -粘 3432 -粟 3433 -粤 3434 -粥 3435 -粪 3436 -粮 3437 -粱 3438 -粲 3439 -粳 3440 -粹 3441 -粼 3442 -粽 3443 -精 3444 -糊 3445 -糕 3446 -糖 3447 -糗 3448 -糙 3449 -糟 3450 -糠 3451 -糯 3452 -系 3453 -紊 3454 -素 3455 -索 3456 -紧 3457 -紫 3458 -累 3459 -絮 3460 -綦 3461 -繁 3462 -纂 3463 -纠 3464 -纡 3465 -红 3466 -纣 3467 -纤 3468 -约 3469 -级 3470 -纨 3471 -纪 3472 -纫 3473 -纬 3474 -纭 3475 -纯 3476 -纰 3477 -纱 3478 -纲 3479 -纳 3480 -纵 3481 -纶 3482 -纷 3483 -纸 3484 -纹 3485 -纺 3486 -纽 3487 -纾 3488 -线 3489 -绀 3490 -练 3491 -组 3492 -绅 3493 -细 3494 -织 3495 -终 3496 -绉 3497 -绊 3498 -绋 3499 -绌 3500 -绍 3501 -绎 3502 -经 3503 -绑 3504 -绒 3505 -结 3506 -绔 3507 -绕 3508 -绘 3509 -给 3510 -绚 3511 -绛 3512 -络 3513 -绝 3514 -绞 3515 -统 3516 -绢 3517 -绣 3518 -绥 3519 -继 3520 -绩 3521 -绪 3522 -绫 3523 -续 3524 -绮 3525 -绯 3526 -绰 3527 -绳 3528 -维 3529 -绵 3530 -绷 3531 -绸 3532 -绻 3533 -综 3534 -绽 3535 -绿 3536 -缀 3537 -缄 3538 -缅 3539 -缆 3540 -缇 3541 -缉 3542 -缎 3543 -缓 3544 -缔 3545 -缕 3546 -编 3547 -缘 3548 -缙 3549 -缚 3550 -缛 3551 -缜 3552 -缝 3553 -缠 3554 -缢 3555 -缤 3556 -缨 3557 -缩 3558 -缪 3559 -缬 3560 -缭 3561 -缮 3562 -缰 3563 -缱 3564 -缴 3565 -缸 3566 -缺 3567 -罂 3568 -罄 3569 -罐 3570 -网 3571 -罔 3572 -罕 3573 -罗 3574 -罚 3575 -罡 3576 -罢 3577 -罩 3578 -罪 3579 -置 3580 -署 3581 -罹 3582 -羁 3583 -羊 3584 -羌 3585 -美 3586 -羔 3587 -羚 3588 -羞 3589 -羡 3590 -群 3591 -羧 3592 -羯 3593 -羲 3594 -羸 3595 -羹 3596 -羽 3597 -羿 3598 -翁 3599 -翅 3600 -翊 3601 -翌 3602 -翎 3603 -翔 3604 -翘 3605 -翟 3606 -翠 3607 -翡 3608 -翩 3609 -翰 3610 -翱 3611 -翻 3612 -翼 3613 -耀 3614 -老 3615 -考 3616 -耄 3617 -者 3618 -耆 3619 -耋 3620 -而 3621 -耍 3622 -耐 3623 -耒 3624 -耕 3625 -耗 3626 -耘 3627 -耙 3628 -耜 3629 -耪 3630 -耳 3631 -耶 3632 -耷 3633 -耸 3634 -耻 3635 -耽 3636 -耿 3637 -聂 3638 -聆 3639 -聊 3640 -聋 3641 -职 3642 -联 3643 -聘 3644 -聚 3645 -聪 3646 -肃 3647 -肆 3648 -肇 3649 -肉 3650 -肋 3651 -肌 3652 -肖 3653 -肘 3654 -肚 3655 -肛 3656 -肝 3657 -肠 3658 -股 3659 -肢 3660 -肤 3661 -肥 3662 -肩 3663 -肪 3664 -肮 3665 -肯 3666 -肱 3667 -育 3668 -肴 3669 -肺 3670 -肾 3671 -肿 3672 -胀 3673 -胁 3674 -胃 3675 -胆 3676 -背 3677 -胎 3678 -胖 3679 -胗 3680 -胚 3681 -胛 3682 -胜 3683 -胞 3684 -胡 3685 -胤 3686 -胥 3687 -胧 3688 -胫 3689 -胭 3690 -胯 3691 -胰 3692 -胱 3693 -胳 3694 -胶 3695 -胸 3696 -胺 3697 -能 3698 -脂 3699 -脆 3700 -脉 3701 -脊 3702 -脍 3703 -脏 3704 -脐 3705 -脑 3706 -脓 3707 -脖 3708 -脚 3709 -脯 3710 -脱 3711 -脸 3712 -脾 3713 -腆 3714 -腈 3715 -腊 3716 -腋 3717 -腌 3718 -腐 3719 -腑 3720 -腓 3721 -腔 3722 -腕 3723 -腥 3724 -腩 3725 -腭 3726 -腮 3727 -腰 3728 -腱 3729 -腴 3730 -腹 3731 -腺 3732 -腻 3733 -腼 3734 -腾 3735 -腿 3736 -膀 3737 -膈 3738 -膊 3739 -膏 3740 -膑 3741 -膛 3742 -膜 3743 -膝 3744 -膨 3745 -膳 3746 -膺 3747 -臀 3748 -臂 3749 -臃 3750 -臆 3751 -臊 3752 -臣 3753 -臧 3754 -自 3755 -臬 3756 -臭 3757 -至 3758 -致 3759 -臻 3760 -臼 3761 -舀 3762 -舅 3763 -舆 3764 -舌 3765 -舍 3766 -舐 3767 -舒 3768 -舔 3769 -舛 3770 -舜 3771 -舞 3772 -舟 3773 -航 3774 -舫 3775 -般 3776 -舰 3777 -舱 3778 -舵 3779 -舶 3780 -舷 3781 -舸 3782 -船 3783 -艇 3784 -艋 3785 -艘 3786 -艮 3787 -良 3788 -艰 3789 -色 3790 -艳 3791 -艺 3792 -艾 3793 -艿 3794 -节 3795 -芊 3796 -芋 3797 -芍 3798 -芒 3799 -芗 3800 -芙 3801 -芜 3802 -芝 3803 -芥 3804 -芦 3805 -芩 3806 -芪 3807 -芬 3808 -芭 3809 -芮 3810 -芯 3811 -花 3812 -芳 3813 -芷 3814 -芸 3815 -芹 3816 -芽 3817 -芾 3818 -苇 3819 -苋 3820 -苍 3821 -苏 3822 -苑 3823 -苓 3824 -苔 3825 -苗 3826 -苛 3827 -苞 3828 -苟 3829 -苡 3830 -苣 3831 -若 3832 -苦 3833 -苫 3834 -苯 3835 -英 3836 -苷 3837 -苹 3838 -茁 3839 -茂 3840 -范 3841 -茄 3842 -茅 3843 -茆 3844 -茉 3845 -茌 3846 -茎 3847 -茗 3848 -茛 3849 -茜 3850 -茧 3851 -茨 3852 -茫 3853 -茬 3854 -茯 3855 -茱 3856 -茳 3857 -茴 3858 -茵 3859 -茶 3860 -茸 3861 -茹 3862 -茼 3863 -荀 3864 -荃 3865 -荆 3866 -荇 3867 -草 3868 -荏 3869 -荐 3870 -荒 3871 -荔 3872 -荚 3873 -荛 3874 -荞 3875 -荟 3876 -荠 3877 -荡 3878 -荣 3879 -荤 3880 -荧 3881 -荨 3882 -荫 3883 -药 3884 -荷 3885 -荸 3886 -荻 3887 -荼 3888 -莅 3889 -莆 3890 -莉 3891 -莎 3892 -莒 3893 -莓 3894 -莘 3895 -莜 3896 -莞 3897 -莠 3898 -莪 3899 -莫 3900 -莱 3901 -莲 3902 -莴 3903 -获 3904 -莹 3905 -莺 3906 -莽 3907 -菀 3908 -菁 3909 -菅 3910 -菇 3911 -菊 3912 -菌 3913 -菏 3914 -菖 3915 -菘 3916 -菜 3917 -菠 3918 -菡 3919 -菩 3920 -菱 3921 -菲 3922 -萃 3923 -萄 3924 -萋 3925 -萌 3926 -萍 3927 -萎 3928 -萝 3929 -萤 3930 -营 3931 -萦 3932 -萧 3933 -萨 3934 -萱 3935 -萸 3936 -落 3937 -葆 3938 -著 3939 -葚 3940 -葛 3941 -葡 3942 -董 3943 -葩 3944 -葫 3945 -葬 3946 -葱 3947 -葳 3948 -葵 3949 -葺 3950 -蒂 3951 -蒋 3952 -蒙 3953 -蒜 3954 -蒯 3955 -蒲 3956 -蒸 3957 -蒿 3958 -蓁 3959 -蓄 3960 -蓉 3961 -蓓 3962 -蓝 3963 -蓟 3964 -蓥 3965 -蓦 3966 -蓬 3967 -蓼 3968 -蔑 3969 -蔓 3970 -蔗 3971 -蔚 3972 -蔡 3973 -蔫 3974 -蔬 3975 -蔷 3976 -蔺 3977 -蔻 3978 -蔼 3979 -蔽 3980 -蕃 3981 -蕉 3982 -蕊 3983 -蕙 3984 -蕨 3985 -蕲 3986 -蕴 3987 -蕾 3988 -薄 3989 -薇 3990 -薏 3991 -薛 3992 -薪 3993 -薯 3994 -薰 3995 -薷 3996 -藁 3997 -藉 3998 -藏 3999 -藐 4000 -藓 4001 -藕 4002 -藜 4003 -藠 4004 -藤 4005 -藩 4006 -藻 4007 -藿 4008 -蘑 4009 -蘸 4010 -虎 4011 -虏 4012 -虐 4013 -虑 4014 -虔 4015 -虚 4016 -虞 4017 -虫 4018 -虱 4019 -虹 4020 -虻 4021 -虽 4022 -虾 4023 -蚀 4024 -蚁 4025 -蚂 4026 -蚊 4027 -蚌 4028 -蚓 4029 -蚕 4030 -蚝 4031 -蚣 4032 -蚤 4033 -蚪 4034 -蚬 4035 -蚯 4036 -蚱 4037 -蚴 4038 -蛀 4039 -蛆 4040 -蛇 4041 -蛉 4042 -蛊 4043 -蛋 4044 -蛎 4045 -蛐 4046 -蛔 4047 -蛙 4048 -蛛 4049 -蛟 4050 -蛤 4051 -蛮 4052 -蛰 4053 -蛳 4054 -蛹 4055 -蛾 4056 -蜀 4057 -蜂 4058 -蜃 4059 -蜇 4060 -蜈 4061 -蜊 4062 -蜍 4063 -蜒 4064 -蜓 4065 -蜕 4066 -蜗 4067 -蜘 4068 -蜚 4069 -蜜 4070 -蜡 4071 -蜢 4072 -蜥 4073 -蜱 4074 -蜴 4075 -蜷 4076 -蜻 4077 -蜿 4078 -蝇 4079 -蝈 4080 -蝉 4081 -蝌 4082 -蝎 4083 -蝗 4084 -蝙 4085 -蝠 4086 -蝮 4087 -蝴 4088 -蝶 4089 -蝽 4090 -螂 4091 -螃 4092 -螈 4093 -融 4094 -螨 4095 -螳 4096 -螺 4097 -蟀 4098 -蟆 4099 -蟊 4100 -蟋 4101 -蟑 4102 -蟒 4103 -蟠 4104 -蟹 4105 -蟾 4106 -蠊 4107 -蠕 4108 -蠡 4109 -蠢 4110 -血 4111 -衅 4112 -行 4113 -衍 4114 -衔 4115 -街 4116 -衙 4117 -衡 4118 -衢 4119 -衣 4120 -补 4121 -表 4122 -衩 4123 -衫 4124 -衬 4125 -衮 4126 -衰 4127 -衲 4128 -衷 4129 -袁 4130 -袂 4131 -袄 4132 -袅 4133 -袈 4134 -袋 4135 -袍 4136 -袒 4137 -袖 4138 -袜 4139 -被 4140 -袭 4141 -袱 4142 -裁 4143 -裂 4144 -装 4145 -裆 4146 -裔 4147 -裕 4148 -裘 4149 -裙 4150 -裟 4151 -裤 4152 -裨 4153 -裱 4154 -裳 4155 -裴 4156 -裸 4157 -裹 4158 -褂 4159 -褐 4160 -褒 4161 -褓 4162 -褔 4163 -褚 4164 -褛 4165 -褥 4166 -褪 4167 -褴 4168 -褶 4169 -襁 4170 -襄 4171 -襟 4172 -西 4173 -要 4174 -覃 4175 -覆 4176 -见 4177 -观 4178 -规 4179 -觅 4180 -视 4181 -览 4182 -觉 4183 -觊 4184 -觎 4185 -觐 4186 -觑 4187 -角 4188 -觞 4189 -解 4190 -觥 4191 -触 4192 -言 4193 -訾 4194 -詹 4195 -誉 4196 -誓 4197 -警 4198 -譬 4199 -计 4200 -订 4201 -讣 4202 -认 4203 -讥 4204 -讧 4205 -讨 4206 -让 4207 -讪 4208 -训 4209 -议 4210 -讯 4211 -记 4212 -讲 4213 -讳 4214 -讴 4215 -讶 4216 -讷 4217 -许 4218 -讹 4219 -论 4220 -讼 4221 -讽 4222 -设 4223 -访 4224 -诀 4225 -证 4226 -诃 4227 -评 4228 -诅 4229 -识 4230 -诈 4231 -诉 4232 -诊 4233 -诋 4234 -词 4235 -诏 4236 -译 4237 -诓 4238 -试 4239 -诗 4240 -诘 4241 -诙 4242 -诚 4243 -诛 4244 -话 4245 -诞 4246 -诟 4247 -诠 4248 -诡 4249 -询 4250 -诣 4251 -诤 4252 -该 4253 -详 4254 -诧 4255 -诩 4256 -诫 4257 -诬 4258 -语 4259 -误 4260 -诱 4261 -诲 4262 -说 4263 -诵 4264 -诶 4265 -请 4266 -诸 4267 -诹 4268 -诺 4269 -读 4270 -诽 4271 -课 4272 -诿 4273 -谀 4274 -谁 4275 -调 4276 -谅 4277 -谆 4278 -谈 4279 -谊 4280 -谋 4281 -谌 4282 -谍 4283 -谎 4284 -谏 4285 -谐 4286 -谑 4287 -谓 4288 -谕 4289 -谖 4290 -谘 4291 -谙 4292 -谚 4293 -谛 4294 -谜 4295 -谟 4296 -谢 4297 -谣 4298 -谤 4299 -谦 4300 -谧 4301 -谨 4302 -谩 4303 -谬 4304 -谭 4305 -谮 4306 -谯 4307 -谱 4308 -谴 4309 -谶 4310 -谷 4311 -豁 4312 -豆 4313 -豇 4314 -豉 4315 -豌 4316 -豚 4317 -象 4318 -豢 4319 -豪 4320 -豫 4321 -豹 4322 -豺 4323 -貂 4324 -貅 4325 -貉 4326 -貌 4327 -貔 4328 -贝 4329 -贞 4330 -负 4331 -贡 4332 -财 4333 -责 4334 -贤 4335 -败 4336 -账 4337 -货 4338 -质 4339 -贩 4340 -贪 4341 -贫 4342 -贬 4343 -购 4344 -贮 4345 -贯 4346 -贰 4347 -贱 4348 -贲 4349 -贴 4350 -贵 4351 -贷 4352 -贸 4353 -费 4354 -贺 4355 -贻 4356 -贼 4357 -贾 4358 -贿 4359 -赁 4360 -赂 4361 -赃 4362 -资 4363 -赅 4364 -赈 4365 -赉 4366 -赊 4367 -赋 4368 -赌 4369 -赎 4370 -赏 4371 -赐 4372 -赓 4373 -赔 4374 -赖 4375 -赘 4376 -赚 4377 -赛 4378 -赝 4379 -赞 4380 -赠 4381 -赡 4382 -赢 4383 -赣 4384 -赤 4385 -赦 4386 -赫 4387 -走 4388 -赳 4389 -赴 4390 -赵 4391 -赶 4392 -起 4393 -趁 4394 -超 4395 -越 4396 -趋 4397 -趟 4398 -趣 4399 -足 4400 -趴 4401 -趵 4402 -趸 4403 -趺 4404 -趾 4405 -跃 4406 -跄 4407 -跆 4408 -跋 4409 -跌 4410 -跎 4411 -跑 4412 -跚 4413 -跛 4414 -距 4415 -跟 4416 -跤 4417 -跨 4418 -跪 4419 -跬 4420 -路 4421 -跳 4422 -践 4423 -跶 4424 -跷 4425 -跹 4426 -跺 4427 -跻 4428 -踉 4429 -踊 4430 -踌 4431 -踏 4432 -踝 4433 -踞 4434 -踢 4435 -踩 4436 -踪 4437 -踮 4438 -踯 4439 -踱 4440 -踵 4441 -踹 4442 -踺 4443 -蹁 4444 -蹂 4445 -蹄 4446 -蹈 4447 -蹉 4448 -蹊 4449 -蹋 4450 -蹒 4451 -蹚 4452 -蹦 4453 -蹩 4454 -蹬 4455 -蹭 4456 -蹲 4457 -蹴 4458 -蹶 4459 -蹼 4460 -蹿 4461 -躁 4462 -躅 4463 -躇 4464 -躏 4465 -身 4466 -躬 4467 -躯 4468 -躲 4469 -躺 4470 -车 4471 -轧 4472 -轨 4473 -轩 4474 -轫 4475 -转 4476 -轮 4477 -软 4478 -轰 4479 -轱 4480 -轲 4481 -轳 4482 -轴 4483 -轶 4484 -轸 4485 -轻 4486 -轼 4487 -载 4488 -轿 4489 -较 4490 -辄 4491 -辅 4492 -辆 4493 -辈 4494 -辉 4495 -辊 4496 -辍 4497 -辐 4498 -辑 4499 -输 4500 -辕 4501 -辖 4502 -辗 4503 -辘 4504 -辙 4505 -辛 4506 -辜 4507 -辞 4508 -辟 4509 -辣 4510 -辨 4511 -辩 4512 -辫 4513 -辰 4514 -辱 4515 -边 4516 -辽 4517 -达 4518 -迁 4519 -迂 4520 -迄 4521 -迅 4522 -过 4523 -迈 4524 -迎 4525 -运 4526 -近 4527 -返 4528 -还 4529 -这 4530 -进 4531 -远 4532 -违 4533 -连 4534 -迟 4535 -迢 4536 -迥 4537 -迦 4538 -迩 4539 -迪 4540 -迫 4541 -迭 4542 -述 4543 -迷 4544 -迸 4545 -迹 4546 -追 4547 -退 4548 -送 4549 -适 4550 -逃 4551 -逅 4552 -逆 4553 -选 4554 -逊 4555 -逋 4556 -逍 4557 -透 4558 -逐 4559 -逑 4560 -递 4561 -途 4562 -逗 4563 -通 4564 -逛 4565 -逝 4566 -逞 4567 -速 4568 -造 4569 -逡 4570 -逢 4571 -逮 4572 -逯 4573 -逵 4574 -逸 4575 -逻 4576 -逼 4577 -逾 4578 -遁 4579 -遂 4580 -遇 4581 -遍 4582 -遏 4583 -遐 4584 -遑 4585 -道 4586 -遗 4587 -遛 4588 -遢 4589 -遣 4590 -遥 4591 -遨 4592 -遭 4593 -遮 4594 -遴 4595 -遵 4596 -避 4597 -邀 4598 -邂 4599 -邃 4600 -邋 4601 -邑 4602 -邓 4603 -邕 4604 -邙 4605 -邛 4606 -邝 4607 -邡 4608 -邢 4609 -那 4610 -邦 4611 -邪 4612 -邬 4613 -邮 4614 -邯 4615 -邰 4616 -邱 4617 -邳 4618 -邵 4619 -邸 4620 -邹 4621 -邺 4622 -邻 4623 -郁 4624 -郅 4625 -郇 4626 -郊 4627 -郎 4628 -郑 4629 -郓 4630 -郜 4631 -郝 4632 -郡 4633 -郧 4634 -部 4635 -郫 4636 -郭 4637 -郯 4638 -郴 4639 -郸 4640 -都 4641 -鄂 4642 -鄙 4643 -鄞 4644 -鄢 4645 -鄱 4646 -酉 4647 -酊 4648 -酋 4649 -酌 4650 -配 4651 -酐 4652 -酒 4653 -酗 4654 -酚 4655 -酝 4656 -酞 4657 -酣 4658 -酥 4659 -酩 4660 -酪 4661 -酬 4662 -酮 4663 -酯 4664 -酰 4665 -酱 4666 -酵 4667 -酶 4668 -酷 4669 -酸 4670 -酿 4671 -醇 4672 -醉 4673 -醋 4674 -醍 4675 -醐 4676 -醒 4677 -醛 4678 -醺 4679 -采 4680 -釉 4681 -释 4682 -里 4683 -重 4684 -野 4685 -量 4686 -金 4687 -釜 4688 -鉴 4689 -銮 4690 -鏖 4691 -鑫 4692 -钇 4693 -针 4694 -钉 4695 -钊 4696 -钎 4697 -钏 4698 -钐 4699 -钒 4700 -钓 4701 -钗 4702 -钙 4703 -钛 4704 -钜 4705 -钝 4706 -钞 4707 -钟 4708 -钠 4709 -钢 4710 -钣 4711 -钥 4712 -钦 4713 -钧 4714 -钨 4715 -钩 4716 -钮 4717 -钯 4718 -钰 4719 -钱 4720 -钲 4721 -钳 4722 -钴 4723 -钵 4724 -钻 4725 -钼 4726 -钾 4727 -钿 4728 -铀 4729 -铁 4730 -铂 4731 -铃 4732 -铄 4733 -铅 4734 -铆 4735 -铉 4736 -铋 4737 -铍 4738 -铎 4739 -铐 4740 -铑 4741 -铖 4742 -铛 4743 -铜 4744 -铝 4745 -铟 4746 -铠 4747 -铡 4748 -铣 4749 -铤 4750 -铧 4751 -铨 4752 -铩 4753 -铬 4754 -铭 4755 -铮 4756 -铰 4757 -铲 4758 -银 4759 -铷 4760 -铸 4761 -铺 4762 -链 4763 -铿 4764 -销 4765 -锁 4766 -锂 4767 -锄 4768 -锅 4769 -锆 4770 -锈 4771 -锉 4772 -锋 4773 -锌 4774 -锏 4775 -锐 4776 -锑 4777 -锒 4778 -错 4779 -锚 4780 -锟 4781 -锡 4782 -锢 4783 -锣 4784 -锤 4785 -锥 4786 -锦 4787 -锨 4788 -锭 4789 -键 4790 -锯 4791 -锰 4792 -锲 4793 -锴 4794 -锵 4795 -锷 4796 -锹 4797 -锻 4798 -镀 4799 -镁 4800 -镂 4801 -镇 4802 -镉 4803 -镊 4804 -镌 4805 -镍 4806 -镏 4807 -镐 4808 -镑 4809 -镔 4810 -镕 4811 -镖 4812 -镜 4813 -镣 4814 -镭 4815 -镯 4816 -镰 4817 -镳 4818 -镶 4819 -长 4820 -门 4821 -闩 4822 -闪 4823 -闫 4824 -闭 4825 -问 4826 -闯 4827 -闰 4828 -闲 4829 -闳 4830 -间 4831 -闵 4832 -闷 4833 -闸 4834 -闹 4835 -闺 4836 -闻 4837 -闽 4838 -闾 4839 -阀 4840 -阁 4841 -阂 4842 -阄 4843 -阅 4844 -阆 4845 -阉 4846 -阎 4847 -阐 4848 -阑 4849 -阔 4850 -阕 4851 -阖 4852 -阙 4853 -阚 4854 -阜 4855 -队 4856 -阡 4857 -阪 4858 -阮 4859 -阱 4860 -防 4861 -阳 4862 -阴 4863 -阵 4864 -阶 4865 -阻 4866 -阿 4867 -陀 4868 -陂 4869 -附 4870 -际 4871 -陆 4872 -陇 4873 -陈 4874 -陉 4875 -陋 4876 -陌 4877 -降 4878 -限 4879 -陕 4880 -陛 4881 -陡 4882 -院 4883 -除 4884 -陨 4885 -险 4886 -陪 4887 -陬 4888 -陵 4889 -陶 4890 -陷 4891 -隅 4892 -隆 4893 -隋 4894 -隍 4895 -随 4896 -隐 4897 -隔 4898 -隗 4899 -隘 4900 -隙 4901 -障 4902 -隧 4903 -隶 4904 -隼 4905 -隽 4906 -难 4907 -雀 4908 -雁 4909 -雄 4910 -雅 4911 -集 4912 -雇 4913 -雉 4914 -雌 4915 -雍 4916 -雏 4917 -雒 4918 -雕 4919 -雨 4920 -雪 4921 -雯 4922 -雳 4923 -零 4924 -雷 4925 -雹 4926 -雾 4927 -需 4928 -霁 4929 -霄 4930 -霆 4931 -震 4932 -霈 4933 -霉 4934 -霍 4935 -霎 4936 -霏 4937 -霓 4938 -霖 4939 -霜 4940 -霞 4941 -霪 4942 -露 4943 -霸 4944 -霹 4945 -霾 4946 -靑 4947 -青 4948 -靓 4949 -靖 4950 -静 4951 -靛 4952 -非 4953 -靠 4954 -靡 4955 -面 4956 -革 4957 -靳 4958 -靴 4959 -靶 4960 -鞅 4961 -鞋 4962 -鞍 4963 -鞑 4964 -鞘 4965 -鞠 4966 -鞭 4967 -韦 4968 -韧 4969 -韩 4970 -韫 4971 -韬 4972 -韭 4973 -音 4974 -韵 4975 -韶 4976 -页 4977 -顶 4978 -顷 4979 -项 4980 -顺 4981 -须 4982 -顽 4983 -顾 4984 -顿 4985 -颀 4986 -颁 4987 -颂 4988 -预 4989 -颅 4990 -领 4991 -颇 4992 -颈 4993 -颊 4994 -颌 4995 -颍 4996 -颐 4997 -频 4998 -颓 4999 -颖 5000 -颗 5001 -题 5002 -颚 5003 -颜 5004 -额 5005 -颠 5006 -颢 5007 -颤 5008 -颦 5009 -颧 5010 -风 5011 -飒 5012 -飓 5013 -飘 5014 -飙 5015 -飚 5016 -飞 5017 -食 5018 -飧 5019 -餍 5020 -餐 5021 -餮 5022 -饕 5023 -饥 5024 -饨 5025 -饪 5026 -饭 5027 -饮 5028 -饯 5029 -饰 5030 -饱 5031 -饲 5032 -饴 5033 -饵 5034 -饶 5035 -饷 5036 -饺 5037 -饼 5038 -饽 5039 -饿 5040 -馀 5041 -馁 5042 -馄 5043 -馅 5044 -馆 5045 -馈 5046 -馊 5047 -馋 5048 -馍 5049 -馏 5050 -馑 5051 -馒 5052 -馕 5053 -首 5054 -馗 5055 -香 5056 -馥 5057 -馨 5058 -马 5059 -驭 5060 -驮 5061 -驯 5062 -驰 5063 -驱 5064 -驳 5065 -驴 5066 -驶 5067 -驷 5068 -驸 5069 -驹 5070 -驻 5071 -驼 5072 -驾 5073 -驿 5074 -骁 5075 -骂 5076 -骄 5077 -骅 5078 -骆 5079 -骇 5080 -骈 5081 -骊 5082 -骋 5083 -验 5084 -骏 5085 -骐 5086 -骑 5087 -骓 5088 -骗 5089 -骚 5090 -骛 5091 -骜 5092 -骝 5093 -骞 5094 -骠 5095 -骡 5096 -骤 5097 -骥 5098 -骨 5099 -骰 5100 -骷 5101 -骸 5102 -骺 5103 -骼 5104 -髂 5105 -髅 5106 -髋 5107 -髌 5108 -髓 5109 -高 5110 -髦 5111 -髯 5112 -鬃 5113 -鬓 5114 -鬟 5115 -鬼 5116 -魁 5117 -魂 5118 -魄 5119 -魅 5120 -魇 5121 -魉 5122 -魍 5123 -魏 5124 -魔 5125 -魟 5126 -鱼 5127 -鱿 5128 -鲁 5129 -鲅 5130 -鲈 5131 -鲍 5132 -鲑 5133 -鲜 5134 -鲟 5135 -鲠 5136 -鲢 5137 -鲤 5138 -鲨 5139 -鲫 5140 -鲭 5141 -鲳 5142 -鲶 5143 -鲷 5144 -鲸 5145 -鲼 5146 -鳃 5147 -鳄 5148 -鳅 5149 -鳌 5150 -鳍 5151 -鳕 5152 -鳖 5153 -鳗 5154 -鳝 5155 -鳞 5156 -鳟 5157 -鸟 5158 -鸠 5159 -鸡 5160 -鸢 5161 -鸣 5162 -鸥 5163 -鸦 5164 -鸩 5165 -鸪 5166 -鸫 5167 -鸭 5168 -鸯 5169 -鸳 5170 -鸵 5171 -鸽 5172 -鸾 5173 -鸿 5174 -鹁 5175 -鹂 5176 -鹃 5177 -鹅 5178 -鹉 5179 -鹊 5180 -鹌 5181 -鹏 5182 -鹑 5183 -鹜 5184 -鹞 5185 -鹤 5186 -鹦 5187 -鹧 5188 -鹫 5189 -鹭 5190 -鹰 5191 -鹳 5192 -鹿 5193 -麂 5194 -麋 5195 -麒 5196 -麓 5197 -麝 5198 -麟 5199 -麦 5200 -麸 5201 -麻 5202 -麾 5203 -黄 5204 -黍 5205 -黎 5206 -黏 5207 -黑 5208 -黔 5209 -默 5210 -黛 5211 -黝 5212 -黟 5213 -黯 5214 -鼎 5215 -鼓 5216 -鼠 5217 -鼬 5218 -鼹 5219 -鼻 5220 -鼾 5221 -齐 5222 -齿 5223 -龃 5224 -龄 5225 -龅 5226 -龈 5227 -龉 5228 -龊 5229 -龌 5230 -龙 5231 -龚 5232 -龟 5233 - 5234 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/librispeech.train_960_unigram5000.bpemodel b/models/audio/speech_recognition/conformer/igie/wenet/test/resources/librispeech.train_960_unigram5000.bpemodel deleted file mode 100644 index 3d24c47cf1a19b69928d186fdb93ab31e964ca75..0000000000000000000000000000000000000000 Binary files a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/librispeech.train_960_unigram5000.bpemodel and /dev/null differ diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/librispeech.words.txt b/models/audio/speech_recognition/conformer/igie/wenet/test/resources/librispeech.words.txt deleted file mode 100644 index 23a5adcbe4b3d883596e1675a7efbc16afacf4f1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/librispeech.words.txt +++ /dev/null @@ -1,5002 +0,0 @@ - 0 - 1 -' 2 -▁ 3 -A 4 -▁A 5 -▁AB 6 -▁ABANDON 7 -ABETH 8 -ABILITY 9 -ABLE 10 -▁ABLE 11 -ABLY 12 -▁ABODE 13 -ABOUT 14 -▁ABOUT 15 -▁ABOVE 16 -▁ABRAHAM 17 -▁ABROAD 18 -▁ABSENCE 19 -▁ABSENT 20 -▁ABSOLUTE 21 -▁ABSOLUTELY 22 -▁ABSORB 23 -▁ABSTRACT 24 -▁ABSURD 25 -▁ABUNDANCE 26 -▁ABUNDANT 27 -▁ABUSE 28 -AC 29 -▁ACCENT 30 -▁ACCEPT 31 -▁ACCEPTED 32 -▁ACCESS 33 -▁ACCIDENT 34 -▁ACCOMPANIED 35 -▁ACCOMPANY 36 -▁ACCOMPLISH 37 -▁ACCORD 38 -▁ACCORDING 39 -▁ACCORDINGLY 40 -▁ACCOUNT 41 -▁ACCUMULAT 42 -▁ACCURATE 43 -▁ACCUSE 44 -▁ACCUSTOMED 45 -ACH 46 -▁ACHIEVE 47 -ACIOUS 48 -▁ACKNOWLEDG 49 -▁ACQUAINTANCE 50 -▁ACQUAINTED 51 -▁ACQUIRED 52 -▁ACROSS 53 -▁ACT 54 -▁ACTION 55 -▁ACTIVE 56 -▁ACTIVITY 57 -▁ACTUAL 58 -▁ACTUALLY 59 -AD 60 -▁ADAM 61 -▁ADAPT 62 -▁ADD 63 -▁ADDED 64 -▁ADDITION 65 -▁ADDRESS 66 -▁ADDRESSED 67 -ADE 68 -▁ADHERE 69 -▁ADJUST 70 -▁ADMINISTER 71 -▁ADMINISTRATION 72 -▁ADMIRABLE 73 -▁ADMIRAL 74 -▁ADMIRATION 75 -▁ADMIRE 76 -▁ADMIRING 77 -▁ADMIT 78 -▁ADMITTED 79 -▁ADOPT 80 -▁ADORN 81 -▁ADVANCE 82 -▁ADVANCED 83 -▁ADVANCING 84 -▁ADVANTAGE 85 -▁ADVENTURE 86 -▁ADVERTISE 87 -▁ADVICE 88 -▁ADVISE 89 -▁AFFAIR 90 -▁AFFAIRS 91 -▁AFFECT 92 -▁AFFECTED 93 -▁AFFECTION 94 -▁AFFECTIONATE 95 -▁AFFIRM 96 -▁AFFLICT 97 -▁AFFORD 98 -▁AFRAID 99 -▁AFRICA 100 -▁AFTER 101 -▁AFTERNOON 102 -▁AFTERWARD 103 -▁AFTERWARDS 104 -AG 105 -▁AGAIN 106 -▁AGAINST 107 -AGE 108 -▁AGE 109 -▁AGENT 110 -▁AGITATED 111 -▁AGITATION 112 -▁AGO 113 -▁AGONY 114 -▁AGREE 115 -▁AGREEABLE 116 -▁AGREED 117 -AH 118 -▁AH 119 -▁AHEAD 120 -▁AID 121 -▁AIM 122 -▁AIR 123 -AK 124 -AL 125 -▁AL 126 -▁ALADDIN 127 -▁ALARM 128 -▁ALAS 129 -▁ALBERT 130 -▁ALEXANDER 131 -▁ALICE 132 -▁ALIVE 133 -▁ALL 134 -▁ALLOW 135 -▁ALLOWED 136 -ALLY 137 -▁ALMOST 138 -▁ALONE 139 -▁ALONG 140 -▁ALOUD 141 -▁ALREADY 142 -▁ALSO 143 -▁ALTAR 144 -▁ALTER 145 -▁ALTHOUGH 146 -▁ALTOGETHER 147 -▁ALWAYS 148 -▁ALYOSHA 149 -AM 150 -▁AM 151 -▁AMA 152 -▁AMBASSADOR 153 -▁AMBITION 154 -▁AMBITIOUS 155 -▁AMELIA 156 -▁AMERICA 157 -▁AMERICAN 158 -▁AMIABLE 159 -▁AMID 160 -▁AMONG 161 -▁AMOUNT 162 -▁AMUSEMENT 163 -AN 164 -▁AN 165 -ANCE 166 -▁ANCHOR 167 -▁ANCIENT 168 -AND 169 -▁AND 170 -▁ANDREW 171 -ANG 172 -▁ANGEL 173 -▁ANGER 174 -▁ANGLE 175 -▁ANGRILY 176 -▁ANGRY 177 -▁ANGUISH 178 -▁ANIMAL 179 -▁ANIMALS 180 -▁ANNA 181 -▁ANNE 182 -▁ANNOUNC 183 -▁ANNOUNCED 184 -▁ANOTHER 185 -ANS 186 -▁ANSWER 187 -▁ANSWERED 188 -ANT 189 -▁ANTHEA 190 -▁ANTI 191 -▁ANTICIPATE 192 -▁ANXIETY 193 -▁ANXIOUS 194 -▁ANY 195 -▁ANYBODY 196 -▁ANYHOW 197 -▁ANYONE 198 -▁ANYTHING 199 -▁ANYWHERE 200 -AP 201 -▁APART 202 -▁APARTMENT 203 -▁APOLOGI 204 -▁APPARATUS 205 -▁APPARENT 206 -▁APPARENTLY 207 -▁APPEAL 208 -▁APPEAR 209 -▁APPEARANCE 210 -▁APPEARED 211 -▁APPETITE 212 -▁APPLE 213 -▁APPLICATION 214 -▁APPLIED 215 -▁APPLY 216 -▁APPOINTED 217 -▁APPOINTMENT 218 -▁APPRECIATE 219 -▁APPREHEND 220 -▁APPREHENSION 221 -▁APPROACH 222 -▁APPROACHED 223 -▁APPROACHING 224 -▁APPROPRIATE 225 -▁APPROVE 226 -▁APRIL 227 -▁APRON 228 -▁APT 229 -AR 230 -▁AR 231 -▁ARAB 232 -▁ARAMIS 233 -▁ARCH 234 -▁ARCHITECT 235 -ARD 236 -▁ARDENT 237 -▁ARE 238 -▁ARGUE 239 -▁ARGUMENT 240 -ARIES 241 -▁ARISE 242 -▁ARISTOCRAT 243 -▁ARM 244 -▁ARMS 245 -▁ARMY 246 -▁AROSE 247 -▁AROUND 248 -▁ARRANGED 249 -▁ARRANGEMENT 250 -▁ARRAY 251 -▁ARREST 252 -▁ARRIVAL 253 -▁ARRIVE 254 -▁ARRIVED 255 -▁ARRIVING 256 -▁ARROW 257 -▁ART 258 -ARTAGNAN 259 -▁ARTHUR 260 -▁ARTICLE 261 -▁ARTIFICIAL 262 -▁ARTIST 263 -ARY 264 -AS 265 -▁AS 266 -▁ASCEND 267 -▁ASCERTAIN 268 -▁ASHAMED 269 -▁ASHES 270 -▁ASHORE 271 -▁ASIDE 272 -▁ASK 273 -▁ASKED 274 -▁ASKING 275 -▁ASLEEP 276 -▁ASPECT 277 -▁ASSASSIN 278 -▁ASSAULT 279 -▁ASSEMBLED 280 -▁ASSEMBLY 281 -▁ASSERT 282 -▁ASSIST 283 -▁ASSISTANCE 284 -▁ASSISTANT 285 -▁ASSOCIATE 286 -▁ASSOCIATION 287 -▁ASSUME 288 -▁ASSUMED 289 -▁ASSURANCE 290 -▁ASSURE 291 -▁ASSURED 292 -▁ASTONISHED 293 -▁ASTONISHMENT 294 -AT 295 -▁AT 296 -ATE 297 -ATED 298 -ATH 299 -ATING 300 -ATION 301 -ATIONS 302 -ATIVE 303 -▁ATLANTIC 304 -▁ATMOSPHERE 305 -ATOR 306 -ATORY 307 -▁ATTACHED 308 -▁ATTACHMENT 309 -▁ATTACK 310 -▁ATTAIN 311 -▁ATTEMPT 312 -▁ATTEND 313 -▁ATTENDANT 314 -▁ATTENTION 315 -▁ATTENTIVE 316 -▁ATTITUDE 317 -▁ATTORNEY 318 -▁ATTRACT 319 -▁ATTRIBUTE 320 -AU 321 -▁AUDIENCE 322 -▁AUGUST 323 -▁AUNT 324 -▁AUTHOR 325 -▁AUTHORITIES 326 -▁AUTHORITY 327 -▁AUTUMN 328 -AV 329 -▁AVAIL 330 -▁AVENUE 331 -▁AVERAGE 332 -▁AVOID 333 -AW 334 -▁AWAIT 335 -▁AWAKE 336 -▁AWAKENED 337 -▁AWARE 338 -▁AWAY 339 -▁AWFUL 340 -▁AWHILE 341 -▁AWKWARD 342 -▁AWOKE 343 -▁AXE 344 -AY 345 -B 346 -▁B 347 -BA 348 -▁BA 349 -▁BABY 350 -▁BACHELOR 351 -▁BACK 352 -▁BACKGROUND 353 -▁BACKWARD 354 -▁BAD 355 -▁BADE 356 -▁BAG 357 -▁BAKE 358 -▁BAL 359 -▁BALANCE 360 -▁BALL 361 -▁BALLOON 362 -▁BAN 363 -▁BAND 364 -▁BANK 365 -▁BAPTI 366 -▁BAR 367 -▁BARBAR 368 -▁BARE 369 -▁BARGAIN 370 -▁BARK 371 -▁BARON 372 -▁BARREL 373 -▁BARRICADE 374 -▁BARRIER 375 -▁BASE 376 -▁BASIN 377 -▁BASKET 378 -▁BATH 379 -▁BATTER 380 -▁BATTLE 381 -▁BAY 382 -BBE 383 -BBLE 384 -BE 385 -▁BE 386 -▁BEAR 387 -▁BEARD 388 -▁BEARING 389 -▁BEAST 390 -▁BEAT 391 -▁BEATEN 392 -▁BEAUTIFUL 393 -▁BEAUTY 394 -▁BECAME 395 -▁BECAUSE 396 -▁BECOME 397 -▁BECOMING 398 -▁BED 399 -▁BEDROOM 400 -▁BEEN 401 -▁BEFORE 402 -▁BEG 403 -▁BEGAN 404 -▁BEGGAR 405 -▁BEGGED 406 -▁BEGIN 407 -▁BEGINNING 408 -▁BEGUN 409 -▁BEHALF 410 -▁BEHAVE 411 -▁BEHAVIOUR 412 -▁BEHELD 413 -▁BEHIND 414 -▁BEHOLD 415 -▁BEING 416 -BEL 417 -▁BELIEF 418 -▁BELIEVE 419 -▁BELIEVED 420 -▁BELIEVING 421 -▁BELL 422 -▁BELONG 423 -▁BELOVED 424 -▁BELOW 425 -▁BENCH 426 -▁BENDING 427 -▁BENEATH 428 -▁BENEFIT 429 -▁BENT 430 -BER 431 -▁BERNARD 432 -▁BESIDE 433 -▁BESIDES 434 -▁BEST 435 -▁BESTOW 436 -▁BETRAY 437 -▁BETTER 438 -▁BETWEEN 439 -▁BEWILDERED 440 -▁BEYOND 441 -BI 442 -▁BI 443 -▁BIBLE 444 -▁BID 445 -▁BIG 446 -▁BILL 447 -▁BILLY 448 -▁BIND 449 -▁BIRD 450 -▁BIRDS 451 -▁BIRTH 452 -▁BISHOP 453 -▁BIT 454 -▁BITTER 455 -▁BLA 456 -▁BLACK 457 -▁BLADE 458 -▁BLAME 459 -▁BLANK 460 -▁BLANKET 461 -BLE 462 -▁BLESS 463 -▁BLEW 464 -▁BLIND 465 -▁BLISS 466 -▁BLOCK 467 -▁BLOOD 468 -▁BLOOM 469 -▁BLOSSOM 470 -▁BLOW 471 -▁BLU 472 -▁BLUE 473 -▁BLUSH 474 -BO 475 -▁BO 476 -BOARD 477 -▁BOARD 478 -▁BOAST 479 -▁BOAT 480 -▁BOB 481 -▁BODIES 482 -▁BODY 483 -▁BOIL 484 -▁BOLD 485 -▁BOLT 486 -▁BON 487 -▁BOND 488 -▁BONNET 489 -▁BOOK 490 -▁BOOKS 491 -▁BOOT 492 -▁BOOTS 493 -▁BORDER 494 -▁BORE 495 -▁BORN 496 -▁BORNE 497 -BOROUGH 498 -▁BORROW 499 -▁BOSOM 500 -▁BOSTON 501 -▁BOTH 502 -▁BOTTLE 503 -▁BOTTOM 504 -▁BOUGHT 505 -▁BOUND 506 -▁BOW 507 -▁BOWED 508 -▁BOWL 509 -▁BOX 510 -▁BOY 511 -▁BOYS 512 -BRA 513 -▁BRA 514 -▁BRAIN 515 -▁BRANCH 516 -▁BRANCHES 517 -▁BRAND 518 -▁BRAVE 519 -▁BREAD 520 -▁BREAK 521 -▁BREAKFAST 522 -▁BREAKING 523 -▁BREAST 524 -▁BREATH 525 -▁BREE 526 -▁BRETHREN 527 -▁BRETON 528 -▁BRI 529 -▁BRICK 530 -▁BRIDE 531 -▁BRIDGE 532 -▁BRIDLE 533 -▁BRIEF 534 -▁BRIG 535 -▁BRIGHT 536 -▁BRILLIANT 537 -▁BRING 538 -▁BRINGING 539 -▁BRISK 540 -▁BRITAIN 541 -▁BRITISH 542 -▁BRO 543 -▁BROAD 544 -▁BROKE 545 -▁BROKEN 546 -▁BROOD 547 -▁BROOK 548 -▁BROTHER 549 -▁BROTHERS 550 -▁BROUGHT 551 -▁BROW 552 -▁BROWN 553 -▁BRUCE 554 -▁BRUSH 555 -▁BRUTAL 556 -▁BRUTE 557 -BU 558 -▁BU 559 -▁BUCK 560 -▁BUILD 561 -▁BUILDING 562 -▁BUILT 563 -▁BULK 564 -▁BULL 565 -▁BULLET 566 -▁BUNCH 567 -▁BUNDLE 568 -▁BUR 569 -▁BURDEN 570 -BURG 571 -▁BURIED 572 -BURN 573 -▁BURN 574 -▁BURNING 575 -▁BURST 576 -BURY 577 -▁BUSH 578 -▁BUSHES 579 -▁BUSINESS 580 -▁BUSY 581 -▁BUT 582 -▁BUTTER 583 -▁BUTTERFLY 584 -▁BUY 585 -BY 586 -▁BY 587 -C 588 -▁C 589 -CA 590 -▁CA 591 -▁CAB 592 -▁CABIN 593 -▁CAESAR 594 -▁CAKE 595 -▁CAL 596 -▁CALAMIT 597 -▁CALCULATED 598 -▁CALIFORNIA 599 -▁CALL 600 -▁CALLED 601 -▁CALLING 602 -▁CALM 603 -▁CAME 604 -▁CAMP 605 -▁CAMPAIGN 606 -▁CAN 607 -▁CANDID 608 -▁CANDLE 609 -▁CANNON 610 -▁CANNOT 611 -▁CANOE 612 -▁CANVAS 613 -▁CAP 614 -▁CAPABLE 615 -▁CAPACITY 616 -▁CAPITAL 617 -▁CAPTAIN 618 -▁CAPTURE 619 -CAR 620 -▁CAR 621 -▁CARD 622 -▁CARDINAL 623 -▁CARE 624 -▁CAREFUL 625 -▁CAREFULLY 626 -▁CARELESS 627 -▁CARLYLE 628 -▁CARPENTER 629 -▁CARPET 630 -▁CARR 631 -▁CARRIAGE 632 -▁CARRIED 633 -▁CARRY 634 -▁CARRYING 635 -▁CART 636 -▁CARVED 637 -▁CASE 638 -CAST 639 -▁CAST 640 -▁CASTLE 641 -▁CASUAL 642 -▁CAT 643 -▁CATCH 644 -▁CATHEDRAL 645 -▁CATHERINE 646 -▁CATHOLIC 647 -▁CATTLE 648 -▁CAUGHT 649 -▁CAUSE 650 -▁CAUSED 651 -▁CAUTION 652 -▁CAVALRY 653 -▁CAVE 654 -CE 655 -▁CE 656 -▁CEASE 657 -▁CEASED 658 -▁CEILING 659 -▁CELEBRAT 660 -▁CELL 661 -▁CELLAR 662 -CENT 663 -▁CENT 664 -▁CENTER 665 -▁CENTRAL 666 -▁CENTRE 667 -▁CENTURIES 668 -▁CENTURY 669 -▁CEREMONY 670 -▁CERTAIN 671 -▁CERTAINLY 672 -▁CETERA 673 -CH 674 -▁CH 675 -CHA 676 -▁CHA 677 -▁CHAIN 678 -▁CHAIR 679 -▁CHALLENGE 680 -▁CHAMBER 681 -▁CHAMPION 682 -▁CHANCE 683 -▁CHANCELLOR 684 -▁CHANGE 685 -▁CHANGED 686 -▁CHANGING 687 -▁CHANNEL 688 -▁CHAP 689 -▁CHAPTER 690 -▁CHAR 691 -▁CHARACTER 692 -▁CHARACTERISTIC 693 -▁CHARGE 694 -▁CHARIOT 695 -▁CHARLES 696 -▁CHARLOTTE 697 -▁CHARM 698 -▁CHARMING 699 -▁CHASE 700 -▁CHATEAU 701 -▁CHATTER 702 -▁CHAUVELIN 703 -CHE 704 -▁CHE 705 -▁CHEAP 706 -▁CHECK 707 -CHED 708 -▁CHEEK 709 -▁CHEEKS 710 -▁CHEER 711 -▁CHEERFUL 712 -▁CHEESE 713 -▁CHERISH 714 -▁CHEST 715 -CHI 716 -▁CHI 717 -▁CHICAGO 718 -▁CHICKEN 719 -▁CHIEF 720 -▁CHILD 721 -▁CHILDHOOD 722 -▁CHILDREN 723 -▁CHILL 724 -▁CHIMNEY 725 -▁CHIN 726 -▁CHINA 727 -▁CHINESE 728 -CHO 729 -▁CHOICE 730 -▁CHOOSE 731 -▁CHOP 732 -▁CHORUS 733 -▁CHOSE 734 -▁CHOSEN 735 -▁CHRIS 736 -▁CHRIST 737 -▁CHRISTIAN 738 -▁CHRISTMAS 739 -▁CHU 740 -▁CHUCK 741 -▁CHURCH 742 -CI 743 -▁CIGAR 744 -▁CIRCLE 745 -▁CIRCULAR 746 -▁CIRCULAT 747 -▁CIRCUMSTANCE 748 -▁CIRCUMSTANCES 749 -▁CITI 750 -▁CITIES 751 -▁CITY 752 -▁CIVIL 753 -▁CIVILI 754 -CK 755 -▁CL 756 -▁CLAIM 757 -▁CLAIR 758 -▁CLAPP 759 -▁CLARA 760 -▁CLASP 761 -▁CLASS 762 -▁CLASSES 763 -▁CLAW 764 -▁CLAY 765 -▁CLEAN 766 -▁CLEAR 767 -▁CLEARLY 768 -▁CLERGY 769 -▁CLERK 770 -▁CLEVER 771 -▁CLIFF 772 -▁CLIMATE 773 -▁CLIMB 774 -▁CLO 775 -▁CLOAK 776 -CLOCK 777 -▁CLOCK 778 -CLOSE 779 -▁CLOSE 780 -▁CLOSED 781 -▁CLOSELY 782 -▁CLOTH 783 -▁CLOTHES 784 -▁CLOUD 785 -▁CLOUDS 786 -▁CLUB 787 -▁CLUSTER 788 -▁CLUTCH 789 -CO 790 -▁CO 791 -▁COACH 792 -▁COAL 793 -▁COARSE 794 -▁COAST 795 -▁COAT 796 -▁COCK 797 -▁COFFEE 798 -▁COFFIN 799 -▁COIN 800 -▁COL 801 -▁COLD 802 -▁COLLAR 803 -▁COLLECT 804 -▁COLLEGE 805 -▁COLONEL 806 -▁COLONI 807 -▁COLONY 808 -▁COLOR 809 -▁COLOUR 810 -▁COLUMN 811 -▁COM 812 -COMB 813 -▁COMB 814 -▁COMBAT 815 -▁COMBINATION 816 -▁COMBINED 817 -▁COME 818 -▁COMES 819 -▁COMFORT 820 -▁COMFORTABLE 821 -▁COMING 822 -▁COMMAND 823 -▁COMMENCED 824 -▁COMMEND 825 -▁COMMENT 826 -▁COMMERCE 827 -▁COMMERCIAL 828 -▁COMMISSION 829 -▁COMMIT 830 -▁COMMITTED 831 -▁COMMITTEE 832 -▁COMMON 833 -▁COMMUN 834 -▁COMMUNICAT 835 -▁COMMUNICATION 836 -▁COMMUNITY 837 -▁COMP 838 -▁COMPANION 839 -▁COMPANIONS 840 -▁COMPANY 841 -▁COMPARATIVELY 842 -▁COMPARE 843 -▁COMPARISON 844 -▁COMPASS 845 -▁COMPELLED 846 -▁COMPLAIN 847 -▁COMPLETE 848 -▁COMPLETELY 849 -▁COMPLEX 850 -▁COMPLIMENT 851 -▁COMPOSED 852 -▁COMPOSITION 853 -▁COMPREHEND 854 -▁COMRADE 855 -CON 856 -▁CON 857 -▁CONCEAL 858 -▁CONCEIVE 859 -▁CONCENTRAT 860 -▁CONCEPTION 861 -▁CONCERN 862 -▁CONCERNED 863 -▁CONCERNING 864 -▁CONCERT 865 -▁CONCLUD 866 -▁CONCLUDED 867 -▁CONCLUSION 868 -▁CONDEMN 869 -▁CONDITION 870 -▁CONDITIONS 871 -▁CONDUCT 872 -▁CONF 873 -▁CONFESS 874 -▁CONFIDE 875 -▁CONFIDENCE 876 -▁CONFIDENT 877 -▁CONFINED 878 -▁CONFIRM 879 -▁CONFLICT 880 -▁CONFOUND 881 -▁CONFRONT 882 -▁CONFUSED 883 -▁CONFUSION 884 -▁CONGRESS 885 -▁CONJECTURE 886 -▁CONNECTED 887 -▁CONNECTION 888 -▁CONQUER 889 -▁CONQUEST 890 -▁CONSCIENCE 891 -▁CONSCIOUS 892 -▁CONSCIOUSNESS 893 -▁CONSEIL 894 -▁CONSENT 895 -▁CONSEQUENCE 896 -▁CONSEQUENTLY 897 -▁CONSIDER 898 -▁CONSIDERABLE 899 -▁CONSIDERATION 900 -▁CONSIDERED 901 -▁CONSIST 902 -▁CONSOLATION 903 -▁CONSPICUOUS 904 -▁CONSTANCE 905 -▁CONSTANT 906 -▁CONSTANTLY 907 -▁CONSTITUTE 908 -▁CONSTITUTION 909 -▁CONSTRUCT 910 -▁CONSULT 911 -▁CONSUM 912 -▁CONTACT 913 -▁CONTAIN 914 -▁CONTEMPLATE 915 -▁CONTEMPT 916 -▁CONTEND 917 -▁CONTENT 918 -▁CONTEST 919 -▁CONTINENT 920 -▁CONTINUAL 921 -▁CONTINUALLY 922 -▁CONTINUE 923 -▁CONTINUED 924 -▁CONTRACT 925 -▁CONTRADICT 926 -▁CONTRARY 927 -▁CONTRAST 928 -▁CONTRIBUTE 929 -▁CONTROL 930 -▁CONVENIENT 931 -▁CONVENT 932 -▁CONVENTION 933 -▁CONVERSATION 934 -▁CONVERSE 935 -▁CONVERT 936 -▁CONVEY 937 -▁CONVICT 938 -▁CONVICTION 939 -▁CONVINCE 940 -▁CONVINCED 941 -▁CONVULS 942 -▁COOK 943 -▁COOL 944 -▁COPIE 945 -▁COPPER 946 -▁COPY 947 -▁COR 948 -▁CORDIAL 949 -▁CORN 950 -▁CORNER 951 -▁CORPORAL 952 -▁CORPSE 953 -▁CORRECT 954 -▁CORRESPOND 955 -▁CORRIDOR 956 -▁CORRUPT 957 -▁COSETTE 958 -▁COST 959 -▁COSTUME 960 -▁COTTAGE 961 -▁COTTON 962 -▁COUCH 963 -▁COULD 964 -▁COULDN 965 -▁COUNCIL 966 -▁COUNSEL 967 -▁COUNT 968 -▁COUNTENANCE 969 -▁COUNTER 970 -▁COUNTESS 971 -▁COUNTRIES 972 -▁COUNTRY 973 -▁COUPLE 974 -▁COURAGE 975 -▁COURSE 976 -▁COURT 977 -▁COUSIN 978 -▁COVER 979 -▁COVERED 980 -▁COW 981 -▁COWARD 982 -▁CRA 983 -▁CRACK 984 -▁CRAFT 985 -▁CRAWL 986 -▁CRE 987 -▁CREAM 988 -▁CREATED 989 -▁CREATURE 990 -▁CREATURES 991 -▁CREDIT 992 -▁CREEK 993 -▁CREEP 994 -▁CREP 995 -▁CREW 996 -▁CRIED 997 -▁CRIES 998 -▁CRIME 999 -▁CRIMINAL 1000 -▁CRIMSON 1001 -▁CRISTO 1002 -▁CRITIC 1003 -▁CRO 1004 -▁CROSS 1005 -▁CROSSED 1006 -▁CROW 1007 -▁CROWD 1008 -▁CROWN 1009 -▁CRU 1010 -▁CRUEL 1011 -▁CRUMBS 1012 -▁CRUSH 1013 -▁CRY 1014 -▁CRYING 1015 -▁CRYSTAL 1016 -CTOR 1017 -CU 1018 -▁CU 1019 -▁CULTIVATE 1020 -▁CULTURE 1021 -CUM 1022 -▁CUNNING 1023 -▁CUP 1024 -▁CUR 1025 -▁CURIOSITY 1026 -▁CURIOUS 1027 -▁CURL 1028 -▁CURRENT 1029 -▁CURSE 1030 -▁CURTAIN 1031 -▁CUSHION 1032 -▁CUSTOM 1033 -▁CUT 1034 -▁CUTTING 1035 -CY 1036 -▁CYRIL 1037 -D 1038 -▁D 1039 -DA 1040 -▁DA 1041 -▁DAGGER 1042 -▁DAILY 1043 -▁DAMAGE 1044 -▁DAMN 1045 -▁DAMP 1046 -▁DAMSEL 1047 -▁DAN 1048 -▁DANCE 1049 -▁DANCING 1050 -▁DANGER 1051 -▁DANGEROUS 1052 -▁DANGLARS 1053 -▁DANIEL 1054 -▁DAR 1055 -▁DARE 1056 -▁DARED 1057 -▁DARK 1058 -▁DARKNESS 1059 -▁DARLING 1060 -▁DASH 1061 -▁DATE 1062 -▁DAUGHTER 1063 -▁DAVID 1064 -▁DAWN 1065 -▁DAY 1066 -▁DAYS 1067 -DDING 1068 -DDLE 1069 -DE 1070 -▁DE 1071 -▁DEAD 1072 -▁DEAF 1073 -▁DEAL 1074 -▁DEAR 1075 -▁DEAREST 1076 -▁DEATH 1077 -▁DEBATE 1078 -▁DEBT 1079 -▁DECAY 1080 -▁DECEIVE 1081 -▁DECEMBER 1082 -▁DECIDE 1083 -▁DECIDED 1084 -▁DECISION 1085 -▁DECK 1086 -▁DECLARE 1087 -▁DECLARED 1088 -▁DECLINE 1089 -▁DECORAT 1090 -▁DECREE 1091 -▁DEEP 1092 -▁DEEPLY 1093 -▁DEFEAT 1094 -▁DEFECT 1095 -▁DEFENCE 1096 -▁DEFEND 1097 -▁DEFENSE 1098 -▁DEFI 1099 -▁DEFINITE 1100 -▁DEGREE 1101 -▁DELAY 1102 -▁DELIBERATE 1103 -▁DELICACY 1104 -▁DELICATE 1105 -▁DELICIOUS 1106 -▁DELIGHT 1107 -▁DELIGHTED 1108 -▁DELIGHTFUL 1109 -▁DELIVER 1110 -▁DEMAND 1111 -▁DEMANDED 1112 -▁DEMOCRATIC 1113 -▁DEMON 1114 -DEN 1115 -▁DEN 1116 -▁DENIED 1117 -▁DENY 1118 -▁DEPART 1119 -▁DEPARTMENT 1120 -▁DEPARTURE 1121 -▁DEPEND 1122 -▁DEPOSIT 1123 -▁DEPRESS 1124 -▁DEPRIVED 1125 -▁DEPTH 1126 -DER 1127 -▁DERIVED 1128 -▁DESCEND 1129 -▁DESCENDED 1130 -▁DESCENT 1131 -▁DESCRIBE 1132 -▁DESCRIBED 1133 -▁DESCRIPTION 1134 -▁DESERT 1135 -▁DESERVE 1136 -▁DESIGN 1137 -▁DESIRABLE 1138 -▁DESIRE 1139 -▁DESIRED 1140 -▁DESIROUS 1141 -▁DESK 1142 -▁DESOLATE 1143 -▁DESPAIR 1144 -▁DESPATCH 1145 -▁DESPERATE 1146 -▁DESPISE 1147 -▁DESPITE 1148 -▁DESTINED 1149 -▁DESTINY 1150 -▁DESTROY 1151 -▁DESTROYED 1152 -▁DESTRUCTION 1153 -▁DETAIL 1154 -▁DETAIN 1155 -▁DETECT 1156 -▁DETECTIVE 1157 -▁DETERMIN 1158 -▁DETERMINATION 1159 -▁DETERMINED 1160 -▁DEVELOP 1161 -▁DEVELOPMENT 1162 -▁DEVICE 1163 -▁DEVIL 1164 -▁DEVOTED 1165 -▁DEVOTION 1166 -▁DEVOUR 1167 -▁DEXTER 1168 -▁DI 1169 -▁DIAMOND 1170 -▁DIANA 1171 -▁DICK 1172 -▁DID 1173 -▁DIDN 1174 -▁DIE 1175 -▁DIED 1176 -▁DIFFER 1177 -▁DIFFERENCE 1178 -▁DIFFERENT 1179 -▁DIFFICULT 1180 -▁DIFFICULTIES 1181 -▁DIFFICULTY 1182 -▁DIG 1183 -▁DIGNIFIED 1184 -▁DIGNITY 1185 -▁DIM 1186 -▁DIMINISH 1187 -▁DIN 1188 -▁DINNER 1189 -▁DIRECT 1190 -▁DIRECTED 1191 -▁DIRECTION 1192 -▁DIRECTLY 1193 -▁DIRTY 1194 -▁DIS 1195 -▁DISAGREEABLE 1196 -▁DISAPPEAR 1197 -▁DISAPPEARED 1198 -▁DISAPPOINT 1199 -▁DISAPPOINTMENT 1200 -▁DISC 1201 -▁DISCERN 1202 -▁DISCHARGE 1203 -▁DISCIPLE 1204 -▁DISCIPLINE 1205 -▁DISCOURAGE 1206 -▁DISCOURSE 1207 -▁DISCOVER 1208 -▁DISCOVERED 1209 -▁DISCOVERY 1210 -▁DISCUSS 1211 -▁DISCUSSION 1212 -▁DISDAIN 1213 -▁DISEASE 1214 -▁DISGRACE 1215 -▁DISGUISE 1216 -▁DISGUST 1217 -▁DISH 1218 -▁DISLIKE 1219 -▁DISMAL 1220 -▁DISMAY 1221 -▁DISMISS 1222 -▁DISORDER 1223 -▁DISPLAY 1224 -▁DISPOSED 1225 -▁DISPOSITION 1226 -▁DISPUTE 1227 -▁DISSOLV 1228 -▁DISTANCE 1229 -▁DISTANT 1230 -▁DISTINCT 1231 -▁DISTINCTION 1232 -▁DISTINGUISH 1233 -▁DISTINGUISHED 1234 -▁DISTRACT 1235 -▁DISTRESS 1236 -▁DISTRIBUT 1237 -▁DISTRICT 1238 -▁DISTRUST 1239 -▁DISTURB 1240 -▁DIV 1241 -▁DIVERS 1242 -▁DIVIDE 1243 -▁DIVIDED 1244 -▁DIVINE 1245 -▁DIVISION 1246 -▁DIXON 1247 -DO 1248 -▁DO 1249 -▁DOCTOR 1250 -▁DOCTRINE 1251 -▁DOCUMENT 1252 -▁DOES 1253 -▁DOESN 1254 -▁DOG 1255 -▁DOGS 1256 -▁DOING 1257 -▁DOLLARS 1258 -DOLPH 1259 -▁DOMESTIC 1260 -▁DOMINION 1261 -▁DON 1262 -▁DONE 1263 -▁DONKEY 1264 -▁DOOR 1265 -▁DOORS 1266 -▁DOORWAY 1267 -▁DOROTHY 1268 -▁DOUBLE 1269 -▁DOUBT 1270 -▁DOUBTFUL 1271 -▁DOUBTLESS 1272 -▁DOWN 1273 -▁DOWNSTAIRS 1274 -▁DRAG 1275 -▁DRAGG 1276 -▁DRAGON 1277 -▁DRAIN 1278 -▁DRAKE 1279 -▁DRAMA 1280 -▁DRANK 1281 -▁DRAP 1282 -▁DRAUGHT 1283 -▁DRAW 1284 -▁DRAWING 1285 -▁DRAWN 1286 -▁DREAD 1287 -▁DREADFUL 1288 -▁DREAM 1289 -▁DREARY 1290 -▁DRESS 1291 -▁DRESSED 1292 -▁DREW 1293 -▁DRI 1294 -▁DRIFT 1295 -▁DRINK 1296 -▁DRIVE 1297 -▁DRIVEN 1298 -▁DRIVER 1299 -▁DRIVING 1300 -▁DROOP 1301 -▁DROP 1302 -▁DROPPED 1303 -▁DROPPING 1304 -▁DROVE 1305 -▁DROWN 1306 -▁DRUG 1307 -▁DRUM 1308 -▁DRUNK 1309 -▁DRY 1310 -▁DU 1311 -▁DUCHESS 1312 -▁DUCK 1313 -▁DUE 1314 -▁DUKE 1315 -▁DULL 1316 -▁DUMB 1317 -▁DUN 1318 -▁DUNBAR 1319 -▁DUR 1320 -▁DUSK 1321 -▁DUST 1322 -▁DUTCH 1323 -▁DUTIES 1324 -▁DUTY 1325 -▁DWARF 1326 -▁DWELL 1327 -▁DWELT 1328 -DY 1329 -▁DYING 1330 -E 1331 -▁E 1332 -EA 1333 -▁EACH 1334 -▁EAGER 1335 -▁EAGERLY 1336 -▁EAGLE 1337 -▁EAR 1338 -▁EARL 1339 -▁EARLIER 1340 -▁EARLIEST 1341 -▁EARLY 1342 -▁EARN 1343 -▁EARNEST 1344 -▁EARS 1345 -▁EARTH 1346 -▁EASE 1347 -▁EASIER 1348 -▁EASILY 1349 -▁EAST 1350 -▁EASTERN 1351 -▁EASY 1352 -▁EAT 1353 -▁EATEN 1354 -▁EATING 1355 -▁ECHO 1356 -ED 1357 -▁EDGE 1358 -▁EDITH 1359 -▁EDITOR 1360 -▁EDUCAT 1361 -▁EDUCATION 1362 -▁EDWARD 1363 -EF 1364 -▁EFFECT 1365 -▁EFFORT 1366 -▁EGGS 1367 -▁EGYPT 1368 -▁EGYPTIAN 1369 -▁EIGHT 1370 -▁EIGHTEEN 1371 -▁EIGHTY 1372 -▁EITHER 1373 -EL 1374 -▁EL 1375 -▁ELABORATE 1376 -▁ELBOW 1377 -▁ELDER 1378 -▁ELDEST 1379 -▁ELEANOR 1380 -▁ELECT 1381 -▁ELECTRIC 1382 -▁ELEGANT 1383 -▁ELEMENT 1384 -▁ELEPHANT 1385 -▁ELEVEN 1386 -▁ELI 1387 -ELLA 1388 -▁ELSE 1389 -▁ELSEWHERE 1390 -▁ELSIE 1391 -EM 1392 -▁EM 1393 -▁EMBARK 1394 -▁EMBARRASS 1395 -▁EMBRACE 1396 -▁EMBROIDER 1397 -EMENT 1398 -▁EMERG 1399 -▁EMILY 1400 -▁EMINENT 1401 -▁EMOTION 1402 -▁EMPEROR 1403 -▁EMPHASI 1404 -▁EMPIRE 1405 -▁EMPLOY 1406 -▁EMPLOYED 1407 -▁EMPTY 1408 -EN 1409 -▁EN 1410 -▁ENABLE 1411 -ENCE 1412 -▁ENCHANT 1413 -ENCIES 1414 -▁ENCLOS 1415 -▁ENCOUNTER 1416 -▁ENCOURAGE 1417 -▁END 1418 -▁ENDEAVOR 1419 -▁ENDEAVOUR 1420 -▁ENDURE 1421 -ENED 1422 -▁ENEMIES 1423 -▁ENEMY 1424 -▁ENERGETIC 1425 -▁ENERGY 1426 -▁ENGAGE 1427 -▁ENGAGED 1428 -▁ENGAGEMENT 1429 -▁ENGINE 1430 -▁ENGLAND 1431 -▁ENGLISH 1432 -▁ENJOY 1433 -▁ENJOYMENT 1434 -▁ENLIGHTEN 1435 -▁ENORMOUS 1436 -▁ENOUGH 1437 -ENS 1438 -▁ENSU 1439 -ENT 1440 -▁ENTER 1441 -▁ENTERED 1442 -▁ENTERPRISE 1443 -▁ENTERTAIN 1444 -▁ENTHUSIASM 1445 -▁ENTIRE 1446 -▁ENTIRELY 1447 -▁ENTITLED 1448 -▁ENTRANCE 1449 -▁ENTREAT 1450 -▁ENVELOPE 1451 -▁ENVY 1452 -▁EPI 1453 -▁EQUAL 1454 -▁EQUALLY 1455 -ER 1456 -▁ER 1457 -▁ERE 1458 -▁ERECT 1459 -▁ERRAND 1460 -▁ERROR 1461 -ERS 1462 -ES 1463 -▁ESCAPE 1464 -▁ESCAPED 1465 -▁ESCORT 1466 -▁ESPECIALLY 1467 -▁ESSENCE 1468 -▁ESSENTIAL 1469 -EST 1470 -▁ESTABLISH 1471 -▁ESTABLISHED 1472 -▁ESTABLISHMENT 1473 -▁ESTATE 1474 -▁ESTEEM 1475 -▁ESTIMATE 1476 -▁ESTRALLA 1477 -ET 1478 -▁ETERNAL 1479 -▁ETERNITY 1480 -ETH 1481 -ETT 1482 -ETTE 1483 -▁EUROPE 1484 -▁EUSTACE 1485 -EV 1486 -▁EVA 1487 -▁EVEN 1488 -▁EVENING 1489 -▁EVENTS 1490 -EVER 1491 -▁EVER 1492 -▁EVERY 1493 -▁EVERYBODY 1494 -▁EVERYONE 1495 -▁EVERYTHING 1496 -▁EVERYWHERE 1497 -▁EVIDENCE 1498 -▁EVIDENT 1499 -▁EVIDENTLY 1500 -▁EVIL 1501 -EX 1502 -▁EX 1503 -▁EXACT 1504 -▁EXACTLY 1505 -▁EXAMINATION 1506 -▁EXAMINE 1507 -▁EXAMINED 1508 -▁EXAMINING 1509 -▁EXAMPLE 1510 -▁EXCEED 1511 -▁EXCEEDINGLY 1512 -▁EXCELLENCY 1513 -▁EXCELLENT 1514 -▁EXCEPT 1515 -▁EXCEPTION 1516 -▁EXCESS 1517 -▁EXCHANGE 1518 -▁EXCITE 1519 -▁EXCITED 1520 -▁EXCITEMENT 1521 -▁EXCITING 1522 -▁EXCLAIMED 1523 -▁EXCLAMATION 1524 -▁EXCLUSIVE 1525 -▁EXCURSION 1526 -▁EXCUSE 1527 -▁EXECUT 1528 -▁EXECUTION 1529 -▁EXERCISE 1530 -▁EXHAUST 1531 -▁EXHIBIT 1532 -▁EXIST 1533 -▁EXISTENCE 1534 -▁EXPAND 1535 -▁EXPECT 1536 -▁EXPECTATION 1537 -▁EXPECTED 1538 -▁EXPEDITION 1539 -▁EXPENSE 1540 -▁EXPERIENCE 1541 -▁EXPERIMENT 1542 -▁EXPLAIN 1543 -▁EXPLAINED 1544 -▁EXPLANATION 1545 -▁EXPLORE 1546 -▁EXPOSED 1547 -▁EXPRESS 1548 -▁EXPRESSED 1549 -▁EXPRESSION 1550 -▁EXQUISITE 1551 -▁EXTEND 1552 -▁EXTENDED 1553 -▁EXTENSIVE 1554 -▁EXTENT 1555 -▁EXTERNAL 1556 -▁EXTRA 1557 -▁EXTRACT 1558 -▁EXTRAORDINARY 1559 -▁EXTREME 1560 -▁EXTREMELY 1561 -▁EXTREMITY 1562 -EY 1563 -▁EYE 1564 -▁EYEBROWS 1565 -▁EYES 1566 -F 1567 -▁F 1568 -FA 1569 -▁FA 1570 -▁FACE 1571 -▁FACES 1572 -▁FACILIT 1573 -▁FACING 1574 -▁FACT 1575 -▁FACULTIES 1576 -▁FACULTY 1577 -▁FADED 1578 -▁FAIL 1579 -▁FAILED 1580 -▁FAILURE 1581 -▁FAINT 1582 -▁FAIR 1583 -▁FAIRLY 1584 -▁FAIRY 1585 -▁FAITH 1586 -▁FAITHFUL 1587 -FALL 1588 -▁FALL 1589 -▁FALLEN 1590 -▁FALLING 1591 -▁FALSE 1592 -▁FAME 1593 -▁FAMILIAR 1594 -▁FAMILIES 1595 -▁FAMILY 1596 -▁FAMOUS 1597 -▁FAN 1598 -▁FANCIED 1599 -▁FANCIES 1600 -▁FANCY 1601 -▁FANNY 1602 -▁FANTASTIC 1603 -▁FAR 1604 -▁FAREWELL 1605 -▁FARM 1606 -▁FARMER 1607 -▁FARTHER 1608 -▁FASHION 1609 -▁FAST 1610 -▁FASTENED 1611 -▁FAT 1612 -▁FATAL 1613 -▁FATE 1614 -▁FATHER 1615 -▁FATIGUE 1616 -▁FAULT 1617 -▁FAVOR 1618 -▁FAVORITE 1619 -▁FAVOUR 1620 -▁FAVOURITE 1621 -FE 1622 -▁FE 1623 -▁FEAR 1624 -▁FEARFUL 1625 -▁FEAST 1626 -▁FEATHER 1627 -▁FEATURE 1628 -▁FEATURES 1629 -▁FEBRUARY 1630 -▁FEDERAL 1631 -▁FEEBLE 1632 -▁FEED 1633 -▁FEEL 1634 -▁FEELING 1635 -▁FEELINGS 1636 -▁FEET 1637 -▁FELICITY 1638 -▁FELL 1639 -▁FELLOW 1640 -▁FELT 1641 -▁FEMALE 1642 -▁FEMININE 1643 -▁FENCE 1644 -FER 1645 -▁FER 1646 -▁FERTIL 1647 -▁FETCH 1648 -▁FEVER 1649 -▁FEW 1650 -FF 1651 -FI 1652 -▁FI 1653 -FIELD 1654 -▁FIELD 1655 -▁FIELDS 1656 -▁FIERCE 1657 -▁FIFTEEN 1658 -▁FIFTH 1659 -▁FIFTY 1660 -▁FIGHT 1661 -▁FIGHTING 1662 -▁FIGURE 1663 -▁FILL 1664 -▁FILLED 1665 -▁FILM 1666 -▁FIN 1667 -▁FINAL 1668 -▁FINALLY 1669 -▁FIND 1670 -▁FINDING 1671 -▁FINE 1672 -▁FINGER 1673 -▁FINGERS 1674 -▁FINISH 1675 -▁FINISHED 1676 -▁FIRE 1677 -▁FIRM 1678 -▁FIRMLY 1679 -▁FIRST 1680 -▁FISH 1681 -▁FISHERMAN 1682 -▁FIT 1683 -▁FITTED 1684 -▁FIVE 1685 -▁FIX 1686 -▁FIXED 1687 -▁FL 1688 -▁FLAG 1689 -▁FLAME 1690 -▁FLANK 1691 -▁FLASH 1692 -▁FLAT 1693 -▁FLATTER 1694 -▁FLED 1695 -▁FLEE 1696 -▁FLEET 1697 -▁FLESH 1698 -▁FLEW 1699 -▁FLICKER 1700 -▁FLIGHT 1701 -▁FLO 1702 -▁FLOCK 1703 -▁FLOOD 1704 -▁FLOOR 1705 -▁FLORENCE 1706 -▁FLOUR 1707 -▁FLOURISH 1708 -▁FLOW 1709 -▁FLOWER 1710 -▁FLOWERS 1711 -▁FLU 1712 -▁FLUTTER 1713 -▁FLY 1714 -▁FLYING 1715 -▁FO 1716 -▁FOG 1717 -FOLD 1718 -▁FOLD 1719 -FOLK 1720 -▁FOLK 1721 -▁FOLLOW 1722 -▁FOLLOWED 1723 -▁FOLLOWING 1724 -▁FOLLY 1725 -▁FOND 1726 -▁FOOD 1727 -▁FOOL 1728 -▁FOOLISH 1729 -FOOT 1730 -▁FOOT 1731 -▁FOOTSTEPS 1732 -FOR 1733 -▁FOR 1734 -▁FORBID 1735 -▁FORCE 1736 -▁FORCED 1737 -FORD 1738 -▁FORE 1739 -▁FOREHEAD 1740 -▁FOREIGN 1741 -▁FORESEE 1742 -▁FOREST 1743 -▁FORGET 1744 -▁FORGIVE 1745 -▁FORGOT 1746 -▁FORGOTTEN 1747 -FORM 1748 -▁FORM 1749 -▁FORMED 1750 -▁FORMER 1751 -▁FORMIDABLE 1752 -▁FORSAKE 1753 -▁FORTH 1754 -▁FORTNIGHT 1755 -▁FORTUNATE 1756 -▁FORTUNE 1757 -▁FORTY 1758 -▁FORWARD 1759 -▁FOUGHT 1760 -▁FOUND 1761 -▁FOUNTAIN 1762 -▁FOUR 1763 -▁FOURTEEN 1764 -▁FOURTH 1765 -▁FOWL 1766 -▁FOX 1767 -▁FRA 1768 -▁FRAGMENT 1769 -▁FRAME 1770 -▁FRANCE 1771 -▁FRANCIS 1772 -▁FRANCS 1773 -▁FRANK 1774 -▁FRED 1775 -▁FREDERICK 1776 -▁FREE 1777 -▁FREEDOM 1778 -▁FRENCH 1779 -▁FREQUENT 1780 -▁FREQUENTLY 1781 -▁FRESH 1782 -▁FRI 1783 -▁FRIDAY 1784 -▁FRIEND 1785 -▁FRIENDLY 1786 -▁FRIENDS 1787 -▁FRIENDSHIP 1788 -▁FRIGHT 1789 -▁FRIGHTENED 1790 -▁FRIGHTFUL 1791 -▁FRINGE 1792 -▁FRO 1793 -▁FROG 1794 -▁FROM 1795 -▁FRONT 1796 -▁FROWN 1797 -▁FRUIT 1798 -FT 1799 -▁FU 1800 -FUL 1801 -▁FULFIL 1802 -▁FULL 1803 -▁FULLY 1804 -▁FUN 1805 -▁FUNCTION 1806 -▁FUNDAMENTAL 1807 -▁FUNERAL 1808 -▁FUNNY 1809 -▁FUR 1810 -▁FURIOUS 1811 -▁FURNISH 1812 -▁FURNITURE 1813 -▁FURTHER 1814 -▁FUTURE 1815 -G 1816 -▁G 1817 -GA 1818 -▁GA 1819 -▁GAIN 1820 -▁GAINED 1821 -▁GALL 1822 -▁GALLANT 1823 -▁GALLERY 1824 -▁GALLOP 1825 -▁GAME 1826 -GAN 1827 -GAR 1828 -▁GAR 1829 -▁GARDEN 1830 -▁GARRISON 1831 -▁GASP 1832 -GATE 1833 -▁GATE 1834 -▁GATHER 1835 -▁GATHERED 1836 -▁GAVE 1837 -▁GAY 1838 -GE 1839 -▁GE 1840 -GED 1841 -GEN 1842 -▁GEN 1843 -▁GENERAL 1844 -▁GENERALLY 1845 -▁GENERATION 1846 -▁GENEROSITY 1847 -▁GENEROUS 1848 -▁GENIUS 1849 -▁GENTLE 1850 -▁GENTLEMAN 1851 -▁GENTLEMEN 1852 -▁GENTLY 1853 -▁GENUINE 1854 -▁GEORGE 1855 -GER 1856 -▁GER 1857 -▁GERMAN 1858 -▁GESTURE 1859 -▁GET 1860 -▁GETTING 1861 -GG 1862 -▁GHASTL 1863 -▁GHOST 1864 -GI 1865 -▁GI 1866 -▁GIANT 1867 -▁GIFT 1868 -▁GIGANTIC 1869 -▁GIL 1870 -▁GILBERT 1871 -GING 1872 -▁GIRL 1873 -▁GIRLS 1874 -▁GIVE 1875 -▁GIVEN 1876 -▁GIVING 1877 -▁GLA 1878 -▁GLACIER 1879 -▁GLAD 1880 -▁GLANCE 1881 -▁GLANCING 1882 -▁GLASS 1883 -▁GLEAM 1884 -▁GLEN 1885 -▁GLID 1886 -▁GLIMMER 1887 -▁GLIMPSE 1888 -▁GLITTER 1889 -▁GLOBE 1890 -▁GLOOM 1891 -▁GLOOMY 1892 -▁GLORIOUS 1893 -▁GLORY 1894 -▁GLOVE 1895 -▁GLOW 1896 -GN 1897 -GO 1898 -▁GO 1899 -▁GOAT 1900 -▁GOD 1901 -▁GODDESS 1902 -▁GOES 1903 -▁GOING 1904 -▁GOLD 1905 -▁GOLDEN 1906 -▁GONE 1907 -▁GOOD 1908 -▁GORGEOUS 1909 -▁GOSPEL 1910 -▁GOSSIP 1911 -▁GOT 1912 -▁GOVERN 1913 -▁GOVERNMENT 1914 -▁GOVERNOR 1915 -▁GOWN 1916 -GRA 1917 -▁GRA 1918 -▁GRACE 1919 -▁GRACEFUL 1920 -▁GRACIOUS 1921 -▁GRADUALLY 1922 -▁GRAND 1923 -▁GRANDFATHER 1924 -▁GRANDMOTHER 1925 -▁GRANITE 1926 -▁GRANT 1927 -▁GRASP 1928 -▁GRASS 1929 -▁GRATEFUL 1930 -▁GRATIFY 1931 -▁GRATITUDE 1932 -▁GRAVE 1933 -▁GRAVITY 1934 -▁GRAY 1935 -▁GRE 1936 -▁GREAT 1937 -▁GREATER 1938 -▁GREATEST 1939 -▁GREATLY 1940 -▁GREEK 1941 -▁GREEN 1942 -▁GREW 1943 -▁GREY 1944 -▁GRI 1945 -▁GRIEF 1946 -▁GRIEVE 1947 -▁GRIM 1948 -▁GRIN 1949 -▁GRO 1950 -▁GROAN 1951 -▁GROUND 1952 -▁GROUP 1953 -▁GROVE 1954 -▁GROW 1955 -▁GROWING 1956 -▁GROWL 1957 -▁GROWN 1958 -▁GROWTH 1959 -GU 1960 -▁GU 1961 -▁GUARD 1962 -GUE 1963 -▁GUESS 1964 -▁GUEST 1965 -▁GUIDE 1966 -▁GUILT 1967 -▁GUILTY 1968 -▁GUINEA 1969 -▁GUN 1970 -H 1971 -HA 1972 -▁HA 1973 -▁HABIT 1974 -▁HABITUAL 1975 -▁HAD 1976 -▁HAIR 1977 -▁HALE 1978 -▁HALF 1979 -▁HALL 1980 -▁HALT 1981 -HAM 1982 -▁HAM 1983 -▁HAMILTON 1984 -▁HAMMER 1985 -HAN 1986 -▁HAND 1987 -▁HANDKERCHIEF 1988 -▁HANDS 1989 -▁HANDSOME 1990 -▁HANG 1991 -▁HANGING 1992 -▁HANS 1993 -▁HAPPEN 1994 -▁HAPPENED 1995 -▁HAPPIER 1996 -▁HAPPILY 1997 -▁HAPPINESS 1998 -▁HAPPY 1999 -HAR 2000 -▁HAR 2001 -▁HARBOR 2002 -▁HARBOUR 2003 -▁HARD 2004 -▁HARDLY 2005 -▁HARM 2006 -▁HARMONI 2007 -▁HARMONY 2008 -▁HARRY 2009 -▁HARSH 2010 -▁HARVEST 2011 -▁HAS 2012 -▁HASTE 2013 -▁HASTENED 2014 -▁HASTILY 2015 -▁HAT 2016 -▁HATE 2017 -▁HATH 2018 -▁HATRED 2019 -▁HAUNT 2020 -▁HAVE 2021 -▁HAVEN 2022 -▁HAVING 2023 -▁HAWK 2024 -▁HAY 2025 -HE 2026 -▁HE 2027 -HEAD 2028 -▁HEAD 2029 -▁HEADS 2030 -▁HEALTH 2031 -▁HEAP 2032 -▁HEAR 2033 -▁HEARD 2034 -▁HEARING 2035 -▁HEART 2036 -▁HEAT 2037 -▁HEAVEN 2038 -▁HEAVILY 2039 -▁HEAVY 2040 -▁HEBREW 2041 -▁HEDGE 2042 -▁HEIGHT 2043 -▁HELD 2044 -▁HELEN 2045 -▁HELP 2046 -▁HELPLESS 2047 -HEN 2048 -▁HENCE 2049 -▁HENRY 2050 -HER 2051 -▁HER 2052 -▁HERBERT 2053 -▁HERCULES 2054 -▁HERE 2055 -▁HERO 2056 -▁HERSELF 2057 -▁HESITATE 2058 -▁HESITATED 2059 -▁HESITATION 2060 -HI 2061 -▁HI 2062 -▁HID 2063 -▁HIDDEN 2064 -▁HIDE 2065 -▁HIDEOUS 2066 -▁HIGH 2067 -▁HIGHER 2068 -▁HIGHEST 2069 -▁HILL 2070 -▁HILLS 2071 -▁HIM 2072 -▁HIMSELF 2073 -HIN 2074 -▁HIND 2075 -▁HINT 2076 -▁HIS 2077 -▁HISTORY 2078 -▁HIT 2079 -▁HITHER 2080 -▁HITHERTO 2081 -HO 2082 -▁HO 2083 -▁HOARSE 2084 -HOLD 2085 -▁HOLD 2086 -▁HOLDING 2087 -▁HOLE 2088 -▁HOLIDAY 2089 -▁HOLLAND 2090 -▁HOLLOW 2091 -▁HOLY 2092 -▁HOME 2093 -▁HONEST 2094 -▁HONEY 2095 -▁HONOR 2096 -▁HONOUR 2097 -HOOD 2098 -▁HOOK 2099 -▁HOPE 2100 -▁HOPED 2101 -▁HOPELESS 2102 -▁HOPING 2103 -▁HORI 2104 -▁HORN 2105 -▁HORRIBLE 2106 -▁HORRID 2107 -▁HORROR 2108 -▁HORSE 2109 -▁HORSES 2110 -▁HOSPITAL 2111 -▁HOST 2112 -▁HOT 2113 -▁HOTEL 2114 -▁HOUR 2115 -▁HOURS 2116 -HOUSE 2117 -▁HOUSE 2118 -▁HOUSEHOLD 2119 -▁HOUSEKEEPER 2120 -▁HOUSES 2121 -▁HOW 2122 -▁HOWEVER 2123 -HU 2124 -▁HU 2125 -▁HUGE 2126 -▁HUM 2127 -▁HUMAN 2128 -▁HUMANITY 2129 -▁HUMBLE 2130 -▁HUMOR 2131 -▁HUMOUR 2132 -▁HUNDRED 2133 -▁HUNG 2134 -▁HUNGER 2135 -▁HUNGRY 2136 -▁HUNT 2137 -▁HUNTER 2138 -▁HUNTING 2139 -▁HURRIED 2140 -▁HURRY 2141 -HURST 2142 -▁HURT 2143 -▁HUSBAND 2144 -▁HUSH 2145 -▁HUT 2146 -HY 2147 -▁HY 2148 -▁HYMN 2149 -▁HYPNOTI 2150 -I 2151 -▁I 2152 -IA 2153 -IAL 2154 -IAN 2155 -IANS 2156 -IB 2157 -IBLE 2158 -IC 2159 -ICAL 2160 -▁ICE 2161 -ICK 2162 -ID 2163 -▁IDEA 2164 -▁IDEAL 2165 -▁IDEAS 2166 -▁IDENTITY 2167 -▁IDIOT 2168 -▁IDLE 2169 -IE 2170 -IED 2171 -IER 2172 -IES 2173 -IF 2174 -▁IF 2175 -IFICATION 2176 -IFIED 2177 -IFYING 2178 -IG 2179 -IGHT 2180 -IGN 2181 -▁IGNOR 2182 -▁IGNORANCE 2183 -▁IGNORANT 2184 -IL 2185 -ILE 2186 -ILITY 2187 -▁ILL 2188 -▁ILLUSION 2189 -▁ILLUSTRAT 2190 -ILY 2191 -IM 2192 -▁IMAGE 2193 -▁IMAGINATION 2194 -▁IMAGINE 2195 -▁IMITAT 2196 -▁IMMEDIATE 2197 -▁IMMEDIATELY 2198 -▁IMMENSE 2199 -▁IMMORTAL 2200 -▁IMP 2201 -▁IMPART 2202 -▁IMPATIENCE 2203 -▁IMPATIENT 2204 -▁IMPERFECT 2205 -▁IMPERIAL 2206 -▁IMPORT 2207 -▁IMPORTANCE 2208 -▁IMPORTANT 2209 -▁IMPOSSIBLE 2210 -▁IMPRESSED 2211 -▁IMPRESSION 2212 -▁IMPROVE 2213 -▁IMPROVEMENT 2214 -▁IMPULSE 2215 -IN 2216 -▁IN 2217 -INA 2218 -▁INCAPABLE 2219 -▁INCENSE 2220 -▁INCESSANT 2221 -▁INCHES 2222 -▁INCIDENT 2223 -▁INCLINATION 2224 -▁INCLINED 2225 -▁INCLUD 2226 -▁INCOME 2227 -▁INCREASE 2228 -▁INCREASED 2229 -▁INCREASING 2230 -▁INDEED 2231 -▁INDEPENDENCE 2232 -▁INDEPENDENT 2233 -▁INDIA 2234 -▁INDIAN 2235 -▁INDIANS 2236 -▁INDIFFERENCE 2237 -▁INDIFFERENT 2238 -▁INDIGNANT 2239 -▁INDIGNATION 2240 -▁INDIVIDUAL 2241 -▁INDUCE 2242 -▁INDULGE 2243 -▁INDUSTRY 2244 -INE 2245 -INESS 2246 -▁INEVITABLE 2247 -▁INFANT 2248 -▁INFERIOR 2249 -▁INFINITE 2250 -▁INFLICT 2251 -▁INFLUENCE 2252 -▁INFORMATION 2253 -▁INFORMED 2254 -ING 2255 -▁INHABIT 2256 -▁INHABITANTS 2257 -▁INHERIT 2258 -▁INJURED 2259 -▁INJURY 2260 -▁INJUSTICE 2261 -▁INNOCENCE 2262 -▁INNOCENT 2263 -▁INNUMERABLE 2264 -▁INQUIRE 2265 -▁INQUIRED 2266 -▁INQUIRIES 2267 -▁INQUIRY 2268 -▁INSECT 2269 -▁INSIDE 2270 -▁INSIST 2271 -▁INSPECTOR 2272 -▁INSTANCE 2273 -▁INSTANT 2274 -▁INSTANTLY 2275 -▁INSTEAD 2276 -▁INSTINCT 2277 -▁INSTINCTIVELY 2278 -▁INSTITUTION 2279 -▁INSTRUCT 2280 -▁INSTRUMENT 2281 -▁INSULT 2282 -▁INTELLECT 2283 -▁INTELLECTUAL 2284 -▁INTELLIGENCE 2285 -▁INTELLIGENT 2286 -▁INTELLIGIBLE 2287 -▁INTEND 2288 -▁INTENDED 2289 -▁INTENSE 2290 -▁INTENSITY 2291 -▁INTENT 2292 -▁INTENTION 2293 -▁INTER 2294 -▁INTERCOURSE 2295 -▁INTEREST 2296 -▁INTERESTED 2297 -▁INTERESTING 2298 -▁INTERFERE 2299 -▁INTERNAL 2300 -▁INTERPOSED 2301 -▁INTERPRET 2302 -▁INTERRUPT 2303 -▁INTERRUPTED 2304 -▁INTERVAL 2305 -▁INTERVEN 2306 -▁INTERVIEW 2307 -▁INTIMACY 2308 -▁INTIMATE 2309 -▁INTO 2310 -▁INTRODUCED 2311 -▁INVARIABLY 2312 -▁INVENT 2313 -▁INVESTIGAT 2314 -▁INVISIBLE 2315 -▁INVITATION 2316 -▁INVITED 2317 -IO 2318 -ION 2319 -IONS 2320 -IOUS 2321 -IP 2322 -IR 2323 -IRE 2324 -▁IRELAND 2325 -▁IRISH 2326 -▁IRON 2327 -▁IRRE 2328 -▁IRREGULAR 2329 -▁IRRESISTIBLE 2330 -IS 2331 -▁IS 2332 -▁ISABEL 2333 -ISH 2334 -▁ISLAND 2335 -ISM 2336 -▁ISN 2337 -ISON 2338 -▁ISSUE 2339 -IST 2340 -ISTIC 2341 -ISTS 2342 -IT 2343 -▁IT 2344 -▁ITALIAN 2345 -▁ITALY 2346 -ITCH 2347 -ITE 2348 -ITIES 2349 -▁ITS 2350 -▁ITSELF 2351 -ITUDE 2352 -ITY 2353 -IUM 2354 -IUS 2355 -IVE 2356 -J 2357 -▁J 2358 -JA 2359 -▁JA 2360 -▁JACK 2361 -▁JACKSON 2362 -▁JACOB 2363 -▁JAMES 2364 -▁JANE 2365 -▁JANUARY 2366 -▁JAPANESE 2367 -▁JAR 2368 -▁JASPER 2369 -▁JAW 2370 -▁JE 2371 -▁JEALOUS 2372 -▁JEAN 2373 -▁JERK 2374 -▁JERRY 2375 -▁JERUSALEM 2376 -▁JEST 2377 -▁JESUS 2378 -▁JEW 2379 -▁JEWEL 2380 -JI 2381 -▁JIM 2382 -▁JIMMIE 2383 -▁JIMMY 2384 -JO 2385 -▁JO 2386 -▁JOB 2387 -▁JOE 2388 -▁JOHN 2389 -▁JOHNSON 2390 -▁JOIN 2391 -▁JOINED 2392 -▁JOKE 2393 -▁JOLLY 2394 -▁JONES 2395 -▁JOSEPH 2396 -▁JOURNAL 2397 -▁JOURNEY 2398 -▁JOY 2399 -JU 2400 -▁JU 2401 -▁JUD 2402 -▁JUDGE 2403 -▁JUDGMENT 2404 -▁JUICE 2405 -▁JULIA 2406 -▁JULIE 2407 -▁JULIUS 2408 -▁JUMP 2409 -▁JUMPED 2410 -▁JUNE 2411 -▁JUNGLE 2412 -▁JUST 2413 -▁JUSTICE 2414 -▁JUSTIFY 2415 -K 2416 -▁K 2417 -KA 2418 -▁KA 2419 -▁KATE 2420 -▁KATY 2421 -KE 2422 -▁KEEN 2423 -KEEP 2424 -▁KEEP 2425 -▁KEEPING 2426 -▁KEITH 2427 -▁KEN 2428 -▁KENNEDY 2429 -▁KEPT 2430 -KER 2431 -▁KETTLE 2432 -▁KEY 2433 -KI 2434 -▁KI 2435 -▁KICK 2436 -▁KILL 2437 -▁KILLED 2438 -KIN 2439 -▁KIND 2440 -▁KINDLY 2441 -▁KINDNESS 2442 -KING 2443 -▁KING 2444 -▁KINGDOM 2445 -▁KISS 2446 -▁KISSED 2447 -▁KIT 2448 -▁KITCHEN 2449 -▁KITTY 2450 -▁KNEE 2451 -▁KNEES 2452 -▁KNELT 2453 -▁KNEW 2454 -▁KNIFE 2455 -▁KNIGHT 2456 -▁KNIT 2457 -▁KNOCK 2458 -▁KNOT 2459 -▁KNOW 2460 -▁KNOWING 2461 -▁KNOWLEDGE 2462 -▁KNOWN 2463 -▁KNOWS 2464 -KO 2465 -▁KO 2466 -KY 2467 -L 2468 -LA 2469 -▁LA 2470 -▁LABOR 2471 -▁LABOUR 2472 -LAC 2473 -▁LACE 2474 -▁LACK 2475 -▁LAD 2476 -▁LADDER 2477 -▁LADIES 2478 -▁LADY 2479 -▁LAID 2480 -▁LAKE 2481 -▁LAMB 2482 -▁LAMENT 2483 -▁LAMP 2484 -LAN 2485 -LAND 2486 -▁LAND 2487 -▁LANDLORD 2488 -▁LANDSCAPE 2489 -▁LANE 2490 -▁LANGUAGE 2491 -▁LANTERN 2492 -▁LAP 2493 -LAR 2494 -▁LARGE 2495 -▁LARGER 2496 -▁LAST 2497 -▁LATE 2498 -▁LATER 2499 -▁LATTER 2500 -▁LAUGH 2501 -▁LAUGHED 2502 -▁LAUGHING 2503 -▁LAUGHTER 2504 -▁LAUNCELOT 2505 -▁LAUNCH 2506 -▁LAURA 2507 -▁LAW 2508 -▁LAWS 2509 -▁LAWYER 2510 -▁LAY 2511 -LD 2512 -LE 2513 -▁LE 2514 -▁LEAD 2515 -▁LEADER 2516 -▁LEADING 2517 -▁LEAF 2518 -▁LEAGUE 2519 -▁LEAN 2520 -▁LEANED 2521 -▁LEANING 2522 -▁LEAP 2523 -▁LEARN 2524 -▁LEARNED 2525 -▁LEAST 2526 -▁LEATHER 2527 -▁LEAVE 2528 -▁LEAVES 2529 -▁LEAVING 2530 -▁LECTURE 2531 -LED 2532 -▁LED 2533 -▁LEFT 2534 -▁LEG 2535 -▁LEGEND 2536 -▁LEGISLATURE 2537 -▁LEGS 2538 -LEIGH 2539 -▁LEISURE 2540 -▁LEMON 2541 -▁LEND 2542 -▁LENGTH 2543 -▁LEONORA 2544 -LER 2545 -LES 2546 -LESS 2547 -▁LESS 2548 -▁LESSON 2549 -▁LEST 2550 -LET 2551 -▁LET 2552 -▁LETTER 2553 -▁LETTERS 2554 -▁LEVEL 2555 -▁LEVIN 2556 -LEY 2557 -LF 2558 -LI 2559 -▁LI 2560 -▁LIBERAL 2561 -▁LIBERTY 2562 -▁LIBRARY 2563 -LIE 2564 -▁LIE 2565 -▁LIES 2566 -▁LIEUTENANT 2567 -▁LIFE 2568 -▁LIFT 2569 -▁LIFTED 2570 -LIGHT 2571 -▁LIGHT 2572 -▁LIGHTNING 2573 -LIKE 2574 -▁LIKE 2575 -▁LIKED 2576 -▁LIKELY 2577 -▁LIKEWISE 2578 -▁LIMB 2579 -▁LIMIT 2580 -LIN 2581 -▁LIN 2582 -▁LINCOLN 2583 -LINE 2584 -▁LINE 2585 -▁LINES 2586 -LINESS 2587 -LING 2588 -▁LINGER 2589 -▁LION 2590 -▁LIPS 2591 -▁LIQUID 2592 -▁LIQUOR 2593 -▁LIST 2594 -▁LISTEN 2595 -▁LISTENED 2596 -▁LISTENING 2597 -▁LITERALLY 2598 -▁LITERARY 2599 -▁LITERATURE 2600 -▁LITTLE 2601 -▁LIVE 2602 -▁LIVED 2603 -▁LIVES 2604 -▁LIVING 2605 -LL 2606 -LO 2607 -▁LO 2608 -▁LOAD 2609 -▁LOAF 2610 -▁LOCAL 2611 -LOCK 2612 -▁LOCK 2613 -▁LOCKED 2614 -▁LODGE 2615 -▁LODGING 2616 -▁LOFTY 2617 -▁LOG 2618 -LON 2619 -▁LONDON 2620 -▁LONELY 2621 -LONG 2622 -▁LONG 2623 -▁LONGER 2624 -▁LOOK 2625 -▁LOOKED 2626 -▁LOOKING 2627 -▁LOOKS 2628 -▁LOOSE 2629 -▁LORD 2630 -▁LOSE 2631 -▁LOSING 2632 -▁LOSS 2633 -▁LOST 2634 -▁LOT 2635 -▁LOUD 2636 -▁LOUIS 2637 -▁LOUNG 2638 -▁LOVE 2639 -▁LOVED 2640 -▁LOVELY 2641 -▁LOVER 2642 -▁LOVING 2643 -LOW 2644 -▁LOW 2645 -▁LOWER 2646 -▁LOYAL 2647 -LT 2648 -▁LU 2649 -▁LUC 2650 -▁LUCK 2651 -▁LUCY 2652 -▁LUNCH 2653 -LUNG 2654 -LUS 2655 -▁LUXURY 2656 -LY 2657 -▁LYING 2658 -▁LYN 2659 -M 2660 -▁M 2661 -MA 2662 -▁MA 2663 -▁MAC 2664 -▁MACHINE 2665 -▁MAD 2666 -▁MADAM 2667 -▁MADAME 2668 -▁MADE 2669 -▁MADEMOISELLE 2670 -▁MAGGIE 2671 -▁MAGIC 2672 -▁MAGICIAN 2673 -▁MAGISTRATE 2674 -▁MAGNIFICENT 2675 -▁MAID 2676 -▁MAIDEN 2677 -▁MAIN 2678 -▁MAINTAIN 2679 -▁MAJESTY 2680 -▁MAJOR 2681 -▁MAJORITY 2682 -▁MAKE 2683 -▁MAKES 2684 -▁MAKING 2685 -▁MAL 2686 -▁MALE 2687 -▁MAMMA 2688 -MAN 2689 -▁MAN 2690 -▁MANAGE 2691 -▁MANAGED 2692 -▁MANIFEST 2693 -▁MANKIND 2694 -▁MANNER 2695 -▁MANUFACTURE 2696 -▁MANUSCRIPT 2697 -▁MANY 2698 -▁MAR 2699 -▁MARBLE 2700 -▁MARCH 2701 -▁MARGARET 2702 -▁MARGUERITE 2703 -▁MARIAN 2704 -▁MARILLA 2705 -▁MARK 2706 -▁MARKED 2707 -▁MARKET 2708 -▁MARQUIS 2709 -▁MARRIAGE 2710 -▁MARRIED 2711 -▁MARRY 2712 -▁MARSH 2713 -▁MARTHA 2714 -▁MARTIAN 2715 -▁MARTIN 2716 -▁MARTYR 2717 -▁MARVEL 2718 -▁MARVELLOUS 2719 -▁MARY 2720 -▁MASK 2721 -▁MASS 2722 -▁MASTER 2723 -▁MAT 2724 -▁MATCH 2725 -▁MATE 2726 -▁MATERIAL 2727 -▁MATTER 2728 -▁MATTERS 2729 -▁MATTHEW 2730 -▁MAXIM 2731 -▁MAY 2732 -▁MAYBE 2733 -MBLED 2734 -ME 2735 -▁ME 2736 -▁MEADOW 2737 -▁MEAL 2738 -▁MEAN 2739 -▁MEANING 2740 -▁MEANS 2741 -▁MEANT 2742 -▁MEANTIME 2743 -▁MEANWHILE 2744 -▁MEASURE 2745 -▁MEAT 2746 -▁MECHANICAL 2747 -▁MEDI 2748 -▁MEDICAL 2749 -▁MEDICINE 2750 -▁MEET 2751 -▁MEETING 2752 -▁MELANCHOLY 2753 -▁MEMBER 2754 -▁MEMBERS 2755 -▁MEMORIES 2756 -▁MEMORY 2757 -MEN 2758 -▁MEN 2759 -MENT 2760 -▁MENTAL 2761 -▁MENTION 2762 -▁MENTIONED 2763 -MENTS 2764 -MER 2765 -▁MER 2766 -▁MERCHANT 2767 -▁MERCY 2768 -▁MERE 2769 -▁MERELY 2770 -▁MERIT 2771 -▁MERRY 2772 -▁MESSAGE 2773 -▁MESSENGER 2774 -▁MET 2775 -▁METAL 2776 -▁METHOD 2777 -▁MEXICAN 2778 -MI 2779 -▁MI 2780 -▁MICHAEL 2781 -▁MID 2782 -▁MIDDLE 2783 -▁MIDNIGHT 2784 -MIDST 2785 -▁MIDST 2786 -▁MIGHT 2787 -▁MIGHTY 2788 -▁MIL 2789 -▁MILD 2790 -▁MILE 2791 -▁MILES 2792 -▁MILITARY 2793 -▁MILK 2794 -▁MILL 2795 -▁MILLION 2796 -▁MIN 2797 -▁MIND 2798 -▁MINE 2799 -▁MINGLED 2800 -▁MINISTER 2801 -▁MINUTE 2802 -▁MINUTES 2803 -▁MIRACLE 2804 -▁MIRROR 2805 -▁MIRTH 2806 -▁MIS 2807 -▁MISCHIEF 2808 -▁MISERABLE 2809 -▁MISERY 2810 -▁MISFORTUNE 2811 -▁MISS 2812 -▁MISSION 2813 -▁MISSUS 2814 -▁MIST 2815 -▁MISTAKE 2816 -▁MISTAKEN 2817 -▁MISTER 2818 -▁MISTRESS 2819 -▁MITYA 2820 -▁MIX 2821 -▁MIXTURE 2822 -MMED 2823 -MO 2824 -▁MO 2825 -▁MOCK 2826 -▁MODE 2827 -▁MODERATE 2828 -▁MODERN 2829 -▁MODEST 2830 -▁MOMENT 2831 -▁MON 2832 -▁MONARCH 2833 -MOND 2834 -▁MONDAY 2835 -▁MONEY 2836 -▁MONK 2837 -▁MONKEY 2838 -▁MONSIEUR 2839 -▁MONSTER 2840 -▁MONSTROUS 2841 -MONT 2842 -▁MONTE 2843 -▁MONTH 2844 -▁MONTHS 2845 -▁MONUMENT 2846 -▁MOOD 2847 -▁MOON 2848 -▁MOONLIGHT 2849 -▁MOR 2850 -▁MORAL 2851 -MORE 2852 -▁MORE 2853 -▁MOREOVER 2854 -▁MORNING 2855 -▁MORROW 2856 -▁MORTAL 2857 -▁MOSCOW 2858 -MOST 2859 -▁MOST 2860 -▁MOTHER 2861 -▁MOTION 2862 -▁MOTIONLESS 2863 -▁MOTIVE 2864 -▁MOTOR 2865 -▁MOULD 2866 -▁MOUNT 2867 -▁MOUNTAIN 2868 -▁MOUNTAINS 2869 -▁MOUNTED 2870 -▁MOURN 2871 -▁MOUSE 2872 -MOUTH 2873 -▁MOUTH 2874 -▁MOVE 2875 -▁MOVED 2876 -▁MOVEMENT 2877 -▁MOVING 2878 -MP 2879 -▁MU 2880 -▁MUCH 2881 -▁MUD 2882 -▁MULE 2883 -▁MULTITUDE 2884 -▁MURDER 2885 -▁MURDERER 2886 -▁MURMUR 2887 -▁MURMURED 2888 -▁MUSCLE 2889 -▁MUSCULAR 2890 -▁MUSIC 2891 -▁MUSKET 2892 -▁MUST 2893 -▁MUTTERED 2894 -▁MUTUAL 2895 -MY 2896 -▁MY 2897 -▁MYSELF 2898 -▁MYSTERIOUS 2899 -▁MYSTERY 2900 -N 2901 -NA 2902 -▁NA 2903 -▁NAIL 2904 -▁NAKED 2905 -▁NAME 2906 -▁NAMED 2907 -▁NANCY 2908 -▁NAPOLEON 2909 -▁NARRAT 2910 -▁NARROW 2911 -▁NATASHA 2912 -▁NATION 2913 -▁NATIONAL 2914 -▁NATIVE 2915 -▁NATURAL 2916 -▁NATURALLY 2917 -▁NATURE 2918 -▁NAUGHT 2919 -▁NAUTILUS 2920 -▁NAV 2921 -▁NAVIGAT 2922 -▁NAY 2923 -NCE 2924 -ND 2925 -NE 2926 -▁NE 2927 -▁NEAR 2928 -▁NEARER 2929 -▁NEAREST 2930 -▁NEARLY 2931 -▁NEAT 2932 -▁NECESSARILY 2933 -▁NECESSARY 2934 -▁NECESSITY 2935 -▁NECK 2936 -NED 2937 -▁NEED 2938 -▁NEEDED 2939 -▁NEGLECT 2940 -▁NEGRO 2941 -▁NEIGHBOR 2942 -▁NEIGHBORHOOD 2943 -▁NEIGHBOUR 2944 -▁NEIGHBOURHOOD 2945 -▁NEITHER 2946 -▁NEPHEW 2947 -NER 2948 -▁NERVE 2949 -▁NERVOUS 2950 -NESS 2951 -▁NEST 2952 -▁NEVER 2953 -▁NEVERTHELESS 2954 -▁NEW 2955 -▁NEWS 2956 -▁NEWSPAPER 2957 -▁NEXT 2958 -NEY 2959 -NG 2960 -NI 2961 -▁NI 2962 -NIC 2963 -▁NICE 2964 -▁NICHOLAS 2965 -▁NIECE 2966 -▁NIGH 2967 -▁NIGHT 2968 -▁NIGHTINGALE 2969 -▁NINE 2970 -▁NINETEEN 2971 -▁NINETY 2972 -NING 2973 -▁NINTH 2974 -NNIE 2975 -NNY 2976 -NO 2977 -▁NO 2978 -▁NOBILITY 2979 -▁NOBLE 2980 -▁NOBODY 2981 -▁NODDED 2982 -▁NOISE 2983 -▁NONE 2984 -▁NONSENSE 2985 -▁NOR 2986 -▁NORMAL 2987 -▁NORMAN 2988 -▁NORTH 2989 -▁NORTHERN 2990 -▁NOSE 2991 -▁NOT 2992 -▁NOTE 2993 -▁NOTHING 2994 -▁NOTICE 2995 -▁NOTICED 2996 -▁NOTWITHSTANDING 2997 -▁NOVEL 2998 -▁NOVEMBER 2999 -▁NOW 3000 -▁NOWHERE 3001 -NT 3002 -▁NU 3003 -▁NUMBER 3004 -▁NUMEROUS 3005 -▁NURSE 3006 -▁NUT 3007 -NY 3008 -O 3009 -▁O 3010 -▁OAK 3011 -▁OATH 3012 -▁OB 3013 -▁OBEDIENCE 3014 -▁OBEY 3015 -▁OBJECT 3016 -▁OBJECTION 3017 -▁OBLIGATION 3018 -▁OBLIGED 3019 -▁OBSCURE 3020 -▁OBSERVATION 3021 -▁OBSERVE 3022 -▁OBSERVED 3023 -▁OBSERVING 3024 -▁OBSTACLE 3025 -▁OBSTINATE 3026 -▁OBTAIN 3027 -▁OBTAINED 3028 -▁OBVIOUS 3029 -OC 3030 -▁OCCASION 3031 -▁OCCASIONALLY 3032 -▁OCCUPATION 3033 -▁OCCUPIED 3034 -▁OCCUPY 3035 -▁OCCUR 3036 -▁OCCURRED 3037 -▁OCCURRENCE 3038 -▁OCEAN 3039 -▁OCTOBER 3040 -OD 3041 -▁ODD 3042 -▁OF 3043 -▁OFF 3044 -▁OFFEND 3045 -▁OFFER 3046 -▁OFFERED 3047 -▁OFFICE 3048 -▁OFFICER 3049 -▁OFFICERS 3050 -▁OFFICIAL 3051 -▁OFTEN 3052 -OG 3053 -▁OH 3054 -▁OIL 3055 -OL 3056 -▁OLD 3057 -▁OLIVER 3058 -OLOGICAL 3059 -OLOGIST 3060 -OLOGY 3061 -OM 3062 -ON 3063 -▁ON 3064 -▁ONCE 3065 -ONE 3066 -▁ONE 3067 -▁ONLY 3068 -OO 3069 -OOK 3070 -OON 3071 -OP 3072 -▁OPEN 3073 -▁OPENED 3074 -▁OPENING 3075 -▁OPERA 3076 -▁OPERATION 3077 -▁OPINION 3078 -▁OPPONENT 3079 -▁OPPORTUNITY 3080 -▁OPPOSITE 3081 -▁OPPOSITION 3082 -▁OPPRESS 3083 -OR 3084 -▁OR 3085 -▁ORANGE 3086 -▁ORCHARD 3087 -ORD 3088 -▁ORDER 3089 -▁ORDERED 3090 -▁ORDERS 3091 -▁ORDINARY 3092 -▁ORGAN 3093 -▁ORGANI 3094 -▁ORIGIN 3095 -▁ORIGINAL 3096 -▁ORNAMENT 3097 -ORS 3098 -ORY 3099 -OS 3100 -OT 3101 -▁OTHER 3102 -▁OTHERS 3103 -▁OTHERWISE 3104 -OU 3105 -▁OUGHT 3106 -▁OUNCE 3107 -OUR 3108 -▁OUR 3109 -▁OURSELVES 3110 -OUS 3111 -▁OUT 3112 -▁OUTRAGE 3113 -▁OUTSIDE 3114 -OV 3115 -▁OVEN 3116 -▁OVER 3117 -▁OVERCOME 3118 -▁OVERFLOW 3119 -▁OVERLOOK 3120 -▁OVERTAKE 3121 -▁OVERWHELM 3122 -OW 3123 -▁OWE 3124 -▁OWING 3125 -▁OWL 3126 -▁OWN 3127 -▁OYSTER 3128 -P 3129 -▁P 3130 -PA 3131 -▁PA 3132 -▁PACE 3133 -▁PACIFIC 3134 -▁PACK 3135 -▁PAGE 3136 -▁PAID 3137 -▁PAIN 3138 -▁PAINFUL 3139 -▁PAINTED 3140 -▁PAIR 3141 -▁PAL 3142 -▁PALACE 3143 -▁PALE 3144 -▁PALM 3145 -▁PAN 3146 -▁PAPA 3147 -▁PAPER 3148 -▁PAPERS 3149 -▁PAR 3150 -▁PARA 3151 -▁PARADISE 3152 -▁PARALLEL 3153 -▁PARCEL 3154 -▁PARDON 3155 -▁PARENTS 3156 -▁PARIS 3157 -▁PARK 3158 -▁PARLIAMENT 3159 -▁PARLOR 3160 -▁PARLOUR 3161 -▁PART 3162 -▁PARTICLE 3163 -▁PARTICULAR 3164 -▁PARTICULARLY 3165 -▁PARTIES 3166 -▁PARTNER 3167 -▁PARTS 3168 -▁PARTY 3169 -▁PASS 3170 -▁PASSAGE 3171 -▁PASSED 3172 -▁PASSENGER 3173 -▁PASSING 3174 -▁PASSION 3175 -▁PASSIONATE 3176 -▁PAST 3177 -▁PAT 3178 -▁PATCH 3179 -▁PATH 3180 -▁PATIENCE 3181 -▁PATIENT 3182 -▁PATRIOT 3183 -▁PAUL 3184 -▁PAUSE 3185 -▁PAUSED 3186 -▁PAVEMENT 3187 -▁PAW 3188 -▁PAY 3189 -PE 3190 -▁PE 3191 -▁PEA 3192 -▁PEACE 3193 -▁PEAK 3194 -▁PEARL 3195 -▁PEASANT 3196 -PEC 3197 -▁PECULIAR 3198 -PED 3199 -▁PEEP 3200 -▁PEER 3201 -▁PEN 3202 -▁PENCIL 3203 -▁PENETRATE 3204 -▁PENNY 3205 -▁PEOPLE 3206 -▁PEPPER 3207 -PER 3208 -▁PER 3209 -▁PERCEIVE 3210 -▁PERCEIVED 3211 -▁PERCEIVING 3212 -▁PERCEPTION 3213 -▁PERCH 3214 -▁PERFECT 3215 -▁PERFECTION 3216 -▁PERFECTLY 3217 -▁PERFORM 3218 -▁PERFORMANCE 3219 -▁PERFUME 3220 -▁PERHAPS 3221 -▁PERIL 3222 -▁PERIOD 3223 -▁PERISH 3224 -▁PERMANENT 3225 -▁PERMISSION 3226 -▁PERMIT 3227 -▁PERMITTED 3228 -▁PERPETUAL 3229 -▁PERPLEX 3230 -▁PERSECUT 3231 -▁PERSIST 3232 -▁PERSON 3233 -▁PERSONAL 3234 -▁PERSONS 3235 -▁PERSUADE 3236 -▁PET 3237 -▁PETER 3238 -PHA 3239 -▁PHARAOH 3240 -▁PHENOMENA 3241 -▁PHENOMENON 3242 -▁PHIL 3243 -▁PHILADELPHIA 3244 -▁PHILIP 3245 -▁PHILOSOPHER 3246 -▁PHILOSOPHY 3247 -▁PHOENIX 3248 -▁PHOTOGRAPH 3249 -▁PHRASE 3250 -▁PHYSICAL 3251 -▁PHYSICIAN 3252 -▁PI 3253 -▁PIANO 3254 -▁PICK 3255 -▁PICKED 3256 -▁PICTURE 3257 -PIECE 3258 -▁PIECE 3259 -▁PIECES 3260 -▁PIERCED 3261 -▁PIERRE 3262 -▁PIG 3263 -▁PILE 3264 -▁PILGRIM 3265 -▁PILL 3266 -▁PILLOW 3267 -▁PILOT 3268 -▁PIN 3269 -▁PINE 3270 -▁PINK 3271 -▁PINOCCHIO 3272 -▁PIPE 3273 -▁PIRATE 3274 -▁PISTOL 3275 -▁PIT 3276 -▁PITCH 3277 -▁PITIFUL 3278 -▁PITY 3279 -▁PLA 3280 -▁PLAC 3281 -▁PLACE 3282 -▁PLACED 3283 -▁PLACES 3284 -▁PLAGUE 3285 -▁PLAIN 3286 -▁PLAINLY 3287 -▁PLAN 3288 -▁PLANET 3289 -▁PLANT 3290 -▁PLATE 3291 -▁PLATFORM 3292 -▁PLAY 3293 -▁PLAYED 3294 -▁PLAYING 3295 -PLE 3296 -▁PLEA 3297 -▁PLEASANT 3298 -▁PLEASE 3299 -▁PLEASED 3300 -▁PLEASURE 3301 -▁PLEDGE 3302 -▁PLENTY 3303 -▁PLOT 3304 -▁PLOUGH 3305 -▁PLUCK 3306 -▁PLUM 3307 -▁PLUNDER 3308 -▁PLUNGE 3309 -PO 3310 -▁PO 3311 -▁POCKET 3312 -▁POEM 3313 -▁POET 3314 -▁POETRY 3315 -▁POINT 3316 -▁POINTED 3317 -▁POISON 3318 -▁POLE 3319 -▁POLICE 3320 -▁POLICY 3321 -▁POLISH 3322 -▁POLITE 3323 -▁POLITICAL 3324 -▁POLITICS 3325 -▁POLLY 3326 -▁POND 3327 -▁PONY 3328 -▁POOL 3329 -▁POOR 3330 -▁POPE 3331 -▁POPULAR 3332 -▁POPULATION 3333 -▁PORCH 3334 -PORT 3335 -▁PORT 3336 -▁PORTHOS 3337 -▁PORTION 3338 -▁PORTRAIT 3339 -POSE 3340 -▁POSITION 3341 -▁POSITIVE 3342 -▁POSSESS 3343 -▁POSSESSED 3344 -▁POSSESSION 3345 -▁POSSIBILITY 3346 -▁POSSIBLE 3347 -▁POSSIBLY 3348 -▁POST 3349 -▁POT 3350 -▁POUND 3351 -▁POUNDS 3352 -▁POUR 3353 -▁POVERTY 3354 -▁POWDER 3355 -▁POWER 3356 -▁POWERFUL 3357 -▁POWERS 3358 -PP 3359 -PPING 3360 -▁PRA 3361 -▁PRACTICAL 3362 -▁PRACTICE 3363 -▁PRACTISE 3364 -▁PRAIRIE 3365 -▁PRAISE 3366 -▁PRAY 3367 -▁PRAYER 3368 -▁PRE 3369 -▁PREACH 3370 -▁PRECAUTION 3371 -▁PRECEDE 3372 -▁PRECEDING 3373 -▁PRECIOUS 3374 -▁PRECISE 3375 -▁PRECISELY 3376 -▁PREFER 3377 -▁PREFERRED 3378 -▁PREJUDICE 3379 -▁PREPARATION 3380 -▁PREPARE 3381 -▁PREPARED 3382 -▁PREPARING 3383 -▁PRESENCE 3384 -▁PRESENT 3385 -▁PRESENTED 3386 -▁PRESENTLY 3387 -▁PRESERV 3388 -▁PRESIDENT 3389 -▁PRESS 3390 -▁PRESSED 3391 -▁PRESSURE 3392 -▁PRESUME 3393 -▁PRETEND 3394 -▁PRETTY 3395 -▁PREVAIL 3396 -▁PREVENT 3397 -▁PREVIOUS 3398 -▁PRI 3399 -▁PRICE 3400 -▁PRIDE 3401 -▁PRIEST 3402 -▁PRIMITIVE 3403 -▁PRINCE 3404 -▁PRINCESS 3405 -▁PRINCIPAL 3406 -▁PRINCIPLE 3407 -▁PRINT 3408 -▁PRISCILLA 3409 -▁PRISON 3410 -▁PRISONER 3411 -▁PRIVATE 3412 -▁PRIVILEGE 3413 -▁PRO 3414 -▁PROBABILITY 3415 -▁PROBABLE 3416 -▁PROBABLY 3417 -▁PROBLEM 3418 -▁PROCEED 3419 -▁PROCEEDED 3420 -▁PROCESS 3421 -▁PROCLAIM 3422 -▁PROCURE 3423 -▁PRODUCE 3424 -▁PRODUCED 3425 -▁PRODUCING 3426 -▁PRODUCT 3427 -▁PROFESS 3428 -▁PROFESSION 3429 -▁PROFESSOR 3430 -▁PROFIT 3431 -▁PROFOUND 3432 -▁PROGRESS 3433 -▁PROHIBIT 3434 -▁PROJECT 3435 -▁PROMINENT 3436 -▁PROMISE 3437 -▁PROMISED 3438 -▁PROMISING 3439 -▁PROMOTE 3440 -▁PROMPT 3441 -▁PRONOUNC 3442 -▁PROOF 3443 -▁PROP 3444 -▁PROPER 3445 -▁PROPERLY 3446 -▁PROPERTY 3447 -▁PROPHET 3448 -▁PROPORTION 3449 -▁PROPOSAL 3450 -▁PROPOSE 3451 -▁PROPOSED 3452 -▁PROPOSITION 3453 -▁PROPRIETOR 3454 -▁PROSPECT 3455 -▁PROSPERITY 3456 -▁PROTECT 3457 -▁PROTECTION 3458 -▁PROTEST 3459 -▁PROUD 3460 -▁PROVE 3461 -▁PROVED 3462 -▁PROVERB 3463 -▁PROVIDE 3464 -▁PROVIDED 3465 -▁PROVINCE 3466 -▁PROVISION 3467 -▁PROVOKE 3468 -▁PRUDENCE 3469 -▁PRUDENT 3470 -PS 3471 -▁PSMITH 3472 -▁PU 3473 -▁PUBLIC 3474 -▁PUBLISH 3475 -▁PUFF 3476 -▁PULL 3477 -▁PULLED 3478 -▁PULSE 3479 -▁PUNISH 3480 -▁PUNISHMENT 3481 -▁PUPIL 3482 -▁PUR 3483 -▁PURCHASE 3484 -▁PURE 3485 -▁PURPLE 3486 -▁PURPOSE 3487 -▁PURSE 3488 -▁PURSUE 3489 -▁PURSUED 3490 -▁PURSUIT 3491 -▁PUSH 3492 -▁PUSHED 3493 -▁PUT 3494 -▁PUTTING 3495 -Q 3496 -QUA 3497 -▁QUA 3498 -▁QUAINT 3499 -▁QUALITIES 3500 -▁QUALITY 3501 -▁QUANTITY 3502 -▁QUARREL 3503 -▁QUARTER 3504 -QUE 3505 -▁QUEEN 3506 -▁QUEER 3507 -▁QUESTION 3508 -▁QUESTIONS 3509 -QUI 3510 -▁QUI 3511 -▁QUICK 3512 -▁QUICKLY 3513 -▁QUIET 3514 -▁QUIETLY 3515 -▁QUITE 3516 -▁QUIVER 3517 -▁QUIXOTE 3518 -▁QUO 3519 -▁QUOTH 3520 -R 3521 -▁R 3522 -RA 3523 -▁RA 3524 -▁RABBIT 3525 -▁RACE 3526 -▁RACHEL 3527 -▁RADIANT 3528 -▁RAG 3529 -▁RAGE 3530 -▁RAIL 3531 -▁RAILROAD 3532 -▁RAILWAY 3533 -▁RAIN 3534 -▁RAINBOW 3535 -▁RAISE 3536 -▁RAISED 3537 -▁RAISING 3538 -▁RALPH 3539 -▁RAM 3540 -RAN 3541 -▁RAN 3542 -▁RANG 3543 -▁RANGE 3544 -▁RANK 3545 -▁RAOUL 3546 -▁RAPID 3547 -▁RAPIDLY 3548 -▁RARE 3549 -▁RASCAL 3550 -RATE 3551 -▁RATE 3552 -▁RATHER 3553 -▁RATIONAL 3554 -▁RATTL 3555 -▁RAVEN 3556 -▁RAY 3557 -RE 3558 -▁RE 3559 -▁REACH 3560 -▁REACHED 3561 -▁REACTION 3562 -▁READ 3563 -▁READER 3564 -▁READILY 3565 -▁READING 3566 -▁READY 3567 -▁REAL 3568 -▁REALI 3569 -▁REALITY 3570 -▁REALLY 3571 -▁REAR 3572 -▁REASON 3573 -▁REBECCA 3574 -▁REBEL 3575 -▁RECALL 3576 -▁RECEIVE 3577 -▁RECEIVED 3578 -▁RECEIVING 3579 -▁RECENT 3580 -▁RECEPTION 3581 -▁RECESS 3582 -▁RECIT 3583 -▁RECKLESS 3584 -▁RECKON 3585 -▁RECOGNI 3586 -▁RECOLLECT 3587 -▁RECOLLECTION 3588 -▁RECOMMEND 3589 -▁RECONCIL 3590 -▁RECORD 3591 -▁RECOVER 3592 -▁RECOVERED 3593 -RED 3594 -▁RED 3595 -▁REDUCED 3596 -▁REFER 3597 -▁REFERENCE 3598 -▁REFINED 3599 -▁REFLECT 3600 -▁REFLECTION 3601 -▁REFORM 3602 -▁REFRAIN 3603 -▁REFRESH 3604 -▁REFUGE 3605 -▁REFUSE 3606 -▁REFUSED 3607 -▁REGAIN 3608 -▁REGARD 3609 -▁REGARDED 3610 -▁REGIMENT 3611 -▁REGION 3612 -▁REGRET 3613 -▁REGULAR 3614 -▁REGULAT 3615 -▁REIGN 3616 -▁REJECT 3617 -▁REJOICE 3618 -▁REJOICING 3619 -▁RELATE 3620 -▁RELATED 3621 -▁RELATION 3622 -▁RELATIVE 3623 -▁RELAX 3624 -▁RELEASE 3625 -▁RELI 3626 -▁RELIEF 3627 -▁RELIEVE 3628 -▁RELIGION 3629 -▁RELIGIOUS 3630 -▁RELUCTANT 3631 -▁REMAIN 3632 -▁REMAINED 3633 -▁REMARK 3634 -▁REMARKABLE 3635 -▁REMARKED 3636 -▁REMEDY 3637 -▁REMEMBER 3638 -▁REMEMBERED 3639 -▁REMEMBRANCE 3640 -▁REMIND 3641 -▁REMORSE 3642 -▁REMOTE 3643 -▁REMOVE 3644 -▁REMOVED 3645 -▁RENDER 3646 -▁RENDERED 3647 -▁RENEW 3648 -▁RENT 3649 -▁REP 3650 -▁REPAIR 3651 -▁REPEAT 3652 -▁REPEATED 3653 -▁REPENT 3654 -▁REPLIED 3655 -▁REPLY 3656 -▁REPORT 3657 -▁REPRESENT 3658 -▁REPRESENTATIVE 3659 -▁REPROACH 3660 -▁REPUBLIC 3661 -▁REPUTATION 3662 -▁REQUEST 3663 -▁REQUIRE 3664 -▁REQUIRED 3665 -▁RESCUE 3666 -▁RESEMBLANCE 3667 -▁RESEMBLE 3668 -▁RESERVE 3669 -▁RESIDENCE 3670 -▁RESIGN 3671 -▁RESIST 3672 -▁RESISTANCE 3673 -▁RESOLUTE 3674 -▁RESOLUTION 3675 -▁RESOLVED 3676 -▁RESORT 3677 -▁RESOURCE 3678 -▁RESPECT 3679 -▁RESPONSE 3680 -▁RESPONSIBILITY 3681 -▁RESPONSIBLE 3682 -RESS 3683 -▁REST 3684 -▁RESTLESS 3685 -▁RESTORE 3686 -▁RESTRAIN 3687 -▁RESULT 3688 -▁RESUMED 3689 -▁RETAIN 3690 -▁RETIRE 3691 -▁RETIRED 3692 -▁RETORTED 3693 -▁RETREAT 3694 -▁RETURN 3695 -▁RETURNED 3696 -▁RETURNING 3697 -▁REV 3698 -▁REVEAL 3699 -▁REVELATION 3700 -▁REVENGE 3701 -▁REVER 3702 -▁REVIEW 3703 -▁REVOLT 3704 -▁REVOLUTION 3705 -▁REWARD 3706 -RG 3707 -RI 3708 -▁RI 3709 -▁RIBBON 3710 -RIC 3711 -▁RICH 3712 -▁RICHARD 3713 -▁RICHMOND 3714 -RICK 3715 -▁RID 3716 -▁RIDE 3717 -RIDGE 3718 -▁RIDICULOUS 3719 -▁RIDING 3720 -RIES 3721 -▁RIFLE 3722 -RIGHT 3723 -▁RIGHT 3724 -▁RIGHTEOUS 3725 -▁RIGID 3726 -RIN 3727 -RING 3728 -▁RING 3729 -▁RIPE 3730 -RIS 3731 -▁RISE 3732 -▁RISING 3733 -▁RISK 3734 -▁RIVAL 3735 -▁RIVER 3736 -RK 3737 -RN 3738 -RO 3739 -▁RO 3740 -▁ROAD 3741 -▁ROAR 3742 -▁ROAST 3743 -▁ROB 3744 -▁ROBBER 3745 -▁ROBE 3746 -▁ROBERT 3747 -▁ROBIN 3748 -▁ROCK 3749 -▁ROCKS 3750 -▁RODE 3751 -▁ROLL 3752 -▁ROLLED 3753 -▁ROMAN 3754 -▁ROME 3755 -RON 3756 -▁ROOF 3757 -▁ROOM 3758 -▁ROOT 3759 -▁ROPE 3760 -▁ROSA 3761 -▁ROSE 3762 -▁ROUGH 3763 -▁ROUND 3764 -ROUS 3765 -▁ROUSED 3766 -▁ROUTE 3767 -ROW 3768 -▁ROW 3769 -▁ROYAL 3770 -RS 3771 -RT 3772 -RU 3773 -▁RU 3774 -▁RUB 3775 -▁RUBBED 3776 -▁RUBBING 3777 -▁RUDE 3778 -▁RUIN 3779 -▁RULE 3780 -▁RUM 3781 -▁RUN 3782 -▁RUNNING 3783 -▁RUSH 3784 -▁RUSHED 3785 -▁RUSSIA 3786 -▁RUSSIAN 3787 -▁RUTH 3788 -RY 3789 -S 3790 -▁S 3791 -▁SA 3792 -▁SACRED 3793 -▁SACRIFICE 3794 -▁SAD 3795 -▁SADDLE 3796 -▁SAFE 3797 -▁SAFETY 3798 -▁SAID 3799 -SAIL 3800 -▁SAIL 3801 -▁SAILOR 3802 -▁SAINT 3803 -▁SAKE 3804 -▁SAL 3805 -▁SALT 3806 -▁SALUTE 3807 -▁SAM 3808 -▁SAME 3809 -▁SAMUEL 3810 -▁SAN 3811 -▁SANCHO 3812 -▁SAND 3813 -▁SANG 3814 -▁SANK 3815 -▁SARAH 3816 -▁SAT 3817 -▁SATISFACTION 3818 -▁SATISFACTORY 3819 -▁SATISFIED 3820 -▁SATISFY 3821 -▁SATURDAY 3822 -▁SAUCE 3823 -▁SAVAGE 3824 -▁SAVE 3825 -▁SAVED 3826 -▁SAVING 3827 -▁SAW 3828 -▁SAY 3829 -▁SAYING 3830 -▁SAYS 3831 -▁SC 3832 -▁SCA 3833 -▁SCALE 3834 -▁SCANDAL 3835 -▁SCAR 3836 -▁SCARCE 3837 -▁SCARCELY 3838 -▁SCARECROW 3839 -▁SCARLET 3840 -▁SCATTERED 3841 -▁SCENE 3842 -▁SCENT 3843 -▁SCH 3844 -▁SCHEME 3845 -▁SCHOLAR 3846 -▁SCHOOL 3847 -▁SCIENCE 3848 -▁SCIENTIFIC 3849 -▁SCOLD 3850 -▁SCORE 3851 -▁SCORN 3852 -▁SCOTCH 3853 -▁SCOTLAND 3854 -▁SCOTT 3855 -▁SCOUNDREL 3856 -▁SCOUT 3857 -▁SCRAMBLE 3858 -▁SCRAP 3859 -▁SCRATCH 3860 -▁SCREAM 3861 -▁SCREEN 3862 -▁SCREW 3863 -▁SCROOGE 3864 -SE 3865 -▁SE 3866 -▁SEA 3867 -▁SEAL 3868 -▁SEARCH 3869 -▁SEASON 3870 -▁SEAT 3871 -▁SEATED 3872 -▁SECOND 3873 -▁SECRET 3874 -▁SECRETARY 3875 -▁SECTION 3876 -▁SECURE 3877 -▁SECURITY 3878 -▁SEE 3879 -▁SEEING 3880 -▁SEEK 3881 -▁SEEM 3882 -▁SEEMED 3883 -▁SEEMS 3884 -▁SEEN 3885 -▁SEI 3886 -▁SELDOM 3887 -▁SELECT 3888 -▁SELF 3889 -▁SELFISH 3890 -▁SELL 3891 -▁SENATE 3892 -▁SENATOR 3893 -▁SEND 3894 -▁SENSATION 3895 -▁SENSE 3896 -▁SENSIBLE 3897 -▁SENSITIVE 3898 -▁SENT 3899 -▁SENTENCE 3900 -▁SENTIMENT 3901 -▁SEPARATE 3902 -▁SEPARATED 3903 -▁SEPTEMBER 3904 -▁SER 3905 -▁SERENE 3906 -▁SERGEANT 3907 -▁SERIES 3908 -▁SERIOUS 3909 -▁SERMON 3910 -▁SERPENT 3911 -▁SERVANT 3912 -▁SERVANTS 3913 -▁SERVE 3914 -▁SERVED 3915 -▁SERVICE 3916 -▁SERVING 3917 -▁SET 3918 -▁SETTING 3919 -▁SETTLE 3920 -▁SETTLED 3921 -▁SEVEN 3922 -▁SEVENTEEN 3923 -▁SEVENTY 3924 -▁SEVERAL 3925 -▁SEVERE 3926 -▁SEX 3927 -SH 3928 -▁SH 3929 -▁SHA 3930 -▁SHADE 3931 -▁SHADOW 3932 -▁SHAGGY 3933 -▁SHAKE 3934 -▁SHAKESPEARE 3935 -▁SHAKING 3936 -▁SHALL 3937 -▁SHAME 3938 -▁SHAPE 3939 -▁SHARE 3940 -▁SHARP 3941 -▁SHARPLY 3942 -▁SHAWL 3943 -▁SHE 3944 -▁SHEEP 3945 -▁SHELTER 3946 -▁SHEPHERD 3947 -▁SHERIFF 3948 -▁SHIELD 3949 -▁SHIFT 3950 -▁SHILLING 3951 -▁SHINE 3952 -▁SHINING 3953 -SHIP 3954 -▁SHIP 3955 -▁SHIPS 3956 -SHIRE 3957 -▁SHIRT 3958 -▁SHIVER 3959 -▁SHOCK 3960 -▁SHOE 3961 -▁SHOES 3962 -▁SHONE 3963 -▁SHOOK 3964 -▁SHOOT 3965 -▁SHOP 3966 -▁SHORE 3967 -▁SHORT 3968 -▁SHOT 3969 -▁SHOULD 3970 -▁SHOULDER 3971 -▁SHOULDERS 3972 -▁SHOUT 3973 -▁SHOUTED 3974 -▁SHOW 3975 -▁SHOWED 3976 -▁SHOWN 3977 -▁SHREWD 3978 -▁SHRIEK 3979 -▁SHRILL 3980 -▁SHRINK 3981 -▁SHUDDER 3982 -▁SHUT 3983 -▁SI 3984 -▁SICK 3985 -SIDE 3986 -▁SIDE 3987 -▁SIDES 3988 -▁SIEGE 3989 -▁SIGH 3990 -▁SIGHED 3991 -▁SIGHT 3992 -▁SIGN 3993 -▁SIGNAL 3994 -▁SIGNIFICANCE 3995 -▁SIGNIFICANT 3996 -▁SILENCE 3997 -▁SILENT 3998 -▁SILK 3999 -▁SILLY 4000 -▁SILVER 4001 -▁SIMILAR 4002 -▁SIMON 4003 -▁SIMPLE 4004 -▁SIMPLICITY 4005 -▁SIMPLY 4006 -▁SIN 4007 -▁SINCE 4008 -▁SING 4009 -▁SINGING 4010 -▁SINGLE 4011 -▁SINGULAR 4012 -▁SINK 4013 -▁SIR 4014 -▁SISTER 4015 -▁SIT 4016 -▁SITTING 4017 -▁SITUATED 4018 -▁SITUATION 4019 -▁SIX 4020 -▁SIXTEEN 4021 -▁SIXTY 4022 -▁SKETCH 4023 -▁SKI 4024 -▁SKILFUL 4025 -▁SKILL 4026 -▁SKIN 4027 -▁SKIRT 4028 -▁SKULL 4029 -▁SKY 4030 -▁SLAIN 4031 -▁SLAUGHTER 4032 -▁SLAVE 4033 -▁SLAVERY 4034 -▁SLAVES 4035 -▁SLEDGE 4036 -▁SLEEP 4037 -▁SLEEVE 4038 -▁SLENDER 4039 -▁SLEPT 4040 -▁SLEW 4041 -▁SLICE 4042 -▁SLID 4043 -▁SLIGHT 4044 -▁SLIGHTEST 4045 -▁SLIGHTLY 4046 -▁SLIM 4047 -▁SLIP 4048 -▁SLIPPED 4049 -▁SLO 4050 -▁SLOPE 4051 -▁SLOW 4052 -▁SLOWLY 4053 -▁SLUMBER 4054 -▁SMALL 4055 -▁SMART 4056 -▁SMASH 4057 -▁SMELL 4058 -▁SMILE 4059 -▁SMILED 4060 -▁SMILING 4061 -▁SMITH 4062 -▁SMOKE 4063 -▁SMOKING 4064 -▁SMOOTH 4065 -▁SMOT 4066 -▁SNAKE 4067 -▁SNAP 4068 -▁SNATCH 4069 -▁SNEER 4070 -▁SNOW 4071 -▁SO 4072 -▁SOCIAL 4073 -▁SOCIETY 4074 -▁SOFT 4075 -▁SOFTLY 4076 -▁SOIL 4077 -▁SOLD 4078 -▁SOLDIER 4079 -▁SOLDIERS 4080 -▁SOLEMN 4081 -▁SOLICIT 4082 -▁SOLID 4083 -▁SOLITARY 4084 -▁SOLITUDE 4085 -▁SOLOMON 4086 -▁SOLUTION 4087 -SOME 4088 -▁SOME 4089 -▁SOMEBODY 4090 -▁SOMEHOW 4091 -▁SOMEONE 4092 -▁SOMETHING 4093 -▁SOMETIMES 4094 -▁SOMEWHAT 4095 -▁SOMEWHERE 4096 -SON 4097 -▁SON 4098 -▁SONG 4099 -▁SOON 4100 -▁SOONER 4101 -▁SOOTH 4102 -▁SORROW 4103 -▁SORRY 4104 -▁SORT 4105 -▁SOUGHT 4106 -▁SOUL 4107 -▁SOUND 4108 -▁SOURCE 4109 -▁SOUTH 4110 -▁SOUTHERN 4111 -▁SOVEREIGN 4112 -▁SP 4113 -▁SPACE 4114 -▁SPAIN 4115 -▁SPAKE 4116 -▁SPANIARD 4117 -▁SPANISH 4118 -▁SPAR 4119 -▁SPARE 4120 -▁SPARK 4121 -▁SPEAK 4122 -▁SPEAKING 4123 -▁SPEAR 4124 -▁SPECIAL 4125 -▁SPECIES 4126 -▁SPECIMEN 4127 -▁SPECK 4128 -▁SPECTACLE 4129 -▁SPECTATOR 4130 -▁SPECULAT 4131 -▁SPEECH 4132 -▁SPEED 4133 -▁SPELL 4134 -▁SPEND 4135 -▁SPENT 4136 -▁SPHERE 4137 -▁SPI 4138 -▁SPIN 4139 -▁SPIRIT 4140 -▁SPIRITS 4141 -▁SPIRITUAL 4142 -▁SPITE 4143 -▁SPLASH 4144 -▁SPLENDID 4145 -▁SPLENDOR 4146 -▁SPLIT 4147 -▁SPOIL 4148 -▁SPOKE 4149 -▁SPOKEN 4150 -▁SPOON 4151 -▁SPORT 4152 -▁SPOT 4153 -▁SPRANG 4154 -▁SPREAD 4155 -▁SPRING 4156 -▁SPRINKLE 4157 -▁SPUR 4158 -▁SQU 4159 -▁SQUARE 4160 -▁SQUEE 4161 -▁SQUIRE 4162 -▁SQUIRREL 4163 -ST 4164 -▁ST 4165 -▁STA 4166 -▁STABLE 4167 -▁STAFF 4168 -▁STAGE 4169 -▁STAGGER 4170 -▁STAIRCASE 4171 -▁STAIRS 4172 -▁STALK 4173 -▁STAMP 4174 -▁STAND 4175 -▁STANDARD 4176 -▁STANDING 4177 -▁STAR 4178 -▁STARED 4179 -▁STARS 4180 -▁START 4181 -▁STARTED 4182 -▁STARTLED 4183 -▁STATE 4184 -▁STATEMENT 4185 -▁STATES 4186 -▁STATION 4187 -▁STATUE 4188 -▁STAY 4189 -▁STE 4190 -STEAD 4191 -▁STEADILY 4192 -▁STEADY 4193 -▁STEAL 4194 -▁STEAM 4195 -▁STEEL 4196 -▁STEEP 4197 -▁STEP 4198 -▁STEPHEN 4199 -▁STEPPED 4200 -▁STEPS 4201 -STER 4202 -▁STERN 4203 -▁STICK 4204 -▁STIFF 4205 -▁STILL 4206 -▁STIR 4207 -▁STIRRED 4208 -▁STO 4209 -▁STOCK 4210 -▁STOLE 4211 -▁STOMACH 4212 -STONE 4213 -▁STONE 4214 -▁STONES 4215 -▁STOOD 4216 -▁STOOPED 4217 -▁STOP 4218 -▁STOPPED 4219 -▁STOPPING 4220 -▁STORE 4221 -▁STORIES 4222 -▁STORM 4223 -▁STORY 4224 -▁STOUT 4225 -STRA 4226 -▁STRAIGHT 4227 -▁STRAIN 4228 -▁STRAIT 4229 -▁STRANGE 4230 -▁STRANGER 4231 -▁STRAP 4232 -▁STRAT 4233 -▁STRAW 4234 -▁STRAY 4235 -▁STREAK 4236 -▁STREAM 4237 -▁STREET 4238 -▁STREETS 4239 -▁STRENGTH 4240 -▁STRETCH 4241 -▁STRETCHED 4242 -▁STREW 4243 -▁STRICKEN 4244 -▁STRICT 4245 -▁STRIKE 4246 -▁STRIKING 4247 -▁STRING 4248 -▁STRIP 4249 -▁STRO 4250 -▁STROKE 4251 -▁STRONG 4252 -▁STRUCK 4253 -▁STRUCTURE 4254 -▁STRUGGLE 4255 -▁STRUGGLING 4256 -▁STUCK 4257 -▁STUDENT 4258 -▁STUDIED 4259 -▁STUDIES 4260 -▁STUDIO 4261 -▁STUDY 4262 -▁STUFF 4263 -▁STUMBLE 4264 -▁STUMP 4265 -▁STUPID 4266 -▁STYLE 4267 -▁SU 4268 -▁SUB 4269 -▁SUBDUED 4270 -▁SUBJECT 4271 -▁SUBLIME 4272 -▁SUBMIT 4273 -▁SUBSEQUENT 4274 -▁SUBSTANCE 4275 -▁SUBSTANTIAL 4276 -▁SUBTLE 4277 -▁SUCCEED 4278 -▁SUCCEEDED 4279 -▁SUCCESS 4280 -▁SUCCESSFUL 4281 -▁SUCH 4282 -▁SUDDEN 4283 -▁SUDDENLY 4284 -▁SUFFER 4285 -▁SUFFERED 4286 -▁SUFFERING 4287 -▁SUFFICE 4288 -▁SUFFICIENT 4289 -▁SUFFICIENTLY 4290 -▁SUFFRAGE 4291 -▁SUGAR 4292 -▁SUGGEST 4293 -▁SUGGESTED 4294 -▁SUGGESTION 4295 -▁SUIT 4296 -▁SULLEN 4297 -▁SULTAN 4298 -▁SUM 4299 -▁SUMMER 4300 -▁SUMMIT 4301 -▁SUMMON 4302 -▁SUN 4303 -▁SUNDAY 4304 -▁SUNK 4305 -▁SUNLIGHT 4306 -▁SUNRISE 4307 -▁SUNSET 4308 -▁SUNSHINE 4309 -▁SUPER 4310 -▁SUPERINTEND 4311 -▁SUPERIOR 4312 -▁SUPPER 4313 -▁SUPPLIED 4314 -▁SUPPLIES 4315 -▁SUPPLY 4316 -▁SUPPORT 4317 -▁SUPPOSE 4318 -▁SUPPOSED 4319 -▁SUPPOSING 4320 -▁SUPPRESS 4321 -▁SUPREME 4322 -▁SUR 4323 -▁SURE 4324 -▁SURELY 4325 -▁SURFACE 4326 -▁SURGEON 4327 -▁SURPASS 4328 -▁SURPRISE 4329 -▁SURPRISED 4330 -▁SURPRISING 4331 -▁SURRENDER 4332 -▁SURROUNDED 4333 -▁SURROUNDING 4334 -▁SURVEY 4335 -▁SURVIV 4336 -▁SUSAN 4337 -▁SUSPECT 4338 -▁SUSPICION 4339 -▁SUSPICIOUS 4340 -▁SUSTAIN 4341 -▁SW 4342 -▁SWA 4343 -▁SWALLOW 4344 -▁SWARM 4345 -▁SWEAR 4346 -▁SWEAT 4347 -▁SWEEP 4348 -▁SWEET 4349 -▁SWELL 4350 -▁SWEPT 4351 -▁SWIFT 4352 -▁SWIM 4353 -▁SWIMMING 4354 -▁SWORD 4355 -▁SWORE 4356 -▁SWUNG 4357 -▁SY 4358 -▁SYLVIA 4359 -▁SYMBOL 4360 -▁SYMPATHETIC 4361 -▁SYMPATHI 4362 -▁SYMPATHY 4363 -▁SYMPTOM 4364 -▁SYSTEM 4365 -T 4366 -▁T 4367 -TA 4368 -▁TA 4369 -▁TABLE 4370 -▁TAIL 4371 -▁TAKE 4372 -▁TAKEN 4373 -▁TAKING 4374 -▁TALE 4375 -▁TALENT 4376 -▁TALK 4377 -▁TALKED 4378 -▁TALKING 4379 -▁TALL 4380 -TAN 4381 -▁TANG 4382 -▁TANK 4383 -▁TAP 4384 -▁TAR 4385 -▁TASK 4386 -▁TASTE 4387 -▁TAUGHT 4388 -▁TAX 4389 -TE 4390 -▁TE 4391 -▁TEA 4392 -▁TEACH 4393 -▁TEACHER 4394 -▁TEAR 4395 -▁TEARS 4396 -TED 4397 -▁TEETH 4398 -▁TELEGRAPH 4399 -▁TELEPHONE 4400 -▁TELL 4401 -▁TELLING 4402 -▁TEMPER 4403 -▁TEMPERAMENT 4404 -▁TEMPERATURE 4405 -▁TEMPEST 4406 -▁TEMPLE 4407 -▁TEMPORARY 4408 -▁TEMPT 4409 -▁TEMPTATION 4410 -TEN 4411 -▁TEN 4412 -▁TENDENCY 4413 -▁TENDER 4414 -▁TENDERNESS 4415 -TER 4416 -TERIOR 4417 -▁TERM 4418 -▁TERMS 4419 -▁TERRACE 4420 -▁TERRIBLE 4421 -▁TERRIBLY 4422 -▁TERRIFIED 4423 -▁TERRITORY 4424 -▁TERROR 4425 -▁TEST 4426 -▁TESTIMONY 4427 -▁TEXT 4428 -TH 4429 -▁TH 4430 -▁THAN 4431 -▁THANK 4432 -▁THAT 4433 -THE 4434 -▁THE 4435 -▁THEATRE 4436 -▁THEIR 4437 -▁THEM 4438 -▁THEMSELVES 4439 -▁THEN 4440 -THER 4441 -▁THERE 4442 -▁THEREFORE 4443 -▁THEREUPON 4444 -▁THESE 4445 -▁THEY 4446 -▁THICK 4447 -▁THIEF 4448 -▁THIEVES 4449 -▁THIN 4450 -▁THING 4451 -▁THINGS 4452 -THINK 4453 -▁THINK 4454 -▁THINKING 4455 -▁THIRD 4456 -▁THIRST 4457 -▁THIRTEEN 4458 -▁THIRTY 4459 -▁THIS 4460 -▁THITHER 4461 -▁THOMAS 4462 -▁THORNTON 4463 -▁THOROUGH 4464 -▁THOROUGHLY 4465 -THORPE 4466 -▁THOSE 4467 -▁THOU 4468 -▁THOUGH 4469 -▁THOUGHT 4470 -▁THOUGHTFULLY 4471 -▁THOUGHTS 4472 -▁THOUSAND 4473 -▁THREAD 4474 -▁THREAT 4475 -▁THREATENED 4476 -▁THREATENING 4477 -▁THREE 4478 -▁THRESHOLD 4479 -▁THREW 4480 -▁THRILL 4481 -▁THRO 4482 -▁THROAT 4483 -▁THRONE 4484 -▁THRONG 4485 -▁THROUGH 4486 -▁THROUGHOUT 4487 -▁THROW 4488 -▁THROWING 4489 -▁THROWN 4490 -▁THRUST 4491 -▁THUMB 4492 -▁THUNDER 4493 -▁THUS 4494 -▁THY 4495 -▁THYSELF 4496 -TI 4497 -▁TI 4498 -TIC 4499 -▁TICKET 4500 -▁TIDE 4501 -▁TIDINGS 4502 -▁TIED 4503 -TIES 4504 -▁TIGHT 4505 -▁TILL 4506 -▁TIMBER 4507 -TIME 4508 -▁TIME 4509 -▁TIMES 4510 -▁TIMID 4511 -TIN 4512 -▁TIN 4513 -TING 4514 -▁TINY 4515 -TION 4516 -▁TIP 4517 -▁TIRED 4518 -▁TITLE 4519 -TO 4520 -▁TO 4521 -▁TOBACCO 4522 -▁TODAY 4523 -▁TOGETHER 4524 -▁TOLD 4525 -▁TOM 4526 -▁TOMB 4527 -▁TOMORROW 4528 -TON 4529 -▁TONE 4530 -▁TONGUE 4531 -▁TOO 4532 -▁TOOK 4533 -▁TOP 4534 -▁TORMENT 4535 -▁TORRENT 4536 -▁TORTURE 4537 -▁TOTAL 4538 -▁TOUCH 4539 -▁TOUCHED 4540 -▁TOWARD 4541 -▁TOWARDS 4542 -▁TOWER 4543 -▁TOWN 4544 -▁TRA 4545 -▁TRACE 4546 -▁TRACK 4547 -▁TRADE 4548 -▁TRADITION 4549 -▁TRAGEDY 4550 -▁TRAGIC 4551 -▁TRAIL 4552 -▁TRAIN 4553 -▁TRAITOR 4554 -▁TRAMP 4555 -▁TRANQUIL 4556 -▁TRANS 4557 -▁TRANSPORT 4558 -▁TRAP 4559 -▁TRAVEL 4560 -▁TRAVELLER 4561 -▁TRE 4562 -▁TREAD 4563 -▁TREASURE 4564 -▁TREAT 4565 -▁TREATED 4566 -▁TREATMENT 4567 -▁TREE 4568 -▁TREES 4569 -▁TREMBLE 4570 -▁TREMBLED 4571 -▁TREMBLING 4572 -▁TREMENDOUS 4573 -▁TRENCH 4574 -TRI 4575 -▁TRI 4576 -▁TRIAL 4577 -▁TRIBE 4578 -▁TRICK 4579 -▁TRIED 4580 -▁TRIFLE 4581 -▁TRIFLING 4582 -▁TRIP 4583 -▁TRISTRAM 4584 -▁TRIUMPH 4585 -▁TRIUMPHANT 4586 -TRO 4587 -▁TROOP 4588 -▁TROOPS 4589 -▁TROT 4590 -▁TROUBLE 4591 -▁TROUBLED 4592 -▁TROUSERS 4593 -▁TROUT 4594 -▁TRU 4595 -▁TRUE 4596 -▁TRULY 4597 -▁TRUMPET 4598 -▁TRUNK 4599 -▁TRUST 4600 -▁TRUTH 4601 -▁TRY 4602 -▁TRYING 4603 -TTE 4604 -TTERED 4605 -TTLE 4606 -▁TU 4607 -▁TUESDAY 4608 -▁TULLIVER 4609 -▁TUMBLE 4610 -▁TUMULT 4611 -TUR 4612 -▁TURKEY 4613 -▁TURN 4614 -▁TURNED 4615 -▁TURNING 4616 -▁TURTLE 4617 -▁TWAS 4618 -▁TWELVE 4619 -▁TWENTIETH 4620 -▁TWENTY 4621 -▁TWICE 4622 -▁TWILIGHT 4623 -▁TWIN 4624 -▁TWIST 4625 -▁TWO 4626 -TY 4627 -▁TYPE 4628 -▁TYRANT 4629 -U 4630 -UB 4631 -UC 4632 -UCH 4633 -UD 4634 -UG 4635 -UGH 4636 -▁UGLY 4637 -UL 4638 -ULATION 4639 -▁ULTIMATE 4640 -UM 4641 -▁UMBRELLA 4642 -UN 4643 -▁UN 4644 -▁UNABLE 4645 -▁UNC 4646 -▁UNCERTAIN 4647 -▁UNCLE 4648 -▁UNCOMFORTABLE 4649 -▁UNCOMMON 4650 -▁UNCONSCIOUS 4651 -UND 4652 -▁UND 4653 -▁UNDER 4654 -▁UNDERNEATH 4655 -▁UNDERSTAND 4656 -▁UNDERSTANDING 4657 -▁UNDERSTOOD 4658 -▁UNDERTAKE 4659 -▁UNDERTAKING 4660 -▁UNDOUBTEDLY 4661 -▁UNEASINESS 4662 -▁UNEASY 4663 -▁UNEXPECTED 4664 -▁UNFORTUNATE 4665 -▁UNHAPPY 4666 -▁UNIFORM 4667 -▁UNION 4668 -▁UNITED 4669 -▁UNIVERSAL 4670 -▁UNIVERSE 4671 -▁UNIVERSITY 4672 -▁UNJUST 4673 -▁UNKNOWN 4674 -▁UNLESS 4675 -▁UNLIKE 4676 -▁UNNATURAL 4677 -▁UNNECESSARY 4678 -▁UNPLEASANT 4679 -▁UNRE 4680 -▁UNSEEN 4681 -▁UNTIL 4682 -▁UNTO 4683 -▁UNUSUAL 4684 -▁UNWILLING 4685 -▁UNWORTHY 4686 -UOUS 4687 -UP 4688 -▁UP 4689 -▁UPON 4690 -▁UPPER 4691 -▁UPSTAIRS 4692 -UR 4693 -URE 4694 -▁URGE 4695 -US 4696 -▁US 4697 -USE 4698 -▁USE 4699 -▁USED 4700 -▁USEFUL 4701 -▁USELESS 4702 -▁USUAL 4703 -▁USUALLY 4704 -UT 4705 -▁UTILI 4706 -▁UTMOST 4707 -▁UTTER 4708 -▁UTTERED 4709 -▁UTTERLY 4710 -UX 4711 -V 4712 -VA 4713 -▁VA 4714 -▁VACANT 4715 -▁VAGUE 4716 -▁VAIN 4717 -VAL 4718 -▁VAL 4719 -▁VALENTINE 4720 -▁VALJEAN 4721 -▁VALLEY 4722 -▁VALUABLE 4723 -▁VALUE 4724 -VAN 4725 -▁VAN 4726 -▁VANISHED 4727 -▁VARI 4728 -▁VARIETY 4729 -▁VARIOUS 4730 -▁VAST 4731 -▁VAULT 4732 -VE 4733 -▁VE 4734 -▁VEGETABLE 4735 -▁VEHICLE 4736 -▁VEIL 4737 -▁VELVET 4738 -▁VEN 4739 -▁VENGEANCE 4740 -▁VENTURE 4741 -▁VENTURED 4742 -VER 4743 -▁VER 4744 -▁VERSE 4745 -▁VERY 4746 -▁VESSEL 4747 -▁VEXED 4748 -VI 4749 -▁VI 4750 -▁VIBRAT 4751 -▁VICE 4752 -▁VICTIM 4753 -▁VICTOR 4754 -▁VICTORY 4755 -▁VIEW 4756 -▁VIGOROUS 4757 -▁VILLAGE 4758 -▁VILLAIN 4759 -VILLE 4760 -▁VILLEFORT 4761 -▁VINE 4762 -▁VIOLENCE 4763 -▁VIOLENT 4764 -▁VIOLET 4765 -▁VIRGIN 4766 -▁VIRGINIA 4767 -▁VIRTUE 4768 -▁VIRTUOUS 4769 -▁VISIBLE 4770 -▁VISION 4771 -▁VISIT 4772 -▁VISITOR 4773 -▁VITAL 4774 -▁VIVID 4775 -VO 4776 -▁VO 4777 -VOCATION 4778 -▁VOICE 4779 -▁VOL 4780 -▁VOLUME 4781 -▁VOLUNTEER 4782 -▁VOTE 4783 -▁VOW 4784 -▁VOYAGE 4785 -▁VULGAR 4786 -W 4787 -▁W 4788 -WA 4789 -▁WA 4790 -▁WAG 4791 -▁WAGON 4792 -▁WAIST 4793 -▁WAISTCOAT 4794 -▁WAIT 4795 -▁WAITED 4796 -▁WAITING 4797 -▁WAKE 4798 -▁WAL 4799 -▁WALK 4800 -▁WALKED 4801 -▁WALKING 4802 -▁WALL 4803 -▁WALLS 4804 -▁WALTER 4805 -▁WANDER 4806 -▁WANDERING 4807 -▁WANT 4808 -▁WANTED 4809 -WAR 4810 -▁WAR 4811 -WARD 4812 -▁WARM 4813 -▁WARN 4814 -▁WARNING 4815 -▁WARRANT 4816 -▁WARRIOR 4817 -▁WAS 4818 -▁WASH 4819 -▁WASHINGTON 4820 -▁WATCH 4821 -▁WATCHED 4822 -▁WATCHING 4823 -▁WATER 4824 -▁WAVE 4825 -▁WAVES 4826 -▁WAVING 4827 -▁WAX 4828 -WAY 4829 -▁WAY 4830 -▁WAYS 4831 -WE 4832 -▁WE 4833 -▁WEAK 4834 -▁WEAKNESS 4835 -▁WEALTH 4836 -▁WEAPON 4837 -▁WEAR 4838 -▁WEARY 4839 -▁WEATHER 4840 -▁WEDDING 4841 -▁WEEK 4842 -▁WEEKS 4843 -▁WEEP 4844 -▁WEIGH 4845 -▁WEIGHT 4846 -▁WELCOME 4847 -▁WELFARE 4848 -WELL 4849 -▁WELL 4850 -▁WENT 4851 -▁WEPT 4852 -▁WERE 4853 -▁WEST 4854 -▁WESTERN 4855 -▁WH 4856 -▁WHALE 4857 -▁WHAT 4858 -▁WHATEVER 4859 -▁WHEAT 4860 -▁WHEEL 4861 -▁WHEN 4862 -▁WHENCE 4863 -▁WHERE 4864 -▁WHEREFORE 4865 -▁WHEREUPON 4866 -▁WHETHER 4867 -▁WHI 4868 -▁WHICH 4869 -▁WHILE 4870 -▁WHILST 4871 -▁WHIP 4872 -▁WHIRL 4873 -▁WHISK 4874 -▁WHISPER 4875 -▁WHISPERED 4876 -▁WHISTLE 4877 -▁WHITE 4878 -▁WHITHER 4879 -▁WHO 4880 -▁WHOLE 4881 -▁WHOLLY 4882 -▁WHOM 4883 -▁WHOSE 4884 -▁WHY 4885 -WI 4886 -▁WI 4887 -WICK 4888 -▁WICKED 4889 -▁WIDE 4890 -▁WIDOW 4891 -▁WIFE 4892 -▁WILD 4893 -▁WILDERNESS 4894 -▁WILL 4895 -▁WILLIAM 4896 -▁WILLING 4897 -▁WILSON 4898 -▁WILT 4899 -WIN 4900 -▁WIN 4901 -▁WIND 4902 -▁WINDOW 4903 -▁WINDOWS 4904 -▁WINE 4905 -▁WINGS 4906 -▁WINTER 4907 -▁WIP 4908 -▁WIRE 4909 -▁WISDOM 4910 -▁WISE 4911 -▁WISH 4912 -▁WISHED 4913 -▁WISHES 4914 -▁WIT 4915 -▁WITCH 4916 -▁WITH 4917 -▁WITHDRAW 4918 -▁WITHDREW 4919 -▁WITHIN 4920 -▁WITHOUT 4921 -▁WITNESS 4922 -▁WIVES 4923 -WN 4924 -▁WOE 4925 -▁WOKE 4926 -▁WOLF 4927 -▁WOLVES 4928 -▁WOMAN 4929 -▁WOMEN 4930 -▁WON 4931 -▁WONDER 4932 -▁WONDERED 4933 -▁WONDERFUL 4934 -▁WONDERING 4935 -WOOD 4936 -▁WOOD 4937 -▁WOODEN 4938 -▁WOODS 4939 -▁WORD 4940 -▁WORDS 4941 -▁WORE 4942 -WORK 4943 -▁WORK 4944 -▁WORKED 4945 -▁WORKING 4946 -▁WORLD 4947 -▁WORM 4948 -▁WORN 4949 -▁WORRIED 4950 -▁WORRY 4951 -▁WORSE 4952 -▁WORSHIP 4953 -▁WORST 4954 -WORTH 4955 -▁WORTH 4956 -▁WORTHY 4957 -▁WOULD 4958 -▁WOULDN 4959 -▁WOUND 4960 -▁WOUNDED 4961 -▁WRAP 4962 -▁WRAPPED 4963 -▁WRATH 4964 -▁WRECK 4965 -▁WREN 4966 -▁WRETCH 4967 -▁WRETCHED 4968 -▁WRINKL 4969 -▁WRIST 4970 -▁WRITE 4971 -▁WRITER 4972 -▁WRITING 4973 -▁WRITTEN 4974 -▁WRONG 4975 -▁WROTE 4976 -▁WROUGHT 4977 -X 4978 -Y 4979 -▁YA 4980 -▁YARD 4981 -▁YE 4982 -▁YEAR 4983 -▁YEARS 4984 -▁YELLOW 4985 -▁YES 4986 -▁YESTERDAY 4987 -▁YET 4988 -▁YIELD 4989 -▁YO 4990 -▁YONDER 4991 -▁YORK 4992 -▁YOU 4993 -▁YOUNG 4994 -▁YOUR 4995 -▁YOURSELF 4996 -▁YOURSELVES 4997 -▁YOUTH 4998 -Z 4999 -ZZ 5000 - 5001 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/non-linguistic-symbols.invalid b/models/audio/speech_recognition/conformer/igie/wenet/test/resources/non-linguistic-symbols.invalid deleted file mode 100644 index 131d3ff322e80fdfca92e9df9dd1e7492545ff5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/non-linguistic-symbols.invalid +++ /dev/null @@ -1,4 +0,0 @@ -#1 -<> -{{BBB}} -[[ccc]] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/non-linguistic-symbols.valid b/models/audio/speech_recognition/conformer/igie/wenet/test/resources/non-linguistic-symbols.valid deleted file mode 100644 index 307b9966d76ad278478d0ef74c4d3fa4c152a99b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/test/resources/non-linguistic-symbols.valid +++ /dev/null @@ -1,6 +0,0 @@ -{~!@#$%^&*()_+`1234567890-=[]|\\:;"'<>,./?} -[~!@#$%^&*()_+`1234567890-={}|\\:;"'<>,./?] -<~!@#$%^&*()_+`1234567890-={}|\\:;"'[],./?> -{qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM} -[qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM] - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/test_file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/test/test_file_utils.py deleted file mode 100644 index cc38ae3bc0084ae13fe16f5b570c473e0f4fec1d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/test/test_file_utils.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright [2021-12-04] - -import pytest - -from wenet.utils.file_utils import read_non_lang_symbols - - -@pytest.mark.parametrize( - "non_lang_symbol_table_path", - [ - "test/resources/non-linguistic-symbols.valid", - "test/resources/non-linguistic-symbols.invalid" - ] -) -def test_read_non_lang_symbols(non_lang_symbol_table_path): - path = non_lang_symbol_table_path - try: - syms = read_non_lang_symbols(path) - assert syms[0] == "{~!@#$%^&*()_+`1234567890-=[]|\\\\:;\"'<>,./?}" - assert syms[1] == "[~!@#$%^&*()_+`1234567890-={}|\\\\:;\"'<>,./?]" - assert syms[2] == "<~!@#$%^&*()_+`1234567890-={}|\\\\:;\"'[],./?>" - assert syms[3] == "{qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM}" - assert syms[4] == "[qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM]" - assert syms[5] == "" - except Exception as e: - assert path == "test/resources/non-linguistic-symbols.invalid" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/test/test_tokenize.py b/models/audio/speech_recognition/conformer/igie/wenet/test/test_tokenize.py deleted file mode 100644 index 157d79a372bd079e0c538fbd082cf34985840a56..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/test/test_tokenize.py +++ /dev/null @@ -1,126 +0,0 @@ -import pytest - -import wenet.dataset.processor as processor - -@pytest.mark.parametrize( - "symbol_table_path", - [ - "test/resources/librispeech.words.txt", - "test/resources/aishell2.words.txt" - ] -) -def test_tokenize(symbol_table_path): - txts = [ - {"txt": "震东好帅"}, - {"txt": " 吴迪也好帅 "}, - {"txt": "binbin is also handsome"}, - {"txt": " life is short i use wenet "}, - {"txt": "超哥 is the most handsome 吧"}, - {"txt": " 人生苦短i use wenet "}, - {"txt": "人生苦短I USE WENET"}, - {"txt": "zhendong ist so schön"}, - {"txt": " zhendong ist so schön "}, - {"txt": "It's okay"} - ] - if symbol_table_path == "test/resources/librispeech.words.txt": - bpe_model = "test/resources/librispeech.train_960_unigram5000.bpemodel" - refs = [ - {"tokens": ['震', '东', '好', '帅'], - "label": [1, 1, 1, 1]}, - {"tokens": ['吴', '迪', '也', '好', '帅'], - "label": [1, 1, 1, 1, 1]}, - {"tokens": ['▁B', 'IN', 'B', 'IN', '▁IS', '▁ALSO', "▁HANDSOME"], - "label": [347, 2216, 346, 2216, 2332, 143, 1990]}, - {"tokens": ['▁LIFE', '▁IS', '▁SHORT', '▁I', '▁USE', '▁WE', - 'NE', 'T'], - "label": [2568, 2332, 3968, 2152, 4699, 4833, 2926, 4366]}, - {"tokens": ['超', '哥', '▁IS', '▁THE', '▁MOST', '▁HANDSOME', '吧'], - "label": [1, 1, 2332, 4435, 2860, 1990, 1]}, - {"tokens": ['人', '生', '苦', '短', '▁I', '▁USE', '▁WE', 'NE', 'T'], - "label": [1, 1, 1, 1, 2152, 4699, 4833, 2926, 4366]}, - {"tokens": ['人', '生', '苦', '短', '▁I', '▁USE', '▁WE', 'NE', 'T'], - "label": [1, 1, 1, 1, 2152, 4699, 4833, 2926, 4366]}, - {"tokens": ['▁', 'Z', 'HEN', 'DO', 'NG', '▁IS', 'T', '▁SO', '▁SCH', - 'Ö', 'N'], - "label": [3, 4999, 2048, 1248, 2960, 2332, 4366, 4072, 3844, - 1, 2901]}, - {"tokens": ['▁', 'Z', 'HEN', 'DO', 'NG', '▁IS', 'T', '▁SO', '▁SCH', - 'Ö', 'N'], - "label": [3, 4999, 2048, 1248, 2960, 2332, 4366, 4072, 3844, - 1, 2901]}, - {"tokens": ['▁IT', "'", 'S', '▁O', 'KA', 'Y'], - "label": [2344, 2, 3790, 3010, 2418, 4979]} - ] - else: - bpe_model = None - refs = [ - {"tokens": ['震', '东', '好', '帅'], - "label": [4932, 80, 1059, 1375]}, - {"tokens": ['吴', '迪', '也', '好', '帅'], - "label": [656, 4540, 117, 1059, 1375]}, - {"tokens": ['b', 'i', 'n', 'b', 'i', 'n', '▁', 'i', 's', '▁', - 'a', 'l', 's', 'o', '▁', 'h', 'a', 'n', 'd', 's', - 'o', 'm', 'e'], - "label": [9, 23, 33, 9, 23, 33, 1, 23, 43, 1, 7, 29, 43, 35, - 1, 21, 7, 33, 13, 43, 35, 31, 15]}, - {"tokens": ['l', 'i', 'f', 'e', '▁', 'i', 's', '▁', 's', 'h', - 'o', 'r', 't', '▁', 'i', '▁', 'u', 's', 'e', '▁', - 'w', 'e', 'n', 'e', 't'], - "label": [29, 23, 17, 15, 1, 23, 43, 1, 43, 21, 35, 41, 46, - 1, 23, 1, 48, 43, 15, 1, 52, 15, 33, 15, 46]}, - {"tokens": ['超', '哥', '▁', 'i', 's', '▁', 't', 'h', 'e', '▁', - 'm', 'o', 's', 't', '▁', 'h', 'a', 'n', 'd', 's', 'o', - 'm', 'e', '▁', '吧'], - "label": [4395, 736, 1, 23, 43, 1, 46, 21, 15, 1, 31, 35, 43, 46, - 1, 21, 7, 33, 13, 43, 35, 31, 15, 1, 647]}, - {"tokens": ['人', '生', '苦', '短', 'i', '▁', 'u', 's', 'e', '▁', - 'w', 'e', 'n', 'e', 't'], - "label": [155, 2980, 3833, 3178, 23, 1, 48, 43, 15, 1, 52, 15, 33, - 15, 46]}, - {"tokens": ['人', '生', '苦', '短', 'I', '▁', 'U', 'S', 'E', '▁', - 'W', 'E', 'N', 'E', 'T'], - "label": [155, 2980, 3833, 3178, 24, 1, 49, 44, 16, 1, 53, 16, 34, - 16, 47]}, - {"tokens": ['z', 'h', 'e', 'n', 'd', 'o', 'n', 'g', '▁', 'i', 's', - 't', '▁', 's', 'o', '▁', 's', 'c', 'h', 'ö', 'n'], - "label": [58, 21, 15, 33, 13, 35, 33, 19, 1, 23, 43, 46, 1, 43, - 35, 1, 43, 11, 21, 1, 33]}, - {"tokens": ['z', 'h', 'e', 'n', 'd', 'o', 'n', 'g', '▁', 'i', 's', - 't', '▁', 's', 'o', '▁', 's', 'c', 'h', 'ö', 'n'], - "label": [58, 21, 15, 33, 13, 35, 33, 19, 1, 23, 43, 46, 1, 43, - 35, 1, 43, 11, 21, 1, 33]}, - {"tokens": ['I', 't', "'", 's', '▁', 'o', 'k', 'a', 'y'], - "label": [24, 46, 2, 43, 1, 35, 27, 7, 56]} - ] - symbol_table = {} - with open(symbol_table_path, 'r') as f: - lines = f.readlines() - for l in lines: - l = l.strip().split() - symbol_table[l[0]] = int(l[1]) - outs = processor.tokenize( - txts, symbol_table, bpe_model, split_with_space=False - ) - for (hyp, ref) in zip(outs, refs): - assert(len(hyp["tokens"]) == len(ref["tokens"])) - assert(all([h == r for h, r in zip(hyp["tokens"], ref["tokens"])])) - assert(len(hyp["label"]) == len(ref["label"])) - assert(all([h == r for h, r in zip(hyp["label"], ref["label"])])) - -@pytest.mark.parametrize("use_pbe_model", [True, False]) -def test_non_lang_symbol_tokenize(use_pbe_model): - data = [{"txt": "我是{NOISE}"}] - symbol_table = {"我": 1, "是": 2, "{NOISE}": 3} - - if use_pbe_model: - bpe_model = "test/resources/librispeech.train_960_unigram5000.bpemodel" - - sample = next(processor.tokenize(data, symbol_table, bpe_model, - non_lang_syms=["{NOISE}"])) - - assert sample["tokens"] == ["我", "是", "{NOISE}"] - else: - sample = next(processor.tokenize(data, symbol_table, - non_lang_syms=["{NOISE}"])) - - assert sample["tokens"] == ["我", "是", "{NOISE}"] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/alignment.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/alignment.sh deleted file mode 100644 index 64d860bb61761cadca750c9baf91eddb49e56728..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/alignment.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -stage=0 # start from 0 if you need to start from data preparation -stop_stage=0 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -dir=exp/ -config=$dir/train.yaml -checkpoint= -checkpoint=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/avg_20.pt -config=/home/diwu/github/latest/wenet/examples/aishell/s0/exp/transformer/train.yaml -set= -ali_format=$feat_dir/$set/format.data -ali_format=format.data -ali_result=$dir/ali - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - nj=32 - # Prepare required data for ctc alignment - echo "Prepare data, prepare required format" - for x in $set; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Test model, please specify the model you want to use by --checkpoint - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $config \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - -fi - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/analyze_dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/analyze_dataset.py deleted file mode 100644 index d4373b065c301972fe0164b6df3591166000acfc..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/analyze_dataset.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Horizon Inc. (authors: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Analyze Dataset, Duration/TextLength/Speed etc. - -Usage: -. ./path.sh && python3 tools/analyze_dataset.py \ - --data_type "shard" \ - --data_list data/test/data.list \ - --output_dir exp/analyze_test \ - --num_thread 32 -""" - -import os -import json -import math -import time -import numpy -import logging -import librosa -import tarfile -import argparse -import torchaudio -import multiprocessing - -from wenet.utils.file_utils import read_lists -from wenet.dataset.processor import AUDIO_FORMAT_SETS - - -def get_args(): - parser = argparse.ArgumentParser(description='Analyze dataset') - parser.add_argument('--data_type', - default='wav_scp', - choices=['wav_scp', 'raw', 'shard'], - help='dataset type') - parser.add_argument('--output_dir', type=str, - default="exp", help='write info to output dir') - parser.add_argument('--data_list', default=None, - help='used in raw/shard mode') - parser.add_argument('--wav_scp', default=None, - help='used in wav_scp mode') - parser.add_argument('--text', default=None, - help='used in wav_scp mode') - parser.add_argument('--num_thread', type=int, - default=4, help='number of threads') - args = parser.parse_args() - print(args) - return args - - -def analyze(datas, output_file, thread_id): - with open(output_file, "w", encoding='utf8') as f: - for i, data in enumerate(datas): - if type(data['wav']) is numpy.ndarray: - y, sample_rate = data['wav'], data['sample_rate'] - data['wav'] = "None" # NOTE(xcsong): Do not save wav. - elif type(data['wav'] is str): - y, sample_rate = librosa.load(data['wav'], sr=16000) - data['dur'] = len(y) / sample_rate - data['txt_length'] = len(data['txt']) - data['speed'] = data['txt_length'] / data['dur'] - # Trim the beginning and ending silence - _, index = librosa.effects.trim(y, top_db=30) - data['leading_sil'] = librosa.get_duration( - y=y[:index[0]], sr=16000) * 1000 if index[0] > 0 else 0 - data['trailing_sil'] = librosa.get_duration( - y=y[index[1]:], sr=16000) * 1000 if index[1] < len(y) else 0 - data_str = json.dumps(data, ensure_ascii=False) - f.write("{}\n".format(data_str)) - if thread_id == 0 and i % 100 == 0: - logging.info("\tThread-{}: processed {}/{}".format( - thread_id, i, len(datas))) - - -def read_tar(file): - try: - with tarfile.open(fileobj=open(file, "rb"), mode="r|*") as stream: - prev_prefix = None - data = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - data['key'] = prev_prefix - if valid: - yield data - data = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - data['txt'] = file_obj.read().decode( - 'utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load( - file_obj) - # single channel - data['wav'] = waveform.numpy()[0, :] - data['sample_rate'] = sample_rate - else: - data[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning( - 'error: {} when parse {}'.format(ex, name)) - prev_prefix = prefix - # The last data in tar - if prev_prefix is not None: - data['key'] = prev_prefix - yield data - except Exception as ex: - logging.warning( - 'tar_file error: {} when processing {}'.format(ex, file)) - - -def main(): - start_time = time.time() - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.makedirs(args.output_dir, exist_ok=True) - os.makedirs(args.output_dir + "/partition", exist_ok=True) - datas = [[] for i in range(args.num_thread)] - - logging.info("Stage-1: Loading data.list OR wav.scp...") - if args.data_type == "shard": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - total = 0 - for line in lists: - for data in read_tar(line): - datas[total % args.num_thread].append(data) - total = total + 1 - elif args.data_type == "raw": - assert args.data_list is not None - lists = read_lists(args.data_list) - # partition - for i, line in enumerate(lists): - data = json.loads(line) - datas[i % args.num_thread].append(data) - elif args.data_type == "wav_scp": - assert args.wav_scp is not None - assert args.text is not None - wavs, texts = {}, {} - # wavs - for line in read_lists(args.wav_scp): - line = line.strip().split() - wavs[line[0]] = line[1] - # texts - for line in read_lists(args.text): - line = line.strip().split(maxsplit=1) - texts[line[0]] = line[1] - sorted(wavs) - sorted(texts) - # partition - for i, (key1, key2) in enumerate(zip(wavs, texts)): - assert key1 == key2 - datas[i % args.num_thread].append( - {'key': key1, "wav": wavs[key1], "txt": texts[key1]} - ) - - logging.info("Stage-2: Start Analyze") - # threads - pool = multiprocessing.Pool(processes=args.num_thread) - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - pool.apply_async(analyze, (datas[i], output_file, i)) - pool.close() - pool.join() - - logging.info("Stage-3: Sort and Write Result") - datas = [] - for i in range(args.num_thread): - output_file = os.path.join( - args.output_dir, "partition", "part-{}".format(i)) - with open(output_file, "r", encoding='utf8') as f: - for line in f.readlines(): - data = json.loads(line) - datas.append(data) - total_dur = sum([x['dur'] for x in datas]) - total_len = sum([x['txt_length'] for x in datas]) - total_leading_sil = sum([x['leading_sil'] for x in datas]) - total_trailing_sil = sum([x['trailing_sil'] for x in datas]) - num_datas = len(datas) - names = ['key', 'dur', 'txt_length', 'speed', - 'leading_sil', 'trailing_sil'] - units = ['', 's', '', 'char/s', 'ms', 'ms'] - avgs = [0, total_dur / num_datas, total_len / num_datas, - total_len / total_dur, total_leading_sil / num_datas, - total_trailing_sil / num_datas] - stds = [0, sum([(x['dur'] - avgs[1])**2 for x in datas]), - sum([(x['txt_length'] - avgs[2])**2 for x in datas]), - sum([(x['txt_length'] / x['dur'] - avgs[3])**2 for x in datas]), - sum([(x['leading_sil'] - avgs[4])**2 for x in datas]), - sum([(x['trailing_sil'] - avgs[5])**2 for x in datas])] - stds = [math.sqrt(x / num_datas) for x in stds] - parts = ['max', 'P99', 'P75', 'P50', 'P25', 'min'] - index = [num_datas - 1, int(num_datas * 0.99), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - - with open(args.output_dir + "/analyze_result_brief", - "w", encoding='utf8') as f: - for i, (name, unit, avg, std) in enumerate( - zip(names, units, avgs, stds)): - if name == 'key': - continue - f.write("==================\n") - - datas.sort(key=lambda x: x[name]) - for p, j in zip(parts, index): - f.write("{} {}: {:.3f} {} (wav_id: {})\n".format( - p, name, datas[j][name], unit, datas[j]['key'])) - f.write("avg {}: {:.3f} {}\n".format( - name, avg, unit)) - f.write("std {}: {:.3f}\n".format( - name, std)) - os.system("cat {}".format(args.output_dir + "/analyze_result_brief")) - - datas.sort(key=lambda x: x['dur']) - with open(args.output_dir + "/analyze_result", "w", encoding='utf8') as f: - for data in datas: - f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) - - end_time = time.time() - logging.info("Time Cost: {:.3f}s".format(end_time - start_time)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/cmvn_kaldi2json.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/cmvn_kaldi2json.py deleted file mode 100644 index 9966046c95a9d50438c4857b785cb7985182e376..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/cmvn_kaldi2json.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import logging -import sys -import json - -def kaldi2json(kaldi_cmvn_file): - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - cmvn_info = {'mean_stat:' : means, - 'var_stat' : variance, - 'frame_num' : count} - return cmvn_info - -if __name__ == '__main__': - with open(sys.argv[2], 'w') as fout: - cmvn = kaldi2json(sys.argv[1]) - fout.write(json.dumps(cmvn)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/combine_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/combine_data.sh deleted file mode 100644 index 8a56c43f1a2a238d78270f94f3d22f1af540e912..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/combine_data.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# Check that frame_shift are compatible, where present together with features. -dir_with_frame_shift= -for dir in $*; do - if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then - if [[ $dir_with_frame_shift ]] && - ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then - echo "$0:error: different frame_shift in directories $dir and " \ - "$dir_with_frame_shift. Cannot combine features." - exit 1; - fi - dir_with_frame_shift=$dir - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if [[ $dir_with_frame_shift ]]; then - cp $dir_with_frame_shift/frame_shift $dest -fi - -if ! $skip_fix ; then - tools/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute-cer.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/compute-cer.py deleted file mode 100644 index a0a8f8fe1f59251c5d8fefeb62ef469276fc6063..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute-cer.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import sys -import unicodedata -import codecs - -remove_tag = True -spacelist = [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - # https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': - sep = '>' - j = i + 1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c == sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: - return '' - chars = [] - i = 0 - T = len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - if x.isalnum(): - for k in x: - new_sentence.append(k) - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, - 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i - 1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j - 1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0, - 'ins': 0, 'del': 0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i={i} , j={j} , \ - error={error}'. - format(i=i, j=j, error=self.space[i][j]['error'])) - return result - - def overall(self) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def cluster(self, data) : - result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [unicodedata.name(char) for char in word] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names) - 1) : - if unicode_names[i] != unicode_names[i + 1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) \ - and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] \ - [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \ - [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose = 1 - padding_symbol = ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose = 0 - try: - verbose = int(b) - except Exception: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol = ' ' - elif b == 'underline': - padding_symbol = '_' - continue - if True or sys.argv[1].startswith('-'): - # ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig = set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array) == 0: - continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, - case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array) == 0: - continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length - len_lab) - space['rec'].append(length - len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end=' ') - else: - print('lab:', end=' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['lab'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end=' ') - else: - print('rec:', end=' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token=token), end='') - for n in range(space['rec'][idx]) : - print(padding_symbol, end='') - print(' ', end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===================================================' - '========================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster(k for k in default_clusters[cluster_id]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like
- if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + - result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], - result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif (token[0] == '<' and token[len(token) - 1] == '>' and - cluster_id == ''): - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('=======================================' - '====================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute-wer.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/compute-wer.py deleted file mode 100644 index a3eefc0dc7b67f252e685da71a5189312e74ef85..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute-wer.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -import re, sys, unicodedata -import codecs - -remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] - -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res - -def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) - - -def normalize(sentence, ignore_words, cs, split=None): - """ sentence, ignore_words are both in unicode - """ - new_sentence = [] - for token in sentence: - x = token - if not cs: - x = x.upper() - if x in ignore_words: - continue - if remove_tag: - x = stripoff_tags(x) - if not x: - continue - if split and x in split: - new_sentence += split[x] - else: - new_sentence.append(x) - return new_sentence - -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) - -def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) - -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] - -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") - -if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue - - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig - - default_clusters = {} - default_words = {} - - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit - - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: - if tochar: - array = characterize(line) - else: - array = line.strip().split() - if len(array)==0: continue - fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) - - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) - - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name - - result = calculator.calculate(lab, rec) - if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() - - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute_cmvn_stats.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/compute_cmvn_stats.py deleted file mode 100644 index 9c89789c47be0c855939469e86040f10398e9d89..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute_cmvn_stats.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys -import argparse -import json -import codecs -import yaml - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.utils.data import Dataset, DataLoader - -torchaudio.set_audio_backend("sox_io") - - -class CollateFunc(object): - ''' Collate function for AudioDataset - ''' - - def __init__(self, feat_dim, resample_rate): - self.feat_dim = feat_dim - self.resample_rate = resample_rate - pass - - def __call__(self, batch): - mean_stat = torch.zeros(self.feat_dim) - var_stat = torch.zeros(self.feat_dim) - number = 0 - for item in batch: - value = item[1].strip().split(",") - assert len(value) == 3 or len(value) == 1 - wav_path = value[0] - sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate - resample_rate = sample_rate - # len(value) == 3 means segmented wav.scp, - # len(value) == 1 means original wav.scp - if len(value) == 3: - start_frame = int(float(value[1]) * sample_rate) - end_frame = int(float(value[2]) * sample_rate) - waveform, sample_rate = torchaudio.backend.sox_io_backend.load( - filepath=wav_path, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(item[1]) - - waveform = waveform * (1 << 15) - if self.resample_rate != 0 and self.resample_rate != sample_rate: - resample_rate = self.resample_rate - waveform = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - - mat = kaldi.fbank(waveform, - num_mel_bins=self.feat_dim, - dither=0.0, - energy_floor=0.0, - sample_frequency=resample_rate) - mean_stat += torch.sum(mat, axis=0) - var_stat += torch.sum(torch.square(mat), axis=0) - number += mat.shape[0] - return number, mean_stat, var_stat - - -class AudioDataset(Dataset): - def __init__(self, data_file): - self.items = [] - with codecs.open(data_file, 'r', encoding='utf-8') as f: - for line in f: - arr = line.strip().split() - self.items.append((arr[0], arr[1])) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.items[idx] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='extract CMVN stats') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for processing') - parser.add_argument('--train_config', - default='', - help='training yaml conf') - parser.add_argument('--in_scp', default=None, help='wav scp file') - parser.add_argument('--out_cmvn', - default='global_cmvn', - help='global cmvn file') - - doc = "Print log after every log_interval audios are processed." - parser.add_argument("--log_interval", type=int, default=1000, help=doc) - args = parser.parse_args() - - with open(args.train_config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - resample_rate = 0 - if 'resample_conf' in configs['dataset_conf']: - resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] - print('using resample and new sample rate is {}'.format(resample_rate)) - - collate_func = CollateFunc(feat_dim, resample_rate) - dataset = AudioDataset(args.in_scp) - batch_size = 20 - data_loader = DataLoader(dataset, - batch_size=batch_size, - shuffle=True, - sampler=None, - num_workers=args.num_workers, - collate_fn=collate_func) - - with torch.no_grad(): - all_number = 0 - all_mean_stat = torch.zeros(feat_dim) - all_var_stat = torch.zeros(feat_dim) - wav_number = 0 - for i, batch in enumerate(data_loader): - number, mean_stat, var_stat = batch - all_mean_stat += mean_stat - all_var_stat += var_stat - all_number += number - wav_number += batch_size - - if wav_number % args.log_interval == 0: - print(f'processed {wav_number} wavs, {all_number} frames', - file=sys.stderr, - flush=True) - - cmvn_info = { - 'mean_stat': list(all_mean_stat.tolist()), - 'var_stat': list(all_var_stat.tolist()), - 'frame_num': all_number - } - - with open(args.out_cmvn, 'w') as fout: - fout.write(json.dumps(cmvn_info)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute_fbank_feats.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/compute_fbank_feats.py deleted file mode 100644 index 4cc7dae54de6e8b24b14148bd3930d19b4d7b28c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/compute_fbank_feats.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging - -import torchaudio -import torchaudio.compliance.kaldi as kaldi - -import wenet.dataset.kaldi_io as kaldi_io - -# The "sox" backends are deprecated and will be removed in 0.9.0 release. -# So here we use sox_io backend -torchaudio.set_audio_backend("sox_io") - - -def parse_opts(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--num_mel_bins', - default=80, - type=int, - help='Number of triangular mel-frequency bins') - parser.add_argument('--frame_length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame_shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--dither', - type=int, - default=0.0, - help='Dithering constant (0.0 means no dither)') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_scp', help='wav scp file') - parser.add_argument('out_ark', help='output ark file') - parser.add_argument('out_scp', help='output scp file') - args = parser.parse_args() - return args - - -# wav format: -def load_wav_scp(wav_scp_file): - wav_list = [] - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_list.append((arr[0], arr[1])) - return wav_list - - -# wav format: -def load_wav_scp_dict(wav_scp_file): - wav_dict = {} - with open(wav_scp_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_dict[arr[0]] = arr[1] - return wav_dict - - -# Segments format: -def load_wav_segments(wav_scp_file, segments_file): - wav_dict = load_wav_scp_dict(wav_scp_file) - audio_list = [] - with open(segments_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - key = arr[0] - wav_file = wav_dict[arr[1]] - start = float(arr[2]) - end = float(arr[3]) - audio_list.append((key, wav_file, start, end)) - return audio_list - - -if __name__ == '__main__': - args = parse_opts() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - if args.segments is None: - audio_list = load_wav_scp(args.wav_scp) - else: - audio_list = load_wav_segments(args.wav_scp, args.segments) - - count = 0 - with open(args.out_ark, 'wb') as ark_fout, \ - open(args.out_scp, 'w', encoding='utf8') as scp_fout: - for item in audio_list: - if len(item) == 2: - key, wav_path = item - waveform, sample_rate = torchaudio.load_wav(wav_path) - else: - assert len(item) == 4 - key, wav_path, start, end = item - sample_rate = torchaudio.info(wav_path).sample_rate - frame_offset = int(start * sample_rate) - num_frames = int((end - start) * sample_rate) - waveform, sample_rate = torchaudio.load_wav( - wav_path, frame_offset, num_frames) - - mat = kaldi.fbank(waveform, - num_mel_bins=args.num_mel_bins, - frame_length=args.frame_length, - frame_shift=args.frame_shift, - dither=args.dither, - energy_floor=0.0, - sample_frequency=sample_rate) - mat = mat.detach().numpy() - kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) - count += 1 - if count % 10000 == 0: - logging.info('Progress {}/{}'.format(count, len(audio_list))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/copy_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/copy_data_dir.sh deleted file mode 100644 index ee880c4c3ca398a58a4e306467c639b0a76310bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/copy_data_dir.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/utt2num_frames ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in frame_shift stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -echo $validate_opts -echo $destdir -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/decode.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/decode.sh deleted file mode 100644 index 1d49b0e48631f4818fb9c464df66904170275a33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/decode.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Mobvoi Inc. All Rights Reserved. -# Author: binbinzhang@mobvoi.com (Binbin Zhang) -export GLOG_logtostderr=1 -export GLOG_v=2 - -set -e - -nj=1 -chunk_size=-1 -ctc_weight=0.0 -reverse_weight=0.0 -rescoring_weight=1.0 -# For CTC WFST based decoding -fst_path= -dict_path= -acoustic_scale=1.0 -beam=15.0 -lattice_beam=12.0 -min_active=200 -max_active=7000 -blank_skip_thresh=1.0 -length_penalty=0.0 - -. tools/parse_options.sh || exit 1; -if [ $# != 5 ]; then - echo "Usage: $0 [options] " - exit 1; -fi - -if ! which decoder_main > /dev/null; then - echo "decoder_main is not built, please go to runtime/libtorch to build it." - exit 1; -fi - -scp=$1 -label_file=$2 -model_file=$3 -unit_file=$4 -dir=$5 - -mkdir -p $dir/split${nj} - -# Step 1. Split wav.scp -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${dir}/split${nj}/wav.${n}.scp" -done -tools/data/split_scp.pl ${scp} ${split_scps} - -# Step 2. Parallel decoding -wfst_decode_opts= -if [ ! -z $fst_path ]; then - wfst_decode_opts="--fst_path $fst_path" - wfst_decode_opts="$wfst_decode_opts --beam $beam" - wfst_decode_opts="$wfst_decode_opts --dict_path $dict_path" - wfst_decode_opts="$wfst_decode_opts --lattice_beam $lattice_beam" - wfst_decode_opts="$wfst_decode_opts --max_active $max_active" - wfst_decode_opts="$wfst_decode_opts --min_active $min_active" - wfst_decode_opts="$wfst_decode_opts --acoustic_scale $acoustic_scale" - wfst_decode_opts="$wfst_decode_opts --blank_skip_thresh $blank_skip_thresh" - wfst_decode_opts="$wfst_decode_opts --length_penalty $length_penalty" - echo $wfst_decode_opts > $dir/config -fi -for n in $(seq ${nj}); do -{ - decoder_main \ - --rescoring_weight $rescoring_weight \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --chunk_size $chunk_size \ - --wav_scp ${dir}/split${nj}/wav.${n}.scp \ - --model_path $model_file \ - --unit_path $unit_file \ - $wfst_decode_opts \ - --result ${dir}/split${nj}/${n}.text &> ${dir}/split${nj}/${n}.log -} & -done -wait - -# Step 3. Merge files -for n in $(seq ${nj}); do - cat ${dir}/split${nj}/${n}.text -done > ${dir}/text -tail $dir/split${nj}/*.log | grep RTF | awk '{sum+=$NF}END{print sum/NR}' > $dir/rtf - -# Step 4. Compute WER -python3 tools/compute-wer.py --char=1 --v=1 \ - $label_file $dir/text > $dir/wer diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/feat_to_shape.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/feat_to_shape.sh deleted file mode 100644 index ab6d45c60709dd05a38f8da269d617233d0d39f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/feat_to_shape.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Begin configuration section. -nj=4 -cmd=run.pl -verbose=0 -filetype="" -preprocess_conf="" -# End configuration section. - -help_message=$(cat << EOF -Usage: $0 [options] [] -e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/log -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) - -echo "$0 $*" 1>&2 # Print the command line for logging - -. parse_options.sh || exit 1; - -if [ $# -lt 2 ] || [ $# -gt 3 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -scp=$1 -outscp=$2 -data=$(dirname ${scp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -split_scps="" -for n in $(seq ${nj}); do - split_scps="${split_scps} ${logdir}/feats.${n}.scp" -done - -utils/split_scp.pl ${scp} ${split_scps} - -if [ -n "${preprocess_conf}" ]; then - preprocess_opt="--preprocess-conf ${preprocess_conf}" -else - preprocess_opt="" -fi -if [ -n "${filetype}" ]; then - filetype_opt="--filetype ${filetype}" -else - filetype_opt="" -fi - -${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ - feat-to-len --verbose=${verbose} \ - scp:${logdir}/feats.JOB.scp ark,t:${logdir}/shape.JOB.scp - -feat_dim=$(feat-to-dim scp:$logdir/feats.1.scp -) - -# concatenate the .scp files together. -for n in $(seq ${nj}); do - sed "s:\ *$:,$feat_dim:g" ${logdir}/shape.${n}.scp -done > ${outscp} - -rm -f ${logdir}/feats.*.scp 2>/dev/null diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/filter_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/filter_scp.pl deleted file mode 100644 index b76d37f41be0886470281978bfacf97f6b8ae976..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fix_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/fix_data_dir.sh deleted file mode 100644 index d1644c1cac4264c78eae7d91b03c4126baf7ec4c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fix_data_dir.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -cmd="$@" - -utt_extra_files= -spk_extra_files= - -. tools/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 - -if [ -f $data/images.scp ]; then - image/fix_data_dir.sh $cmd - exit $? -fi - -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2emo utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - tools/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - tools/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - tools/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - echo "$(cat $tmpdir/utts | wc -l)" - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - - maybe_utt2dur= - if [ -f $data/utt2dur ]; then - cat $data/utt2dur | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 - maybe_utt2dur=utt2dur.ok - fi - - maybe_utt2num_frames= - if [ -f $data/utt2num_frames ]; then - cat $data/utt2num_frames | \ - awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1 - maybe_utt2num_frames=utt2num_frames.ok - fi - - for x in feats.scp text segments utt2lang utt2emo $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do - if [ -f $data/$x ]; then - tools/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - echo "$data/$x, $(cat $tmpdir/utts | wc -l), $(cat $tmpdir/utts.tmp | wc -l)" - mv $tmpdir/utts.tmp $tmpdir/utts - # echo "$tmpdir/utts" - fi - done - rm $data/utt2dur.ok 2>/dev/null || true - rm $data/utt2num_frames.ok 2>/dev/null || true - - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2emo utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( tools/filter_scp.pl $tmpdir/utts $data/$x ) ; then - tools/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -tools/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/flake8_hook.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/flake8_hook.py deleted file mode 100644 index bbe21bf4aa8ab460aca0eba5a24785e4d6b2c39d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -import sys - -from flake8.main import git - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/format_data.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/format_data.sh deleted file mode 100644 index 51f4602dfa0bac7873541c7f621ef4bb9eb29c94..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/format_data.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Mobvoi Corporation (Author: Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -echo "$0 $*" >&2 # Print the command line for logging -. ./path.sh - -nj=1 -cmd=run.pl -nlsyms="" -lang="" -feat="" -feat_type="kaldi" -oov="" -bpecode="" -allow_one_column=false -raw="" -verbose=0 -trans_type=char -filetype="" -preprocess_conf="" -category="" -out="" # If omitted, write in stdout -help_message=$(cat << EOF -Usage: $0 -e.g. $0 data/train data/lang_1char/train_units.txt -Options: - --nj # number of parallel jobs - --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. - --feat # feat.scp or feat1.scp,feat2.scp,... - --feat-type # kaldi or wav - --oov # Default: - --out # If omitted, write in stdout - --filetype # Specify the format of feats file - --preprocess-conf # Apply preprocess to feats when creating shape.scp - --verbose # Default: 0 -EOF -) -. tools/parse_options.sh - -if [ $# != 2 ]; then - echo "${help_message}" 1>&2 - exit 1; -fi - -set -euo pipefail - -dir=$1 -dic=$2 -tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) -#trap 'rm -rf ${tmpdir}' EXIT - -# 1. Create scp files for inputs -# These are not necessary for decoding mode, and make it as an option -input= -if [ -n "${feat}" ]; then - _feat_scps=$(echo "${feat}" | tr ',' ' ' ) - read -r -a feat_scps <<< $_feat_scps - num_feats=${#feat_scps[@]} - - for (( i=1; i<=num_feats; i++ )); do - feat=${feat_scps[$((i-1))]} - mkdir -p ${tmpdir}/input_${i} - input+="input_${i} " - cat ${feat} > ${tmpdir}/input_${i}/feat.scp - - # Dump in the "legacy" style JSON format - if [ -n "${filetype}" ]; then - awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ - > ${tmpdir}/input_${i}/filetype.scp - fi - - if [ ${feat_type} == "kaldi" ]; then - tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ - --filetype "${filetype}" \ - --preprocess-conf "${preprocess_conf}" \ - --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp - elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then - if [ -f $dir/segments ]; then - # used for segmented wav.scp - awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur - fi - if [ ! -f $dir/utt2dur ]; then - tools/wav_to_duration.sh --nj ${nj} \ - ${feat} ${tmpdir}/input_${i}/shape.scp - # use the existed utt2dur as shape.scp directly - else - cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp - fi - fi - done -fi - -# 2. Create scp files for outputs -mkdir -p ${tmpdir}/output -if [ -n "${bpecode}" ]; then - if [ "${trans_type}" == "cn_char_en_bpe" ]; then - tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp - else - paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \ - | tools/spm_encode --model=${bpecode} --output_format=piece) \ - > ${tmpdir}/output/token.scp - fi -elif [ -n "${nlsyms}" ]; then - tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -elif [ -n "${raw}" ]; then - cat $dir/text > ${tmpdir}/output/token.scp -else - tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp -fi -< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp -odim=$(cat ${dic} | wc -l) -< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp - -cat ${dir}/text > ${tmpdir}/output/text.scp - -# 3. Create scp files for the others -mkdir -p ${tmpdir}/other -if [ -n "${lang}" ]; then - awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp -fi - -if [ -n "${category}" ]; then - awk -v category=${category} '{print $1 " " category}' ${dir}/text \ - > ${tmpdir}/other/category.scp -fi -#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp - -# 4. Merge scp files into a one file -opts="" -for intype in ${input} output other; do - if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then - continue - fi - - if [ ${intype} != other ]; then - opts+="--${intype%_*}-scps " - else - opts+="--scps " - fi - - for x in "${tmpdir}/${intype}"/*.scp; do - k=$(basename ${x} .scp) - if [ ${k} = shape ]; then - opts+="shape:${x}:shape " - else - opts+="${k}:${x} " - fi - done -done - -if ${allow_one_column}; then - opts+="--allow-one-column true " -else - opts+="--allow-one-column false " -fi - -if [ -n "${out}" ]; then - opts+="-O ${out}" -fi - -tools/merge_scp2txt.py --verbose ${verbose} ${opts} - -#rm -fr ${tmpdir} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/add_lex_disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/add_lex_disambig.pl deleted file mode 100644 index dd8a25de6e1140a6d19b1e876f2e76f528532edf..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/compile_lexicon_token_fst.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100644 index b67814fe3f3244b14b8e494bfe46c4829c4f8bd6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -export LC_ALL=C - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here use ctc_token_fst_compact -tools/fst/ctc_token_fst_compact.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst.py deleted file mode 100644 index d81644b9cd216177a10a17772781d3293abe084f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst_compact.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst_compact.py deleted file mode 100644 index d3018d8b14ce25108cb1acc637cecded5d41be13..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst_corrected.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst_corrected.py deleted file mode 100644 index 81f7079eccb9e6447c46cdfdf6378aca7efe4a09..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/eps2disambig.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/eps2disambig.pl deleted file mode 100644 index e1d84a6bf56703596a0e4552d184f7168f724bcb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/make_lexicon_fst.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/make_lexicon_fst.pl deleted file mode 100644 index f97129c05cb3ba6460be401e92001261acfaf746..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/make_tlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/make_tlg.sh deleted file mode 100644 index 98694e5540968760f0c27eaf30a6668f4c46c50d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/prepare_dict.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/prepare_dict.py deleted file mode 100644 index 8a6a3cfe7cfded0c863637deef0bae2f2ede5557..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/prepare_dict.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - # We assume that the lexicon does not contain code-switch, - # i.e. the word contains both English and Chinese. - # see PR https://github.com/wenet-e2e/wenet/pull/1693 - # and Issue https://github.com/wenet-e2e/wenet/issues/1653 - if word.encode('utf8').isalpha(): - pieces = sp.EncodeAsPieces(word) - else: - pieces = word - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/remove_oovs.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/remove_oovs.pl deleted file mode 100644 index ac914c3bd9363eded791cdeb309fd05e980c4f2e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/rnnt_token_fst.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/rnnt_token_fst.py deleted file mode 100644 index cc6def1703311ab700a4a01f22c1adda32db9b0d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/s2eps.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/s2eps.pl deleted file mode 100644 index ffeeb8eb6af3c4f319f31ebff80be388d8f59e1a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/git-pre-commit b/models/audio/speech_recognition/conformer/igie/wenet/tools/git-pre-commit deleted file mode 100644 index b6e448ed375a0ddf502ce332685de8a99e88dc08..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/git-pre-commit +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python tools/flake8_hook.py diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/install_srilm.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/install_srilm.sh deleted file mode 100644 index 4aa113c14722a73fd3d3f84430025d44173c207b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/install_srilm.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2022 Binbin Zhang(binbzha@qq.com) - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -srilm_url="https://github.com/BitSpeech/SRILM/archive/refs/tags/1.7.3.tar.gz" - -if [ ! -f ./srilm.tar.gz ]; then - if ! wget -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 - fi -fi - -tar -zxvf srilm.tar.gz -mv SRILM-1.7.3 srilm - -# set the SRILM variable in the top-level Makefile to this directory. -cd srilm -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -make || exit -cd .. - -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/k2/make_hlg.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/k2/make_hlg.sh deleted file mode 100644 index 18c2268487410824ae11b199cf06f37acd717c88..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/k2/make_hlg.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) - -lexion_dir=$1 -lm_dir=$2 -tgt_dir=$3 - -# k2 and icefall updates very fast. Below commits are veryfied in this script. -# k2 3dc222f981b9fdbc8061b3782c3b385514a2d444, icefall 499ac24ecba64f687ff244c7d66baa5c222ecf0f - -# For k2 installation, please refer to https://github.com/k2-fsa/k2/ -python -c "import k2; print(k2.__file__)" -python -c "import torch; import _k2; print(_k2.__file__)" - -# Prepare necessary icefall scripts -if [ ! -d tools/k2/icefall ]; then - git clone --depth 1 https://github.com/k2-fsa/icefall.git tools/k2/icefall -fi -pip3 install -r tools/k2/icefall/requirements.txt -export PYTHONPATH=`pwd`/tools/k2/icefall:`pwd`/tools/k2/icefall/egs/aishell/ASR/local:$PYTHONPATH - -# 8.1 Prepare char based lang -mkdir -p $tgt_dir -python tools/k2/prepare_char.py $lexion_dir/units.txt $lm_dir/wordlist $tgt_dir -echo "Compile lexicon L.pt L_disambig.pt succeeded" - -# 8.2 Prepare G -mkdir -p data/lm -python -m kaldilm \ - --read-symbol-table="$tgt_dir/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $lm_dir/lm.arpa > data/lm/G_3_gram.fst.txt - -# 8.3 Compile HLG -python tools/k2/icefall/egs/aishell/ASR/local/compile_hlg.py --lang-dir $tgt_dir -echo "Compile decoding graph HLG.pt succeeded" \ No newline at end of file diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/k2/prepare_char.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/k2/prepare_char.py deleted file mode 100644 index 6e05042c42eb280135f6be7cdb3566b185258b90..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/k2/prepare_char.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# Copyright 2022 Ximalaya Speech Team (author: Xiang Lyu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" - -This script generates the following files in the directory sys.argv[3]: - - - lexicon.txt - - lexicon_disambig.txt - - L.pt - - L_disambig.pt - - tokens.txt - - words.txt -""" - -import sys -from pathlib import Path -from typing import Dict, List - -import k2 -import torch -from prepare_lang import ( - Lexicon, - add_disambig_symbols, - add_self_loops, - write_lexicon, - write_mapping, -) - - -def lexicon_to_fst_no_sil( - lexicon: Lexicon, - token2id: Dict[str, int], - word2id: Dict[str, int], - need_self_loops: bool = False, -) -> k2.Fsa: - """Convert a lexicon to an FST (in k2 format). - - Args: - lexicon: - The input lexicon. See also :func:`read_lexicon` - token2id: - A dict mapping tokens to IDs. - word2id: - A dict mapping words to IDs. - need_self_loops: - If True, add self-loop to states with non-epsilon output symbols - on at least one arc out of the state. The input label for this - self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. - Returns: - Return an instance of `k2.Fsa` representing the given lexicon. - """ - loop_state = 0 # words enter and leave from here - next_state = 1 # the next un-allocated state, will be incremented as we go - - arcs = [] - - # The blank symbol is defined in local/train_bpe_model.py - assert token2id[""] == 0 - assert word2id[""] == 0 - - eps = 0 - - for word, pieces in lexicon: - assert len(pieces) > 0, f"{word} has no pronunciations" - cur_state = loop_state - - word = word2id[word] - pieces = [ - token2id[i] if i in token2id else token2id[""] for i in pieces - ] - - for i in range(len(pieces) - 1): - w = word if i == 0 else eps - arcs.append([cur_state, next_state, pieces[i], w, 0]) - - cur_state = next_state - next_state += 1 - - # now for the last piece of this word - i = len(pieces) - 1 - w = word if i == 0 else eps - arcs.append([cur_state, loop_state, pieces[i], w, 0]) - - if need_self_loops: - disambig_token = token2id["#0"] - disambig_word = word2id["#0"] - arcs = add_self_loops( - arcs, - disambig_token=disambig_token, - disambig_word=disambig_word, - ) - - final_state = next_state - arcs.append([loop_state, final_state, -1, -1, 0]) - arcs.append([final_state]) - - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - - fsa = k2.Fsa.from_str(arcs, acceptor=False) - return fsa - - -def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool: - """Check if all the given tokens are in token symbol table. - - Args: - token_sym_table: - Token symbol table that contains all the valid tokens. - tokens: - A list of tokens. - Returns: - Return True if there is any token not in the token_sym_table, - otherwise False. - """ - for tok in tokens: - if tok not in token_sym_table: - return True - return False - - -def generate_lexicon( - token_sym_table: Dict[str, int], words: List[str] -) -> Lexicon: - """Generate a lexicon from a word list and token_sym_table. - - Args: - token_sym_table: - Token symbol table that mapping token to token ids. - words: - A list of strings representing words. - Returns: - Return a dict whose keys are words and values are the corresponding - tokens. - """ - lexicon = [] - for word in words: - chars = list(word.strip(" \t")) - if contain_oov(token_sym_table, chars): - continue - lexicon.append((word, chars)) - - # The OOV word is - lexicon.append(("", [""])) - return lexicon - - -def generate_tokens(text_file: str) -> Dict[str, int]: - """Generate tokens from the given text file. - - Args: - text_file: - A file that contains text lines to generate tokens. - Returns: - Return a dict whose keys are tokens and values are token ids ranged - from 0 to len(keys) - 1. - """ - token2id: Dict[str, int] = dict() - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - char, index = line.replace('\n', '').split() - assert char not in token2id - token2id[char] = int(index) - assert token2id[''] == 0 - return token2id - - -def generate_words(text_file: str) -> Dict[str, int]: - """Generate words from the given text file. - - Args: - text_file: - A file that contains text lines to generate words. - Returns: - Return a dict whose keys are words and values are words ids ranged - from 0 to len(keys) - 1. - """ - words = [] - with open(text_file, "r", encoding="utf-8") as f: - for line in f: - word = line.replace('\n', '') - assert word not in words - words.append(word) - words.sort() - - # We put '' '' at begining of word2id - # '#0', '', '' at end of word2id - words = [word for word in words - if word not in ['', '', '#0', '', '']] - words.insert(0, '') - words.insert(1, '') - words.append('#0') - words.append('') - words.append('') - word2id = {j: i for i, j in enumerate(words)} - return word2id - - -def main(): - token2id = generate_tokens(sys.argv[1]) - word2id = generate_words(sys.argv[2]) - tgt_dir = Path(sys.argv[3]) - - words = [word for word in word2id.keys() - if word not in - ["", "!SIL", "", "", "#0", "", ""]] - lexicon = generate_lexicon(token2id, words) - - lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) - next_token_id = max(token2id.values()) + 1 - for i in range(max_disambig + 1): - disambig = f"#{i}" - assert disambig not in token2id - token2id[disambig] = next_token_id - next_token_id += 1 - - write_mapping(tgt_dir / "tokens.txt", token2id) - write_mapping(tgt_dir / "words.txt", word2id) - write_lexicon(tgt_dir / "lexicon.txt", lexicon) - write_lexicon(tgt_dir / "lexicon_disambig.txt", lexicon_disambig) - - L = lexicon_to_fst_no_sil( - lexicon, - token2id=token2id, - word2id=word2id, - ) - L_disambig = lexicon_to_fst_no_sil( - lexicon_disambig, - token2id=token2id, - word2id=word2id, - need_self_loops=True, - ) - torch.save(L.as_dict(), tgt_dir / "L.pt") - torch.save(L_disambig.as_dict(), tgt_dir / "L_disambig.pt") - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/latency_metrics.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/latency_metrics.py deleted file mode 100644 index df2d8eee45f8e2d7c8536f208d44fafaeac3341f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/latency_metrics.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2022 Horizon Inc. (author: Xingchen Song) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import argparse -import logging -import librosa -import torch -import torchaudio -import yaml - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.font_manager as fm -import torchaudio.compliance.kaldi as kaldi - -from wenet.utils.init_model import init_model -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.mask import make_pad_mask -from wenet.utils.common import replace_duplicates_with_blank - - -def get_args(): - parser = argparse.ArgumentParser( - description='Analyze latency and plot CTC-Spike.') - parser.add_argument('--config', required=True, - type=str, help='configration') - parser.add_argument('--gpu', - type=int, - default=0, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--ckpt', required=True, - type=str, help='model checkpoint') - parser.add_argument('--tag', required=True, - type=str, help='image subtitle') - parser.add_argument('--wavscp', required=True, - type=str, help='wav.scp') - parser.add_argument('--alignment', required=True, - type=str, help='force alignment, generated by Kaldi.') - parser.add_argument('--chunk_size', required=True, - type=int, help='chunk size') - parser.add_argument('--left_chunks', default=-1, - type=int, help='left chunks') - parser.add_argument('--font', required=True, - type=str, help='font file') - parser.add_argument('--dict', required=True, - type=str, help='dict file') - parser.add_argument('--result_dir', required=True, - type=str, help='saving pdf') - parser.add_argument('--model_type', default='ctc', - choices=['ctc', 'transducer'], - help='show latency metrics from ctc models or rnn-t models') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - torch.manual_seed(777) - - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - symbol_table = read_symbol_table(args.dict) - char_dict = {v: k for k, v in symbol_table.items()} - - # 1. Load model - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - - model = init_model(conf) - load_checkpoint(model, args.ckpt) - model = model.eval().to(device) - - subsampling = model.encoder.embed.subsampling_rate - eos = model.eos_symbol() - - with open(args.wavscp, 'r') as fin: - wavs = fin.readlines() - - # 2. Forward model (get streaming_timestamps) - timestamps = {} - for idx, wav in enumerate(wavs): - if idx % 100 == 0: - logging.info("processed {}.".format(idx)) - key, wav = wav.strip().split(' ', 1) - waveform, sr = torchaudio.load(wav) - resample_rate = conf['dataset_conf']['resample_conf']['resample_rate'] - waveform = torchaudio.transforms.Resample( - orig_freq=sr, new_freq=resample_rate)(waveform) - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank( - waveform, - num_mel_bins=conf['dataset_conf']['fbank_conf']['num_mel_bins'], - frame_length=conf['dataset_conf']['fbank_conf']['frame_length'], - frame_shift=conf['dataset_conf']['fbank_conf']['frame_shift'], - dither=0.0, energy_floor=0.0, - sample_frequency=resample_rate, - ) - - speech = mat.unsqueeze(0).to(device) - speech_lengths = torch.tensor([mat.size(0)]).to(device) - - # Let's assume batch_size = 1 - encoder_out, encoder_mask = model.encoder( - speech, speech_lengths, args.chunk_size, args.left_chunks) - - maxlen = encoder_out.size(1) # (B, maxlen, encoder_dim) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # CTC greedy search - if args.model_type == 'ctc': - ctc_probs = model.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - topk_prob = topk_prob.view(1, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, eos) # (B, maxlen) - topk_prob = topk_prob.masked_fill_(mask, 0.0) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - hyps = [replace_duplicates_with_blank(hyp) for hyp in hyps] - scores = [prob.tolist() for prob in topk_prob] - timestamps[key] = [hyps[0], scores[0], wav] - - if args.model_type == 'transducer': - hyps = [] - scores = [] - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = 1 - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, - padding, cache) - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - scores.append(torch.max(joint_out_probs).item()) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or \ - per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - hyps.append(model.blank) - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - timestamps[key] = [hyps, scores, wav] - - # 3. Analyze latency - with open(args.alignment, 'r') as fin: - aligns = fin.readlines() - not_found, len_unequal, ignored = 0, 0, 0 - datas = [] - for align in aligns: - key, align = align.strip().split(' ', 1) - if key not in timestamps: - not_found += 1 - continue - fa, st = [], [] # force_alignment, streaming_timestamps - text_fa, text_st = "", "" - for i, token in enumerate(align.split()): - if token != '': - text_fa += token - # NOTE(xcsong): W/O subsample - fa.append(i * 10) - # ignore alignment_errors >= 70ms - frames_fa = len(align.split()) - frames_st = len(timestamps[key][0]) * subsampling - if abs(frames_st - frames_fa) >= 7: - ignored += 1 - continue - for i, token_id in enumerate(timestamps[key][0]): - if token_id != 0: - text_st += char_dict[token_id] - # NOTE(xcsong): W subsample - st.append(i * subsampling * 10) - if len(fa) != len(st): - len_unequal += 1 - continue - # datas[i] = [key, text_fa, text_st, list_of_diff, - # FirstTokenDelay, LastTokenDelay, AvgTokenDelay, - # streaming_timestamps, force_alignment] - datas.append([key, text_fa, text_st, - [a - b for a, b in zip(st, fa)], - st[0] - fa[0], st[-1] - fa[-1], - (sum(st) - sum(fa)) / len(st), - timestamps[key], align.split()]) - - logging.info("not found: {}, length unequal: {}, ignored: {}, \ - valid samples: {}".format(not_found, len_unequal, ignored, len(datas))) - - # 4. Plot and print - num_datas = len(datas) - names = ['FirstTokenDelay', 'LastTokenDelay', 'AvgTokenDelay'] - names_index = [4, 5, 6] - parts = ['max', 'P90', 'P75', 'P50', 'P25', 'min'] - parts_index = [num_datas - 1, int(num_datas * 0.90), int(num_datas * 0.75), - int(num_datas * 0.50), int(num_datas * 0.25), 0] - for name, name_idx in zip(names, names_index): - def f(name_idx=name_idx): - return name_idx - datas.sort(key=lambda x: x[f()]) - logging.info("==========================") - for p, i in zip(parts, parts_index): - data = datas[i] - # i.e., LastTokenDelay P90: 270.000 ms (wav_id: BAC009S0902W0144) - logging.info("{} {}: {:.3f} ms (wav_id: {})".format( - name, p, data[f()], datas[i][0])) - - font = fm.FontProperties(fname=args.font) - plt.rcParams['axes.unicode_minus'] = False - # we will have 2 sub-plots (force-align + streaming timestamps) - # plus one wav-plot - fig, axes = plt.subplots(figsize=(60, 60), nrows=3, ncols=1) - for j in range(2): - if j == 0: - # subplot-0: streaming_timestamps - plt_prefix = args.tag + "_" + name + "_" + p - x = np.arange(len(data[7][0])) * subsampling - hyps, scores = data[7][0], data[7][1] - else: - # subplot-1: force_alignments - plt_prefix = "force_alignment" - x = np.arange(len(data[8])) - hyps = [symbol_table[d] for d in data[8]] - scores = [0.0] * len(data[8]) - axes[j].set_title(plt_prefix, fontsize=30) - for frame, token, prob in zip(x, hyps, scores): - if char_dict[token] != '': - axes[j].bar( - frame, np.exp(prob), - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].text( - frame, np.exp(prob), - '{} {:.3f} {}'.format( - char_dict[token], np.exp(prob), frame), - fontdict=dict(fontsize=24), - fontproperties=font, - ) - else: - axes[j].bar( - frame, 0.01, - label='{} {:.3f}'.format( - char_dict[token], np.exp(prob)), - ) - axes[j].tick_params(labelsize=25) - - # subplot-2: wav - # wav, hardcode sample_rate to 16000 - samples, sr = librosa.load(data[7][2], sr=16000) - time = np.arange(0, len(samples)) * (1.0 / sr) - axes[-1].plot(time, samples) - - # i.e., RESULT_DIR/LTD_P90_120ms_BAC009S0768W0342.pdf - plt.savefig(args.result_dir + "/" + name + "_" + - p + "_" + str(data[f()]) + "ms" + "_" + data[0] + ".pdf") - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/make_raw_list.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/make_raw_list.py deleted file mode 100644 index 2f84f015542bb38da027b8ea61e8638f873cec33..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/make_raw_list.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('output_file', help='output list file') - args = parser.parse_args() - - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - if args.segments is not None: - segments_table = {} - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - with open(args.text_file, 'r', encoding='utf8') as fin, \ - open(args.output_file, 'w', encoding='utf8') as fout: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if args.segments is None: - assert key in wav_table - wav = wav_table[key] - line = dict(key=key, wav=wav, txt=txt) - else: - assert key in segments_table - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - line = dict(key=key, wav=wav, txt=txt, start=start, end=end) - json_line = json.dumps(line, ensure_ascii=False) - fout.write(json_line + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/make_shard_list.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/make_shard_list.py deleted file mode 100644 index 1f7d82829808c9cc181bbc5e0f60cccef8795bae..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/make_shard_list.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import io -import logging -import os -import tarfile -import time -import multiprocessing - -import torch -import torchaudio -import torchaudio.backend.sox_io_backend as sox - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def write_tar_file(data_list, - no_segments, - tar_file, - resample=16000, - index=0, - total=1): - logging.info('Processing {} {}/{}'.format(tar_file, index, total)) - read_time = 0.0 - save_time = 0.0 - write_time = 0.0 - with tarfile.open(tar_file, "w") as tar: - prev_wav = None - for item in data_list: - if no_segments: - key, txt, wav = item - else: - key, txt, wav, start, end = item - - suffix = wav.split('.')[-1] - assert suffix in AUDIO_FORMAT_SETS - if no_segments: - ts = time.time() - with open(wav, 'rb') as fin: - data = fin.read() - read_time += (time.time() - ts) - else: - if wav != prev_wav: - ts = time.time() - waveforms, sample_rate = sox.load(wav, normalize=False) - read_time += (time.time() - ts) - prev_wav = wav - start = int(start * sample_rate) - end = int(end * sample_rate) - audio = waveforms[:1, start:end] - - # resample - if sample_rate != resample: - if not audio.is_floating_point(): - # normalize the audio before resample - # because resample can't process int audio - audio = audio / (1 << 15) - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - audio = (audio * (1 << 15)).short() - else: - audio = torchaudio.transforms.Resample( - sample_rate, resample)(audio) - - ts = time.time() - f = io.BytesIO() - sox.save(f, audio, resample, format="wav", bits_per_sample=16) - # Save to wav for segments file - suffix = "wav" - f.seek(0) - data = f.read() - save_time += (time.time() - ts) - - assert isinstance(txt, str) - ts = time.time() - txt_file = key + '.txt' - txt = txt.encode('utf8') - txt_data = io.BytesIO(txt) - txt_info = tarfile.TarInfo(txt_file) - txt_info.size = len(txt) - tar.addfile(txt_info, txt_data) - - wav_file = key + '.' + suffix - wav_data = io.BytesIO(data) - wav_info = tarfile.TarInfo(wav_file) - wav_info.size = len(data) - tar.addfile(wav_info, wav_data) - write_time += (time.time() - ts) - logging.info('read {} save {} write {}'.format(read_time, save_time, - write_time)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument('--num_utts_per_shard', - type=int, - default=1000, - help='num utts per shard') - parser.add_argument('--num_threads', - type=int, - default=1, - help='num threads for make shards') - parser.add_argument('--prefix', - default='shards', - help='prefix of shards tar file') - parser.add_argument('--segments', default=None, help='segments file') - parser.add_argument('--resample', - type=int, - default=16000, - help='segments file') - parser.add_argument('wav_file', help='wav file') - parser.add_argument('text_file', help='text file') - parser.add_argument('shards_dir', help='output shards dir') - parser.add_argument('shards_list', help='output shards list file') - args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') - - torch.set_num_threads(1) - wav_table = {} - with open(args.wav_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - wav_table[arr[0]] = arr[1] - - no_segments = True - segments_table = {} - if args.segments is not None: - no_segments = False - with open(args.segments, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 4 - segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3])) - - data = [] - with open(args.text_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split(maxsplit=1) - key = arr[0] - txt = arr[1] if len(arr) > 1 else '' - if no_segments: - assert key in wav_table - wav = wav_table[key] - data.append((key, txt, wav)) - else: - wav_key, start, end = segments_table[key] - wav = wav_table[wav_key] - data.append((key, txt, wav, start, end)) - - num = args.num_utts_per_shard - chunks = [data[i:i + num] for i in range(0, len(data), num)] - os.makedirs(args.shards_dir, exist_ok=True) - - # Using thread pool to speedup - pool = multiprocessing.Pool(processes=args.num_threads) - shards_list = [] - tasks_list = [] - num_chunks = len(chunks) - for i, chunk in enumerate(chunks): - tar_file = os.path.join(args.shards_dir, - '{}_{:09d}.tar'.format(args.prefix, i)) - shards_list.append(tar_file) - pool.apply_async( - write_tar_file, - (chunk, no_segments, tar_file, args.resample, i, num_chunks)) - - pool.close() - pool.join() - - with open(args.shards_list, 'w', encoding='utf8') as fout: - for name in shards_list: - fout.write(name + '\n') diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/merge_scp2txt.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/merge_scp2txt.py deleted file mode 100644 index 51f1c42f272f0fd9fec0a7d69ee860d2f1eb6158..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/merge_scp2txt.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -from distutils.util import strtobool -from io import open -import logging -import sys - -PY2 = sys.version_info[0] == 2 -sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) -sys.stdout = codecs.getwriter('utf-8')( - sys.stdout if PY2 else sys.stdout.buffer) - - -# Special types: -def shape(x): - """Change str to List[int] - - >>> shape('3,5') - [3, 5] - >>> shape(' [3, 5] ') - [3, 5] - - """ - - # x: ' [3, 5] ' -> '3, 5' - x = x.strip() - if x[0] == '[': - x = x[1:] - if x[-1] == ']': - x = x[:-1] - - return list(map(int, x.split(','))) - - -def get_parser(): - parser = argparse.ArgumentParser( - description='Given each file paths with such format as ' - '::. type> can be omitted and the default ' - 'is "str". e.g. {} ' - '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' - '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' - '--output-scps text:data/text shape:data/utt2text_shape:shape ' - '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the inputs') - parser.add_argument('--output-scps', - type=str, - nargs='*', - action='append', - default=[], - help='files for the outputs') - parser.add_argument('--scps', - type=str, - nargs='+', - default=[], - help='The files except for the input and outputs') - parser.add_argument('--verbose', - '-V', - default=1, - type=int, - help='Verbose option') - parser.add_argument('--allow-one-column', - type=strtobool, - default=False, - help='Allow one column in input scp files. ' - 'In this case, the value will be empty string.') - parser.add_argument('--out', - '-O', - type=str, - help='The output filename. ' - 'If omitted, then output to sys.stdout') - return parser - - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - args.scps = [args.scps] - - # logging info - logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" - if args.verbose > 0: - logging.basicConfig(level=logging.INFO, format=logfmt) - else: - logging.basicConfig(level=logging.WARN, format=logfmt) - - inputs = {} - assert (len(args.input_scps) == 1) - for f in args.input_scps[0]: - arr = f.strip().split(':') - inputs[arr[0]] = arr[1] - assert ('feat' in inputs) - assert ('shape' in inputs) - - outputs = {} - assert (len(args.output_scps) == 1) - for f in args.output_scps[0]: - arr = f.strip().split(':') - outputs[arr[0]] = arr[1] - assert ('shape' in outputs) - assert ('text' in outputs) - assert ('token' in outputs) - assert ('tokenid' in outputs) - - files = [ - inputs['feat'], inputs['shape'], outputs['text'], outputs['token'], - outputs['tokenid'], outputs['shape'] - ] - fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape'] - fids = [open(f, 'r', encoding='utf-8') for f in files] - - if args.out is None: - out = sys.stdout - else: - out = open(args.out, 'w', encoding='utf-8') - done = False - while not done: - for i, fid in enumerate(fids): - line = fid.readline() - if line == '': - done = True - break - arr = line.strip().split() - content = ' '.join(arr[1:]) - if i == 0: - out.write('utt:{}'.format(arr[0])) - out.write('\t') - out.write('{}:{}'.format(fields[i], content)) - out.write('\n') - - for f in fids: - f.close() - if args.out is not None: - out.close() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/onnx2horizonbin.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/onnx2horizonbin.py deleted file mode 100644 index a94b647fb19d1446d4bc506c399c85677dddde9f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/onnx2horizonbin.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - -import argparse -import copy -import logging -import os -import sys -import random -import torch -import yaml -import numpy as np - -from torch.utils.data import DataLoader - -from wenet.utils.common import remove_duplicates_and_blank -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import to_numpy -from wenet.bin.export_onnx_bpu import export_encoder, export_ctc - - -try: - import hbdk # noqa: F401 - import horizon_nn # noqa: F401 - from horizon_tc_ui import HB_ONNXRuntime -except ImportError: - print('Please install hbdk,horizon_nn,horizon_tc_ui !') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def save_data(tensor, dirs, prefix): - if tensor.requires_grad: - data = tensor.detach().numpy().astype(np.float32) - else: - data = tensor.numpy().astype(np.float32) - os.makedirs(dirs, exist_ok=True) - data.tofile(dirs + "/" + prefix + ".bin") - - -def make_calibration_data(enc, args, conf): - conf['shuffle'] = True - logger.info(conf) - dataset = Dataset( - "shard", args.cali_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - cal_data_dir = os.path.join(args.output_dir, 'cal_data_dir') - for batch_idx, batch in enumerate(dataloader): - if batch_idx >= args.max_samples: - break - if batch_idx % 100 == 0: - logger.info("processed {} samples.".format(batch_idx)) - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - - # Feed forward overlap input step by step - random_high = (num_frames - context) // stride - num_rand = random.randint(0, random_high) - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - if i == num_rand: - save_data(chunk, "{}/chunk".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_cache, "{}/att_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(cnn_cache, "{}/cnn_cache".format(cal_data_dir), - prefix + "." + str(i)) - save_data(att_mask, "{}/att_mask".format(cal_data_dir), - prefix + "." + str(i)) - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - # NOTE(xcsong): It's fast to calibrate ctc.onnx, - # so it's okay to save all chunks - save_data(y, "{}/hidden".format(cal_data_dir), - prefix + "." + str(i)) - - -def check_wer(enc, ctc, args, conf): - conf['shuffle'] = False - dataset = Dataset( - "shard", args.wer_datalist, args.symbol_table, conf, - bpe_model=args.bpe_model, non_lang_syms=None, partition=False) - dataloader = DataLoader(dataset, batch_size=None, num_workers=0) - char_dict = {v: k for k, v in args.symbol_table.items()} - eos = len(char_dict) - 1 - - enc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_encoder/encoder_quantized_model.onnx") - ctc_session = HB_ONNXRuntime( - model_file=args.output_dir + - "/hb_makertbin_output_ctc/ctc_quantized_model.onnx") - torch_file = open(args.output_dir + "/torch_text", 'w', encoding="utf-8") - onnx_file = open(args.output_dir + "/onnx_text", 'w', encoding="utf-8") - subsampling = enc.embed.subsampling_rate - context = enc.embed.right_context + 1 # Add current frame - stride = subsampling * args.chunk_size - decoding_window = (args.chunk_size - 1) * subsampling + context - required_cache_size = args.chunk_size * args.num_decoding_left_chunks - num_layers = len(enc.encoders) - head, d_k = enc.encoders[0].self_attn.h, enc.encoders[0].self_attn.d_k - dim, lorder = enc._output_size, enc.encoders[0].conv_module.lorder - chunk_size, left_chunks = args.chunk_size, args.num_decoding_left_chunks - for batch_idx, batch in enumerate(dataloader): - keys, feats, target, feats_lengths, target_lengths = batch - num_frames, prefix = feats.size(1), keys[0] - att_cache = torch.zeros( - [1, head * num_layers, d_k * 2, required_cache_size], - dtype=feats.dtype, device=feats.device) - att_mask = torch.ones( - [1, head, chunk_size, required_cache_size + chunk_size], - dtype=feats.dtype, device=feats.device) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros( - [1, dim, num_layers, lorder], - dtype=feats.dtype, device=feats.device) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - - # Feed forward overlap input step by step - torch_out, onnx_out = [], [] - for i, cur in enumerate(range(0, num_frames - context + 1, stride)): - att_mask[:, :, :, -(chunk_size * (i + 1)):] = 1 - end = min(cur + decoding_window, num_frames) - chunk = feats[:, cur:end, :].unsqueeze(0) # (1, 1, window, mel) - if end == num_frames and end - cur < decoding_window: # last chunk - pad_len = decoding_window - (end - cur) # 67 - (35) - pad_chunk = torch.zeros((1, 1, pad_len, chunk.size(-1)), - device=feats.device) - chunk = torch.cat((chunk, pad_chunk), - dim=2) # (1, 1, win, mel) - if pad_len >= subsampling: - att_mask[:, :, :, -(pad_len // subsampling):] = 0 - # Torch model - (y, att_cache, cnn_cache) = enc.forward( - xs=chunk, att_cache=att_cache, - cnn_cache=cnn_cache, att_mask=att_mask) - torch_out.append(ctc.forward(y).transpose(1, 3).squeeze(2)) - # Quantized onnx model - ort_inputs = { - 'chunk': to_numpy(chunk), 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': to_numpy(att_mask)} - ort_outs = enc_session.run_feature( - enc_session.output_names, ort_inputs, input_offset=0) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_y = ctc_session.run_feature( - ctc_session.output_names, {'hidden': ort_outs[0]}, input_offset=0) - onnx_out.append(torch.from_numpy( - np.squeeze(onnx_y[0].transpose(0, 3, 2, 1), axis=2))) - - def post_process(list_out, file_obj, keys): - probs = torch.cat(list_out, dim=1) - maxlen = probs.size(1) - topk_prob, topk_index = probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(1, maxlen) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - for i, key in enumerate(keys): - content = '' - for w in hyps[i]: - if w == eos: - break - content += char_dict[w] - file_obj.write('{} {}\n'.format(key, content)) - return key, content - - if len(torch_out) > 0 and len(onnx_out) > 0: - key, content = post_process(torch_out, torch_file, keys) - logger.info('torch: {} {}'.format(key, content)) - key, content = post_process(onnx_out, onnx_file, keys) - logger.info('onnx : {} {}'.format(key, content)) - torch_file.close() - onnx_file.close() - - -def generate_config(enc_session, ctc_session, args): - template = """ -# 模型参数组 -model_parameters: - # 原始Onnx浮点模型文件 - onnx_model: '{}' - # 转换的目标AI芯片架构 - march: 'bernoulli2' - # 模型转换输出的用于上板执行的模型文件的名称前缀 - output_model_file_prefix: '{}' - # 模型转换输出的结果的存放目录 - working_dir: '{}' - # 指定转换后混合异构模型是否保留输出各层的中间结果的能力 - layer_out_dump: False - # 转换过程中日志生成级别 - log_level: 'debug' -# 输入信息参数组 -input_parameters: - # 原始浮点模型的输入节点名称 - input_name: '{}' - # 原始浮点模型的输入数据格式(数量/顺序与input_name一致) - input_type_train: '{}' - # 原始浮点模型的输入数据排布(数量/顺序与input_name一致) - input_layout_train: '{}' - # 原始浮点模型的输入数据尺寸 - input_shape: '{}' - # 网络实际执行时,输入给网络的batch_size 默认值为1 - # input_batch: 1 - # 在模型中添加的输入数据预处理方法 - norm_type: '{}' - # 预处理方法的图像减去的均值; 如果是通道均值,value之间必须用空格分隔 - # mean_value: '' - # 预处理方法的图像缩放比例,如果是通道缩放比例,value之间必须用空格分隔 - # scale_value: '' - # 转换后混合异构模型需要适配的输入数据格式(数量/顺序与input_name一致) - input_type_rt: '{}' - # 输入数据格式的特殊制式 - input_space_and_range: '' - # 转换后混合异构模型需要适配的输入数据排布(数量/顺序与input_name一致) - input_layout_rt: '{}' -# 校准参数组 -calibration_parameters: - # 模型校准使用的标定样本的存放目录 - cal_data_dir: '{}' - # 开启图片校准样本自动处理(skimage read resize到输入节点尺寸) - preprocess_on: False - # 校准使用的算法类型 - calibration_type: '{}' - # max 校准方式的参数 - max_percentile: 1.0 - # 强制指定OP在CPU上运行 - run_on_cpu: '{}' - # 强制指定OP在BPU上运行 - run_on_bpu: '{}' -# 编译参数组 -compiler_parameters: - # 编译策略选择 - compile_mode: 'latency' - # 是否打开编译的debug信息 - debug: False - # 模型运行核心数 - core_num: 1 - # 模型编译的优化等级选择 - optimize_level: 'O3' -""" - output_dir = os.path.realpath(args.output_dir) - cal_data_dir = os.path.join(output_dir, 'cal_data_dir') - os.makedirs(cal_data_dir, exist_ok=True) - enc_dic = enc_session.get_modelmeta().custom_metadata_map - enc_onnx_path = os.path.join(output_dir, 'encoder.onnx') - enc_log_path = os.path.join(output_dir, 'hb_makertbin_output_encoder') - enc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in enc_dic['input_name'].split(';')]) - ctc_dic = ctc_session.get_modelmeta().custom_metadata_map - ctc_onnx_path = os.path.join(output_dir, 'ctc.onnx') - ctc_log_path = os.path.join(output_dir, 'hb_makertbin_output_ctc') - ctc_cal_data = ";".join( - [cal_data_dir + "/" + x for x in ctc_dic['input_name'].split(';')]) - enc_config = template.format( - enc_onnx_path, "encoder", enc_log_path, - enc_dic['input_name'], enc_dic['input_type'], - enc_dic['input_layout_train'], enc_dic['input_shape'], - enc_dic['norm_type'], enc_dic['input_type'], enc_dic['input_layout_rt'], - enc_cal_data, args.calibration_type, args.extra_ops_run_on_cpu, "") - ctc_config = template.format( - ctc_onnx_path, "ctc", ctc_log_path, - ctc_dic['input_name'], ctc_dic['input_type'], - ctc_dic['input_layout_train'], ctc_dic['input_shape'], - ctc_dic['norm_type'], ctc_dic['input_type'], ctc_dic['input_layout_rt'], - ctc_cal_data, "default", "", "") - with open(output_dir + "/config_encoder.yaml", "w") as enc_yaml: - enc_yaml.write(enc_config) - with open(output_dir + "/config_ctc.yaml", "w") as ctc_yaml: - ctc_yaml.write(ctc_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='convert onnx to horizon .bin') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - parser.add_argument('--dict', type=str, required=True, help='dict file') - parser.add_argument('--max_samples', type=int, required=True, - help='maximum samples') - parser.add_argument('--cali_datalist', type=str, default=None, - help='make calibration data') - parser.add_argument('--wer_datalist', type=str, default=None, - help='check wer') - parser.add_argument('--wer_text', type=str, default=None, - help='check wer') - parser.add_argument('--bpe_model', default=None, type=str, - help='bpe model for english part') - parser.add_argument('--ln_run_on_bpu', action='store_true', - help='layernorm running on bpu') - parser.add_argument('--extra_ops_run_on_cpu', type=str, default=None, - help='extra operations running on cpu.') - parser.add_argument('--calibration_type', type=str, default='default', - help='kl / max / default.') - return parser - - -if __name__ == '__main__': - random.seed(777) - parser = get_args() - args = parser.parse_args() - # NOTE(xcsong): X3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - conf = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(conf) - load_checkpoint(model, args.checkpoint) - model.eval() - - symbol_table = read_symbol_table(args.dict) - args.symbol_table = symbol_table - args.feature_size = conf['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - logger.info("Stage-1: Export onnx") - enc, enc_session = export_encoder(model, args) - ctc, ctc_session = export_ctc(model, args) - - conf = copy.deepcopy(conf['dataset_conf']) - conf['filter_conf']['max_length'] = 102400 - conf['filter_conf']['min_length'] = 0 - conf['filter_conf']['token_max_length'] = 102400 - conf['filter_conf']['token_min_length'] = 0 - conf['filter_conf']['max_output_input_ratio'] = 102400 - conf['filter_conf']['min_output_input_ratio'] = 0 - conf['speed_perturb'] = False - conf['spec_aug'] = False - conf['spec_sub'] = False - conf['spec_trim'] = False - conf['shuffle'] = False - conf['sort'] = False - if 'fbank_conf' in conf: - conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in conf: - conf['mfcc_conf']['dither'] = 0.0 - conf['batch_conf']['batch_type'] = "static" - conf['batch_conf']['batch_size'] = 1 - - if args.cali_datalist is not None: - logger.info("Stage-2: Generate config") - # FIXME(xcsong): Remove hard code - logger.info("torch version: {}".format(torch.__version__)) - if int(torch.__version__[:4].split('.')[1]) >= 13: - args.extra_ops_run_on_cpu = "/Split;" + \ - "/encoders.0/self_attn/Split;/encoders.1/self_attn/Split;" + \ - "/encoders.2/self_attn/Split;/encoders.3/self_attn/Split;" + \ - "/encoders.4/self_attn/Split;/encoders.5/self_attn/Split;" + \ - "/encoders.6/self_attn/Split;/encoders.7/self_attn/Split;" + \ - "/encoders.8/self_attn/Split;/encoders.9/self_attn/Split;" + \ - "/encoders.10/self_attn/Split;/encoders.11/self_attn/Split;" + \ - "/encoders.0/self_attn/Mul;/encoders.1/self_attn/Mul;" + \ - "/encoders.2/self_attn/Mul;/encoders.3/self_attn/Mul;" + \ - "/encoders.4/self_attn/Mul;/encoders.5/self_attn/Mul;" + \ - "/encoders.6/self_attn/Mul;/encoders.7/self_attn/Mul;" + \ - "/encoders.8/self_attn/Mul;/encoders.9/self_attn/Mul;" + \ - "/encoders.10/self_attn/Mul;/encoders.11/self_attn/Mul;" - else: - args.extra_ops_run_on_cpu = "Split_17;Split_67;Split_209;" + \ - "Split_351;Split_493;Split_635;Split_777;Split_919;Split_1061;" + \ - "Split_1203;Split_1345;Split_1487;Split_1629;" + \ - "Mul_72;Mul_214;Mul_356;Mul_498;Mul_640;Mul_782;" + \ - "Mul_924;Mul_1066;Mul_1208;Mul_1350;Mul_1492;Mul_1634;" - generate_config(enc_session, ctc_session, args) - - logger.info("Stage-3: Make calibration data") - make_calibration_data(enc, args, conf) - - output_dir = os.path.realpath(args.output_dir) - logger.info("Stage-4: Make ctc.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_ctc".format(output_dir) + - " && cd hb_makertbin_log_ctc &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_ctc.yaml") - ) - logger.info("Stage-5: Make encoder.bin") - os.system( - "cd {} && mkdir -p hb_makertbin_log_encoder ".format(output_dir) + - " && cd hb_makertbin_log_encoder &&" + - " hb_mapper makertbin --model-type \"onnx\" --config \"{}\"".format( - output_dir + "/config_encoder.yaml") - ) - - if args.wer_datalist is not None: - logger.info("Stage-6: Check wer between torch model and quantized onnx") - assert args.wer_text is not None - check_wer(enc, ctc, args, conf) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/torch_text", - args.output_dir + "/torch_wer") - ) - os.system( - "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format( - args.wer_text, args.output_dir + "/onnx_text", - args.output_dir + "/onnx_wer") - ) - os.system("tail {} {}".format( - args.output_dir + "/torch_wer", args.output_dir + "/onnx_wer")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/parse_options.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/parse_options.sh deleted file mode 100644 index 34476fdb37a4b14d5fe6e0edbebe97e760d2be5a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/perturb_data_dir_speed.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/perturb_data_dir_speed.sh deleted file mode 100644 index 901a4882e6481ae269067b0fe7175dba62c4db9e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/perturb_data_dir_speed.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# 2020 @kamo-naoyuki -# This file was copied from Kaldi and -# I deleted parts related to wav duration -# because we shouldn't use kaldi's command here -# and we don't need the files actually. - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# -# It generates the files which are used for perturbing the speed of the original data. - -export LC_ALL=C -set -euo pipefail - -if [[ $# != 3 ]]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix="${label}${factor}-" -utt_prefix="${label}${factor}-" - -#check is sox on the path - -! command -v sox &>/dev/null && echo "sox: command not found" && exit 1; - -if [[ ! -f ${srcdir}/utt2spk ]]; then - echo "$0: no such file ${srcdir}/utt2spk" - exit 1; -fi - -if [[ ${destdir} == "${srcdir}" ]]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -mkdir -p "${destdir}" - -<"${srcdir}"/utt2spk awk -v p="${utt_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map" -<"${srcdir}"/spk2utt awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map" -<"${srcdir}"/wav.scp awk -v p="${spk_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/reco_map" -if [[ ! -f ${srcdir}/utt2uniq ]]; then - <"${srcdir}/utt2spk" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq" -else - <"${srcdir}/utt2uniq" awk -v p="${utt_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq" -fi - - -<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \ - utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk - -utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt - -if [[ -f ${srcdir}/segments ]]; then - - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/segments | \ - utils/apply_map.pl -f 2 "${destdir}"/reco_map | \ - awk -v factor="${factor}" \ - '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' \ - >"${destdir}"/segments - - utils/apply_map.pl -f 1 "${destdir}"/reco_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - if [[ -f ${srcdir}/reco2file_and_channel ]]; then - utils/apply_map.pl -f 1 "${destdir}"/reco_map \ - <"${srcdir}"/reco2file_and_channel >"${destdir}"/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [[ -f ${srcdir}/wav.scp ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor="${factor}" \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox" $_ " -t wav - speed " factor " |"}}' \ - > "${destdir}"/wav.scp - fi -fi - -if [[ -f ${srcdir}/text ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text -fi -if [[ -f ${srcdir}/spk2gender ]]; then - utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender -fi -if [[ -f ${srcdir}/utt2lang ]]; then - utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang -fi - -rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}" - -utils/validate_data_dir.sh --no-feats --no-text "${destdir}" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/reduce_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/reduce_data_dir.sh deleted file mode 100644 index 16194dcc7309a646041181a698c53cd4f46e618b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/reduce_data_dir.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# koried, 10/29/2012 - -# Reduce a data set based on a list of turn-ids - -help_message="usage: $0 srcdir turnlist destdir" - -if [ $1 == "--help" ]; then - echo "${help_message}" - exit 0; -fi - -if [ $# != 3 ]; then - echo "${help_message}" - exit 1; -fi - -srcdir=$1 -reclist=$2 -destdir=$3 - -if [ ! -f ${srcdir}/utt2spk ]; then -echo "$0: no such file $srcdir/utt2spk" -exit 1; -fi - -function do_filtering { -# assumes the utt2spk and spk2utt files already exist. - [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text - [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames - [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender - [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp - if [ -f ${srcdir}/segments ]; then - utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments - awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp - [ -f ${srcdir}/reco2file_and_channel ] && \ - utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm - rm ${destdir}/reco - fi - srcutts=$(wc -l < ${srcdir}/utt2spk) - destutts=$(wc -l < ${destdir}/utt2spk) - echo "Reduced #utt from $srcutts to $destutts" -} - -mkdir -p ${destdir} - -# filter the utt2spk based on the set of recordings -utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk - -utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt -do_filtering; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/remove_longshortdata.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/remove_longshortdata.py deleted file mode 100644 index 7e92f8a424d2d717acf6fc1db5503f79ba38a898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/remove_longshortdata.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='remove too long or too short data in format.data') - parser.add_argument('--data_file', - type=str, - help='input format data') - parser.add_argument('--output_data_file', - type=str, - help='output format data') - parser.add_argument('--min_input_len', type=float, - default=0, - help='minimum input seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--max_input_len', type=float, - default=20, - help='maximum output seq length, in seconds for raw wav, \ - in frame numbers for feature data') - parser.add_argument('--min_output_len', type=float, - default=0, help='minimum input seq length, in modeling units') - parser.add_argument('--max_output_len', type=float, - default=500, - help='maximum output seq length, in modeling units') - parser.add_argument('--min_output_input_ratio', type=float, default=0.05, - help='minimum output seq length/output seq length ratio') - parser.add_argument('--max_output_input_ratio', type=float, default=10, - help='maximum output seq length/output seq length ratio') - args = parser.parse_args() - - data_file = args.data_file - output_data_file = args.output_data_file - min_input_len = args.min_input_len - max_input_len = args.max_input_len - min_output_len = args.min_output_len - max_output_len = args.max_output_len - min_output_input_ratio = args.min_output_input_ratio - max_output_input_ratio = args.max_output_input_ratio - - with open(data_file, 'r') as f, open(output_data_file, 'w') as fout: - for l in f: - l = l.strip() - if l: - items = l.strip().split('\t') - token_shape = items[6] - feature_shape = items[2] - feat_len = float(feature_shape.split(':')[1].split(',')[0]) - token_len = float(token_shape.split(':')[1].split(',')[0]) - condition = [feat_len > min_input_len, - feat_len < max_input_len, - token_len > min_output_len, - token_len < max_output_len, - token_len / feat_len > min_output_input_ratio, - token_len / feat_len < max_output_input_ratio, - ] - if all(condition): - fout.write('{}\n'.format(l)) - continue diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/segment.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/segment.py deleted file mode 100644 index a1a7f93a05fbaf42ca09c26c0e5be6a7185f0d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/segment.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 Mobvoi Inc. (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='generate segmented wav.scp') - parser.add_argument('--segments', required=True, help='segments file') - parser.add_argument('--input', - required=True, - help='origin wav.scp that not segmented') - parser.add_argument('--output', - required=True, - help='output segmented wav.scp') - wav_dic = {} - args = parser.parse_args() - ori_wav = args.input - segment_file = args.segments - wav_scp = args.output - with open(ori_wav, 'r') as ori: - for l in ori: - item = l.strip().split() - wav_dic[item[0]] = item[1] - with open(wav_scp, 'w') as f, open(segment_file, 'r') as sgement: - for l in sgement: - item = l.strip().split() - if item[1] in wav_dic: - item[1] = wav_dic[item[1]] - f.write("{} {},{},{}\n".format(item[0], item[1], item[2], item[3])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/setup_anaconda.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/setup_anaconda.sh deleted file mode 100644 index f53ace9cc4c19994fc79d01e85d70f49d40d673f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/setup_anaconda.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env bash -# NOTE(hslee): this code is borrowed from ESPnet (https://github.com/espnet/espnet) -set -euo pipefail - -if [ -z "${PS1:-}" ]; then - PS1=__dummy__ -fi -CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - -if [ $# -gt 4 ]; then - echo "Usage: $0 [output] [conda-env-name] [python-version>]" - exit 1; -elif [ $# -eq 3 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="$3" -elif [ $# -eq 2 ]; then - output_dir="$1" - name="$2" - PYTHON_VERSION="" -elif [ $# -eq 1 ]; then - output_dir="$1" - name="" - PYTHON_VERSION="" -elif [ $# -eq 0 ]; then - output_dir=venv - name="" - PYTHON_VERSION="" -fi - -if [ -e activate_python.sh ]; then - echo "Warning: activate_python.sh already exists. It will be overwritten" -fi - -if [ ! -e "${output_dir}/etc/profile.d/conda.sh" ]; then - if [ ! -e miniconda.sh ]; then - wget --tries=3 "${CONDA_URL}" -O miniconda.sh - fi - - bash miniconda.sh -b -p "${output_dir}" -fi - -# shellcheck disable=SC1090 -source "${output_dir}/etc/profile.d/conda.sh" -conda deactivate - -# If the env already exists, skip recreation -if [ -n "${name}" ] && ! conda activate ${name}; then - conda create -yn "${name}" -fi -conda activate ${name} - -if [ -n "${PYTHON_VERSION}" ]; then - conda install -y conda "python=${PYTHON_VERSION}" -else - conda install -y conda -fi - -conda install -y pip setuptools - -cat << EOF > activate_python.sh -#!/usr/bin/env bash -# THIS FILE IS GENERATED BY tools/setup_anaconda.sh -if [ -z "\${PS1:-}" ]; then - PS1=__dummy__ -fi -. $(cd ${output_dir}; pwd)/etc/profile.d/conda.sh && conda deactivate && conda activate ${name} -EOF diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/sph2wav.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/sph2wav.sh deleted file mode 100644 index a8f0749e3be2ee69b5831da6699c303510ecbed4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/sph2wav.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# convert sph scp to segmented wav scp -nj=1 -. tools/parse_options.sh || exit 1; - -inscp=$1 -segments=$2 -outscp=$3 -data=$(dirname ${inscp}) -if [ $# -eq 4 ]; then - logdir=$4 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -sph2pipe_version="v2.5" -if [ ! -d tools/sph2pipe_${sph2pipe_version} ]; then - echo "Download sph2pipe_${sph2pipe_version} ......" - wget -T 10 -t 3 -P tools https://www.openslr.org/resources/3/sph2pipe_${sph2pipe_version}.tar.gz || \ - wget -T 10 -c -P tools https://sourceforge.net/projects/kaldi/files/sph2pipe_${sph2pipe_version}.tar.gz; \ - tar --no-same-owner -xzf tools/sph2pipe_${sph2pipe_version}.tar.gz -C tools - cd tools/sph2pipe_${sph2pipe_version}/ && \ - gcc -o sph2pipe *.c -lm - cd - -fi -sph2pipe=`which sph2pipe` || sph2pipe=`pwd`/tools/sph2pipe_${sph2pipe_version}/sph2pipe -[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; -sox=`which sox` -[ ! -x $sox ] && echo "Could not find the sox program at $sph2pipe" && exit 1; - -cat $inscp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s#-f#wav#-p#-c#1#%s#|\n", $1, sph2pipe, $2); - printf("%s-B %s#-f#wav#-p#-c#2#%s#|\n", $1, sph2pipe, $2);}' | \ - sort > $data/wav_ori.scp || exit 1; - -tools/segment.py --segments $segments --input $data/wav_ori.scp --output $data/wav_segments.scp -sed -i 's/ /,/g' $data/wav_segments.scp -sed -i 's/#/ /g' $data/wav_segments.scp - -rm -f $logdir/wav_*.slice -rm -f $logdir/*.log -split --additional-suffix .slice -d -n l/$nj $data/wav_segments.scp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - mkdir -p ${data}/wavs/${name} - cat ${slice} | awk -F ',' -v sox=$sox -v data=`pwd`/$data/wavs/$name \ - -v logdir=$logdir -v name=$name '{ - during=$4-$3 - cmd=$2 sox " - " data "/" $1 ".wav" " trim " $3 " " during; - system(cmd) - printf("%s %s/%s.wav\n", $1, data, $1); - }' | \ - sort > ${data}/wavs_${name}.scp || exit 1; -} & -done -wait -cat ${data}/wavs_*.scp > $outscp -rm ${data}/wavs_*.scp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/spk2utt_to_utt2spk.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/spk2utt_to_utt2spk.pl deleted file mode 100644 index 19fb89d501146e360912863d847d6eabb0194511..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_decode b/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_decode deleted file mode 100644 index 882b4f966013d7708460f8d41696583ae59f8fa9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_decode +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for decoding") - parser.add_argument("--input", default=None, help="input file to decode") - parser.add_argument("--input_format", choices=["piece", "id"], default="piece") - args = parser.parse_args() - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) - elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) - else: - raise NotImplementedError - - def tok2int(tok): - # remap reference-side (represented as <>) to 0 - return int(tok) if tok != "<>" else 0 - - if args.input is None: - h = sys.stdin - else: - h = open(args.input, "r", encoding="utf-8") - for line in h: - print(decode(line.split())) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_encode b/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_encode deleted file mode 100644 index 4dd2e1004f9fe393c2d34b43bade881b84a31b1f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_encode +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in -# https://github.com/pytorch/fairseq/blob/master/LICENSE - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import contextlib -import sys - -import sentencepiece as spm - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, - help="sentencepiece model to use for encoding") - parser.add_argument("--inputs", nargs="+", default=['-'], - help="input files to filter/encode") - parser.add_argument("--outputs", nargs="+", default=['-'], - help="path to save encoded outputs") - parser.add_argument("--output_format", choices=["piece", "id"], default="piece") - parser.add_argument("--min-len", type=int, metavar="N", - help="filter sentence pairs with fewer than N tokens") - parser.add_argument("--max-len", type=int, metavar="N", - help="filter sentence pairs with more than N tokens") - args = parser.parse_args() - - assert len(args.inputs) == len(args.outputs), \ - "number of input and output paths should match" - - sp = spm.SentencePieceProcessor() - sp.Load(args.model) - - if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) - elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) - else: - raise NotImplementedError - - if args.min_len is not None or args.max_len is not None: - def valid(line): - return ( - (args.min_len is None or len(line) >= args.min_len) and - (args.max_len is None or len(line) <= args.max_len) - ) - else: - def valid(lines): - return True - - with contextlib.ExitStack() as stack: - inputs = [ - stack.enter_context(open(input, "r", encoding="utf-8")) - if input != "-" else sys.stdin - for input in args.inputs - ] - outputs = [ - stack.enter_context(open(output, "w", encoding="utf-8")) - if output != "-" else sys.stdout - for output in args.outputs - ] - - stats = { - "num_empty": 0, - "num_filtered": 0, - } - - def encode_line(line): - line = line.strip() - if len(line) > 0: - line = encode(line) - if valid(line): - return line - else: - stats["num_filtered"] += 1 - else: - stats["num_empty"] += 1 - return None - - for i, lines in enumerate(zip(*inputs), start=1): - enc_lines = list(map(encode_line, lines)) - if not any(enc_line is None for enc_line in enc_lines): - for enc_line, output_h in zip(enc_lines, outputs): - print(" ".join(enc_line), file=output_h) - if i % 10000 == 0: - print("processed {} lines".format(i), file=sys.stderr) - - print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) - print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_train b/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_train deleted file mode 100644 index 0b247aee0dc5fcaa7b6cf66d89602e896619c9bb..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/spm_train +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# https://github.com/pytorch/fairseq/blob/master/LICENSE -import sys - -import sentencepiece as spm - - -if __name__ == "__main__": - spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/subset_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/subset_data_dir.sh deleted file mode 100644 index c35bee62d8710facb8c42a9171ed3caf0171450f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/subset_data_dir.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -speakers=false -first_opt= -spk_list= -utt_list= - -expect_args=3 -case $1 in - --first|--last) first_opt=$1; shift ;; - --per-spk) perspk=true; shift ;; - --shortest) shortest=true; shift ;; - --speakers) speakers=true; shift ;; - --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; - --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; - --*) echo "$0: invalid option '$1'"; exit 1 -esac - -if [ $# != $expect_args ]; then - echo "Usage:" - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - echo "With --utt-list, reads the utterances to keep from " - exit 1; -fi - -srcdir=$1 -if [[ $spk_list || $utt_list ]]; then - numutt= - destdir=$2 -else - numutt=$2 - destdir=$3 -fi - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1 -fi - -if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then - echo "$0: cannot subset to more utterances than you originally had." - exit 1 -fi - -if $shortest && [ ! -f $srcdir/feats.scp ]; then - echo "$0: you selected --shortest but no feats.scp exist." - exit 1 -fi - -mkdir -p $destdir || exit 1 - -if [[ $spk_list ]]; then - tools/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; -elif [[ $utt_list ]]; then - tools/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; -elif $speakers; then - tools/shuffle_list.pl < $srcdir/spk2utt | - awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | - sort > $destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -elif $perspk; then - awk '{ n='$numutt'; printf("%s ",$1); - skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - tools/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk -else - if $shortest; then - # Select $numutt shortest utterances. - . ./path.sh - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | - awk '{print $1}' | - head -$numutt >$destdir/tmp.uttlist - tools/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - # Select $numutt random utterances. - tools/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - tools/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt -fi - -# Perform filtering. utt2spk and spk2utt files already exist by this point. -# Filter by utterance. -[ -f $srcdir/feats.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp -[ -f $srcdir/vad.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp -[ -f $srcdir/utt2lang ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang -[ -f $srcdir/utt2dur ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur -[ -f $srcdir/utt2num_frames ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames -[ -f $srcdir/utt2uniq ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq -[ -f $srcdir/wav.scp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp -[ -f $srcdir/utt2warp ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp -[ -f $srcdir/text ] && - tools/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - -# Filter by speaker. -[ -f $srcdir/spk2warp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp -[ -f $srcdir/spk2gender ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender -[ -f $srcdir/cmvn.scp ] && - tools/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - -# Filter by recording-id. -if [ -f $srcdir/segments ]; then - tools/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - # Recording-ids are in segments. - awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco - # The next line overrides the command above for wav.scp, which would be incorrect. - #[ -f $srcdir/wav.scp ] && - # tools/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp -else - # No segments; recording-ids are in wav.scp. - awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco -fi - -[ -f $srcdir/reco2file_and_channel ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel -[ -f $srcdir/reco2dur ] && - tools/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur - -# Filter the STM file for proper sclite scoring. -# Copy over the comments from STM file. -[ -f $srcdir/stm ] && - (grep "^;;" $srcdir/stm - tools/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm - -rm $destdir/reco - -# Copy frame_shift if present. -[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir - -srcutts=$(wc -l <$srcdir/utt2spk) -destutts=$(wc -l <$destdir/utt2spk) -echo "$0: reducing #utt from $srcutts to $destutts" -exit 0 diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/subset_scp.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/subset_scp.pl deleted file mode 100644 index 11fddc09a0f4e5fad8e5d63cf65e7e5e627e4af6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/sym2int.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/sym2int.pl deleted file mode 100644 index cec097b6bdaefb5c3452e31fa334f0a7530b9a72..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/text2token.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/text2token.py deleted file mode 100644 index 4f4dcc901d436650695f0b80e0cf99e1e99269ee..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/text2token.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan) -# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import codecs -import re -import sys - -is_python2 = sys.version_info[0] == 2 - - -def exist_or_not(i, match_pos): - start_pos = None - end_pos = None - for pos in match_pos: - if pos[0] <= i < pos[1]: - start_pos = pos[0] - end_pos = pos[1] - break - - return start_pos, end_pos - -def seg_char(sent): - pattern = re.compile(r'([\u4e00-\u9fa5])') - chars = pattern.split(sent) - chars = [w for w in chars if len(w.strip()) > 0] - return chars - -def get_parser(): - parser = argparse.ArgumentParser( - description='convert raw text to tokenized text', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--nchar', - '-n', - default=1, - type=int, - help='number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2') - parser.add_argument('--skip-ncols', - '-s', - default=0, - type=int, - help='skip first n columns') - parser.add_argument('--space', - default='', - type=str, - help='space symbol') - parser.add_argument('--bpe-model', - '-m', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--non-lang-syms', - '-l', - default=None, - type=str, - help='list of non-linguistic symobles,' - ' e.g., etc.') - parser.add_argument('text', - type=str, - default=False, - nargs='?', - help='input text') - parser.add_argument('--trans_type', - '-t', - type=str, - default="char", - choices=["char", "phn", "cn_char_en_bpe"], - help="""Transcript type. char/phn. e.g., for TIMIT - FADG0_SI1279 - - If trans_type is char, read from - SI1279.WRD file -> "bricks are an alternative" - Else if trans_type is phn, - read from SI1279.PHN file -> - "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """) - return parser - - -def main(): - parser = get_parser() - args = parser.parse_args() - - rs = [] - if args.non_lang_syms is not None: - with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f: - nls = [x.rstrip() for x in f.readlines()] - rs = [re.compile(re.escape(x)) for x in nls] - - if args.bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(args.bpe_model) - - if args.text: - f = codecs.open(args.text, encoding="utf-8") - else: - f = codecs.getreader("utf-8")( - sys.stdin if is_python2 else sys.stdin.buffer) - - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer) - line = f.readline() - n = args.nchar - while line: - x = line.split() - print(' '.join(x[:args.skip_ncols]), end=" ") - a = ' '.join(x[args.skip_ncols:]) - - # get all matched positions - match_pos = [] - for r in rs: - i = 0 - while i >= 0: - m = r.search(a, i) - if m: - match_pos.append([m.start(), m.end()]) - i = m.end() - else: - break - - if len(match_pos) > 0: - chars = [] - i = 0 - while i < len(a): - start_pos, end_pos = exist_or_not(i, match_pos) - if start_pos is not None: - chars.append(a[start_pos:end_pos]) - i = end_pos - else: - chars.append(a[i]) - i += 1 - a = chars - - if (args.trans_type == "phn"): - a = a.split(" ") - elif args.trans_type == "cn_char_en_bpe": - b = seg_char(a) - a = [] - for j in b: - # we use "▁" to instead of blanks among english words - # warning: here is "▁", not "_" - for l in j.strip().split("▁"): - if not l.encode('UTF-8').isalpha(): - a.append(l) - else: - for k in sp.encode_as_pieces(l): - a.append(k) - else: - a = [a[j:j + n] for j in range(0, len(a), n)] - - a_flat = [] - for z in a: - a_flat.append("".join(z)) - - a_chars = [z.replace(' ', args.space) for z in a_flat] - if (args.trans_type == "phn"): - a_chars = [z.replace("sil", args.space) for z in a_chars] - print(' '.join(a_chars)) - line = f.readline() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/utt2spk_to_spk2utt.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/utt2spk_to_spk2utt.pl deleted file mode 100644 index 5086699ff85fdcb8667bb9ab054700c53e35fd0c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_data_dir.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_data_dir.sh deleted file mode 100644 index f4b4cbe1410111555d56380078e3d55381e7155a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_data_dir.sh +++ /dev/null @@ -1,383 +0,0 @@ -#!/bin/bash - -cmd="$@" - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -if [ -f $data/images.scp ]; then - cmd=${cmd/--no-wav/} # remove --no-wav if supplied - image/validate_data_dir.sh $cmd - exit $? -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1; - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6) - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(tools/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - tools/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1 -fi - -if [ -f $data/utt2num_frames ]; then - check_sorted_and_uniq $data/utt2num_frames - cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames - if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2num_frames} - exit 1 - fi - awk <$data/utt2num_frames '{ - if (NF != 2 || !($2 > 0) || $2 != int($2)) { - print "Bad line utt2num_frames:" NR ":" $0 - exit 1 } }' || exit 1 -fi - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_dict_dir.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_dict_dir.pl deleted file mode 100644 index 819fca7f03caff91f3f24f0b69876a0bfc0abbe9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_dict_dir.pl +++ /dev/null @@ -1,531 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for 'dict' directories (e.g. data/local/dict) - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - -if (-f "$dict/nonterminals.txt") { - open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt"; - my %nonterminals = (); - my $line_number = 1; - while () { - chop; - my @line = split(" ", $_); - if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) { - print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1; - } - $nonterminals{$line[0]} = 1; - $line_number++; - } - print "--> $dict/nonterminals.txt is OK\n"; -} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_text.pl b/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_text.pl deleted file mode 100644 index 7f75cf12f20f6e22948682e8e726e628a72dac69..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/validate_text.pl +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - if ((substr $current_line, -1) ne "\n"){ - print STDERR "$0: The current line (nr. $i) has invalid newline\n"; - return 1; - } - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/wav2dur.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/wav2dur.py deleted file mode 100644 index 1bcc1b693458b66c0e341e5d6b375cc81e6db8b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/wav2dur.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -import torchaudio -torchaudio.set_audio_backend("sox_io") - -scp = sys.argv[1] -dur_scp = sys.argv[2] - -with open(scp, 'r') as f, open(dur_scp, 'w') as fout: - cnt = 0 - total_duration = 0 - for l in f: - items = l.strip().split() - wav_id = items[0] - fname = items[1] - cnt += 1 - waveform, rate = torchaudio.load(fname) - frames = len(waveform[0]) - duration = frames / float(rate) - total_duration += duration - fout.write('{} {}\n'.format(wav_id, duration)) - print('process {} utts'.format(cnt)) - print('total {} s'.format(total_duration)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/wav_to_duration.sh b/models/audio/speech_recognition/conformer/igie/wenet/tools/wav_to_duration.sh deleted file mode 100644 index 51b055c633ac809b6b8d702925dc47875973403d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/wav_to_duration.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# split the wav scp, calculate duration and merge -nj=4 -. tools/parse_options.sh || exit 1; - -inscp=$1 -outscp=$2 -data=$(dirname ${inscp}) -if [ $# -eq 3 ]; then - logdir=$3 -else - logdir=${data}/log -fi -mkdir -p ${logdir} - -rm -f $logdir/wav_*.slice -rm -f $logdir/wav_*.shape -split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ - -for slice in `ls $logdir/wav_*.slice`; do -{ - name=`basename -s .slice $slice` - tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log -} & -done -wait -cat $logdir/wav_*.shape > $outscp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/tools/websocket/performance-ws.py b/models/audio/speech_recognition/conformer/igie/wenet/tools/websocket/performance-ws.py deleted file mode 100644 index af77dea06bb41297b674b5b6dbfd0266bcff5d53..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/tools/websocket/performance-ws.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# coding:utf-8 - -# Copyright (c) 2022 SDCI Co. Ltd (author: veelion) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -import time -import asyncio -import argparse -import websockets -import soundfile as sf -import statistics - - -WS_START = json.dumps({ - 'signal': 'start', - 'nbest': 1, - 'continuous_decoding': False, -}) -WS_END = json.dumps({ - 'signal': 'end' -}) - - -async def ws_rec(data, ws_uri): - begin = time.time() - conn = await websockets.connect(ws_uri, ping_timeout=200) - # step 1: send start - await conn.send(WS_START) - ret = await conn.recv() - # step 2: send audio data - await conn.send(data) - # step 3: send end - await conn.send(WS_END) - # step 4: receive result - texts = [] - while 1: - ret = await conn.recv() - ret = json.loads(ret) - if ret['type'] == 'final_result': - nbest = json.loads(ret['nbest']) - text = nbest[0]['sentence'] - texts.append(text) - elif ret['type'] == 'speech_end': - break - # step 5: close - try: - await conn.close() - except Exception as e: - # this except has no effect, just log as debug - # it seems the server does not send close info, maybe - print(e) - time_cost = time.time() - begin - return { - 'text': ''.join(texts), - 'time': time_cost, - } - - -def get_args(): - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-u', '--ws_uri', required=True, - help="websocket_server_main's uri, e.g. ws://127.0.0.1:10086") - parser.add_argument( - '-w', '--wav_scp', required=True, - help='path to wav_scp_file') - parser.add_argument( - '-t', '--trans', required=True, - help='path to trans_text_file of wavs') - parser.add_argument( - '-s', '--save_to', required=True, - help='path to save transcription') - parser.add_argument( - '-n', '--num_concurrence', type=int, required=True, - help='num of concurrence for query') - args = parser.parse_args() - return args - - -def print_result(info): - length = max([len(k) for k in info]) - for k, v in info.items(): - print(f'\t{k: >{length}} : {v}') - - -async def main(args): - wav_scp = [] - total_duration = 0 - with open(args.wav_scp) as f: - for line in f: - zz = line.strip().split() - assert len(zz) == 2 - data, sr = sf.read(zz[1], dtype='int16') - assert sr == 16000 - duration = (len(data)) / 16000 - total_duration += duration - wav_scp.append((zz[0], data.tobytes())) - print(f'{len(wav_scp) = }, {total_duration = }') - - tasks = [] - failed = 0 - texts = [] - request_times = [] - begin = time.time() - for i, (_uttid, data) in enumerate(wav_scp): - task = asyncio.create_task(ws_rec(data, args.ws_uri)) - tasks.append((_uttid, task)) - if len(tasks) < args.num_concurrence: - continue - print((f'{i=}, start {args.num_concurrence} ' - f'queries @ {time.strftime("%m-%d %H:%M:%S")}')) - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - tasks = [] - print(f'\tdone @ {time.strftime("%m-%d %H:%M:%S")}') - if tasks: - for uttid, task in tasks: - result = await task - texts.append(f'{uttid}\t{result["text"]}\n') - request_times.append(result['time']) - request_time = time.time() - begin - rtf = request_time / total_duration - print('For all concurrence:') - print_result({ - 'failed': failed, - 'total_duration': total_duration, - 'request_time': request_time, - 'RTF': rtf, - }) - print('For one request:') - print_result({ - 'mean': statistics.mean(request_times), - 'median': statistics.median(request_times), - 'max_time': max(request_times), - 'min_time': min(request_times), - }) - with open(args.save_to, 'w', encoding='utf8') as fsave: - fsave.write(''.join(texts)) - # caculate CER - cmd = (f'python ../compute-wer.py --char=1 --v=1 ' - f'{args.trans} {args.save_to} > ' - f'{args.save_to}-test-{args.num_concurrence}.cer.txt') - print(cmd) - os.system(cmd) - print('done') - - -if __name__ == '__main__': - args = get_args() - asyncio.run(main(args)) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/alignment.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/alignment.py deleted file mode 100644 index 071691183e5af227e60fe06e4f8d4bf0f33b7f71..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/alignment.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu) -# 2022 Tinnove Inc (authors: Wei Ren) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader -from textgrid import TextGrid, IntervalTier - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.ctc_util import forced_align -from wenet.utils.common import get_subsample -from wenet.utils.init_model import init_model - - -def generator_textgrid(maxtime, lines, output): - # Download Praat: https://www.fon.hum.uva.nl/praat/ - interval = maxtime / (len(lines) + 1) - margin = 0.0001 - - tg = TextGrid(maxTime=maxtime) - linetier = IntervalTier(name="line", maxTime=maxtime) - - i = 0 - for l in lines: - s, e, w = l.split() - linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) - - tg.append(linetier) - print("successfully generator {}".format(output)) - tg.write(output) - - -def get_frames_timestamp(alignment): - # convert alignment to a praat format, which is a doing phonetics - # by computer and helps analyzing alignment - timestamp = [] - # get frames level duration for each token - start = 0 - end = 0 - while end < len(alignment): - while end < len(alignment) and alignment[end] == 0: - end += 1 - if end == len(alignment): - timestamp[-1] += alignment[start:] - break - end += 1 - while end < len(alignment) and alignment[end - 1] == alignment[end]: - end += 1 - timestamp.append(alignment[start:end]) - start = end - return timestamp - - -def get_labformat(timestamp, subsample): - begin = 0 - duration = 0 - labformat = [] - for idx, t in enumerate(timestamp): - # 25ms frame_length,10ms hop_length, 1/subsample - subsample = get_subsample(configs) - # time duration - duration = len(t) * 0.01 * subsample - if idx < len(timestamp) - 1: - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[t[-1]])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[t[-1]])) - else: - non_blank = 0 - for i in t: - if i != 0: - token = i - break - print("{:.2f} {:.2f} {}".format(begin, begin + duration, - char_dict[token])) - labformat.append("{:.2f} {:.2f} {}\n".format( - begin, begin + duration, char_dict[token])) - begin = begin + duration - return labformat - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='use ctc to generate alignment') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--input_file', required=True, help='format data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--non_lang_syms', - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--result_file', - required=True, - help='alignment result file') - parser.add_argument('--batch_size', type=int, default=1, help='batch size') - parser.add_argument('--gen_praat', - action='store_true', - help='convert alignment to a praat format') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - - args = parser.parse_args() - print(args) - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.batch_size > 1: - logging.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - # Load dict - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - eos = len(char_dict) - 1 - - symbol_table = read_symbol_table(args.dict) - - # Init dataset and data loader - ali_conf = copy.deepcopy(configs['dataset_conf']) - - ali_conf['filter_conf']['max_length'] = 102400 - ali_conf['filter_conf']['min_length'] = 0 - ali_conf['filter_conf']['token_max_length'] = 102400 - ali_conf['filter_conf']['token_min_length'] = 0 - ali_conf['filter_conf']['max_output_input_ratio'] = 102400 - ali_conf['filter_conf']['min_output_input_ratio'] = 0 - ali_conf['speed_perturb'] = False - ali_conf['spec_aug'] = False - ali_conf['shuffle'] = False - ali_conf['sort'] = False - ali_conf['fbank_conf']['dither'] = 0.0 - ali_conf['batch_conf']['batch_type'] = "static" - ali_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - ali_dataset = Dataset(args.data_type, - args.input_file, - symbol_table, - ali_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w', - encoding='utf-8') as fout: - for batch_idx, batch in enumerate(ali_data_loader): - print("#" * 80) - key, feat, target, feats_length, target_length = batch - print(key) - - feat = feat.to(device) - target = target.to(device) - feats_length = feats_length.to(device) - target_length = target_length.to(device) - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - # print(ctc_probs.size(1)) - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = forced_align(ctc_probs, target) - print(alignment) - fout.write('{} {}\n'.format(key[0], alignment)) - - if args.gen_praat: - timestamp = get_frames_timestamp(alignment) - print(timestamp) - subsample = get_subsample(configs) - labformat = get_labformat(timestamp, subsample) - - lab_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".lab") - with open(lab_path, 'w', encoding='utf-8') as f: - f.writelines(labformat) - - textgrid_path = os.path.join(os.path.dirname(args.result_file), - key[0] + ".TextGrid") - generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 * - subsample, - lines=labformat, - output=textgrid_path) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/average_model.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/average_model.py deleted file mode 100644 index 01efa64b4b458bc931a86a9a304b9f330ce4aaa2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/average_model.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import argparse -import glob - -import yaml -import numpy as np -import torch - - -def get_args(): - parser = argparse.ArgumentParser(description='average model') - parser.add_argument('--dst_model', required=True, help='averaged model') - parser.add_argument('--src_path', - required=True, - help='src model path for average') - parser.add_argument('--val_best', - action="store_true", - help='averaged model') - parser.add_argument('--num', - default=5, - type=int, - help='nums for averaged model') - parser.add_argument('--min_epoch', - default=0, - type=int, - help='min epoch used for averaging model') - parser.add_argument('--max_epoch', - default=65536, - type=int, - help='max epoch used for averaging model') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - checkpoints = [] - val_scores = [] - if args.val_best: - yamls = glob.glob('{}/[!train]*.yaml'.format(args.src_path)) - for y in yamls: - with open(y, 'r') as f: - dic_yaml = yaml.load(f, Loader=yaml.FullLoader) - loss = dic_yaml['cv_loss'] - epoch = dic_yaml['epoch'] - if epoch >= args.min_epoch and epoch <= args.max_epoch: - val_scores += [[epoch, loss]] - val_scores = np.array(val_scores) - sort_idx = np.argsort(val_scores[:, -1]) - sorted_val_scores = val_scores[sort_idx][::1] - print("best val scores = " + str(sorted_val_scores[:args.num, 1])) - print("selected epochs = " + - str(sorted_val_scores[:args.num, 0].astype(np.int64))) - path_list = [ - args.src_path + '/{}.pt'.format(int(epoch)) - for epoch in sorted_val_scores[:args.num, 0] - ] - else: - path_list = glob.glob('{}/[0-9]*.pt'.format(args.src_path)) - path_list = sorted(path_list, key=os.path.getmtime) - path_list = path_list[-args.num:] - print(path_list) - avg = None - num = args.num - assert num == len(path_list) - for path in path_list: - print('Processing {}'.format(path)) - states = torch.load(path, map_location=torch.device('cpu')) - if avg is None: - avg = states - else: - for k in avg.keys(): - avg[k] += states[k] - # average - for k in avg.keys(): - if avg[k] is not None: - # pytorch 1.6 use true_divide instead of /= - avg[k] = torch.true_divide(avg[k], num) - print('Saving to {}'.format(args.dst_model)) - torch.save(avg, args.dst_model) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_jit.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_jit.py deleted file mode 100644 index b2e5864e8382235c1cc800484ba5031ae22f3bd9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_jit.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os - -import torch -import yaml - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_file', default=None, help='output file') - parser.add_argument('--output_quant_file', - default=None, - help='output quantized model file') - args = parser.parse_args() - return args - - -def main(): - args = get_args() - # No need gpu for model export - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - model = init_model(configs) - print(model) - - load_checkpoint(model, args.checkpoint) - # Export jit torch script model - - if args.output_file: - script_model = torch.jit.script(model) - script_model.save(args.output_file) - print('Export model successfully, see {}'.format(args.output_file)) - - # Export quantized jit torch script model - if args.output_quant_file: - quantized_model = torch.quantization.quantize_dynamic( - model, {torch.nn.Linear}, dtype=torch.qint8 - ) - print(quantized_model) - script_quant_model = torch.jit.script(quantized_model) - script_quant_model.save(args.output_quant_file) - print('Export quantized model successfully, ' - 'see {}'.format(args.output_quant_file)) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_bpu.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_bpu.py deleted file mode 100644 index 6462a69506f10778d08faae5fcf3067ad43d38bd..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_bpu.py +++ /dev/null @@ -1,1019 +0,0 @@ -# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NOTE(xcsong): Currently, we only support -1. specific conformer encoder architecture, see: - encoder: conformer - encoder_conf: - activation_type: **must be** relu - attention_heads: 2 or 4 or 8 or any number divisible by output_size - causal: **must be** true - cnn_module_kernel: 1 ~ 7 - cnn_module_norm: **must be** batch_norm - input_layer: **must be** conv2d8 - linear_units: 1 ~ 2048 - normalize_before: **must be** true - num_blocks: 1 ~ 12 - output_size: 1 ~ 512 - pos_enc_layer_type: **must be** no_pos - selfattention_layer_type: **must be** selfattn - use_cnn_module: **must be** true - use_dynamic_chunk: **must be** true - use_dynamic_left_chunk: **must be** true - -2. specific decoding method: ctc_greedy_search -""" - - -from __future__ import print_function - -import os -import sys -import copy -import math -import yaml -import logging -from typing import Tuple - -import torch -import numpy as np - -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model -from wenet.bin.export_onnx_cpu import (get_args, to_numpy, - print_input_output_info) - - -try: - import onnx - import onnxruntime -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class BPULayerNorm(torch.nn.Module): - """Refactor torch.nn.LayerNorm to meet 4-D dataflow.""" - def __init__(self, module, chunk_size=8, run_on_bpu=False): - super().__init__() - original = copy.deepcopy(module) - self.hidden = module.weight.size(0) - self.chunk_size = chunk_size - self.run_on_bpu = run_on_bpu - - if self.run_on_bpu: - self.weight = torch.nn.Parameter( - module.weight.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.bias = torch.nn.Parameter( - module.bias.reshape(1, self.hidden, 1, 1).repeat( - 1, 1, 1, chunk_size)) - self.negtive = torch.nn.Parameter( - torch.ones((1, self.hidden, 1, chunk_size)) * -1.0) - self.eps = torch.nn.Parameter( - torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps) - self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_1.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False) - self.mean_conv_2.weight = torch.nn.Parameter( - torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)) - else: - self.norm = module - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.hidden) - orig_out = module(random_data) - new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2)) - np.testing.assert_allclose( - to_numpy(orig_out), to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.run_on_bpu: - u = self.mean_conv_1(x) # (1, h, 1, c) - numerator = x + u * self.negtive # (1, h, 1, c) - s = torch.pow(numerator, 2) # (1, h, 1, c) - s = self.mean_conv_2(s) # (1, h, 1, c) - denominator = torch.sqrt(s + self.eps) # (1, h, 1, c) - x = torch.div(numerator, denominator) # (1, h, 1, c) - x = x * self.weight + self.bias - else: - x = x.squeeze(2).transpose(1, 2).contiguous() - x = self.norm(x) - x = x.transpose(1, 2).contiguous().unsqueeze(2) - return x - - -class BPUIdentity(torch.nn.Module): - """Refactor torch.nn.Identity(). - For inserting BPU node whose input == output. - """ - def __init__(self, channels): - super().__init__() - self.channels = channels - self.identity_conv = torch.nn.Conv2d( - channels, channels, 1, groups=channels, bias=False) - torch.nn.init.dirac_( - self.identity_conv.weight.data, groups=channels) - - self.check_equal() - - def check_equal(self): - random_data = torch.randn(1, self.channels, 1, 10) - result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(random_data), to_numpy(result), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Identity with 4-D dataflow, input == output. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - - Returns: - (torch.Tensor): (batch, in_channel, 1, time). - """ - return self.identity_conv(x) - - -class BPULinear(torch.nn.Module): - """Refactor torch.nn.Linear or pointwise_conv""" - def __init__(self, module, is_pointwise_conv=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.weight.size(1) - self.odim = module.weight.size(0) - self.is_pointwise_conv = is_pointwise_conv - - # Modify weight & bias - self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1) - if is_pointwise_conv: - # (odim, idim, kernel=1) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(-1)) - else: - # (odim, idim) -> (odim, idim, 1, 1) - self.linear.weight = torch.nn.Parameter( - module.weight.unsqueeze(2).unsqueeze(3)) - self.linear.bias = module.bias - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.idim) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = module(random_data) - if self.is_pointwise_conv: - random_data = random_data.transpose(1, 2) - original_result = original_result.transpose(1, 2) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Linear with 4-D dataflow. - Args: - x (torch.Tensor): (batch, in_channel, 1, time) - Returns: - (torch.Tensor): (batch, out_channel, 1, time). - """ - return self.linear(x) - - -class BPUGlobalCMVN(torch.nn.Module): - """Refactor wenet/transformer/cmvn.py::GlobalCMVN""" - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - self.norm_var = module.norm_var - - # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1) - self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """CMVN with 4-D dataflow. - Args: - x (torch.Tensor): (batch, 1, mel_dim, time) - Returns: - (torch.Tensor): normalized feature with same shape. - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x - - -class BPUConv2dSubsampling8(torch.nn.Module): - """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8 - - NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.right_context = module.right_context - self.subsampling_rate = module.subsampling_rate - assert isinstance(module.pos_enc, NoPositionalEncoding) - - # 1. Modify self.conv - # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim) - # to (1, 1, mel_dim, frames) for more efficient computation. - self.conv = module.conv - for idx in [0, 2, 4]: - self.conv[idx].weight = torch.nn.Parameter( - module.conv[idx].weight.transpose(2, 3) - ) - - # 2. Modify self.linear - # NOTE(xcsong): Split final projection to meet the requirment of - # maximum kernel_size (7 for XJ3) - self.linear = torch.nn.ModuleList() - odim = module.linear.weight.size(0) # 512, in this case - freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9 - self.odim, self.freq = odim, freq - weight = module.linear.weight.reshape( - odim, odim, freq, 1) # (odim, odim * freq) -> (odim, odim, freq, 1) - self.split_size = [] - num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7 - slice_begin = 0 - for idx in range(num_split): - kernel_size = min(freq, (idx + 1) * 7) - idx * 7 - conv_ele = torch.nn.Conv2d( - odim, odim, (kernel_size, 1), (kernel_size, 1)) - conv_ele.weight = torch.nn.Parameter( - weight[:, :, slice_begin:slice_begin + kernel_size, :] - ) - conv_ele.bias = torch.nn.Parameter( - torch.zeros_like(conv_ele.bias) - ) - self.linear.append(conv_ele) - self.split_size.append(kernel_size) - slice_begin += kernel_size - self.linear[0].bias = torch.nn.Parameter(module.linear.bias) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 67, 80) - mask = torch.zeros(1, 1, 67) - original_result, _, _ = module(random_data, mask) # (1, 8, 512) - random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67) - new_result = self.forward(random_data) # (1, 512, 1, 8) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x with 4-D dataflow. - Args: - x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, odim, 1, time'), - where time' = time // 8. - """ - x = self.conv(x) # (1, odim, freq, time') - x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3)) - x = torch.split(x, self.split_size, dim=2) - for idx, (x_part, layer) in enumerate(zip(x, self.linear)): - x_out += layer(x_part) - return x_out - - -class BPUMultiHeadedAttention(torch.nn.Module): - """Refactor wenet/transformer/attention.py::MultiHeadedAttention - - NOTE(xcsong): Only support attention_class == MultiHeadedAttention, - we do not consider RelPositionMultiHeadedAttention currently. - """ - def __init__(self, module, chunk_size, left_chunks): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.d_k = module.d_k - self.h = module.h - n_feat = self.d_k * self.h - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.time = chunk_size * (left_chunks + 1) - self.activation = torch.nn.Softmax(dim=-1) - - # 1. Modify self.linear_x - self.linear_q = BPULinear(module.linear_q) - self.linear_k = BPULinear(module.linear_k) - self.linear_v = BPULinear(module.linear_v) - self.linear_out = BPULinear(module.linear_out) - # 2. denom - self.register_buffer( - "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, self.chunk_size, self.d_k * self.h) - mask = torch.ones((1, self.h, self.chunk_size, self.time), - dtype=torch.bool) - cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, - self.d_k * 2) - original_out, original_cache = module( - random_data, random_data, random_data, - mask[:, 0, :, :], torch.empty(0), cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.reshape(1, self.h, self.d_k * 2, - self.chunk_size * self.left_chunks) - new_out, new_cache = self.forward( - random_data, random_data, random_data, mask, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - - def forward( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - mask: torch.Tensor, cache: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size). - k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size). - v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size). - mask (torch.Tensor): Mask tensor, - (#batch, head, chunk_size, cache_t + chunk_size). - cache (torch.Tensor): Cache tensor - (1, head, d_k * 2, cache_t), - where `cache_t == chunk_size * left_chunks`. - - - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: Cache tensor - (1, head, d_k * 2, cache_t + chunk_size) - where `cache_t == chunk_size * left_chunks` - """ - # 1. Forward QKV - q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size - k = self.linear_k(k) # (1, d, 1, c) - v = self.linear_v(v) # (1, d, 1, c) - q = q.view(1, self.h, self.d_k, self.chunk_size) - k = k.view(1, self.h, self.d_k, self.chunk_size) - v = v.view(1, self.h, self.d_k, self.chunk_size) - q = q.transpose(2, 3) # (batch, head, time1, d_k) - k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2) - k = torch.cat((k_cache, k), dim=3) - v = torch.cat((v_cache, v), dim=3) - new_cache = torch.cat((k, v), dim=2) - # 2. (Q^T)K - scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2) - # 3. Forward attention - mask = mask.eq(0) - scores = scores.masked_fill(mask, -float('inf')) - attn = self.activation(scores).masked_fill(mask, 0.0) - attn = attn.transpose(2, 3) - x = torch.matmul(v, attn) - x = x.view(1, self.d_k * self.h, 1, self.chunk_size) - x_out = self.linear_out(x) - return x_out, new_cache - - -class BPUConvolution(torch.nn.Module): - """Refactor wenet/transformer/convolution.py::ConvolutionModule - - NOTE(xcsong): Only suport use_layer_norm == False - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.lorder = module.lorder - self.use_layer_norm = False - self.activation = module.activation - channels = module.pointwise_conv1.weight.size(1) - self.channels = channels - kernel_size = module.depthwise_conv.weight.size(2) - assert module.use_layer_norm is False - - # 1. Modify self.pointwise_conv1 - self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True) - - # 2. Modify self.depthwise_conv - self.depthwise_conv = torch.nn.Conv2d( - channels, channels, (1, kernel_size), - stride=1, groups=channels) - self.depthwise_conv.weight = torch.nn.Parameter( - module.depthwise_conv.weight.unsqueeze(-2)) - self.depthwise_conv.bias = torch.nn.Parameter( - module.depthwise_conv.bias) - - # 3. Modify self.norm, Only support batchnorm2d - self.norm = torch.nn.BatchNorm2d(channels) - self.norm.training = False - self.norm.num_features = module.norm.num_features - self.norm.eps = module.norm.eps - self.norm.momentum = module.norm.momentum - self.norm.weight = torch.nn.Parameter(module.norm.weight) - self.norm.bias = torch.nn.Parameter(module.norm.bias) - self.norm.running_mean = module.norm.running_mean - self.norm.running_var = module.norm.running_var - - # 4. Modify self.pointwise_conv2 - self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True) - - # 5. Identity conv, for running `concat` on BPU - self.identity = BPUIdentity(channels) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.channels) - cache = torch.zeros((1, self.channels, self.lorder)) - original_out, original_cache = module(random_data, cache=cache) - random_data = random_data.transpose(1, 2).unsqueeze(2) - cache = cache.unsqueeze(2) - new_out, new_cache = self.forward(random_data, cache) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cache), - to_numpy(new_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size). - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, 1, cache_t). - Returns: - torch.Tensor: Output tensor (#batch, channels, 1, chunk_size). - torch.Tensor: Cache tensor (#batch, channels, 1, cache_t). - """ - # Concat cache - x = torch.cat((self.identity(cache), self.identity(x)), dim=3) - new_cache = x[:, :, :, -self.lorder:] - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim) - x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim) - - # Depthwise Conv - x = self.depthwise_conv(x) - x = self.activation(self.norm(x)) - x = self.pointwise_conv2(x) - return x, new_cache - - -class BPUFFN(torch.nn.Module): - """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.activation = module.activation - - # 1. Modify self.w_x - self.w_1 = BPULinear(module.w_1) - self.w_2 = BPULinear(module.w_2) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 8, self.w_1.idim) - original_out = module(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_out = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_out), - to_numpy(new_out.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, D, 1, L) - Returns: - output tensor, (B, D, 1, L) - """ - return self.w_2(self.activation(self.w_1(x))) - - -class BPUConformerEncoderLayer(torch.nn.Module): - """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.size = module.size - assert module.normalize_before is True - assert module.concat_after is False - - # 1. Modify submodules - self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron) - self.self_attn = BPUMultiHeadedAttention( - module.self_attn, chunk_size, left_chunks) - self.conv_module = BPUConvolution(module.conv_module) - self.feed_forward = BPUFFN(module.feed_forward) - - # 2. Modify norms - self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu) - self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu) - self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, - chunk_size, ln_run_on_bpu) - self.norm_conv = BPULayerNorm(module.norm_conv, - chunk_size, ln_run_on_bpu) - self.norm_final = BPULayerNorm(module.norm_final, - chunk_size, ln_run_on_bpu) - - # 3. 4-D ff_scale - self.register_buffer( - "ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.self_attn.chunk_size - time2 = self.self_attn.time - h, d_k = self.self_attn.h, self.self_attn.d_k - random_x = torch.randn(1, time1, self.size) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(1, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder) - original_x, _, original_att_cache, original_cnn_cache = module( - random_x, att_mask[:, 0, :, :], torch.empty(0), - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.transpose(1, 2).unsqueeze(2) - att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.unsqueeze(2) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_mask, att_cache, cnn_cache - ) - np.testing.assert_allclose( - to_numpy(original_att_cache), - to_numpy(new_att_cache.transpose(2, 3)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(original_cnn_cache), - to_numpy(new_cnn_cache.squeeze(2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, x: torch.Tensor, att_mask: torch.Tensor, - att_cache: torch.Tensor, cnn_cache: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, size, 1, chunk_size) - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, d_k * 2, cache_t1), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, 1, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, size, 1, chunk_size). - torch.Tensor: att_cache tensor, - (1, head, d_k * 2, cache_t1 + chunk_size). - torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2). - """ - # 1. ffn_macaron - residual = x - x = self.norm_ff_macron(x) - x = residual + self.ff_scale * self.feed_forward_macaron(x) - - # 2. attention - residual = x - x = self.norm_mha(x) - x_att, new_att_cache = self.self_attn( - x, x, x, att_mask, att_cache) - x = residual + x_att - - # 3. convolution - residual = x - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, cnn_cache) - x = residual + x - - # 4. ffn - residual = x - x = self.norm_ff(x) - x = residual + self.ff_scale * self.feed_forward(x) - - # 5. final post-norm - x = self.norm_final(x) - - return x, new_att_cache, new_cnn_cache - - -class BPUConformerEncoder(torch.nn.Module): - """Refactor wenet/transformer/encoder.py::ConformerEncoder - """ - def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - output_size = module.output_size() - self._output_size = module.output_size() - self.after_norm = module.after_norm - self.chunk_size = chunk_size - self.left_chunks = left_chunks - self.head = module.encoders[0].self_attn.h - self.layers = len(module.encoders) - - # 1. Modify submodules - self.global_cmvn = BPUGlobalCMVN(module.global_cmvn) - self.embed = BPUConv2dSubsampling8(module.embed) - self.encoders = torch.nn.ModuleList() - for layer in module.encoders: - self.encoders.append(BPUConformerEncoderLayer( - layer, chunk_size, left_chunks, ln_run_on_bpu)) - - # 2. Auxiliary conv - self.identity_cnncache = BPUIdentity(output_size) - - self.check_equal(original) - - def check_equal(self, module): - time1 = self.encoders[0].self_attn.chunk_size - time2 = self.encoders[0].self_attn.time - layers = self.layers - h, d_k = self.head, self.encoders[0].self_attn.d_k - decoding_window = (self.chunk_size - 1) * \ - module.embed.subsampling_rate + \ - module.embed.right_context + 1 - lorder = self.encoders[0].conv_module.lorder - random_x = torch.randn(1, decoding_window, 80) - att_mask = torch.ones(1, h, time1, time2) - att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2) - cnn_cache = torch.zeros(layers, 1, self._output_size, lorder) - orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk( - random_x, 0, time2 - time1, att_mask=att_mask[:, 0, :, :], - att_cache=att_cache, cnn_cache=cnn_cache - ) - random_x = random_x.unsqueeze(0) - att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1) - cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder) - new_x, new_att_cache, new_cnn_cache = self.forward( - random_x, att_cache, cnn_cache, att_mask - ) - caches = torch.split(new_att_cache, h, dim=1) - caches = [c.transpose(2, 3) for c in caches] - np.testing.assert_allclose( - to_numpy(orig_att_cache), - to_numpy(torch.cat(caches, dim=0)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_x), - to_numpy(new_x.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - np.testing.assert_allclose( - to_numpy(orig_cnn_cache), - to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward( - self, xs: torch.Tensor, att_cache: torch.Tensor, - cnn_cache: torch.Tensor, att_mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (1, head * elayers, d_k * 2, cache_t1), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (1, hidden-dim, elayers, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask (torch.Tensor): Mask tensor for the input - (#batch, head, chunk_size, cache_t1 + chunk_size), - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, hidden-dim, 1, chunk_size). - torch.Tensor: new attention cache required for next chunk, with - same shape as the original att_cache. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - """ - # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time) - xs = xs.transpose(2, 3) - xs = self.global_cmvn(xs) - # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size) - xs = self.embed(xs) - - att_cache = torch.split(att_cache, self.head, dim=1) - cnn_cache = self.identity_cnncache(cnn_cache) - cnn_cache = torch.split(cnn_cache, 1, dim=2) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - xs, new_att_cache, new_cnn_cache = layer( - xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]) - r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:]) - r_cnn_cache.append(new_cnn_cache) - r_att_cache = torch.cat(r_att_cache, dim=1) - r_cnn_cache = self.identity_cnncache( - torch.cat(r_cnn_cache, dim=2)) - - xs = xs.squeeze(2).transpose(1, 2).contiguous() - xs = self.after_norm(xs) - # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input. - xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T) - - return (xs, r_att_cache, r_cnn_cache) - - -class BPUCTC(torch.nn.Module): - """Refactor wenet/transformer/ctc.py::CTC - """ - def __init__(self, module): - super().__init__() - # Unchanged submodules and attributes - original = copy.deepcopy(module) - self.idim = module.ctc_lo.weight.size(1) - num_class = module.ctc_lo.weight.size(0) - - # 1. Modify self.ctc_lo, Split final projection to meet the - # requirment of maximum in/out channels (2048 for XJ3) - self.ctc_lo = torch.nn.ModuleList() - self.split_size = [] - num_split = (num_class - 1) // 2048 + 1 - for idx in range(num_split): - out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048 - conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1) - self.ctc_lo.append(conv_ele) - self.split_size.append(out_channel) - orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0) - orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0) - for i, (w, b) in enumerate(zip(orig_weight, orig_bias)): - w = w.unsqueeze(2).unsqueeze(3) - self.ctc_lo[i].weight = torch.nn.Parameter(w) - self.ctc_lo[i].bias = torch.nn.Parameter(b) - - self.check_equal(original) - - def check_equal(self, module): - random_data = torch.randn(1, 100, self.idim) - original_result = module.ctc_lo(random_data) - random_data = random_data.transpose(1, 2).unsqueeze(2) - new_result = self.forward(random_data) - np.testing.assert_allclose( - to_numpy(original_result), - to_numpy(new_result.squeeze(2).transpose(1, 2)), - rtol=1e-02, atol=1e-03) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """frame activations, without softmax. - - Args: - Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size) - Returns: - torch.Tensor: (B, num_class, 1, chunk_size) - """ - out = [] - for i, layer in enumerate(self.ctc_lo): - out.append(layer(x)) - out = torch.cat(out, dim=1) - return out - - -def export_encoder(asr_model, args): - logger.info("Stage-1: export encoder") - decode_window, mel_dim = args.decoding_window, args.feature_size - encoder = BPUConformerEncoder( - asr_model.encoder, args.chunk_size, args.num_decoding_left_chunks, - args.ln_run_on_bpu) - encoder.eval() - encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx') - - logger.info("Stage-1.1: prepare inputs for encoder") - chunk = torch.randn((1, 1, decode_window, mel_dim)) - required_cache_size = encoder.chunk_size * encoder.left_chunks - kv_time = required_cache_size + encoder.chunk_size - hidden, layers = encoder._output_size, len(encoder.encoders) - head = encoder.encoders[0].self_attn.h - d_k = hidden // head - lorder = encoder.encoders[0].conv_module.lorder - att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size) - att_mask = torch.ones((1, head, encoder.chunk_size, kv_time)) - att_mask[:, :, :, :required_cache_size] = 0 - cnn_cache = torch.zeros((1, hidden, layers, lorder)) - inputs = (chunk, att_cache, cnn_cache, att_mask) - logger.info("chunk.size(): {} att_cache.size(): {} " - "cnn_cache.size(): {} att_mask.size(): {}".format( - list(chunk.size()), list(att_cache.size()), - list(cnn_cache.size()), list(att_mask.size()))) - - logger.info("Stage-1.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask" - attributes['output_name'] = "output;r_att_cache;r_cnn_cache" - attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap" - attributes['norm_type'] = \ - "no_preprocess;no_preprocess;no_preprocess;no_preprocess" - attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW" - attributes['input_shape'] = \ - "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format( - chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3), - att_cache.size(0), att_cache.size(1), att_cache.size(2), - att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1), - cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0), - att_mask.size(1), att_mask.size(2), att_mask.size(3) - ) - torch.onnx.export( # NOTE(xcsong): only support opset==11 - encoder, inputs, encoder_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=attributes['input_name'].split(';'), - output_names=attributes['output_name'].split(';'), - dynamic_axes=None, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for k in vars(args): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath)) - - logger.info("Stage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - for i in range(10): - logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}" - ", att_mask: {}".format( - i, list(torch_chunk.size()), - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), - list(torch_att_mask.size()))) - torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_output = torch.cat(torch_output, dim=-1) - - onnx_output = [] - onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - logger.info("onnx chunk-{}: {}, att_cache: {}, cnn_cache: {}," - " att_mask: {}".format( - i, onnx_chunk.shape, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'att_cache': onnx_att_cache, - 'cnn_cache': onnx_cnn_cache, 'att_mask': onnx_att_mask, - } - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_output = np.concatenate(onnx_output, axis=-1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_encoder, pass!") - return encoder, ort_session - - -def export_ctc(asr_model, args): - logger.info("Stage-2: export ctc") - ctc = BPUCTC(asr_model.ctc).eval() - ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx') - - logger.info("Stage-2.1: prepare inputs for ctc") - hidden = torch.randn((1, args.output_size, 1, args.chunk_size)) - - logger.info("Stage-2.2: torch.onnx.export") - # NOTE(xcsong): Below attributes will be used in - # onnx2horizonbin.py::generate_config() - attributes = {} - attributes['input_name'], attributes['input_type'] = "hidden", "featuremap" - attributes['norm_type'] = "no_preprocess" - attributes['input_layout_train'] = "NCHW" - attributes['input_layout_rt'] = "NCHW" - attributes['input_shape'] = "{}x{}x{}x{}".format( - hidden.size(0), hidden.size(1), hidden.size(2), hidden.size(3), - ) - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=11, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=None, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for k in vars(args): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(getattr(args, k)) - for k in attributes: - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(attributes[k]) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath)) - - logger.info("Stage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-04) - meta = ort_session.get_modelmeta() - logger.info("custom_metadata_map={}".format(meta.custom_metadata_map)) - logger.info("Check onnx_ctc, pass!") - return ctc, ort_session - - -def export_decoder(asr_model, args): - logger.info("Currently, Decoder is not supported.") - - -if __name__ == '__main__': - torch.manual_seed(777) - args = get_args() - args.ln_run_on_bpu = False - # NOTE(xcsong): XJ3 BPU only support static shapes - assert args.chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - os.system("mkdir -p " + args.output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - args.feature_size = configs['input_dim'] - args.output_size = model.encoder.output_size() - args.decoding_window = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 - - export_encoder(model, args) - export_ctc(model, args) - export_decoder(model, args) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_cpu.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_cpu.py deleted file mode 100644 index a8009d2f606f753a5870eb754235d8d55e756b5d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_cpu.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import copy -import sys - -import torch -import yaml -import numpy as np - -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.init_model import init_model - -try: - import onnx - import onnxruntime - from onnxruntime.quantization import quantize_dynamic, QuantType -except ImportError: - print('Please install onnx and onnxruntime!') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='export your script model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--output_dir', required=True, help='output directory') - parser.add_argument('--chunk_size', required=True, - type=int, help='decoding chunk size') - parser.add_argument('--num_decoding_left_chunks', required=True, - type=int, help='cache chunks') - parser.add_argument('--reverse_weight', default=0.5, - type=float, help='reverse_weight in attention_rescoing') - args = parser.parse_args() - return args - - -def to_numpy(tensor): - if tensor.requires_grad: - return tensor.detach().cpu().numpy() - else: - return tensor.cpu().numpy() - - -def print_input_output_info(onnx_model, name, prefix="\t\t"): - input_names = [node.name for node in onnx_model.graph.input] - input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.input] - output_names = [node.name for node in onnx_model.graph.output] - output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim] - for node in onnx_model.graph.output] - print("{}{} inputs : {}".format(prefix, name, input_names)) - print("{}{} input shapes : {}".format(prefix, name, input_shapes)) - print("{}{} outputs: {}".format(prefix, name, output_names)) - print("{}{} output shapes : {}".format(prefix, name, output_shapes)) - - -def export_encoder(asr_model, args): - print("Stage-1: export encoder") - encoder = asr_model.encoder - encoder.forward = encoder.forward_chunk - encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx') - - print("\tStage-1.1: prepare inputs for encoder") - chunk = torch.randn( - (args['batch'], args['decoding_window'], args['feature_size'])) - offset = 0 - # NOTE(xcsong): The uncertainty of `next_cache_start` only appears - # in the first few chunks, this is caused by dynamic att_cache shape, i,e - # (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent - # chunks. One way to ease the ONNX export is to keep `next_cache_start` - # as a fixed value. To do this, for the **first** chunk, if - # left_chunks > 0, we feed real cache & real mask to the model, otherwise - # fake cache & fake mask. In this way, we get: - # 1. 16/-1 mode: next_cache_start == 0 for all chunks - # 2. 16/4 mode: next_cache_start == chunk_size for all chunks - # 3. 16/0 mode: next_cache_start == chunk_size for all chunks - # 4. -1/-1 mode: next_cache_start == 0 for all chunks - # NO MORE DYNAMIC CHANGES!! - # - # NOTE(Mddct): We retain the current design for the convenience of supporting some - # inference frameworks without dynamic shapes. If you're interested in all-in-one - # model that supports different chunks please see: - # https://github.com/wenet-e2e/wenet/pull/1174 - - if args['left_chunks'] > 0: # 16/4 - required_cache_size = args['chunk_size'] * args['left_chunks'] - offset = required_cache_size - # Real cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], required_cache_size, - args['output_size'] // args['head'] * 2)) - # Real mask - att_mask = torch.ones( - (args['batch'], 1, required_cache_size + args['chunk_size']), - dtype=torch.bool) - att_mask[:, :, :required_cache_size] = 0 - elif args['left_chunks'] <= 0: # 16/-1, -1/-1, 16/0 - required_cache_size = -1 if args['left_chunks'] < 0 else 0 - # Fake cache - att_cache = torch.zeros( - (args['num_blocks'], args['head'], 0, - args['output_size'] // args['head'] * 2)) - # Fake mask - att_mask = torch.ones((0, 0, 0), dtype=torch.bool) - cnn_cache = torch.zeros( - (args['num_blocks'], args['batch'], - args['output_size'], args['cnn_module_kernel'] - 1)) - inputs = (chunk, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - print("\t\tchunk.size(): {}\n".format(chunk.size()), - "\t\toffset: {}\n".format(offset), - "\t\trequired_cache: {}\n".format(required_cache_size), - "\t\tatt_cache.size(): {}\n".format(att_cache.size()), - "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()), - "\t\tatt_mask.size(): {}\n".format(att_mask.size())) - - print("\tStage-1.2: torch.onnx.export") - dynamic_axes = { - 'chunk': {1: 'T'}, - 'att_cache': {2: 'T_CACHE'}, - 'att_mask': {2: 'T_ADD_T_CACHE'}, - 'output': {1: 'T'}, - 'r_att_cache': {2: 'T_CACHE'}, - } - # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is - # to avoid padding the last chunk (which usually contains less - # frames than required). For users who want static axes, just pop - # out specific axis. - # if args['chunk_size'] > 0: # 16/4, 16/-1, 16/0 - # dynamic_axes.pop('chunk') - # dynamic_axes.pop('output') - # if args['left_chunks'] >= 0: # 16/4, 16/0 - # # NOTE(xsong): since we feed real cache & real mask into the - # # model when left_chunks > 0, the shape of cache will never - # # be changed. - # dynamic_axes.pop('att_cache') - # dynamic_axes.pop('r_att_cache') - torch.onnx.export( - encoder, inputs, encoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=[ - 'chunk', 'offset', 'required_cache_size', - 'att_cache', 'cnn_cache', 'att_mask' - ], - output_names=['output', 'r_att_cache', 'r_cnn_cache'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_encoder = onnx.load(encoder_outpath) - for (k, v) in args.items(): - meta = onnx_encoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_encoder) - onnx.helper.printable_graph(onnx_encoder.graph) - # NOTE(xcsong): to add those metadatas we need to reopen - # the file and resave it. - onnx.save(onnx_encoder, encoder_outpath) - print_input_output_info(onnx_encoder, "onnx_encoder") - # Dynamic quantization - model_fp32 = encoder_outpath - model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath)) - - print("\tStage-1.3: check onnx_encoder and torch_encoder") - torch_output = [] - torch_chunk = copy.deepcopy(chunk) - torch_offset = copy.deepcopy(offset) - torch_required_cache_size = copy.deepcopy(required_cache_size) - torch_att_cache = copy.deepcopy(att_cache) - torch_cnn_cache = copy.deepcopy(cnn_cache) - torch_att_mask = copy.deepcopy(att_mask) - for i in range(10): - print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, list(torch_chunk.size()), torch_offset, - list(torch_att_cache.size()), - list(torch_cnn_cache.size()), list(torch_att_mask.size()))) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - out, torch_att_cache, torch_cnn_cache = encoder( - torch_chunk, torch_offset, torch_required_cache_size, - torch_att_cache, torch_cnn_cache, torch_att_mask) - torch_output.append(out) - torch_offset += out.size(1) - torch_output = torch.cat(torch_output, dim=1) - - onnx_output = [] - onnx_chunk = to_numpy(chunk) - onnx_offset = np.array((offset)).astype(np.int64) - onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64) - onnx_att_cache = to_numpy(att_cache) - onnx_cnn_cache = to_numpy(cnn_cache) - onnx_att_mask = to_numpy(att_mask) - ort_session = onnxruntime.InferenceSession(encoder_outpath) - input_names = [node.name for node in onnx_encoder.graph.input] - for i in range(10): - print("\t\tonnx chunk-{}: {}, offset: {}, att_cache: {}," - " cnn_cache: {}, att_mask: {}".format( - i, onnx_chunk.shape, onnx_offset, onnx_att_cache.shape, - onnx_cnn_cache.shape, onnx_att_mask.shape)) - # NOTE(xsong): att_mask of the first few batches need changes if - # we use 16/4 mode. - if args['left_chunks'] > 0: # 16/4 - onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1 - ort_inputs = { - 'chunk': onnx_chunk, 'offset': onnx_offset, - 'required_cache_size': onnx_required_cache_size, - 'att_cache': onnx_att_cache, 'cnn_cache': onnx_cnn_cache, - 'att_mask': onnx_att_mask - } - # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start` - # will be hardcoded to 0 or chunk_size by ONNX, thus - # required_cache_size and att_mask are no more needed and they will - # be removed by ONNX automatically. - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - ort_outs = ort_session.run(None, ort_inputs) - onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2] - onnx_output.append(ort_outs[0]) - onnx_offset += ort_outs[0].shape[1] - onnx_output = np.concatenate(onnx_output, axis=1) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output, - rtol=1e-03, atol=1e-05) - meta = ort_session.get_modelmeta() - print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map)) - print("\t\tCheck onnx_encoder, pass!") - - -def export_ctc(asr_model, args): - print("Stage-2: export ctc") - ctc = asr_model.ctc - ctc.forward = ctc.log_softmax - ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx') - - print("\tStage-2.1: prepare inputs for ctc") - hidden = torch.randn( - (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16, - args['output_size'])) - - print("\tStage-2.2: torch.onnx.export") - dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}} - torch.onnx.export( - ctc, hidden, ctc_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hidden'], output_names=['probs'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_ctc = onnx.load(ctc_outpath) - for (k, v) in args.items(): - meta = onnx_ctc.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_ctc) - onnx.helper.printable_graph(onnx_ctc.graph) - onnx.save(onnx_ctc, ctc_outpath) - print_input_output_info(onnx_ctc, "onnx_ctc") - # Dynamic quantization - model_fp32 = ctc_outpath - model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath)) - - print("\tStage-2.3: check onnx_ctc and torch_ctc") - torch_output = ctc(hidden) - ort_session = onnxruntime.InferenceSession(ctc_outpath) - onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)}) - - np.testing.assert_allclose(to_numpy(torch_output), onnx_output[0], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_ctc, pass!") - - -def export_decoder(asr_model, args): - print("Stage-3: export decoder") - decoder = asr_model - # NOTE(lzhin): parameters of encoder will be automatically removed - # since they are not used during rescoring. - decoder.forward = decoder.forward_attention_decoder - decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx') - - print("\tStage-3.1: prepare inputs for decoder") - # hardcode time->200 nbest->10 len->20, they are dynamic axes. - encoder_out = torch.randn((1, 200, args['output_size'])) - hyps = torch.randint(low=0, high=args['vocab_size'], - size=[10, 20]) - hyps[:, 0] = args['vocab_size'] - 1 # - hyps_lens = torch.randint(low=15, high=21, size=[10]) - - print("\tStage-3.2: torch.onnx.export") - dynamic_axes = { - 'hyps': {0: 'NBEST', 1: 'L'}, 'hyps_lens': {0: 'NBEST'}, - 'encoder_out': {1: 'T'}, - 'score': {0: 'NBEST', 1: 'L'}, 'r_score': {0: 'NBEST', 1: 'L'} - } - inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight']) - torch.onnx.export( - decoder, inputs, decoder_outpath, opset_version=13, - export_params=True, do_constant_folding=True, - input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'], - output_names=['score', 'r_score'], - dynamic_axes=dynamic_axes, verbose=False) - onnx_decoder = onnx.load(decoder_outpath) - for (k, v) in args.items(): - meta = onnx_decoder.metadata_props.add() - meta.key, meta.value = str(k), str(v) - onnx.checker.check_model(onnx_decoder) - onnx.helper.printable_graph(onnx_decoder.graph) - onnx.save(onnx_decoder, decoder_outpath) - print_input_output_info(onnx_decoder, "onnx_decoder") - model_fp32 = decoder_outpath - model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx') - quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) - print('\t\tExport onnx_decoder, done! see {}'.format( - decoder_outpath)) - - print("\tStage-3.3: check onnx_decoder and torch_decoder") - torch_score, torch_r_score = decoder( - hyps, hyps_lens, encoder_out, args['reverse_weight']) - ort_session = onnxruntime.InferenceSession(decoder_outpath) - input_names = [node.name for node in onnx_decoder.graph.input] - ort_inputs = { - 'hyps': to_numpy(hyps), - 'hyps_lens': to_numpy(hyps_lens), - 'encoder_out': to_numpy(encoder_out), - 'reverse_weight': np.array((args['reverse_weight'])), - } - for k in list(ort_inputs): - if k not in input_names: - ort_inputs.pop(k) - onnx_output = ort_session.run(None, ort_inputs) - - np.testing.assert_allclose(to_numpy(torch_score), onnx_output[0], - rtol=1e-03, atol=1e-05) - if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0: - np.testing.assert_allclose(to_numpy(torch_r_score), onnx_output[1], - rtol=1e-03, atol=1e-05) - print("\t\tCheck onnx_decoder, pass!") - - -def main(): - torch.manual_seed(777) - args = get_args() - output_dir = args.output_dir - os.system("mkdir -p " + output_dir) - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - print(model) - - arguments = {} - arguments['output_dir'] = output_dir - arguments['batch'] = 1 - arguments['chunk_size'] = args.chunk_size - arguments['left_chunks'] = args.num_decoding_left_chunks - arguments['reverse_weight'] = args.reverse_weight - arguments['output_size'] = configs['encoder_conf']['output_size'] - arguments['num_blocks'] = configs['encoder_conf']['num_blocks'] - arguments['cnn_module_kernel'] = configs['encoder_conf'].get('cnn_module_kernel', 1) - arguments['head'] = configs['encoder_conf']['attention_heads'] - arguments['feature_size'] = configs['input_dim'] - arguments['vocab_size'] = configs['output_dim'] - # NOTE(xcsong): if chunk_size == -1, hardcode to 67 - arguments['decoding_window'] = (args.chunk_size - 1) * \ - model.encoder.embed.subsampling_rate + \ - model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67 - arguments['encoder'] = configs['encoder'] - arguments['decoder'] = configs['decoder'] - arguments['subsampling_rate'] = model.subsampling_rate() - arguments['right_context'] = model.right_context() - arguments['sos_symbol'] = model.sos_symbol() - arguments['eos_symbol'] = model.eos_symbol() - arguments['is_bidirectional_decoder'] = 1 \ - if model.is_bidirectional_decoder() else 0 - - # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is - # not a [16/4 16/-1 16/0] all-in-one model and it should not be used in - # streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you - # want to use 16/-1 or any other streaming mode in `decoder_main`, - # please export onnx in the same config. - if arguments['left_chunks'] > 0: - assert arguments['chunk_size'] > 0 # -1/4 not supported - - export_encoder(model, arguments) - export_ctc(model, arguments) - export_decoder(model, arguments) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_gpu.py deleted file mode 100644 index 19f810c2804efdf74ff369f780fa3102e2e389fa..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/export_onnx_gpu.py +++ /dev/null @@ -1,1056 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import os -import sys - -import torch -import yaml -import logging - -import torch.nn.functional as F -from wenet.utils.checkpoint import load_checkpoint -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import BaseEncoder -from wenet.utils.init_model import init_model -from wenet.utils.mask import make_pad_mask - -try: - import onnxruntime -except ImportError: - print('Please install onnxruntime-gpu!') - sys.exit(1) - -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -class Encoder(torch.nn.Module): - def __init__(self, - encoder: BaseEncoder, - ctc: CTC, - beam_size: int = 10): - super().__init__() - self.encoder = encoder - self.ctc = ctc - self.beam_size = beam_size - - def forward(self, speech: torch.Tensor, - speech_lengths: torch.Tensor,): - """Encoder - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - Returns: - encoder_out: B x T x F - encoder_out_lens: B - ctc_log_probs: B x T x V - beam_log_probs: B x T x beam_size - beam_log_probs_idx: B x T x beam_size - """ - encoder_out, encoder_mask = self.encoder(speech, - speech_lengths, - -1, -1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_log_probs = self.ctc.log_softmax(encoder_out) - encoder_out_lens = encoder_out_lens.int() - beam_log_probs, beam_log_probs_idx = torch.topk( - ctc_log_probs, self.beam_size, dim=2) - return encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx - - -class StreamingEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size, transformer=False): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.transformer = transformer - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoder.encoders): - xs, _, new_att_cache, new_cnn_cache = layer( - xs, masks, pos_emb, - att_cache=att_cache[i], - cnn_cache=cnn_cache[i]) - # shape(new_att_cache) is (B, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (B, hidden-dim, cache_t2) - r_att_cache.append( - new_att_cache[:, :, next_cache_start:, :].unsqueeze(1)) - if not self.transformer: - r_cnn_cache.append(new_cnn_cache.unsqueeze(1)) - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - if not self.transformer: - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingSqueezeformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - self.reduce_idx = model.encoder.reduce_idx - self.recover_idx = model.encoder.recover_idx - if self.reduce_idx is None: - self.time_reduce = None - else: - if self.recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - required_cache_size (int): cache size required for next chunk - compuation - > 0: actual cache size - <= 0: not allowed in streaming gpu encoder ` - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, b, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - elayers, cache_size = att_cache.size(0), att_cache.size(3) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = att_mask[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.encoder.preln(xs) - for i, layer in enumerate(self.encoder.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append( - (xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.encoder.time_reduction_layer( - xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - if self.encoder.pos_enc_layer_type == "rel_pos_repaired": - pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.encoder.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(1) - cached_att = cached_att.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1)) - r_cnn_cache.append(cached_cnn) - - chunk_out = xs - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class StreamingEfficientConformerEncoder(torch.nn.Module): - def __init__(self, model, required_cache_size, beam_size): - super().__init__() - self.ctc = model.ctc - self.subsampling_rate = model.encoder.embed.subsampling_rate - self.embed = model.encoder.embed - self.global_cmvn = model.encoder.global_cmvn - self.required_cache_size = required_cache_size - self.beam_size = beam_size - self.encoder = model.encoder - - # Efficient Conformer - self.stride_layer_idx = model.encoder.stride_layer_idx - self.stride = model.encoder.stride - self.num_blocks = model.encoder.num_blocks - self.cnn_module_kernel = model.encoder.cnn_module_kernel - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask): - """Streaming Encoder - Args: - chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - chunk_lens (torch.Tensor): - offset (torch.Tensor): offset with shape (b, 1) - 1 is retained for triton deployment - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (b, elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (b, elayers, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size) - in a batch of request, each request may have different - history cache. Cache mask is used to indidate the effective - cache for each request - Returns: - torch.Tensor: log probabilities of ctc output and cutoff by beam size - with shape (b, chunk_size, beam) - torch.Tensor: index of top beam size probabilities for each timestep - with shape (b, chunk_size, beam) - torch.Tensor: output of current input xs, - with shape (b, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - same shape (b, elayers, head, cache_t1, d_k * 2) - as the original att_cache - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - torch.Tensor: new cache mask, with same shape as the original - cache mask - """ - offset = offset.squeeze(1) # (b, ) - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - T = chunk_xs.size(1) - chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1) # (b, 1, T) - # B X 1 X T - chunk_mask = chunk_mask.to(chunk_xs.dtype) - # transpose batch & num_layers dim - # Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2) - # Shape(cnn_cache): (elayers, b, outsize, cnn_kernel) - att_cache = torch.transpose(att_cache, 0, 1) - cnn_cache = torch.transpose(cnn_cache, 0, 1) - - # rewrite encoder.forward_chunk - # <---------forward_chunk START---------> - xs = self.global_cmvn(chunk_xs) - # chunk mask is important for batch inferencing since - # different sequence in a batch has different length - xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset) - cache_size = att_cache.size(3) # required cache size - masks = torch.cat((cache_mask, chunk_mask), dim=2) - att_mask = torch.cat((cache_mask, chunk_mask), dim=2) - index = offset - cache_size - - pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1)) - pos_emb = pos_emb.to(dtype=xs.dtype) - - next_cache_start = -self.required_cache_size - r_cache_mask = masks[:, :, next_cache_start:] - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = chunk_mask.to(torch.bool) - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoder.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - # We propose to double the chunk_size. - att_cache_trunc = xs.size(1) + \ - att_cache.size(3) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i][:, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(1) # shape(1):layerID - - # use repeat_interleave to new_att_cache - # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - new_att_cache = new_att_cache.unsqueeze(3). \ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :].unsqueeze(1)) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.encoder.normalize_before: - chunk_out = self.encoder.after_norm(xs) - else: - chunk_out = xs - - # shape of r_att_cache: (b, elayers, head, time2, outdim) - r_att_cache = torch.cat(r_att_cache, dim=1) # concat on layers idx - # shape of r_cnn_cache: (b, elayers, outdim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=1) # concat on layers - - # <---------forward_chunk END---------> - - log_ctc_probs = self.ctc.log_softmax(chunk_out) - log_probs, log_probs_idx = torch.topk(log_ctc_probs, - self.beam_size, - dim=2) - log_probs = log_probs.to(chunk_xs.dtype) - - r_offset = offset + chunk_out.shape[1] - # the below ops not supported in Tensorrt - # chunk_out_lens = torch.div(chunk_lens, subsampling_rate, - # rounding_mode='floor') - chunk_out_lens = chunk_lens // self.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - chunk_out_lens += 1 - r_offset = r_offset.unsqueeze(1) - - return log_probs, log_probs_idx, chunk_out, chunk_out_lens, \ - r_offset, r_att_cache, r_cnn_cache, r_cache_mask - - -class Decoder(torch.nn.Module): - def __init__(self, - decoder: TransformerDecoder, - ctc_weight: float = 0.5, - reverse_weight: float = 0.0, - beam_size: int = 10, - decoder_fastertransformer: bool = False): - super().__init__() - self.decoder = decoder - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - self.beam_size = beam_size - self.decoder_fastertransformer = decoder_fastertransformer - - def forward(self, - encoder_out: torch.Tensor, - encoder_lens: torch.Tensor, - hyps_pad_sos_eos: torch.Tensor, - hyps_lens_sos: torch.Tensor, - r_hyps_pad_sos_eos: torch.Tensor, - ctc_score: torch.Tensor): - """Encoder - Args: - encoder_out: B x T x F - encoder_lens: B - hyps_pad_sos_eos: B x beam x (T2+1), - hyps with sos & eos and padded by ignore id - hyps_lens_sos: B x beam, length for each hyp with sos - r_hyps_pad_sos_eos: B x beam x (T2+1), - reversed hyps with sos & eos and padded by ignore id - ctc_score: B x beam, ctc score for each hyp - Returns: - decoder_out: B x beam x T2 x V - r_decoder_out: B x beam x T2 x V - best_index: B - """ - B, T, F = encoder_out.shape - bz = self.beam_size - B2 = B * bz - encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F) - encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1) - encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T) - T2 = hyps_pad_sos_eos.shape[2] - 1 - hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1) - hyps_lens = hyps_lens_sos.view(B2,) - hyps_pad_sos = hyps_pad[:, :-1].contiguous() - hyps_pad_eos = hyps_pad[:, 1:].contiguous() - - r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1) - r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous() - r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous() - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad_sos, hyps_lens, r_hyps_pad_sos, - self.reverse_weight) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - V = decoder_out.shape[-1] - decoder_out = decoder_out.view(B2, T2, V) - mask = ~make_pad_mask(hyps_lens, T2) # B2 x T2 - # mask index, remove ignore id - index = torch.unsqueeze(hyps_pad_eos * mask, 2) - score = decoder_out.gather(2, index).squeeze(2) # B2 X T2 - # mask padded part - score = score * mask - decoder_out = decoder_out.view(B, bz, T2, V) - if self.reverse_weight > 0: - r_decoder_out = torch.nn.functional.log_softmax( - r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.view(B2, T2, V) - index = torch.unsqueeze(r_hyps_pad_eos * mask, 2) - r_score = r_decoder_out.gather(2, index).squeeze(2) - r_score = r_score * mask - score = score * (1 - self.reverse_weight) + \ - self.reverse_weight * r_score - r_decoder_out = r_decoder_out.view(B, bz, T2, V) - score = torch.sum(score, axis=1) # B2 - score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score - best_index = torch.argmax(score, dim=1) - if self.decoder_fastertransformer: - return decoder_out, best_index - else: - return best_index - - -def to_numpy(tensors): - out = [] - if type(tensors) == torch.tensor: - tensors = [tensors] - for tensor in tensors: - if tensor.requires_grad: - tensor = tensor.detach().cpu().numpy() - else: - tensor = tensor.cpu().numpy() - out.append(tensor) - return out - - -def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True): - for a, b in zip(xlist, blist): - try: - torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol) - except AssertionError as error: - if tolerate_small_mismatch: - print(error) - else: - raise - - -def export_offline_encoder(model, configs, args, logger, encoder_onnx_path): - bz = 32 - seq_len = 100 - beam_size = args.beam_size - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint( - low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - dynamic_axes={ - 'speech': {0: 'B', 1: 'T'}, - 'speech_lengths': {0: 'B'}, - 'encoder_out': {0: 'B', 1: 'T_OUT'}, - 'encoder_out_lens': {0: 'B'}, - 'ctc_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs': {0: 'B', 1: 'T_OUT'}, - 'beam_log_probs_idx': {0: 'B', 1: 'T_OUT'}, - }, - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - -def export_offline_encoder_static(model, configs, args, logger, encoder_onnx_path): - bz = args.batch_size - seq_len = args.seq_len - beam_size = args.beam_size - - feature_size = configs["input_dim"] - - speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32) - speech_lens = torch.randint(low=10, high=seq_len, size=(bz,), dtype=torch.int32) - encoder = Encoder(model.encoder, model.ctc, beam_size) - encoder.eval() - import os - file_name, file_ext = os.path.splitext(encoder_onnx_path) - encoder_onnx_path = file_name + "_bs" + str(bz) + "_seq" + str(seq_len) + "_static.onnx" - - torch.onnx.export(encoder, - (speech, speech_lens), - encoder_onnx_path, - export_params=True, - opset_version=11, - do_constant_folding=True, - input_names=['speech', 'speech_lengths'], - output_names=['encoder_out', 'encoder_out_lens', - 'ctc_log_probs', - 'beam_log_probs', 'beam_log_probs_idx'], - verbose=False - ) - - with torch.no_grad(): - o0, o1, o2, o3, o4 = encoder(speech, speech_lens) - - providers = ["CPUExecutionProvider"] - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=providers) - ort_inputs = {'speech': to_numpy(speech), - 'speech_lengths': to_numpy(speech_lens)} - ort_outs = ort_session.run(None, ort_inputs) - - # check encoder output - test(to_numpy([o0, o1, o2, o3, o4]), ort_outs) - logger.info("export offline onnx encoder succeed!") - onnx_config = {"beam_size": args.beam_size, - "reverse_weight": args.reverse_weight, - "ctc_weight": args.ctc_weight, - "fp16": args.fp16} - return onnx_config - - -def export_online_encoder(model, configs, args, logger, encoder_onnx_path): - decoding_chunk_size = args.decoding_chunk_size - subsampling = model.encoder.embed.subsampling_rate - context = model.encoder.embed.right_context + 1 - decoding_window = (decoding_chunk_size - 1) * subsampling + context - batch_size = 32 - audio_len = decoding_window - feature_size = configs["input_dim"] - output_size = configs["encoder_conf"]["output_size"] - num_layers = configs["encoder_conf"]["num_blocks"] - # in transformer the cnn module will not be available - transformer = False - cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1 - if not cnn_module_kernel: - transformer = True - num_decoding_left_chunks = args.num_decoding_left_chunks - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if configs['encoder'] == 'squeezeformer': - encoder = StreamingSqueezeformerEncoder( - model, required_cache_size, args.beam_size) - elif configs['encoder'] == 'efficientConformer': - encoder = StreamingEfficientConformerEncoder( - model, required_cache_size, args.beam_size) - else: - encoder = StreamingEncoder( - model, required_cache_size, args.beam_size, transformer) - encoder.eval() - - # begin to export encoder - chunk_xs = torch.randn(batch_size, audio_len, - feature_size, dtype=torch.float32) - chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len - - offset = torch.arange(0, batch_size).unsqueeze(1) - # (elayers, b, head, cache_t1, d_k * 2) - head = configs["encoder_conf"]["attention_heads"] - d_k = configs["encoder_conf"]["output_size"] // head - att_cache = torch.randn(batch_size, num_layers, head, - required_cache_size, d_k * 2, - dtype=torch.float32) - cnn_cache = torch.randn(batch_size, num_layers, output_size, - cnn_module_kernel, dtype=torch.float32) - - cache_mask = torch.ones( - batch_size, 1, required_cache_size, dtype=torch.float32) - input_names = ['chunk_xs', 'chunk_lens', 'offset', - 'att_cache', 'cnn_cache', 'cache_mask'] - output_names = ['log_probs', 'log_probs_idx', 'chunk_out', - 'chunk_out_lens', 'r_offset', 'r_att_cache', - 'r_cnn_cache', 'r_cache_mask'] - input_tensors = (chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - output_names.pop(6) - - all_names = input_names + output_names - dynamic_axes = {} - for name in all_names: - # only the first dimension is dynamic - # all other dimension is fixed - dynamic_axes[name] = {0: 'B'} - - torch.onnx.export(encoder, - input_tensors, - encoder_onnx_path, - export_params=True, - opset_version=14, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - verbose=False) - - with torch.no_grad(): - torch_outs = encoder(chunk_xs, chunk_lens, offset, - att_cache, cnn_cache, cache_mask) - if transformer: - torch_outs = list(torch_outs).pop(6) - ort_session = onnxruntime.InferenceSession(encoder_onnx_path, - providers=["CUDAExecutionProvider"]) - ort_inputs = {} - - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - if transformer: - del ort_inputs['cnn_cache'] - ort_outs = ort_session.run(None, ort_inputs) - test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx streaming encoder succeed!") - onnx_config = { - "subsampling_rate": subsampling, - "context": context, - "decoding_chunk_size": decoding_chunk_size, - "num_decoding_left_chunks": num_decoding_left_chunks, - "beam_size": args.beam_size, - "fp16": args.fp16, - "feat_size": feature_size, - "decoding_window": decoding_window, - "cnn_module_kernel_cache": cnn_module_kernel - } - return onnx_config - - -def export_rescoring_decoder(model, configs, args, - logger, decoder_onnx_path, decoder_fastertransformer): - bz, seq_len = 32, 100 - beam_size = args.beam_size - decoder = Decoder(model.decoder, - model.ctc_weight, - model.reverse_weight, - beam_size, - decoder_fastertransformer) - decoder.eval() - - hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - hyps_lens_sos = torch.randint(low=3, high=seq_len, size=(bz, beam_size), - dtype=torch.int32) - r_hyps_pad_sos_eos = torch.randint( - low=3, high=1000, size=(bz, beam_size, seq_len)) - - output_size = configs["encoder_conf"]["output_size"] - encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32) - encoder_out_lens = torch.randint( - low=3, high=seq_len, size=(bz,), dtype=torch.int32) - ctc_score = torch.randn(bz, beam_size, dtype=torch.float32) - - input_names = ['encoder_out', 'encoder_out_lens', - 'hyps_pad_sos_eos', 'hyps_lens_sos', - 'r_hyps_pad_sos_eos', 'ctc_score'] - output_names = ['best_index'] - if decoder_fastertransformer: - output_names.insert(0, 'decoder_out') - - torch.onnx.export(decoder, - (encoder_out, encoder_out_lens, - hyps_pad_sos_eos, hyps_lens_sos, - r_hyps_pad_sos_eos, ctc_score), - decoder_onnx_path, - export_params=True, - opset_version=13, - do_constant_folding=True, - input_names=input_names, - output_names=output_names, - dynamic_axes={'encoder_out': {0: 'B', 1: 'T'}, - 'encoder_out_lens': {0: 'B'}, - 'hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'hyps_lens_sos': {0: 'B'}, - 'r_hyps_pad_sos_eos': {0: 'B', 2: 'T2'}, - 'ctc_score': {0: 'B'}, - 'best_index': {0: 'B'}, - }, - verbose=False - ) - with torch.no_grad(): - o0 = decoder(encoder_out, - encoder_out_lens, - hyps_pad_sos_eos, - hyps_lens_sos, - r_hyps_pad_sos_eos, - ctc_score) - providers = ["CUDAExecutionProvider"] - ort_session = onnxruntime.InferenceSession(decoder_onnx_path, - providers=providers) - - input_tensors = [encoder_out, encoder_out_lens, hyps_pad_sos_eos, - hyps_lens_sos, r_hyps_pad_sos_eos, ctc_score] - ort_inputs = {} - input_tensors = to_numpy(input_tensors) - for idx, name in enumerate(input_names): - ort_inputs[name] = input_tensors[idx] - - # if model.reverse weight == 0, - # the r_hyps_pad will be removed - # from the onnx decoder since it doen't play any role - if model.reverse_weight == 0: - del ort_inputs['r_hyps_pad_sos_eos'] - ort_outs = ort_session.run(None, ort_inputs) - - # check decoder output - if decoder_fastertransformer: - test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05) - else: - test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05) - logger.info("export to onnx decoder succeed!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='export x86_gpu model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--cmvn_file', required=False, default='', type=str, - help='global_cmvn file, default path is in config file') - parser.add_argument('--reverse_weight', default=-1.0, type=float, - required=False, - help='reverse weight for bitransformer,' + - 'default value is in config file') - parser.add_argument('--ctc_weight', default=-1.0, type=float, - required=False, - help='ctc weight, default value is in config file') - parser.add_argument('--batch_size', type=int, default=24, help='encoder batch size') - parser.add_argument('--seq_len', default=512, type=int, required=False, - help="Encoder seq_len") - parser.add_argument('--beam_size', default=10, type=int, required=False, - help="beam size would be ctc output size") - parser.add_argument('--output_onnx_dir', - default="onnx_model", - help='output onnx encoder and decoder directory') - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - # arguments for streaming encoder - parser.add_argument('--streaming', - action='store_true', - help="whether to export streaming encoder, default false") - parser.add_argument('--decoding_chunk_size', - default=16, - type=int, - required=False, - help='the decoding chunk size, <=0 is not supported') - parser.add_argument('--num_decoding_left_chunks', - default=5, - type=int, - required=False, - help="number of left chunks, <= 0 is not supported") - parser.add_argument('--decoder_fastertransformer', - action='store_true', - help='return decoder_out and best_index for ft') - args = parser.parse_args() - - torch.manual_seed(0) - torch.set_printoptions(precision=10) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if args.cmvn_file and os.path.exists(args.cmvn_file): - configs['cmvn_file'] = args.cmvn_file - if args.reverse_weight != -1.0 and 'reverse_weight' in configs['model_conf']: - configs['model_conf']['reverse_weight'] = args.reverse_weight - print("Update reverse weight to", args.reverse_weight) - if args.ctc_weight != -1: - print("Update ctc weight to ", args.ctc_weight) - configs['model_conf']['ctc_weight'] = args.ctc_weight - configs["encoder_conf"]["use_dynamic_chunk"] = False - - model = init_model(configs) - load_checkpoint(model, args.checkpoint) - model.eval() - - if not os.path.exists(args.output_onnx_dir): - os.mkdir(args.output_onnx_dir) - encoder_onnx_path = os.path.join(args.output_onnx_dir, 'encoder.onnx') - export_enc_func = None - if args.streaming: - assert args.decoding_chunk_size > 0 - assert args.num_decoding_left_chunks > 0 - export_enc_func = export_online_encoder - else: - export_enc_func = export_offline_encoder_static - - onnx_config = export_enc_func( - model, configs, args, logger, encoder_onnx_path) - - decoder_onnx_path = os.path.join(args.output_onnx_dir, 'decoder.onnx') - export_rescoring_decoder(model, configs, args, logger, - decoder_onnx_path, args.decoder_fastertransformer) - - if args.fp16: - try: - import onnxmltools - from onnxmltools.utils.float16_converter import convert_float_to_float16 - except ImportError: - print('Please install onnxmltools!') - sys.exit(1) - encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path) - encoder_onnx_model = convert_float_to_float16(encoder_onnx_model) - encoder_onnx_path = os.path.join( - args.output_onnx_dir, 'encoder_fp16.onnx') - onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path) - decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path) - decoder_onnx_model = convert_float_to_float16(decoder_onnx_model) - decoder_onnx_path = os.path.join( - args.output_onnx_dir, 'decoder_fp16.onnx') - onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path) - # dump configurations - - config_dir = os.path.join(args.output_onnx_dir, "config.yaml") - with open(config_dir, "w") as out: - yaml.dump(onnx_config, out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/recognize.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/recognize.py deleted file mode 100644 index 03b5dfd42cc098efacd20e08756a5300f6477cc1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/recognize.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import load_checkpoint -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--checkpoint', required=True, help='checkpoint model') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--beam_size', - type=int, - default=10, - help='beam size for search') - parser.add_argument('--penalty', - type=float, - default=0.0, - help='length penalty') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=16, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'attention', 'ctc_greedy_search', - 'ctc_prefix_beam_search', 'attention_rescoring', - 'rnnt_greedy_search', 'rnnt_beam_search', - 'rnnt_beam_attn_rescoring', 'ctc_beam_td_attn_rescoring', - 'hlg_onebest', 'hlg_rescore' - ], - default='attention', - help='decoding mode') - - parser.add_argument('--search_ctc_weight', - type=float, - default=1.0, - help='ctc weight for nbest generation') - parser.add_argument('--search_transducer_weight', - type=float, - default=0.0, - help='transducer weight for nbest generation') - parser.add_argument('--ctc_weight', - type=float, - default=0.0, - help='ctc weight for rescoring weight in \ - attention rescoring decode mode \ - ctc weight for rescoring weight in \ - transducer attention rescore decode mode') - - parser.add_argument('--transducer_weight', - type=float, - default=0.0, - help='transducer weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--attn_weight', - type=float, - default=0.0, - help='attention weight for rescoring weight in transducer \ - attention rescore mode') - parser.add_argument('--decoding_chunk_size', - type=int, - default=-1, - help='''decoding chunk size, - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here''') - parser.add_argument('--num_decoding_left_chunks', - type=int, - default=-1, - help='number of left chunks for decoding') - parser.add_argument('--simulate_streaming', - action='store_true', - help='simulate streaming inference') - parser.add_argument('--reverse_weight', - type=float, - default=0.0, - help='''right to left weight for attention rescoring - decode mode''') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--connect_symbol', - default='', - type=str, - help='used to connect the output characters') - - parser.add_argument('--word', - default='', - type=str, - help='word file, only used for hlg decode') - parser.add_argument('--hlg', - default='', - type=str, - help='hlg file, only used for hlg decode') - parser.add_argument('--lm_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - parser.add_argument('--r_decoder_scale', - type=float, - default=0.0, - help='lm scale for hlg attention rescore decode') - - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' - ] and args.batch_size > 1: - logging.fatal( - 'decoding mode {} must be running with batch_size == 1'.format( - args.mode)) - sys.exit(1) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_sub'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - if 'fbank_conf' in test_conf: - test_conf['fbank_conf']['dither'] = 0.0 - elif 'mfcc_conf' in test_conf: - test_conf['mfcc_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - model = init_model(configs) - - # Load dict - char_dict = {v: k for k, v in symbol_table.items()} - eos = len(char_dict) - 1 - - load_checkpoint(model, args.checkpoint) - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - model.eval() - with torch.no_grad(), open(args.result_file, 'w') as fout: - for batch_idx, batch in enumerate(test_data_loader): - keys, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - if args.mode == 'attention': - hyps, _ = model.recognize( - feats, - feats_lengths, - beam_size=args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp.tolist() for hyp in hyps] - elif args.mode == 'ctc_greedy_search': - hyps, _ = model.ctc_greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_greedy_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.greedy_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - elif args.mode == 'rnnt_beam_search': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.beam_search( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.search_ctc_weight, - transducer_weight=args.search_transducer_weight) - elif args.mode == 'rnnt_beam_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight) - elif args.mode == 'ctc_beam_td_attn_rescoring': - assert (feats.size(0) == 1) - assert 'predictor' in configs - hyps = model.transducer_attention_rescoring( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - beam_size=args.beam_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - ctc_weight=args.ctc_weight, - transducer_weight=args.transducer_weight, - attn_weight=args.attn_weight, - reverse_weight=args.reverse_weight, - search_ctc_weight=args.search_ctc_weight, - search_transducer_weight=args.search_transducer_weight, - beam_search_type='ctc') - # ctc_prefix_beam_search and attention_rescoring only return one - # result in List[int], change it to List[List[int]] for compatible - # with other batch decoding mode - elif args.mode == 'ctc_prefix_beam_search': - assert (feats.size(0) == 1) - hyp, _ = model.ctc_prefix_beam_search( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming) - hyps = [hyp] - elif args.mode == 'attention_rescoring': - assert (feats.size(0) == 1) - hyp, _ = model.attention_rescoring( - feats, - feats_lengths, - args.beam_size, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - ctc_weight=args.ctc_weight, - simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) - hyps = [hyp] - elif args.mode == 'hlg_onebest': - hyps = model.hlg_onebest( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - elif args.mode == 'hlg_rescore': - hyps = model.hlg_rescore( - feats, - feats_lengths, - decoding_chunk_size=args.decoding_chunk_size, - num_decoding_left_chunks=args.num_decoding_left_chunks, - simulate_streaming=args.simulate_streaming, - lm_scale=args.lm_scale, - decoder_scale=args.decoder_scale, - r_decoder_scale=args.r_decoder_scale, - hlg=args.hlg, - word=args.word, - symbol_table=symbol_table) - for i, key in enumerate(keys): - content = [] - for w in hyps[i]: - if w == eos: - break - content.append(char_dict[w]) - logging.info('{} {}'.format(key, args.connect_symbol.join(content))) - fout.write('{} {}\n'.format(key, args.connect_symbol.join(content))) - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/recognize_onnx_gpu.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/recognize_onnx_gpu.py deleted file mode 100644 index 42f403bf55ac0bc51d9c754574d3479345948122..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/recognize_onnx_gpu.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for testing exported onnx encoder and decoder from -export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference. -It requires a python wrapped c++ ctc decoder. -Please install it by following: -https://github.com/Slyne/ctc_decoder.git -""" -from __future__ import print_function - -import argparse -import copy -import logging -import os -import sys - -import torch -import yaml -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.common import IGNORE_ID -from wenet.utils.file_utils import read_symbol_table -from wenet.utils.config import override_config - -import onnxruntime as rt -import multiprocessing -import numpy as np - -try: - from swig_decoders import map_batch, \ - ctc_beam_search_decoder_batch, \ - TrieVector, PathTrie -except ImportError: - print('Please install ctc decoders first by refering to\n' + - 'https://github.com/Slyne/ctc_decoder.git') - sys.exit(1) - - -def get_args(): - parser = argparse.ArgumentParser(description='recognize with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--test_data', required=True, help='test data file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--dict', required=True, help='dict file') - parser.add_argument('--encoder_onnx', required=True, help='encoder onnx file') - parser.add_argument('--decoder_onnx', required=True, help='decoder onnx file') - parser.add_argument('--result_file', required=True, help='asr result file') - parser.add_argument('--batch_size', - type=int, - default=32, - help='asr result file') - parser.add_argument('--mode', - choices=[ - 'ctc_greedy_search', 'ctc_prefix_beam_search', - 'attention_rescoring'], - default='attention_rescoring', - help='decoding mode') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument('--fp16', - action='store_true', - help='whether to export fp16 model, default false') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - reverse_weight = configs["model_conf"].get("reverse_weight", 0.0) - symbol_table = read_symbol_table(args.dict) - test_conf = copy.deepcopy(configs['dataset_conf']) - test_conf['filter_conf']['max_length'] = 102400 - test_conf['filter_conf']['min_length'] = 0 - test_conf['filter_conf']['token_max_length'] = 102400 - test_conf['filter_conf']['token_min_length'] = 0 - test_conf['filter_conf']['max_output_input_ratio'] = 102400 - test_conf['filter_conf']['min_output_input_ratio'] = 0 - test_conf['speed_perturb'] = False - test_conf['spec_aug'] = False - test_conf['spec_trim'] = False - test_conf['shuffle'] = False - test_conf['sort'] = False - test_conf['fbank_conf']['dither'] = 0.0 - test_conf['batch_conf']['batch_type'] = "static" - test_conf['batch_conf']['batch_size'] = args.batch_size - - test_dataset = Dataset(args.data_type, - args.test_data, - symbol_table, - test_conf, - args.bpe_model, - partition=False) - - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - # Init asr model from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - if use_cuda: - EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - else: - EP_list = ['CPUExecutionProvider'] - - encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) - decoder_ort_session = None - if args.mode == "attention_rescoring": - decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) - - # Load dict - vocabulary = [] - char_dict = {} - with open(args.dict, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - char_dict[int(arr[1])] = arr[0] - vocabulary.append(arr[0]) - eos = sos = len(char_dict) - 1 - with torch.no_grad(), open(args.result_file, 'w') as fout: - for _, batch in enumerate(test_data_loader): - keys, feats, _, feats_lengths, _ = batch - feats, feats_lengths = feats.numpy(), feats_lengths.numpy() - if args.fp16: - feats = feats.astype(np.float16) - ort_inputs = { - encoder_ort_session.get_inputs()[0].name: feats, - encoder_ort_session.get_inputs()[1].name: feats_lengths} - ort_outs = encoder_ort_session.run(None, ort_inputs) - encoder_out, encoder_out_lens, ctc_log_probs, \ - beam_log_probs, beam_log_probs_idx = ort_outs - beam_size = beam_log_probs.shape[-1] - batch_size = beam_log_probs.shape[0] - num_processes = min(multiprocessing.cpu_count(), batch_size) - if args.mode == 'ctc_greedy_search': - if beam_size != 1: - log_probs_idx = beam_log_probs_idx[:, :, 0] - batch_sents = [] - for idx, seq in enumerate(log_probs_idx): - batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) - hyps = map_batch(batch_sents, vocabulary, num_processes, - True, 0) - elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): - batch_log_probs_seq_list = beam_log_probs.tolist() - batch_log_probs_idx_list = beam_log_probs_idx.tolist() - batch_len_list = encoder_out_lens.tolist() - batch_log_probs_seq = [] - batch_log_probs_ids = [] - batch_start = [] # only effective in streaming deployment - batch_root = TrieVector() - root_dict = {} - for i in range(len(batch_len_list)): - num_sent = batch_len_list[i] - batch_log_probs_seq.append( - batch_log_probs_seq_list[i][0:num_sent]) - batch_log_probs_ids.append( - batch_log_probs_idx_list[i][0:num_sent]) - root_dict[i] = PathTrie() - batch_root.append(root_dict[i]) - batch_start.append(True) - score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq, - batch_log_probs_ids, - batch_root, - batch_start, - beam_size, - num_processes, - 0, -2, 0.99999) - if args.mode == 'ctc_prefix_beam_search': - hyps = [] - for cand_hyps in score_hyps: - hyps.append(cand_hyps[0][1]) - hyps = map_batch(hyps, vocabulary, num_processes, False, 0) - if args.mode == 'attention_rescoring': - ctc_score, all_hyps = [], [] - max_len = 0 - for hyps in score_hyps: - cur_len = len(hyps) - if len(hyps) < beam_size: - hyps += (beam_size - cur_len) * [(-float("INF"), (0,))] - cur_ctc_score = [] - for hyp in hyps: - cur_ctc_score.append(hyp[0]) - all_hyps.append(list(hyp[1])) - if len(hyp[1]) > max_len: - max_len = len(hyp[1]) - ctc_score.append(cur_ctc_score) - if args.fp16: - ctc_score = np.array(ctc_score, dtype=np.float16) - else: - ctc_score = np.array(ctc_score, dtype=np.float32) - hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - r_hyps_pad_sos_eos = np.ones( - (batch_size, beam_size, max_len + 2), dtype=np.int64) * IGNORE_ID - hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) - k = 0 - for i in range(batch_size): - for j in range(beam_size): - cand = all_hyps[k] - l = len(cand) + 2 - hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos] - r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [eos] - hyps_lens_sos[i][j] = len(cand) + 1 - k += 1 - decoder_ort_inputs = { - decoder_ort_session.get_inputs()[0].name: encoder_out, - decoder_ort_session.get_inputs()[1].name: encoder_out_lens, - decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos, - decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, - decoder_ort_session.get_inputs()[-1].name: ctc_score} - if reverse_weight > 0: - r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs()[4].name - decoder_ort_inputs[r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos - best_index = decoder_ort_session.run(None, decoder_ort_inputs)[0] - best_sents = [] - k = 0 - for idx in best_index: - cur_best_sent = all_hyps[k: k + beam_size][idx] - best_sents.append(cur_best_sent) - k += beam_size - hyps = map_batch(best_sents, vocabulary, num_processes) - - for i, key in enumerate(keys): - content = hyps[i] - logging.info('{} {}'.format(key, content)) - fout.write('{} {}\n'.format(key, content)) - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/train.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/train.py deleted file mode 100644 index 70799b60790b31d73911770891f519f5473e2f4b..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/bin/train.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import copy -import logging -import os - -import torch -import torch.distributed as dist -import torch.optim as optim -import yaml -from tensorboardX import SummaryWriter -from torch.utils.data import DataLoader - -from wenet.dataset.dataset import Dataset -from wenet.utils.checkpoint import (load_checkpoint, save_checkpoint, - load_trained_modules) -from wenet.utils.executor import Executor -from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols -from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing -from wenet.utils.config import override_config -from wenet.utils.init_model import init_model - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--data_type', - default='raw', - choices=['raw', 'shard'], - help='train and cv data type') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this local rank, -1 for cpu') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.rank', - dest='rank', - default=0, - type=int, - help='global rank for distributed training') - parser.add_argument('--ddp.world_size', - dest='world_size', - default=-1, - type=int, - help='''number of total processes/gpus for - distributed training''') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--ddp.init_method', - dest='init_method', - default=None, - help='ddp init method') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--use_amp', - action='store_true', - default=False, - help='Use automatic mixed precision training') - parser.add_argument('--fp16_grad_sync', - action='store_true', - default=False, - help='Use fp16 gradient sync for ddp') - parser.add_argument('--cmvn', default=None, help='global cmvn file') - parser.add_argument('--symbol_table', - required=True, - help='model unit symbol table for training') - parser.add_argument("--non_lang_syms", - help="non-linguistic symbol file. One symbol per line.") - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--bpe_model', - default=None, - type=str, - help='bpe model for english part') - parser.add_argument('--override_config', - action='append', - default=[], - help="override yaml config") - parser.add_argument("--enc_init", - default=None, - type=str, - help="Pre-trained model to initialize encoder") - parser.add_argument("--enc_init_mods", - default="encoder.", - type=lambda s: [str(mod) for mod in s.split(",") if s != ""], - help="List of encoder modules \ - to initialize ,separated by a comma") - - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Set random seed - torch.manual_seed(777) - with open(args.config, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - if len(args.override_config) > 0: - configs = override_config(configs, args.override_config) - - distributed = args.world_size > 1 - if distributed: - logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) - dist.init_process_group(args.dist_backend, - init_method=args.init_method, - world_size=args.world_size, - rank=args.rank) - - symbol_table = read_symbol_table(args.symbol_table) - - train_conf = configs['dataset_conf'] - cv_conf = copy.deepcopy(train_conf) - cv_conf['speed_perturb'] = False - cv_conf['spec_aug'] = False - cv_conf['spec_sub'] = False - cv_conf['spec_trim'] = False - cv_conf['shuffle'] = False - non_lang_syms = read_non_lang_symbols(args.non_lang_syms) - - train_dataset = Dataset(args.data_type, args.train_data, symbol_table, - train_conf, args.bpe_model, non_lang_syms, True) - cv_dataset = Dataset(args.data_type, - args.cv_data, - symbol_table, - cv_conf, - args.bpe_model, - non_lang_syms, - partition=False) - - train_data_loader = DataLoader(train_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - cv_data_loader = DataLoader(cv_dataset, - batch_size=None, - pin_memory=args.pin_memory, - num_workers=args.num_workers, - prefetch_factor=args.prefetch) - - if 'fbank_conf' in configs['dataset_conf']: - input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] - else: - input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] - vocab_size = len(symbol_table) - - # Save configs to model_dir/train.yaml for inference and export - configs['input_dim'] = input_dim - configs['output_dim'] = vocab_size - configs['cmvn_file'] = args.cmvn - configs['is_json_cmvn'] = True - if args.rank == 0: - saved_config_path = os.path.join(args.model_dir, 'train.yaml') - with open(saved_config_path, 'w') as fout: - data = yaml.dump(configs) - fout.write(data) - - # Init asr model from configs - model = init_model(configs) - print(model) - num_params = sum(p.numel() for p in model.parameters()) - print('the number of model params: {:,d}'.format(num_params)) - - # !!!IMPORTANT!!! - # Try to export the model by script, if fails, we should refine - # the code to satisfy the script export requirements - if args.rank == 0: - script_model = torch.jit.script(model) - script_model.save(os.path.join(args.model_dir, 'init.zip')) - executor = Executor() - # If specify checkpoint, load some info from checkpoint - if args.checkpoint is not None: - infos = load_checkpoint(model, args.checkpoint) - elif args.enc_init is not None: - logging.info('load pretrained encoders: {}'.format(args.enc_init)) - infos = load_trained_modules(model, args) - else: - infos = {} - start_epoch = infos.get('epoch', -1) + 1 - cv_loss = infos.get('cv_loss', 0.0) - step = infos.get('step', -1) - - num_epochs = configs.get('max_epoch', 100) - model_dir = args.model_dir - writer = None - if args.rank == 0: - os.makedirs(model_dir, exist_ok=True) - exp_id = os.path.basename(model_dir) - writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) - - if distributed: - assert (torch.cuda.is_available()) - # cuda model is required for nn.parallel.DistributedDataParallel - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, find_unused_parameters=True) - device = torch.device("cuda") - if args.fp16_grad_sync: - from torch.distributed.algorithms.ddp_comm_hooks import ( - default as comm_hooks, - ) - model.register_comm_hook( - state=None, hook=comm_hooks.fp16_compress_hook - ) - else: - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - model = model.to(device) - - if configs['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) - elif configs['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['optim']) - if configs['scheduler'] == 'warmuplr': - scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) - elif configs['scheduler'] == 'NoamHoldAnnealing': - scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf']) - else: - raise ValueError("unknown scheduler: " + configs['scheduler']) - - final_epoch = None - configs['rank'] = args.rank - configs['is_distributed'] = distributed - configs['use_amp'] = args.use_amp - if start_epoch == 0 and args.rank == 0: - save_model_path = os.path.join(model_dir, 'init.pt') - save_checkpoint(model, save_model_path) - - # Start training loop - executor.step = step - scheduler.set_step(step) - # used for pytorch amp mixed precision training - scaler = None - if args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(start_epoch, num_epochs): - train_dataset.set_epoch(epoch) - configs['epoch'] = epoch - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) - executor.train(model, optimizer, scheduler, train_data_loader, device, - writer, configs, scaler) - total_loss, num_seen_utts = executor.cv(model, cv_data_loader, device, - configs) - cv_loss = total_loss / num_seen_utts - - logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) - if args.rank == 0: - save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) - save_checkpoint( - model, save_model_path, { - 'epoch': epoch, - 'lr': lr, - 'cv_loss': cv_loss, - 'step': executor.step - }) - writer.add_scalar('epoch/cv_loss', cv_loss, epoch) - writer.add_scalar('epoch/lr', lr, epoch) - final_epoch = epoch - - if final_epoch is not None and args.rank == 0: - final_model_path = os.path.join(model_dir, 'final.pt') - os.remove(final_model_path) if os.path.exists(final_model_path) else None - os.symlink('{}.pt'.format(final_epoch), final_model_path) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/dataset.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/dataset.py deleted file mode 100644 index 6d799b5b5aea2d34546484b3fed5d45e2d5b6aa6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/dataset.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import torch -import torch.distributed as dist -from torch.utils.data import IterableDataset - -import wenet.dataset.processor as processor -from wenet.utils.file_utils import read_lists - - -class Processor(IterableDataset): - def __init__(self, source, f, *args, **kw): - assert callable(f) - self.source = source - self.f = f - self.args = args - self.kw = kw - - def set_epoch(self, epoch): - self.source.set_epoch(epoch) - - def __iter__(self): - """ Return an iterator over the source dataset processed by the - given processor. - """ - assert self.source is not None - assert callable(self.f) - return self.f(iter(self.source), *self.args, **self.kw) - - def apply(self, f): - assert callable(f) - return Processor(self, f, *self.args, **self.kw) - - -class DistributedSampler: - def __init__(self, shuffle=True, partition=True): - self.epoch = -1 - self.update() - self.shuffle = shuffle - self.partition = partition - - def update(self): - assert dist.is_available() - if dist.is_initialized(): - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() - else: - self.rank = 0 - self.world_size = 1 - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: - self.worker_id = 0 - self.num_workers = 1 - else: - self.worker_id = worker_info.id - self.num_workers = worker_info.num_workers - return dict(rank=self.rank, - world_size=self.world_size, - worker_id=self.worker_id, - num_workers=self.num_workers) - - def set_epoch(self, epoch): - self.epoch = epoch - - def sample(self, data): - """ Sample data according to rank/world_size/num_workers - - Args: - data(List): input data list - - Returns: - List: data list after sample - """ - data = list(range(len(data))) - # TODO(Binbin Zhang): fix this - # We can not handle uneven data for CV on DDP, so we don't - # sample data by rank, that means every GPU gets the same - # and all the CV data - if self.partition: - if self.shuffle: - random.Random(self.epoch).shuffle(data) - data = data[self.rank::self.world_size] - data = data[self.worker_id::self.num_workers] - return data - - -class DataList(IterableDataset): - def __init__(self, lists, shuffle=True, partition=True): - self.lists = lists - self.sampler = DistributedSampler(shuffle, partition) - - def set_epoch(self, epoch): - self.sampler.set_epoch(epoch) - - def __iter__(self): - sampler_info = self.sampler.update() - indexes = self.sampler.sample(self.lists) - for index in indexes: - # yield dict(src=src) - data = dict(src=self.lists[index]) - data.update(sampler_info) - yield data - - -def Dataset(data_type, - data_list_file, - symbol_table, - conf, - bpe_model=None, - non_lang_syms=None, - partition=True): - """ Construct dataset from arguments - - We have two shuffle stage in the Dataset. The first is global - shuffle at shards tar/raw file level. The second is global shuffle - at training samples level. - - Args: - data_type(str): raw/shard - bpe_model(str): model for english bpe part - partition(bool): whether to do data partition in terms of rank - """ - assert data_type in ['raw', 'shard'] - lists = read_lists(data_list_file) - shuffle = conf.get('shuffle', True) - dataset = DataList(lists, shuffle=shuffle, partition=partition) - if data_type == 'shard': - dataset = Processor(dataset, processor.url_opener) - dataset = Processor(dataset, processor.tar_file_and_group) - else: - dataset = Processor(dataset, processor.parse_raw) - - dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model, - non_lang_syms, conf.get('split_with_space', False)) - filter_conf = conf.get('filter_conf', {}) - dataset = Processor(dataset, processor.filter, **filter_conf) - - resample_conf = conf.get('resample_conf', {}) - dataset = Processor(dataset, processor.resample, **resample_conf) - - speed_perturb = conf.get('speed_perturb', False) - if speed_perturb: - dataset = Processor(dataset, processor.speed_perturb) - - feats_type = conf.get('feats_type', 'fbank') - assert feats_type in ['fbank', 'mfcc'] - if feats_type == 'fbank': - fbank_conf = conf.get('fbank_conf', {}) - dataset = Processor(dataset, processor.compute_fbank, **fbank_conf) - elif feats_type == 'mfcc': - mfcc_conf = conf.get('mfcc_conf', {}) - dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf) - - spec_aug = conf.get('spec_aug', True) - spec_sub = conf.get('spec_sub', False) - spec_trim = conf.get('spec_trim', False) - if spec_aug: - spec_aug_conf = conf.get('spec_aug_conf', {}) - dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf) - if spec_sub: - spec_sub_conf = conf.get('spec_sub_conf', {}) - dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf) - if spec_trim: - spec_trim_conf = conf.get('spec_trim_conf', {}) - dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf) - - if shuffle: - shuffle_conf = conf.get('shuffle_conf', {}) - dataset = Processor(dataset, processor.shuffle, **shuffle_conf) - - sort = conf.get('sort', True) - if sort: - sort_conf = conf.get('sort_conf', {}) - dataset = Processor(dataset, processor.sort, **sort_conf) - - batch_conf = conf.get('batch_conf', {}) - dataset = Processor(dataset, processor.batch, **batch_conf) - dataset = Processor(dataset, processor.padding) - return dataset diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/kaldi_io.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/kaldi_io.py deleted file mode 100644 index c9bef293c93d882147bb5b738e1fc49a7a19a484..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/kaldi_io.py +++ /dev/null @@ -1,666 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int_scp(file_or_fd): - """ generator(key,vec) = read_vec_int_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_int_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_int(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Mapping for percentiles in col-headers, - def uint16_to_float(value, min, range): - return np.float32(min + range * 1.52590218966964e-05 * value) - - # Mapping for matrix elements, - def uint8_to_float_v2(vec, p0, p25, p75, p100): - # Split the vector by masks, - mask_0_64 = (vec <= 64); - mask_193_255 = (vec > 192); - mask_65_192 = (~(mask_0_64 | mask_193_255)); - # Sanity check (useful but slow...), - # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255]))) - # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0))) - # Build the float vector, - ans = np.empty(len(vec), dtype='float32') - ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64] - ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64) - ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] - 192) - return ans - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.empty((cols,rows), dtype='float32') - for i, col_header in enumerate(col_headers): - col_header_flt = [ uint16_to_float(percentile, globmin, globrange) for percentile in col_header ] - mat[i] = uint8_to_float_v2(data[i], *col_header_flt) - - return mat.T # transpose! col-major -> row-major, - -def write_ark_scp(key, mat, ark_fout, scp_out): - mat_offset = write_mat(ark_fout, mat, key) - scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset) - scp_out.write(scp_line) - scp_out.write('\n') - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - mat_offset = 0 - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - mat_offset = fd.tell() - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - return mat_offset - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/processor.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/processor.py deleted file mode 100644 index b4bd07ce674eb3288cd1b13a09085eec48d40845..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/processor.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import json -import random -import re -import tarfile -from subprocess import PIPE, Popen -from urllib.parse import urlparse - -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from torch.nn.utils.rnn import pad_sequence - -AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - - -def url_opener(data): - """ Give url or local file, return file descriptor - Inplace operation. - - Args: - data(Iterable[str]): url or local file list - - Returns: - Iterable[{src, stream}] - """ - for sample in data: - assert 'src' in sample - # TODO(Binbin Zhang): support HTTP - url = sample['src'] - try: - pr = urlparse(url) - # local file - if pr.scheme == '' or pr.scheme == 'file': - stream = open(url, 'rb') - # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP - else: - cmd = f'wget -q -O - {url}' - process = Popen(cmd, shell=True, stdout=PIPE) - sample.update(process=process) - stream = process.stdout - sample.update(stream=stream) - yield sample - except Exception as ex: - logging.warning('Failed to open {}'.format(url)) - - -def tar_file_and_group(data): - """ Expand a stream of open tar files into a stream of tar file contents. - And groups the file with same prefix - - Args: - data: Iterable[{src, stream}] - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'stream' in sample - stream = tarfile.open(fileobj=sample['stream'], mode="r|*") - prev_prefix = None - example = {} - valid = True - for tarinfo in stream: - name = tarinfo.name - pos = name.rfind('.') - assert pos > 0 - prefix, postfix = name[:pos], name[pos + 1:] - if prev_prefix is not None and prefix != prev_prefix: - example['key'] = prev_prefix - if valid: - yield example - example = {} - valid = True - with stream.extractfile(tarinfo) as file_obj: - try: - if postfix == 'txt': - example['txt'] = file_obj.read().decode('utf8').strip() - elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = torchaudio.load(file_obj) - example['wav'] = waveform - example['sample_rate'] = sample_rate - else: - example[postfix] = file_obj.read() - except Exception as ex: - valid = False - logging.warning('error to parse {}'.format(name)) - prev_prefix = prefix - if prev_prefix is not None: - example['key'] = prev_prefix - yield example - stream.close() - if 'process' in sample: - sample['process'].communicate() - sample['stream'].close() - - -def parse_raw(data): - """ Parse key/wav/txt from json line - - Args: - data: Iterable[str], str is a json line has key/wav/txt - - Returns: - Iterable[{key, wav, txt, sample_rate}] - """ - for sample in data: - assert 'src' in sample - json_line = sample['src'] - obj = json.loads(json_line) - assert 'key' in obj - assert 'wav' in obj - assert 'txt' in obj - key = obj['key'] - wav_file = obj['wav'] - txt = obj['txt'] - try: - if 'start' in obj: - assert 'end' in obj - sample_rate = torchaudio.backend.sox_io_backend.info( - wav_file).sample_rate - start_frame = int(obj['start'] * sample_rate) - end_frame = int(obj['end'] * sample_rate) - waveform, _ = torchaudio.backend.sox_io_backend.load( - filepath=wav_file, - num_frames=end_frame - start_frame, - frame_offset=start_frame) - else: - waveform, sample_rate = torchaudio.load(wav_file) - example = dict(key=key, - txt=txt, - wav=waveform, - sample_rate=sample_rate) - yield example - except Exception as ex: - logging.warning('Failed to read {}'.format(wav_file)) - - -def filter(data, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): - """ Filter sample according to feature and label length - Inplace operation. - - Args:: - data: Iterable[{key, wav, label, sample_rate}] - max_length: drop utterance which is greater than max_length(10ms) - min_length: drop utterance which is less than min_length(10ms) - token_max_length: drop utterance which is greater than - token_max_length, especially when use char unit for - english modeling - token_min_length: drop utterance which is - less than token_max_length - min_output_input_ratio: minimal ration of - token_length / feats_length(10ms) - max_output_input_ratio: maximum ration of - token_length / feats_length(10ms) - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'label' in sample - # sample['wav'] is torch.Tensor, we have 100 frames every second - num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100 - if num_frames < min_length: - continue - if num_frames > max_length: - continue - if len(sample['label']) < token_min_length: - continue - if len(sample['label']) > token_max_length: - continue - if num_frames != 0: - if len(sample['label']) / num_frames < min_output_input_ratio: - continue - if len(sample['label']) / num_frames > max_output_input_ratio: - continue - yield sample - - -def resample(data, resample_rate=16000): - """ Resample data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - resample_rate: target resample rate - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - if sample_rate != resample_rate: - sample['sample_rate'] = resample_rate - sample['wav'] = torchaudio.transforms.Resample( - orig_freq=sample_rate, new_freq=resample_rate)(waveform) - yield sample - - -def speed_perturb(data, speeds=None): - """ Apply speed perturb to the data. - Inplace operation. - - Args: - data: Iterable[{key, wav, label, sample_rate}] - speeds(List[float]): optional speed - - Returns: - Iterable[{key, wav, label, sample_rate}] - """ - if speeds is None: - speeds = [0.9, 1.0, 1.1] - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - speed = random.choice(speeds) - if speed != 1.0: - wav, _ = torchaudio.sox_effects.apply_effects_tensor( - waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) - sample['wav'] = wav - - yield sample - - -def compute_fbank(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0): - """ Extract fbank - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.fbank(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def compute_mfcc(data, - num_mel_bins=23, - frame_length=25, - frame_shift=10, - dither=0.0, - num_ceps=40, - high_freq=0.0, - low_freq=20.0): - """ Extract mfcc - - Args: - data: Iterable[{key, wav, label, sample_rate}] - - Returns: - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'sample_rate' in sample - assert 'wav' in sample - assert 'key' in sample - assert 'label' in sample - sample_rate = sample['sample_rate'] - waveform = sample['wav'] - waveform = waveform * (1 << 15) - # Only keep key, feat, label - mat = kaldi.mfcc(waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - num_ceps=num_ceps, - high_freq=high_freq, - low_freq=low_freq, - sample_frequency=sample_rate) - yield dict(key=sample['key'], label=sample['label'], feat=mat) - - -def __tokenize_by_bpe_model(sp, txt): - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r'([\u4e00-\u9fff])') - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - - return tokens - - -def tokenize(data, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): - """ Decode text to chars or BPE - Inplace operation - - Args: - data: Iterable[{key, wav, txt, sample_rate}] - - Returns: - Iterable[{key, wav, txt, tokens, label, sample_rate}] - """ - if non_lang_syms is not None: - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - else: - non_lang_syms = {} - non_lang_syms_pattern = None - - if bpe_model is not None: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) - else: - sp = None - - for sample in data: - assert 'txt' in sample - txt = sample['txt'].strip() - if non_lang_syms_pattern is not None: - parts = non_lang_syms_pattern.split(txt.upper()) - parts = [w for w in parts if len(w.strip()) > 0] - else: - parts = [txt] - - label = [] - tokens = [] - for part in parts: - if part in non_lang_syms: - tokens.append(part) - else: - if bpe_model is not None: - tokens.extend(__tokenize_by_bpe_model(sp, part)) - else: - if split_with_space: - part = part.split(" ") - for ch in part: - if ch == ' ': - ch = "▁" - tokens.append(ch) - - for ch in tokens: - if ch in symbol_table: - label.append(symbol_table[ch]) - elif '' in symbol_table: - label.append(symbol_table['']) - - sample['tokens'] = tokens - sample['label'] = label - yield sample - - -def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80): - """ Do spec augmentation - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - num_t_mask: number of time mask to apply - num_f_mask: number of freq mask to apply - max_t: max width of time mask - max_f: max width of freq mask - max_w: max width of time warp - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - max_freq = y.size(1) - # time mask - for i in range(num_t_mask): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - y[start:end, :] = 0 - # freq mask - for i in range(num_f_mask): - start = random.randint(0, max_freq - 1) - length = random.randint(1, max_f) - end = min(max_freq, start + length) - y[:, start:end] = 0 - sample['feat'] = y - yield sample - - -def spec_sub(data, max_t=20, num_t_sub=3): - """ Do spec substitute - Inplace operation - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of time substitute - num_t_sub: number of time substitute to apply - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - y = x.clone().detach() - max_frames = y.size(0) - for i in range(num_t_sub): - start = random.randint(0, max_frames - 1) - length = random.randint(1, max_t) - end = min(max_frames, start + length) - # only substitute the earlier time chosen randomly for current time - pos = random.randint(0, start) - y[start:end, :] = x[start - pos:end - pos, :] - sample['feat'] = y - yield sample - - -def spec_trim(data, max_t=20): - """ Trim tailing frames. Inplace operation. - ref: TrimTail [https://arxiv.org/abs/2211.00522] - - Args: - data: Iterable[{key, feat, label}] - max_t: max width of length trimming - - Returns - Iterable[{key, feat, label}] - """ - for sample in data: - assert 'feat' in sample - x = sample['feat'] - assert isinstance(x, torch.Tensor) - max_frames = x.size(0) - length = random.randint(1, max_t) - if length < max_frames / 2: - y = x.clone().detach()[:max_frames - length] - sample['feat'] = y - yield sample - - -def shuffle(data, shuffle_size=10000): - """ Local shuffle the data - - Args: - data: Iterable[{key, feat, label}] - shuffle_size: buffer size for shuffle - - Returns: - Iterable[{key, feat, label}] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= shuffle_size: - random.shuffle(buf) - for x in buf: - yield x - buf = [] - # The sample left over - random.shuffle(buf) - for x in buf: - yield x - - -def sort(data, sort_size=500): - """ Sort the data by feature length. - Sort is used after shuffle and before batch, so we can group - utts with similar lengths into a batch, and `sort_size` should - be less than `shuffle_size` - - Args: - data: Iterable[{key, feat, label}] - sort_size: buffer size for sort - - Returns: - Iterable[{key, feat, label}] - """ - - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= sort_size: - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - buf = [] - # The sample left over - buf.sort(key=lambda x: x['feat'].size(0)) - for x in buf: - yield x - - -def static_batch(data, batch_size=16): - """ Static batch the data by `batch_size` - - Args: - data: Iterable[{key, feat, label}] - batch_size: batch size - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - for sample in data: - buf.append(sample) - if len(buf) >= batch_size: - yield buf - buf = [] - if len(buf) > 0: - yield buf - - -def dynamic_batch(data, max_frames_in_batch=12000): - """ Dynamic batch the data until the total frames in batch - reach `max_frames_in_batch` - - Args: - data: Iterable[{key, feat, label}] - max_frames_in_batch: max_frames in one batch - - Returns: - Iterable[List[{key, feat, label}]] - """ - buf = [] - longest_frames = 0 - for sample in data: - assert 'feat' in sample - assert isinstance(sample['feat'], torch.Tensor) - new_sample_frames = sample['feat'].size(0) - longest_frames = max(longest_frames, new_sample_frames) - frames_after_padding = longest_frames * (len(buf) + 1) - if frames_after_padding > max_frames_in_batch: - yield buf - buf = [sample] - longest_frames = new_sample_frames - else: - buf.append(sample) - if len(buf) > 0: - yield buf - - -def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000): - """ Wrapper for static/dynamic batch - """ - if batch_type == 'static': - return static_batch(data, batch_size) - elif batch_type == 'dynamic': - return dynamic_batch(data, max_frames_in_batch) - else: - logging.fatal('Unsupported batch type {}'.format(batch_type)) - - -def padding(data): - """ Padding the data into training data - - Args: - data: Iterable[List[{key, feat, label}]] - - Returns: - Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] - """ - for sample in data: - assert isinstance(sample, list) - feats_length = torch.tensor([x['feat'].size(0) for x in sample], - dtype=torch.int32) - order = torch.argsort(feats_length, descending=True) - feats_lengths = torch.tensor( - [sample[i]['feat'].size(0) for i in order], dtype=torch.int32) - sorted_feats = [sample[i]['feat'] for i in order] - sorted_keys = [sample[i]['key'] for i in order] - sorted_labels = [ - torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order - ] - label_lengths = torch.tensor([x.size(0) for x in sorted_labels], - dtype=torch.int32) - - padded_feats = pad_sequence(sorted_feats, - batch_first=True, - padding_value=0) - - pad = (0, 0, 0, 0) - seq_len= padded_feats.shape[1] - if seq_len < 384: - pad = (0, 0, 0, 384-seq_len) - elif seq_len < 512: - pad = (0, 0, 0, 512-seq_len) - elif seq_len < 640: - pad = (0, 0, 0, 640-seq_len) - elif seq_len < 768: - pad = (0, 0, 0, 768-seq_len) - elif seq_len < 896: - pad = (0, 0, 0, 896-seq_len) - elif seq_len < 1024: - pad = (0, 0, 0, 1024-seq_len) - elif seq_len < 1280: - pad = (0, 0, 0, 1280-seq_len) - padded_feats = torch.nn.functional.pad(padded_feats, pad, mode='constant', value=0) - padding_labels = pad_sequence(sorted_labels, - batch_first=True, - padding_value=-1) - - yield (sorted_keys, padded_feats, padding_labels, feats_lengths, - label_lengths) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/wav_distortion.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/wav_distortion.py deleted file mode 100644 index 2917d3cc6cfb801935cb0885d0c42cd88f1833b8..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/dataset/wav_distortion.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Chao Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import math - -import torchaudio -import torch -torchaudio.set_audio_backend("sox_io") - - -def db2amp(db): - return pow(10, db / 20) - -def amp2db(amp): - return 20 * math.log10(amp) - -def make_poly_distortion(conf): - """Generate a db-domain ploynomial distortion function - - f(x) = a * x^m * (1-x)^n + x - - Args: - conf: a dict {'a': #int, 'm': #int, 'n': #int} - - Returns: - The ploynomial function, which could be applied on - a float amplitude value - """ - a = conf['a'] - m = conf['m'] - n = conf['n'] - - def poly_distortion(x): - abs_x = abs(x) - if abs_x < 0.000001: - x = x - else: - db_norm = amp2db(abs_x) / 100 + 1 - if db_norm < 0: - db_norm = 0 - db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm - if db_norm > 1: - db_norm = 1 - db = (db_norm - 1) * 100 - amp = db2amp(db) - if amp >= 0.9997: - amp = 0.9997 - if x > 0: - x = amp - else: - x = -amp - return x - return poly_distortion - -def make_quad_distortion(): - return make_poly_distortion({'a' : 1, 'm' : 1, 'n' : 1}) - -# the amplitude are set to max for all non-zero point -def make_max_distortion(conf): - """Generate a max distortion function - - Args: - conf: a dict {'max_db': float } - 'max_db': the maxium value. - - Returns: - The max function, which could be applied on - a float amplitude value - """ - max_db = conf['max_db'] - if max_db: - max_amp = db2amp(max_db) # < 0.997 - else: - max_amp = 0.997 - - def max_distortion(x): - if x > 0: - x = max_amp - elif x < 0: - x = -max_amp - else: - x = 0.0 - return x - return max_distortion - - - -def make_amp_mask(db_mask=None): - """Get a amplitude domain mask from db domain mask - - Args: - db_mask: Optional. A list of tuple. if None, using default value. - - Returns: - A list of tuple. The amplitude domain mask - """ - if db_mask is None: - db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)] - amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask] - return amp_mask - -default_mask = make_amp_mask() - - -def generate_amp_mask(mask_num): - """Generate amplitude domain mask randomly in [-100db, 0db] - - Args: - mask_num: the slot number of the mask - - Returns: - A list of tuple. each tuple defines a slot. - e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)] - for #mask_num = 4 - """ - a = [0] * 2 * mask_num - a[0] = 0 - m = [] - for i in range(1, 2 * mask_num): - a[i] = a[i - 1] + random.uniform(0.5, 1) - max_val = a[2 * mask_num - 1] - for i in range(0, mask_num): - l = ((a[2 * i] - max_val) / max_val) * 100 - r = ((a[2 * i + 1] - max_val) / max_val) * 100 - m.append((l, r)) - return make_amp_mask(m) - - -def make_fence_distortion(conf): - """Generate a fence distortion function - - In this fence-like shape function, the values in mask slots are - set to maxium, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': int,'max_db': float } - 'mask_number': the slot number in mask. - 'max_db': the maxium value. - - Returns: - The fence function, which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - max_db = conf['max_db'] - max_amp = db2amp(max_db) # 0.997 - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def fence_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return max_amp - if not is_in_mask: - return 0.0 - return x - - return fence_distortion - -# -def make_jag_distortion(conf): - """Generate a jag distortion function - - In this jag-like shape function, the values in mask slots are - not changed, while the values not in mask slots are set to 0. - Use seperated masks for Positive and negetive amplitude. - - Args: - conf: a dict {'mask_number': #int} - 'mask_number': the slot number in mask. - - Returns: - The jag function,which could be applied on - a float amplitude value - """ - mask_number = conf['mask_number'] - if mask_number <= 0 : - positive_mask = default_mask - negative_mask = make_amp_mask([(-50, 0)]) - else: - positive_mask = generate_amp_mask(mask_number) - negative_mask = generate_amp_mask(mask_number) - - def jag_distortion(x): - is_in_mask = False - if x > 0: - for mask in positive_mask: - if x >= mask[0] and x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - elif x < 0: - abs_x = abs(x) - for mask in negative_mask: - if abs_x >= mask[0] and abs_x <= mask[1]: - is_in_mask = True - return x - if not is_in_mask: - return 0.0 - return x - - return jag_distortion - -# gaining 20db means amp = amp * 10 -# gaining -20db means amp = amp / 10 -def make_gain_db(conf): - """Generate a db domain gain function - - Args: - conf: a dict {'db': #float} - 'db': the gaining value - - Returns: - The db gain function, which could be applied on - a float amplitude value - """ - db = conf['db'] - - def gain_db(x): - return min(0.997, x * pow(10, db / 20)) - - return gain_db - - -def distort(x, func, rate=0.8): - """Distort a waveform in sample point level - - Args: - x: the origin wavefrom - func: the distort function - rate: sample point-level distort probability - - Returns: - the distorted waveform - """ - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - x[0][i] = func(float(x[0][i])) - return x - -def distort_chain(x, funcs, rate=0.8): - for i in range(0, x.shape[1]): - a = random.uniform(0, 1) - if a < rate: - for func in funcs: - x[0][i] = func(float(x[0][i])) - return x - -# x is numpy -def distort_wav_conf(x, distort_type, distort_conf, rate=0.1): - if distort_type == 'gain_db': - gain_db = make_gain_db(distort_conf) - x = distort(x, gain_db) - elif distort_type == 'max_distortion': - max_distortion = make_max_distortion(distort_conf) - x = distort(x, max_distortion, rate=rate) - elif distort_type == 'fence_distortion': - fence_distortion = make_fence_distortion(distort_conf) - x = distort(x, fence_distortion, rate=rate) - elif distort_type == 'jag_distortion': - jag_distortion = make_jag_distortion(distort_conf) - x = distort(x, jag_distortion, rate=rate) - elif distort_type == 'poly_distortion': - poly_distortion = make_poly_distortion(distort_conf) - x = distort(x, poly_distortion, rate=rate) - elif distort_type == 'quad_distortion': - quad_distortion = make_quad_distortion() - x = distort(x, quad_distortion, rate=rate) - elif distort_type == 'none_distortion': - pass - else: - print('unsupport type') - return x - -def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in, wav_out): - x, sr = torchaudio.load(wav_in) - x = x.detach().numpy() - out = distort_wav_conf(x, distort_type, distort_conf, rate) - torchaudio.save(wav_out, torch.from_numpy(out), sr) - -if __name__ == "__main__": - distort_type = sys.argv[1] - wav_in = sys.argv[2] - wav_out = sys.argv[3] - conf = None - rate = 0.1 - if distort_type == 'new_jag_distortion': - conf = {'mask_number' : 4} - elif distort_type == 'new_fence_distortion': - conf = {'mask_number' : 1, 'max_db' : -30} - elif distort_type == 'poly_distortion': - conf = {'a' : 4, 'm' : 2, "n" : 2} - distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/attention.py deleted file mode 100644 index 475131b15af92ffcaf91ad5e2e30d114d4d1a2a3..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/attention.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple, Optional - -import torch -from torch import nn -import torch.nn.functional as F -from wenet.transformer.attention import MultiHeadedAttention - - -class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: - https://arxiv.org/abs/1901.02860 - https://arxiv.org/abs/2109.01163 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate, group_size=3): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - self.group_size = group_size - self.d_k = n_feat // n_head # for GroupedAttention - self.n_feat = n_feat - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k * self.group_size)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def pad4group(self, Q, K, V, P, mask, group_size: int = 3): - """ - q: (#batch, time1, size) -> (#batch, head, time1, size/head) - k,v: (#batch, time2, size) -> (#batch, head, time2, size/head) - p: (#batch, time2, size) - """ - # Compute Overflows - overflow_Q = Q.size(2) % group_size - overflow_KV = K.size(2) % group_size - - # if-else for ONNX export - # 0 // 0.00000000000000001 = 0 - # 1 // 1.00000000000000001 = 1 - padding_Q = (group_size - overflow_Q) * int( - overflow_Q // (overflow_Q + 0.00000000000000001)) - padding_KV = (group_size - overflow_KV) * int( - overflow_KV // (overflow_KV + 0.00000000000000001)) - - batch_size, _, seq_len_KV, _ = K.size() - - # Input Padding (B, T, D) -> (B, T + P, D) - Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0) - K = F.pad(K, (0, 0, 0, padding_KV), value=0.0) - V = F.pad(V, (0, 0, 0, padding_KV), value=0.0) - - if mask is not None and mask.size(2) > 0 : # time2 > 0: - mask = mask[:, ::group_size, ::group_size] - - Q = Q.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - K = K.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - V = V.transpose(1, 2).contiguous().view( - batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - # process pos_emb - P_batch_size = P.size(0) - overflow_P = P.size(1) % group_size - padding_P = group_size - overflow_P if overflow_P else 0 - P = F.pad(P, (0, 0, 0, padding_P), value=0.0) - P = P.view(P_batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2) - - return Q, K, V, P, mask, padding_Q - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - padding_q: Optional[int] = None - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - padding_q : for GroupedAttention in efficent conformer - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - - # n_feat!=h*d_k may be happened in GroupAttention - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat) - ) # (batch, time1, d_model) - if padding_q is not None: - # for GroupedAttention in efficent conformer - x = x[:, :x.size(1) - padding_q] - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q = self.linear_q(query) - k = self.linear_k(key) # (#batch, time2, size) - v = self.linear_v(value) - p = self.linear_pos(pos_emb) # (#batch, time2, size) - - batch_size, seq_len_KV, _ = k.size() # seq_len_KV = time2 - - # (#batch, time2, size) -> (#batch, head, time2, size/head) - q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - if cache.size(0) > 0: - # use attention cache - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - new_cache = torch.cat((k, v), dim=-1) - - # May be k and p does not match. eg. time2=18+18/2=27 > mask=36/2=18 - if mask is not None and mask.size(2) > 0: - time2 = mask.size(2) - k = k[:, :, -time2:, :] - v = v[:, :, -time2:, :] - - # q k v p: (batch, head, time1, d_k) - q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask, self.group_size) - - # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k * self.group_size) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask, padding_q), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/convolution.py deleted file mode 100644 index 52d6c1c14c0812ab3957a60a135f644833c2ad95..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/convolution.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - stride: int = 1): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - stride (int): Stride Convolution, for efficient Conformer - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=stride, # for depthwise_conv in StrideConv - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - self.stride = stride - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - # When export ONNX,the first cache is not None but all-zero, - # cause shape error in residual block, - # eg. cache14 + x9 = 23, 23-7+1=17 != 9 - cache = cache[:, :, -self.lorder:] - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is requried, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - if mask_pad.size(2) != x.size(2): - mask_pad = mask_pad[:, :, ::self.stride] - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/encoder.py deleted file mode 100644 index dbd37f53cac86be851e2bb194354fd07eb271f11..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/encoder.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer) -# Paper(https://arxiv.org/abs/2109.01163) - -"""Encoder definition.""" -from typing import Tuple, Optional, List, Union - -import torch -import logging -from typeguard import check_argument_types -import torch.nn.functional as F - -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.encoder_layer import ConformerEncoderLayer - -from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 -from wenet.efficient_conformer.convolution import ConvolutionModule -from wenet.efficient_conformer.attention import GroupedRelPositionMultiHeadedAttention -from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer - -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class EfficientConformerEncoder(torch.nn.Module): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - macaron_style: bool = True, - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - stride_layer_idx: Optional[Union[int, List[int]]] = 3, - stride: Optional[Union[int, List[int]]] = 2, - group_layer_idx: Optional[Union[int, List[int], tuple]] = (0, 1, 2, 3), - group_size: int = 3, - stride_kernel: bool = True, - **kwargs - ): - """Construct Efficient Conformer Encoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - macaron_style (bool): Whether to use macaron style for - positionwise layer. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - stride_layer_idx (list): layer id with StrideConv, start from 0 - stride (list): stride size of each StrideConv in efficient conformer - group_layer_idx (list): layer id with GroupedAttention, start from 0 - group_size (int): group size of every GroupedAttention layer - stride_kernel (bool): default True. True: recompute cnn kernels with stride. - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d2": - subsampling_class = Conv2dSubsampling2 - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - logging.info(f"input_layer = {input_layer}, " - f"subsampling_class = {subsampling_class}") - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - self.input_layer = input_layer - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - activation = get_activation(activation_type) - self.num_blocks = num_blocks - self.attention_heads = attention_heads - self.cnn_module_kernel = cnn_module_kernel - self.global_chunk_size = 0 - self.chunk_feature_map = 0 - - # efficient conformer configs - self.stride_layer_idx = [stride_layer_idx] \ - if type(stride_layer_idx) == int else stride_layer_idx - self.stride = [stride] \ - if type(stride) == int else stride - self.group_layer_idx = [group_layer_idx] \ - if type(group_layer_idx) == int else group_layer_idx - self.grouped_size = group_size # group size of every GroupedAttention layer - - assert len(self.stride) == len(self.stride_layer_idx) - self.cnn_module_kernels = [cnn_module_kernel] # kernel size of each StridedConv - for i in self.stride: - if stride_kernel: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1] // i) - else: - self.cnn_module_kernels.append(self.cnn_module_kernels[-1]) - - logging.info(f"stride_layer_idx= {self.stride_layer_idx}, " - f"stride = {self.stride}, " - f"cnn_module_kernel = {self.cnn_module_kernels}, " - f"group_layer_idx = {self.group_layer_idx}, " - f"grouped_size = {self.grouped_size}") - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - - # encoder definition - index = 0 - layers = [] - for i in range(num_blocks): - # self-attention module definition - if i in self.group_layer_idx: - encoder_selfattn_layer = GroupedRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - self.grouped_size) - else: - if pos_enc_layer_type == "no_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate) - - # conformer module definition - if i in self.stride_layer_idx: - # conformer block with downsampling - convolution_layer_args_stride = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal, True, self.stride[index]) - layers.append(StrideConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_stride) if use_cnn_module else None, - torch.nn.AvgPool1d( - kernel_size=self.stride[index], stride=self.stride[index], - padding=0, ceil_mode=True, - count_include_pad=False), # pointwise_conv_layer - dropout_rate, - normalize_before, - concat_after, - )) - index = index + 1 - else: - # conformer block - convolution_layer_args_normal = ( - output_size, self.cnn_module_kernels[index], activation, - cnn_module_norm, causal) - layers.append(ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args_normal) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - )) - - self.encoders = torch.nn.ModuleList(layers) - - def set_global_chunk_size(self, chunk_size): - """Used in ONNX export. - """ - logging.info(f"set global chunk size: {chunk_size}, default is 0.") - self.global_chunk_size = chunk_size - if self.embed.subsampling_rate == 2: - self.chunk_feature_map = 2 * self.global_chunk_size + 1 - elif self.embed.subsampling_rate == 6: - self.chunk_feature_map = 6 * self.global_chunk_size + 5 - elif self.embed.subsampling_rate == 8: - self.chunk_feature_map = 8 * self.global_chunk_size + 7 - else: - self.chunk_feature_map = 4 * self.global_chunk_size + 3 - - def output_size(self) -> int: - return self._output_size - - def calculate_downsampling_factor(self, i: int) -> int: - factor = 1 - for idx, stride_idx in enumerate(self.stride_layer_idx): - if i > stride_idx: - factor *= self.stride[idx] - return factor - - def forward(self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - index = 0 # traverse stride - for i, layer in enumerate(self.encoders): - # layer return : x, mask, new_att_cache, new_cnn_cache - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if i in self.stride_layer_idx: - masks = masks[:, :, ::self.stride[index]] - chunk_masks = chunk_masks[:, ::self.stride[index], - ::self.stride[index]] - mask_pad = masks - pos_emb = pos_emb[:, ::self.stride[index], :] - index = index + 1 - - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - att_mask : mask matrix of self attention - - Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next - chunk computation - List[torch.Tensor]: conformer cnn cache - - """ - assert xs.size(0) == 1 - - # using downsampling factor to recover offset - offset *= self.calculate_downsampling_factor(self.num_blocks + 1) - - chunk_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - chunk_masks = chunk_masks.unsqueeze(1) # (1, 1, xs-time) - - real_len = 0 - if self.global_chunk_size > 0: - # for ONNX decode simulation, padding xs to chunk_size - real_len = xs.size(1) - pad_len = self.chunk_feature_map - real_len - xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0) - chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0) - - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim) - - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) # batchPad (b=1, 1, time=chunk_size) - - if self.global_chunk_size > 0: - # for ONNX decode simulation - pos_emb = self.embed.position_encoding( - offset=max(offset - cache_t1, 0), - size=cache_t1 + self.global_chunk_size) - att_mask[:, :, -self.global_chunk_size:] = chunk_masks - mask_pad = chunk_masks.to(torch.bool) - else: - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - - max_att_len, max_cnn_len = 0, 0 # for repeat_interleave of new_att_cache - for i, layer in enumerate(self.encoders): - factor = self.calculate_downsampling_factor(i) - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ] - att_cache_trunc = 0 - if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1): - # The time step is not divisible by the downsampling multiple - att_cache_trunc = xs.size(1) + \ - att_cache.size(2) // factor - pos_emb.size(1) + 1 - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - mask_pad=mask_pad, - att_cache=att_cache[i:i + 1, :, ::factor, :][:, :, att_cache_trunc:, :], - cnn_cache=cnn_cache[i, :, :, :] - if cnn_cache.size(0) > 0 else cnn_cache - ) - - if i in self.stride_layer_idx: - # compute time dimension for next block - efficient_index = self.stride_layer_idx.index(i) - att_mask = att_mask[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - mask_pad = mask_pad[:, ::self.stride[efficient_index], - ::self.stride[efficient_index]] - pos_emb = pos_emb[:, ::self.stride[efficient_index], :] - - # shape(new_att_cache) = [batch, head, time2, outdim] - new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :] - # shape(new_cnn_cache) = [1, batch, outdim, cache_t2] - new_cnn_cache = new_cnn_cache.unsqueeze(0) - - # use repeat_interleave to new_att_cache - new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2) - # padding new_cnn_cache to cnn.lorder for casual convolution - new_cnn_cache = F.pad( - new_cnn_cache, - (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0)) - - if i == 0: - # record length for the first block as max length - max_att_len = new_att_cache.size(2) - max_cnn_len = new_cnn_cache.size(3) - - # update real shape of att_cache and cnn_cache - r_att_cache.append(new_att_cache[:, :, -max_att_len:, :]) - r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:]) - - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.global_chunk_size > 0 and real_len: - chunk_real_len = real_len // self.embed.subsampling_rate // \ - self.calculate_downsampling_factor(self.num_blocks + 1) - # Keeping 1 more timestep can mitigate information leakage - # from the encoder caused by the padding - xs = xs[:, :chunk_real_len + 1, :] - - return xs, r_att_cache, r_cnn_cache - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - use_onnx=False - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - decoding_chunk_size (int): decoding chunk size - num_decoding_left_chunks (int): - use_onnx (bool): True for simulating ONNX model inference. - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - if use_onnx: - logging.info("Simulating for ONNX runtime ...") - att_cache: torch.Tensor = torch.zeros( - (self.num_blocks, self.attention_heads, required_cache_size, - self.output_size() // self.attention_heads * 2), - device=xs.device) - cnn_cache: torch.Tensor = torch.zeros( - (self.num_blocks, 1, self.output_size(), self.cnn_module_kernel - 1), - device=xs.device) - self.set_global_chunk_size(chunk_size=decoding_chunk_size) - else: - logging.info("Simulating for JIT runtime ...") - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - logging.info(f"-->> frame chunk msg: cur={cur}, " - f"end={end}, num_frames={end-cur}, " - f"decoding_window={decoding_window}") - if use_onnx: - att_mask: torch.Tensor = torch.ones( - (1, 1, required_cache_size + decoding_chunk_size), - dtype=torch.bool, device=xs.device) - if cur == 0: - att_mask[:, :, :required_cache_size] = 0 - else: - att_mask: torch.Tensor = torch.ones( - (0, 0, 0), dtype=torch.bool, device=xs.device) - - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache, att_mask) - outputs.append(y) - offset += y.size(1) - - ys = torch.cat(outputs, 1) - masks = torch.ones(1, 1, ys.size(1), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/encoder_layer.py deleted file mode 100644 index 3a88ec9fca9797664ce89566e6c1d28a8f0ad5f4..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/encoder_layer.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple -import torch -from torch import nn - - -class StrideConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - pointwise_conv_layer: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.pointwise_conv_layer = pointwise_conv_layer - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - self.concat_linear = nn.Linear(size + size, size) - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - - # add pointwise_conv for efficient conformer - # pointwise_conv_layer does not change shape - if self.pointwise_conv_layer is not None: - residual = residual.transpose(1, 2) - residual = self.pointwise_conv_layer(residual) - residual = residual.transpose(1, 2) - assert residual.size(0) == x.size(0) - assert residual.size(1) == x.size(1) - assert residual.size(2) == x.size(2) - - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/subsampling.py deleted file mode 100644 index 98b2c2228eac8e77586110686c48a7b0141458c9..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/efficient_conformer/subsampling.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 58.com(Wuba) Inc AI Lab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch -from wenet.transformer.subsampling import BaseSubsampling - - -class Conv2dSubsampling2(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU() - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * ((idim - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 2 - # 2 = (3 - 1) * 1 - self.right_context = 2 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 2. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 2. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, :-2:2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/attention.py deleted file mode 100644 index 97412badbe8e2c5caec81c0636d15be3f80d6b84..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/attention.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# 2022 Ximalaya Inc. (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -import torch -import torch.nn as nn -from wenet.transformer.attention import MultiHeadedAttention -from typing import Tuple - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - - def __init__(self, n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.do_rel_shift = do_rel_shift - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - self.adaptive_scale = adaptive_scale - self.ada_scale = nn.Parameter( - torch.ones([1, 1, n_feat]), requires_grad=adaptive_scale) - self.ada_bias = nn.Parameter( - torch.zeros([1, 1, n_feat]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max) - torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max) - torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0: # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - # (batch, head, time1, time2) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - if self.adaptive_scale: - query = self.ada_scale * query + self.ada_bias - key = self.ada_scale * key + self.ada_bias - value = self.ada_scale * value + self.ada_bias - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/conv2d.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/conv2d.py deleted file mode 100644 index c230263396392d72f36c56d645338f2d576db898..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/conv2d.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Conv2d Module with Valid Padding""" - -import torch.nn.functional as F -from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional - - -class Conv2dValid(_ConvNd): - """ - Conv2d operator for VALID mode padding. - """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', # TODO: refine this type - device=None, - dtype=None, - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - factory_kwargs = {'device': device, 'dtype': dtype} - kernel_size_ = _pair(kernel_size) - stride_ = _pair(stride) - padding_ = padding if isinstance(padding, str) else _pair(padding) - dilation_ = _pair(dilation) - super(Conv2dValid, self).__init__( - in_channels, out_channels, kernel_size_, - stride_, padding_, dilation_, False, _pair(0), - groups, bias, padding_mode, **factory_kwargs) - self.valid_trigx = valid_trigx - self.valid_trigy = valid_trigy - - def _conv_forward( - self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): - validx, validy = 0, 0 - if self.valid_trigx: - validx = (input.size(-2) * (self.stride[-2] - 1) - 1 - + self.kernel_size[-2]) // 2 - if self.valid_trigy: - validy = (input.size(-1) * (self.stride[-1] - 1) - 1 - + self.kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self.stride, - (validx, validy), self.dilation, self.groups) - - def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight, self.bias) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/convolution.py deleted file mode 100644 index 6da2ee8c98ed58fae66d66c892041037f0d6bc3a..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/convolution.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - self.bias = bias - self.channels = channels - self.kernel_size = kernel_size - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, channels]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, channels]), requires_grad=adaptive_scale) - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - if init_weights: - self.init_weights() - - def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max, pw_max) - torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max, dw_max) - if self.bias: - torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max, dw_max) - torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max, pw_max) - if self.bias: - torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max, pw_max) - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - if self.adaptive_scale: - x = self.ada_scale * x + self.ada_bias - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/encoder.py deleted file mode 100644 index f13038321ae6c07d484a617aee7d83ed07742510..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/encoder.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -import torch -import torch.nn as nn -from typing import Tuple, Union, Optional, List -from wenet.squeezeformer.subsampling \ - import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \ - TimeReductionLayer2D, TimeReductionLayerStream -from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.attention import MultiHeadedAttention -from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention -from wenet.squeezeformer.positionwise_feed_forward \ - import PositionwiseFeedForward -from wenet.squeezeformer.convolution import ConvolutionModule -from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask -from wenet.utils.common import get_activation - - -class SqueezeformerEncoder(nn.Module): - def __init__( - self, - input_size: int = 80, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "batch_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: torch.nn.Module = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): - """Construct SqueezeformerEncoder - - Args: - input_size to use_dynamic_chunk, see in Transformer BaseEncoder. - encoder_dim (int): The hidden dimension of encoder layer. - output_size (int): The output dimension of final projection layer. - attention_heads (int): Num of attention head in attention module. - num_blocks (int): Num of encoder layers. - reduce_idx Optional[Union[int, List[int]]]: - reduce layer index, from 40ms to 80ms per frame. - recover_idx Optional[Union[int, List[int]]]: - recover layer index, from 80ms to 40ms per frame. - feed_forward_expansion_factor (int): Enlarge coefficient of FFN. - dw_stride (bool): Whether do depthwise convolution - on subsampling module. - input_dropout_rate (float): Dropout rate of input projection layer. - pos_enc_layer_type (str): Self attention type. - time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. - cnn_module_kernel (int): Kernel size of CNN module. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - adaptive_scale (bool): Whether to use adaptive scale. - init_weights (bool): Whether to initialize weights. - causal (bool): whether to use causal convolution or not. - """ - super(SqueezeformerEncoder, self).__init__() - self.global_cmvn = global_cmvn - self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ - if type(reduce_idx) == int else reduce_idx - self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \ - if type(recover_idx) == int else recover_idx - self.check_ascending_list() - if reduce_idx is None: - self.time_reduce = None - else: - if recover_idx is None: - self.time_reduce = 'normal' # no recovery at the end - else: - self.time_reduce = 'recover' # recovery at the end - assert len(self.reduce_idx) == len(self.recover_idx) - self.reduce_stride = 2 - self._output_size = output_size - self.normalize_before = normalize_before - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - self.pos_enc_layer_type = pos_enc_layer_type - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights - ) - - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights - ) - - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = ( - encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) - - self.embed = DepthwiseConv2dSubsampling4( - 1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights - ) - - self.preln = nn.LayerNorm(encoder_dim) - self.encoders = torch.nn.ModuleList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) - ]) - if time_reduction_layer_type == 'conv1d': - time_reduction_layer = TimeReductionLayer1D - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - elif time_reduction_layer_type == 'stream': - time_reduction_layer = TimeReductionLayerStream - time_reduction_layer_args = { - 'channel': encoder_dim, - 'out_dim': encoder_dim, - } - else: - time_reduction_layer = TimeReductionLayer2D - time_reduction_layer_args = {'encoder_dim': encoder_dim} - - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) - self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim) - self.final_proj = None - if output_size != encoder_dim: - self.final_proj = nn.Linear(encoder_dim, output_size) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - xs_lens = mask_pad.squeeze(1).sum(1) - xs = self.preln(xs) - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - for i, layer in enumerate(self.encoders): - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_chunk_masks, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - chunk_masks = recover_chunk_masks - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0) - - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return xs, masks - - def check_ascending_list(self): - if self.reduce_idx is not None: - assert self.reduce_idx == sorted(self.reduce_idx), \ - "reduce_idx should be int or ascending list" - if self.recover_idx is not None: - assert self.recover_idx == sorted(self.recover_idx), \ - "recover_idx should be int or ascending list" - - def calculate_downsampling_factor(self, i: int) -> int: - if self.reduce_idx is None: - return 1 - else: - reduce_exp, recover_exp = 0, 0 - for exp, rd_idx in enumerate(self.reduce_idx): - if i >= rd_idx: - reduce_exp = exp + 1 - if self.recover_idx is not None: - for exp, rc_idx in enumerate(self.recover_idx): - if i >= rc_idx: - recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - - r_att_cache = [] - r_cnn_cache = [] - - mask_pad = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - mask_pad = mask_pad.unsqueeze(1) - max_att_len: int = 0 - recover_activations: \ - List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = [] - index = 0 - xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int) - xs = self.preln(xs) - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - if self.reduce_idx is not None: - if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = \ - self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) - pos_emb = pos_emb[:, ::2, :] - index += 1 - - if self.recover_idx is not None: - if self.time_reduce == 'recover' and i in self.recover_idx: - index -= 1 - (recover_tensor, recover_att_mask, - recover_pos_emb, recover_mask_pad) \ - = recover_activations[index] - # recover output length for ctc decode - xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2) - xs = self.time_recover_layer(xs) - recoverd_t = recover_tensor.size(1) - xs = recover_tensor + xs[:, :recoverd_t, :].contiguous() - att_mask = recover_att_mask - pos_emb = recover_pos_emb - mask_pad = recover_mask_pad - if att_mask.size(1) != 0: - xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1), 0.0) - - factor = self.calculate_downsampling_factor(i) - - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1][:, :, ::factor, :] - [:, :, :pos_emb.size(1) - xs.size(1), :] if - elayers > 0 else att_cache[:, :, ::factor, :], - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - cached_att \ - = new_att_cache[:, :, next_cache_start // factor:, :] - cached_cnn = new_cnn_cache.unsqueeze(0) - cached_att = cached_att.unsqueeze(3).\ - repeat(1, 1, 1, factor, 1).flatten(2, 3) - if i == 0: - # record length for the first block as max length - max_att_len = cached_att.size(2) - r_att_cache.append(cached_att[:, :, :max_att_len, :]) - r_cnn_cache.append(cached_cnn) - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - if self.final_proj is not None: - xs = self.final_proj(xs) - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = \ - self.forward_chunk( - chunk_xs, offset, required_cache_size, - att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/encoder_layer.py deleted file mode 100644 index 3c6bdd44a20447cea91c0f965c666b844f4264be..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/encoder_layer.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""SqueezeformerEncoderLayer definition.""" - -import torch -import torch.nn as nn -from typing import Optional, Tuple - - -class SqueezeformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward1 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - feed_forward2 (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - """ - - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward1: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - feed_forward2: Optional[nn.Module] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False, - ): - super(SqueezeformerEncoderLayer, self).__init__() - self.size = size - self.self_attn = self_attn - self.layer_norm1 = nn.LayerNorm(size) - self.ffn1 = feed_forward1 - self.layer_norm2 = nn.LayerNorm(size) - self.conv_module = conv_module - self.layer_norm3 = nn.LayerNorm(size) - self.ffn2 = feed_forward2 - self.layer_norm4 = nn.LayerNorm(size) - self.normalize_before = normalize_before - self.dropout = nn.Dropout(dropout_rate) - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # self attention module - residual = x - if self.normalize_before: - x = self.layer_norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.layer_norm1(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm2(x) - x = self.ffn1(x) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm2(x) - - # conv module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - residual = x - if self.normalize_before: - x = self.layer_norm3(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm3(x) - - # ffn module - residual = x - if self.normalize_before: - x = self.layer_norm4(x) - x = self.ffn2(x) - # we do not use dropout here since it is inside feed forward function - x = residual + self.dropout(x) - if not self.normalize_before: - x = self.layer_norm4(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/positionwise_feed_forward.py deleted file mode 100644 index 289062dcf3189f79a5ebb206990160d8665c613c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/positionwise_feed_forward.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False - ): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.idim = idim - self.hidden_units = hidden_units - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - self.ada_scale = None - self.ada_bias = None - self.adaptive_scale = adaptive_scale - self.ada_scale = torch.nn.Parameter( - torch.ones([1, 1, idim]), requires_grad=adaptive_scale) - self.ada_bias = torch.nn.Parameter( - torch.zeros([1, 1, idim]), requires_grad=adaptive_scale) - if init_weights: - self.init_weights() - - def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max) - torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max) - torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - if self.adaptive_scale: - xs = self.ada_scale * xs + self.ada_bias - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/subsampling.py deleted file mode 100644 index fdb0101d6ebb54c42e710bbb0f35a6f7615ca567..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/squeezeformer/subsampling.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer) -# Squeezeformer(https://github.com/upskyy/Squeezeformer) -# NeMo(https://github.com/NVIDIA/NeMo) - -"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition.""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from wenet.transformer.subsampling import BaseSubsampling -from typing import Tuple -from wenet.squeezeformer.conv2d import Conv2dValid - - -class DepthwiseConv2dSubsampling4(BaseSubsampling): - """Depthwise Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - pos_enc_class (nn.Module): position encoding class. - dw_stride (int): Whether do depthwise convolution. - input_size (int): filter bank dimension. - - """ - - def __init__( - self, idim: int, odim: int, - pos_enc_class: torch.nn.Module, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True - ): - super(DepthwiseConv2dSubsampling4, self).__init__() - self.idim = idim - self.odim = odim - self.pw_conv = nn.Conv2d( - in_channels=idim, out_channels=odim, kernel_size=3, stride=2) - self.act1 = nn.ReLU() - self.dw_conv = nn.Conv2d( - in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1 - ) - self.act2 = nn.ReLU() - self.pos_enc = pos_enc_class - self.input_proj = nn.Sequential( - nn.Linear( - odim * (((input_size - 1) // 2 - 1) // 2), odim), - nn.Dropout(p=input_dropout_rate), - ) - if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.weight'], -linear_max, linear_max) - torch.nn.init.uniform_( - self.input_proj.state_dict()['0.bias'], -linear_max, linear_max) - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: int = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.pw_conv(x) - x = self.act1(x) - x = self.dw_conv(x) - x = self.act2(x) - b, c, t, f = x.size() - x = x.permute(0, 2, 1, 3) - x = x.contiguous().view(b, t, c * f) - x, pos_emb = self.pos_enc(x, offset) - x = self.input_proj(x) - return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] - - -class TimeReductionLayer1D(nn.Module): - """ - Modified NeMo, - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 5, stride: int = 2): - super(TimeReductionLayer1D, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayer2D(nn.Module): - def __init__( - self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): - super(TimeReductionLayer2D, self).__init__() - self.encoder_dim = encoder_dim - self.kernel_size = kernel_size - self.dw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True - ) - self.pw_conv = Conv2dValid( - in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False, - ) - - self.kernel_size = kernel_size - self.stride = stride - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward( - self, xs: torch.Tensor, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0) - xs = xs.unsqueeze(2) - padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), - mode='constant', value=0.) - xs = self.dw_conv(xs.permute(0, 3, 1, 2)) - xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous() - tmp_length = xs.size(1) - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - padding2 = max(0, (xs_lens.max() - tmp_length).data.item()) - batch_size, hidden = xs.size(0), xs.size(-1) - dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - mask = mask[:, ::2, ::2] - mask_pad = mask_pad[:, :, ::2] - return xs, xs_lens, mask, mask_pad - - -class TimeReductionLayerStream(nn.Module): - """ - Squeezeformer Time Reduction procedure. - Downsamples the audio by `stride` in the time dimension. - Args: - channel (int): input dimension of - MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for - depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): - super(TimeReductionLayerStream, self).__init__() - - self.channel = channel - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - - self.dw_conv = nn.Conv1d( - in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel, - ) - - self.pw_conv = nn.Conv1d( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.init_weights() - - def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - def forward(self, xs, xs_lens: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ): - xs = xs.transpose(1, 2) # [B, C, T] - xs = xs.masked_fill(mask_pad.eq(0), 0.0) - - xs = self.dw_conv(xs) - xs = self.pw_conv(xs) - - xs = xs.transpose(1, 2) # [B, T, C] - - B, T, D = xs.size() - mask = mask[:, ::self.stride, ::self.stride] - mask_pad = mask_pad[:, :, ::self.stride] - L = mask_pad.size(-1) - # For JIT exporting, we remove F.pad operator. - if L - T < 0: - xs = xs[:, :L - T, :].contiguous() - else: - dummy_pad = torch.zeros(B, L - T, D, device=xs.device) - xs = torch.cat([xs, dummy_pad], dim=1) - - xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc') - return xs, xs_lens, mask, mask_pad diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/joint.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/joint.py deleted file mode 100644 index f7cbaf62ee0bf4ffa127e5bbf4a49a64c2378495..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/joint.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Optional - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation - - -class TransducerJoint(torch.nn.Module): - - def __init__(self, - voca_size: int, - enc_output_size: int, - pred_output_size: int, - join_dim: int, - prejoin_linear: bool = True, - postjoin_linear: bool = False, - joint_mode: str = 'add', - activation: str = "tanh"): - assert check_argument_types() - # TODO(Mddct): concat in future - assert joint_mode in ['add'] - super().__init__() - - self.activatoin = get_activation(activation) - self.prejoin_linear = prejoin_linear - self.postjoin_linear = postjoin_linear - self.joint_mode = joint_mode - - if not self.prejoin_linear and not self.postjoin_linear: - assert enc_output_size == pred_output_size == join_dim - # torchscript compatibility - self.enc_ffn: Optional[nn.Linear] = None - self.pred_ffn: Optional[nn.Linear] = None - if self.prejoin_linear: - self.enc_ffn = nn.Linear(enc_output_size, join_dim) - self.pred_ffn = nn.Linear(pred_output_size, join_dim) - # torchscript compatibility - self.post_ffn: Optional[nn.Linear] = None - if self.postjoin_linear: - self.post_ffn = nn.Linear(join_dim, join_dim) - - self.ffn_out = nn.Linear(join_dim, voca_size) - - def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor): - """ - Args: - enc_out (torch.Tensor): [B, T, E] - pred_out (torch.Tensor): [B, T, P] - Return: - [B,T,U,V] - """ - if (self.prejoin_linear and self.enc_ffn is not None - and self.pred_ffn is not None): - enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V] - pred_out = self.pred_ffn(pred_out) - - enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V] - pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V] - - # TODO(Mddct): concat joint - _ = self.joint_mode - out = enc_out + pred_out # [B,T,U,V] - - if self.postjoin_linear and self.post_ffn is not None: - out = self.post_ffn(out) - - out = self.activatoin(out) - out = self.ffn_out(out) - return out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/predictor.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/predictor.py deleted file mode 100644 index 600e97a9d83646047ec3fc14f3087bd4df761c68..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/predictor.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import List, Optional, Tuple - -import torch -from torch import nn -from typeguard import check_argument_types -from wenet.utils.common import get_activation, get_rnn - - -def ApplyPadding(input, padding, pad_value) -> torch.Tensor: - """ - Args: - input: [bs, max_time_step, dim] - padding: [bs, max_time_step] - """ - return padding * pad_value + input * (1 - padding) - - -class PredictorBase(torch.nn.Module): - - # NOTE(Mddct): We can use ABC abstract here, but - # keep this class simple enough for now - def __init__(self) -> None: - super().__init__() - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - _, _, _ = batch_size, method, device - raise NotImplementedError("this is a base precictor") - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - _ = cache - raise NotImplementedError("this is a base precictor") - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ): - _, _, = input, cache - raise NotImplementedError("this is a base precictor") - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - _, _, _, = input, padding, cache - raise NotImplementedError("this is a base precictor") - - -class RNNPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - output_size: int, - embed_dropout: float, - hidden_size: int, - num_layers: int, - bias: bool = True, - rnn_type: str = "lstm", - dropout: float = 0.1) -> None: - assert check_argument_types() - super().__init__() - self.n_layers = num_layers - self.hidden_size = hidden_size - # disable rnn base out projection - self.embed = nn.Embedding(voca_size, embed_size) - self.dropout = nn.Dropout(embed_dropout) - # NOTE(Mddct): rnn base from torch not support layer norm - # will add layer norm and prune value in cell and layer - # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py - self.rnn = get_rnn(rnn_type=rnn_type)(input_size=embed_size, - hidden_size=hidden_size, - num_layers=num_layers, - bias=bias, - batch_first=True, - dropout=dropout) - self.projection = nn.Linear(hidden_size, output_size) - - def forward( - self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> torch.Tensor: - """ - Args: - input (torch.Tensor): [batch, max_time). - padding (torch.Tensor): [batch, max_time] - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - Returns: - output: [batch, max_time, output_size] - """ - - # NOTE(Mddct): we don't use pack input format - embed = self.embed(input) # [batch, max_time, emb_size] - embed = self.dropout(embed) - states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - if cache is None: - state = self.init_state(batch_size=input.size(0), - device=input.device) - states = (state[0], state[1]) - else: - assert len(cache) == 2 - states = (cache[0], cache[1]) - out, (m, c) = self.rnn(embed, states) - out = self.projection(out) - - # NOTE(Mddct): Although we don't use staate in transducer - # training forward, we need make it right for padding value - # so we create forward_step for infering, forward for training - _, _ = m, c - return out - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache: [state_m, state_c] - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - Returns: - new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...] - """ - assert len(cache) == 2 - state_ms = cache[0] - state_cs = cache[1] - - assert state_ms.size(1) == state_cs.size(1) - - new_cache: List[List[torch.Tensor]] = [] - for state_m, state_c in zip(torch.split(state_ms, 1, dim=1), - torch.split(state_cs, 1, dim=1)): - new_cache.append([state_m, state_c]) - return new_cache - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...] - - Returns: - new_caceh: [state_ms, state_cs], - state_ms: [1*n_layers, bs, ...] - state_cs: [1*n_layers, bs, ...] - """ - state_ms = torch.cat([states[0] for states in cache], dim=1) - state_cs = torch.cat([states[1] for states in cache], dim=1) - return [state_ms, state_cs] - - def init_state( - self, - batch_size: int, - device: torch.device, - method: str = "zero", - ) -> List[torch.Tensor]: - assert batch_size > 0 - # TODO(Mddct): xavier init method - _ = method - return [ - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device), - torch.zeros(1 * self.n_layers, - batch_size, - self.hidden_size, - device=device) - ] - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache : rnn predictor cache[0] == state_m - cache[1] == state_c - """ - assert len(cache) == 2 - state_m, state_c = cache[0], cache[1] - embed = self.embed(input) # [batch, 1, emb_size] - embed = self.dropout(embed) - out, (m, c) = self.rnn(embed, (state_m, state_c)) - - out = self.projection(out) - m = ApplyPadding(m, padding.unsqueeze(0), state_m) - c = ApplyPadding(c, padding.unsqueeze(0), state_c) - - return (out, [m, c]) - - -class EmbeddingPredictor(PredictorBase): - """Embedding predictor - - Described in: - https://arxiv.org/pdf/2109.07513.pdf - - embed-> proj -> layer norm -> swish - """ - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - n_head: int, - history_size: int = 2, - activation: str = "swish", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - - assert check_argument_types() - super().__init__() - # multi head - self.num_heads = n_head - self.embed_size = embed_size - self.context_size = history_size + 1 - self.pos_embed = torch.nn.Linear(embed_size * self.context_size, - self.num_heads, - bias=bias) - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.ffn = nn.Linear(self.embed_size, self.embed_size) - self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - _ = method - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device), - ] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - - input = input.unfold(1, self.context_size, 1).permute( - 0, 1, 3, 2) # [bs, seq_len, context_size, embed] - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - # broadcast dot attenton - input_expand = input.unsqueeze( - 2) # [bs, seq_len, 1, context_size, embed] - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - - # [bs, seq_len, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, seq_len, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, seq_len, num_heads, embed] - output = output.sum(dim=2) # [bs, seq_len, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - return output - - def forward_step( - self, - input: torch.Tensor, - padding: torch.Tensor, - cache: List[torch.Tensor], - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input_expand = context_input.unsqueeze(1).unsqueeze( - 2) # [bs, 1, 1, context_size, embed] - - # multi head pos: [n_head, embed, context_size] - multi_head_pos = self.pos_embed.weight.view(self.num_heads, - self.embed_size, - self.context_size) - - multi_head_pos = multi_head_pos.permute( - 0, 2, 1) # [num_heads, context_size, embed] - # [bs, 1, num_heads, context_size, embed] - weight = input_expand * multi_head_pos - weight = weight.sum(dim=-1, keepdim=False).unsqueeze( - 3) # [bs, 1, num_heads, 1, context_size] - output = weight.matmul(input_expand).squeeze( - dim=3) # [bs, 1, num_heads, embed] - output = output.sum(dim=2) # [bs, 1, embed] - output = output / (self.num_heads * self.context_size) - - output = self.ffn(output) - output = self.norm(output) - output = self.activatoin(output) - new_cache = context_input[:, 1:, :] - # TODO(Mddct): we need padding new_cache in future - # new_cache = ApplyPadding(history, padding, new_cache) - return (output, [new_cache]) - - -class ConvPredictor(PredictorBase): - - def __init__(self, - voca_size: int, - embed_size: int, - embed_dropout: float, - history_size: int = 2, - activation: str = "relu", - bias: bool = False, - layer_norm_epsilon: float = 1e-5) -> None: - assert check_argument_types() - super().__init__() - - assert history_size >= 0 - self.embed_size = embed_size - self.context_size = history_size + 1 - self.embed = nn.Embedding(voca_size, self.embed_size) - self.embed_dropout = nn.Dropout(p=embed_dropout) - self.conv = nn.Conv1d(in_channels=embed_size, - out_channels=embed_size, - kernel_size=self.context_size, - padding=0, - groups=embed_size, - bias=bias) - self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon) - self.activatoin = get_activation(activation) - - def init_state(self, - batch_size: int, - device: torch.device, - method: str = "zero") -> List[torch.Tensor]: - assert batch_size > 0 - assert method == "zero" - return [ - torch.zeros(batch_size, - self.context_size - 1, - self.embed_size, - device=device) - ] - - def cache_to_batch(self, - cache: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """ - Args: - cache : [[history_1], [history_2], [history3]...] - - Returns: - new_caceh: [history], - history: [bs, ...] - """ - history = torch.cat([h[0] for h in cache], dim=0) - return [history] - - def batch_to_cache(self, - cache: List[torch.Tensor]) -> List[List[torch.Tensor]]: - """ - Args: - cache : [history] - history: [bs, ...] - Returns: - new_ache : [[history_1], [history_2], [history_3]...] - """ - assert len(cache) == 1 - cache_0 = cache[0] - history: List[List[torch.Tensor]] = [] - for h in torch.split(cache_0, 1, dim=0): - history.append([h]) - return history - - def forward(self, - input: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None): - """ forward for training - """ - input = self.embed(input) # [bs, seq_len, embed] - input = self.embed_dropout(input) - if cache is None: - zeros = self.init_state(input.size(0), device=input.device)[0] - else: - assert len(cache) == 1 - zeros = cache[0] - - input = torch.cat((zeros, input), - dim=1) # [bs, context_size-1 + seq_len, embed] - input = input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - return out - - def forward_step( - self, input: torch.Tensor, padding: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ forward step for inference - Args: - input (torch.Tensor): [batch_size, time_step=1] - padding (torch.Tensor): [batch_size,1], 1 is padding value - cache: for embedding predictor, cache[0] == history - """ - assert input.size(1) == 1 - assert len(cache) == 1 - history = cache[0] - assert history.size(1) == self.context_size - 1 - input = self.embed(input) # [bs, 1, embed] - input = self.embed_dropout(input) - context_input = torch.cat((history, input), dim=1) - input = context_input.permute(0, 2, 1) - out = self.conv(input).permute(0, 2, 1) - out = self.activatoin(self.norm(out)) - - new_cache = context_input[:, 1:, :] - # TODO(Mddct): apply padding in future - return (out, [new_cache]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/search/greedy_search.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/search/greedy_search.py deleted file mode 100644 index ef7354562b6617b7be33bf32d673117eb1d3d547..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/search/greedy_search.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List - -import torch - - -def basic_greedy_search( - model: torch.nn.Module, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - n_steps: int = 64, -) -> List[List[int]]: - # fake padding - padding = torch.zeros(1, 1).to(encoder_out.device) - # sos - pred_input_step = torch.tensor([model.blank]).reshape(1, 1) - cache = model.predictor.init_state(1, - method="zero", - device=encoder_out.device) - new_cache: List[torch.Tensor] = [] - t = 0 - hyps = [] - prev_out_nblk = True - pred_out_step = None - per_frame_max_noblk = n_steps - per_frame_noblk = 0 - while t < encoder_out_lens: - encoder_out_step = encoder_out[:, t:t + 1, :] # [1, 1, E] - if prev_out_nblk: - step_outs = model.predictor.forward_step(pred_input_step, padding, - cache) # [1, 1, P] - pred_out_step, new_cache = step_outs[0], step_outs[1] - - joint_out_step = model.joint(encoder_out_step, - pred_out_step) # [1,1,v] - joint_out_probs = joint_out_step.log_softmax(dim=-1) - - joint_out_max = joint_out_probs.argmax(dim=-1).squeeze() # [] - if joint_out_max != model.blank: - hyps.append(joint_out_max.item()) - prev_out_nblk = True - per_frame_noblk = per_frame_noblk + 1 - pred_input_step = joint_out_max.reshape(1, 1) - # state_m, state_c = clstate_out_m, state_out_c - cache = new_cache - - if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk: - if joint_out_max == model.blank: - prev_out_nblk = False - # TODO(Mddct): make t in chunk for streamming - # or t should't be too lang to predict none blank - t = t + 1 - per_frame_noblk = 0 - - return [hyps] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/search/prefix_beam_search.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/search/prefix_beam_search.py deleted file mode 100644 index f00917717c16a73916586708ebfede54fa02a21f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/search/prefix_beam_search.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import List, Tuple - -import torch -from wenet.utils.common import log_add - - -class Sequence(): - - __slots__ = {'hyp', 'score', 'cache'} - - def __init__( - self, - hyp: List[torch.Tensor], - score, - cache: List[torch.Tensor], - ): - self.hyp = hyp - self.score = score - self.cache = cache - - -class PrefixBeamSearch(): - - def __init__(self, encoder, predictor, joint, ctc, blank): - self.encoder = encoder - self.predictor = predictor - self.joint = joint - self.ctc = ctc - self.blank = blank - - def forward_decoder_one_step( - self, encoder_x: torch.Tensor, pre_t: torch.Tensor, - cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device) - pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1), - padding, cache) - x = self.joint(encoder_x, pre_t) # [beam, 1, 1, vocab] - x = x.log_softmax(dim=-1) - return x, new_cache - - def prefix_beam_search(self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7): - """prefix beam search - also see wenet.transducer.transducer.beam_search - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - assert batch_size == 1 - - # 1. Encoder - encoder_out, _ = self.encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - - ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0) - beam_init: List[Sequence] = [] - - # 2. init beam using Sequence to save beam unit - cache = self.predictor.init_state(1, method="zero", device=device) - beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache)) - # 3. start decoding (notice: we use breathwise first searching) - # !!!! In this decoding method: one frame do not output multi units. !!!! - # !!!! Experiments show that this strategy has little impact !!!! - for i in range(maxlen): - # 3.1 building input - # decoder taking the last token to predict the next token - input_hyp = [s.hyp[-1] for s in beam_init] - input_hyp_tensor = torch.tensor(input_hyp, - dtype=torch.int, - device=device) - # building statement from beam - cache_batch = self.predictor.cache_to_batch( - [s.cache for s in beam_init]) - # build score tensor to do torch.add() function - scores = torch.tensor([s.score for s in beam_init]).to(device) - - # 3.2 forward decoder - logp, new_cache = self.forward_decoder_one_step( - encoder_out[:, i, :].unsqueeze(1), - input_hyp_tensor, - cache_batch, - ) # logp: (N, 1, 1, vocab_size) - logp = logp.squeeze(1).squeeze(1) # logp: (N, vocab_size) - new_cache = self.predictor.batch_to_cache(new_cache) - - # 3.3 shallow fusion for transducer score - # and ctc score where we can also add the LM score - logp = torch.log( - torch.add(transducer_weight * torch.exp(logp), - ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0)))) - - # 3.4 first beam prune - top_k_logp, top_k_index = logp.topk(beam_size) # (N, N) - scores = torch.add(scores.unsqueeze(1), top_k_logp) - - # 3.5 generate new beam (N*N) - beam_A = [] - for j in range(len(beam_init)): - # update seq - base_seq = beam_init[j] - for t in range(beam_size): - # blank: only update the score - if top_k_index[j, t] == self.blank: - new_seq = Sequence(hyp=base_seq.hyp.copy(), - score=scores[j, t].item(), - cache=base_seq.cache) - - beam_A.append(new_seq) - # other unit: update hyp score statement and last - else: - hyp_new = base_seq.hyp.copy() - hyp_new.append(top_k_index[j, t].item()) - new_seq = Sequence(hyp=hyp_new, - score=scores[j, t].item(), - cache=new_cache[j]) - beam_A.append(new_seq) - - # 3.6 prefix fusion - fusion_A = [beam_A[0]] - for j in range(1, len(beam_A)): - s1 = beam_A[j] - if_do_append = True - for t in range(len(fusion_A)): - # notice: A_ can not fusion with A - if s1.hyp == fusion_A[t].hyp: - fusion_A[t].score = log_add( - [fusion_A[t].score, s1.score]) - if_do_append = False - break - if if_do_append: - fusion_A.append(s1) - - # 4. second pruned - fusion_A.sort(key=lambda x: x.score, reverse=True) - beam_init = fusion_A[:beam_size] - - return beam_init, encoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/transducer.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/transducer.py deleted file mode 100644 index 821a0946e621353a18bededbd93a658e83b0e0e2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transducer/transducer.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchaudio -from torch import nn -from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types -from wenet.transducer.predictor import PredictorBase -from wenet.transducer.search.greedy_search import basic_greedy_search -from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos, - reverse_pad_list) - - -class Transducer(ASRModel): - """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model""" - - def __init__( - self, - vocab_size: int, - blank: int, - encoder: nn.Module, - predictor: PredictorBase, - joint: nn.Module, - attention_decoder: Optional[Union[TransformerDecoder, - BiTransformerDecoder]] = None, - ctc: Optional[CTC] = None, - ctc_weight: float = 0, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - transducer_weight: float = 1.0, - attention_weight: float = 0.0, - ) -> None: - assert check_argument_types() - assert attention_weight + ctc_weight + transducer_weight == 1.0 - super().__init__(vocab_size, encoder, attention_decoder, ctc, - ctc_weight, ignore_id, reverse_weight, lsm_weight, - length_normalized_loss) - - self.blank = blank - self.transducer_weight = transducer_weight - self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight - - self.predictor = predictor - self.joint = joint - self.bs = None - - # Note(Mddct): decoder also means predictor in transducer, - # but here decoder is attention decoder - del self.criterion_att - if attention_decoder is not None: - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + predictor + joint + loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - - # Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - # predictor - ys_in_pad = add_blank(text, self.blank, self.ignore_id) - predictor_out = self.predictor(ys_in_pad) - # joint - joint_out = self.joint(encoder_out, predictor_out) - # NOTE(Mddct): some loss implementation require pad valid is zero - # torch.int32 rnnt_loss required - rnnt_text = text.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - rnnt_text_lengths = text_lengths.to(torch.int32) - encoder_out_lens = encoder_out_lens.to(torch.int32) - loss = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - encoder_out_lens, - rnnt_text_lengths, - blank=self.blank, - reduction="mean") - loss_rnnt = loss - - loss = self.transducer_weight * loss - # optional attention decoder - loss_att: Optional[torch.Tensor] = None - if self.attention_decoder_weight != 0.0 and self.decoder is not None: - loss_att, _ = self._calc_att_loss(encoder_out, encoder_mask, text, - text_lengths) - - # optional ctc - loss_ctc: Optional[torch.Tensor] = None - if self.ctc_weight != 0.0 and self.ctc is not None: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is not None: - loss = loss + self.ctc_weight * loss_ctc.sum() - if loss_att is not None: - loss = loss + self.attention_decoder_weight * loss_att.sum() - # NOTE: 'loss' must be in dict - return { - 'loss': loss, - 'loss_att': loss_att, - 'loss_ctc': loss_ctc, - 'loss_rnnt': loss_rnnt, - } - - def init_bs(self): - if self.bs is None: - self.bs = PrefixBeamSearch(self.encoder, self.predictor, - self.joint, self.ctc, self.blank) - - def _cal_transducer_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_lens: torch.Tensor, - hyps_pad: torch.Tensor, - ): - # ignore id -> blank, add blank at head - hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id) - xs_in_lens = encoder_mask.squeeze(1).sum(1).int() - - # 1. Forward predictor - predictor_out = self.predictor(hyps_pad_blank) - # 2. Forward joint - joint_out = self.joint(encoder_out, predictor_out) - rnnt_text = hyps_pad.to(torch.int64) - rnnt_text = torch.where(rnnt_text == self.ignore_id, 0, - rnnt_text).to(torch.int32) - # 3. Compute transducer loss - loss_td = torchaudio.functional.rnnt_loss(joint_out, - rnnt_text, - xs_in_lens, - hyps_lens.int(), - blank=self.blank, - reduction='none') - return loss_td * -1 - - def _cal_attn_score( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - hyps_pad: torch.Tensor, - hyps_lens: torch.Tensor, - ): - # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - - # td_score = loss_td * -1 - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - self.reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - return decoder_out, r_decoder_out - - def beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - beam_size: int = 5, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ctc_weight: float = 0.3, - transducer_weight: float = 0.7, - ): - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight in transducer - prefix beam search. - final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob - transducer_weight (float): transducer probability weight in - prefix beam search - Returns: - List[List[int]]: best path result - - """ - self.init_bs() - beam, _ = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size, - beam_size, - num_decoding_left_chunks, - simulate_streaming, - ctc_weight, - transducer_weight, - ) - return beam[0].hyp[1:], beam[0].score - - def transducer_attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ctc_weight: float = 0.0, - attn_weight: float = 0.0, - transducer_weight: float = 0.0, - search_ctc_weight: float = 1.0, - search_transducer_weight: float = 0.0, - beam_search_type: str = 'transducer') -> List[List[int]]: - """beam search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - ctc_weight (float): ctc probability weight using in rescoring. - rescore_prob = ctc_weight * ctc_prob + - transducer_weight * (transducer_loss * -1) + - attn_weight * attn_prob - attn_weight (float): attn probability weight using in rescoring. - transducer_weight (float): transducer probability weight using in - rescoring - search_ctc_weight (float): ctc weight using - in rnnt beam search (seeing in self.beam_search) - search_transducer_weight (float): transducer weight using - in rnnt beam search (seeing in self.beam_search) - Returns: - List[List[int]]: best path result - - """ - - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - self.init_bs() - if beam_search_type == 'transducer': - beam, encoder_out = self.bs.prefix_beam_search( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - beam_size=beam_size, - num_decoding_left_chunks=num_decoding_left_chunks, - ctc_weight=search_ctc_weight, - transducer_weight=search_transducer_weight, - ) - beam_score = [s.score for s in beam] - hyps = [s.hyp[1:] for s in beam] - - elif beam_search_type == 'ctc': - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, - speech_lengths, - beam_size=beam_size, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks, - simulate_streaming=simulate_streaming) - beam_score = [hyp[1] for hyp in hyps] - hyps = [hyp[0] for hyp in hyps] - assert len(hyps) == beam_size - - # build hyps and encoder output - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - - # 2.1 calculate transducer score - td_score = self._cal_transducer_score( - encoder_out, - encoder_mask, - hyps_lens, - hyps_pad, - ) - # 2.2 calculate attention score - decoder_out, r_decoder_out = self._cal_attn_score( - encoder_out, - encoder_mask, - hyps_pad, - hyps_lens, - ) - - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp)][self.eos] - td_s = td_score[i] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp): - r_score += r_decoder_out[i][len(hyp) - j - 1][w] - r_score += r_decoder_out[i][len(hyp)][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score = score * attn_weight + \ - beam_score[i] * ctc_weight + \ - td_s * transducer_weight - if score > best_score: - best_score = score - best_index = i - - return hyps[best_index], best_score - - def greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - n_steps: int = 64, - ) -> List[List[int]]: - """ greedy search - - Args: - speech (torch.Tensor): (batch=1, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - # TODO(Mddct): batch decode - assert speech.size(0) == 1 - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - # TODO(Mddct): forward chunk by chunk - _ = simulate_streaming - # Let's assume B = batch_size - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size, - num_decoding_left_chunks, - ) - encoder_out_lens = encoder_mask.squeeze(1).sum() - hyps = basic_greedy_search(self, - encoder_out, - encoder_out_lens, - n_steps=n_steps) - - return hyps - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def forward_predictor_step( - self, xs: torch.Tensor, cache: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - assert len(cache) == 2 - # fake padding - padding = torch.zeros(1, 1) - return self.predictor.forward_step(xs, padding, cache) - - @torch.jit.export - def forward_joint_step(self, enc_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: - return self.joint(enc_out, pred_out) - - @torch.jit.export - def forward_predictor_init_state(self) -> List[torch.Tensor]: - return self.predictor.init_state(1, device=torch.device("cpu")) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/asr_model.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/asr_model.py deleted file mode 100644 index 4288f68472d63ce4bf270c5f377d62fa7408713e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/asr_model.py +++ /dev/null @@ -1,904 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -from collections import defaultdict -from typing import Dict, List, Optional, Tuple - -import torch - -from torch.nn.utils.rnn import pad_sequence - -try: - import k2 - from icefall.utils import get_texts - from icefall.decode import get_lattice, Nbest, one_best_decoding -except ImportError: - print('Failed to import k2 and icefall. \ - Notice that they are necessary for hlg_onebest and hlg_rescore') - -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import TransformerDecoder -from wenet.transformer.encoder import TransformerEncoder -from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss -from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, - remove_duplicates_and_blank, th_accuracy, - reverse_pad_list) -from wenet.utils.mask import (make_pad_mask, mask_finished_preds, - mask_finished_scores, subsequent_mask) - - -class ASRModel(torch.nn.Module): - """CTC-attention hybrid Encoder-Decoder model""" - def __init__( - self, - vocab_size: int, - encoder: TransformerEncoder, - decoder: TransformerDecoder, - ctc: CTC, - ctc_weight: float = 0.5, - ignore_id: int = IGNORE_ID, - reverse_weight: float = 0.0, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - ): - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.reverse_weight = reverse_weight - - self.encoder = encoder - self.decoder = decoder - self.ctc = ctc - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, Optional[torch.Tensor]]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - # 1. Encoder - encoder_out, encoder_mask = self.encoder(speech, speech_lengths) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - - # 2a. Attention-decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, - text, text_lengths) - else: - loss_att = None - - # 2b. CTC branch - if self.ctc_weight != 0.0: - loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, - text_lengths) - else: - loss_ctc = None - - if loss_ctc is None: - loss = loss_att - elif loss_att is None: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + (1 - - self.ctc_weight) * loss_att - return {"loss": loss, "loss_att": loss_att, "loss_ctc": loss_ctc} - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_mask: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, float]: - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # reverse the seq, used for right to left decoder - r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) - r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, - self.ignore_id) - # 1. Forward decoder - decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, - ys_in_pad, ys_in_lens, - r_ys_in_pad, - self.reverse_weight) - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - r_loss_att = torch.tensor(0.0) - if self.reverse_weight > 0.0: - r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) - loss_att = loss_att * ( - 1 - self.reverse_weight) + r_loss_att * self.reverse_weight - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - return loss_att, acc_att - - def _forward_encoder( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Let's assume B = batch_size - # 1. Encoder - if simulate_streaming and decoding_chunk_size > 0: - encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( - speech, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - else: - encoder_out, encoder_mask = self.encoder( - speech, - speech_lengths, - decoding_chunk_size=decoding_chunk_size, - num_decoding_left_chunks=num_decoding_left_chunks - ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask - - def recognize( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int = 10, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> torch.Tensor: - """ Apply beam search on attention decoder - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - torch.Tensor: decoding result, (batch, max_result_len) - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - device = speech.device - batch_size = speech.shape[0] - - # Let's assume B = batch_size and N = beam_size - # 1. Encoder - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) - running_size = batch_size * beam_size - encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( - running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) - encoder_mask = encoder_mask.unsqueeze(1).repeat( - 1, beam_size, 1, 1).view(running_size, 1, - maxlen) # (B*N, 1, max_len) - - hyps = torch.ones([running_size, 1], dtype=torch.long, - device=device).fill_(self.sos) # (B*N, 1) - scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), - dtype=torch.float) - scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( - device) # (B*N, 1) - end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) - cache: Optional[List[torch.Tensor]] = None - # 2. Decoder forward step by step - for i in range(1, maxlen + 1): - # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: - break - # 2.1 Forward decoder step - hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( - running_size, 1, 1).to(device) # (B*N, i, i) - # logp: (B*N, vocab) - logp, cache = self.decoder.forward_one_step( - encoder_out, encoder_mask, hyps, hyps_mask, cache) - # 2.2 First beam prune: select topk best prob at current time - top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) - top_k_logp = mask_finished_scores(top_k_logp, end_flag) - top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) - # 2.3 Second beam prune: select topk score with history - scores = scores + top_k_logp # (B*N, N), broadcast add - scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) - scores, offset_k_index = scores.topk(k=beam_size) # (B, N) - # Update cache to be consistent with new topk scores / hyps - cache_index = (offset_k_index // beam_size).view(-1) # (B*N) - base_cache_index = (torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) * beam_size).view(-1) # (B*N) - cache_index = base_cache_index + cache_index - cache = [torch.index_select(c, dim=0, index=cache_index) for c in cache] - scores = scores.view(-1, 1) # (B*N, 1) - # 2.4. Compute base index in top_k_index, - # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), - # then find offset_k_index in top_k_index - base_k_index = torch.arange(batch_size, device=device).view( - -1, 1).repeat([1, beam_size]) # (B, N) - base_k_index = base_k_index * beam_size * beam_size - best_k_index = base_k_index.view(-1) + offset_k_index.view( - -1) # (B*N) - - # 2.5 Update best hyps - best_k_pred = torch.index_select(top_k_index.view(-1), - dim=-1, - index=best_k_index) # (B*N) - best_hyps_index = best_k_index // beam_size - last_best_k_hyps = torch.index_select( - hyps, dim=0, index=best_hyps_index) # (B*N, i) - hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), - dim=1) # (B*N, i+1) - - # 2.6 Update end flag - end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) - - # 3. Select best of best - scores = scores.view(batch_size, beam_size) - # TODO: length normalization - best_scores, best_index = scores.max(dim=-1) - best_hyps_index = best_index + torch.arange( - batch_size, dtype=torch.long, device=device) * beam_size - best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) - best_hyps = best_hyps[:, 1:] - return best_hyps, best_scores - - def ctc_greedy_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[List[int]]: - """ Apply CTC greedy search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - Returns: - List[List[int]]: best path result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # Let's assume B = batch_size - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_out_lens = encoder_mask.squeeze(1).sum(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (B, maxlen, vocab_size) - topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) - topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) - mask = make_pad_mask(encoder_out_lens, maxlen) # (B, maxlen) - topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) - hyps = [hyp.tolist() for hyp in topk_index] - scores = topk_prob.max(1) - hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] - return hyps, scores - - def _ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> Tuple[List[List[int]], torch.Tensor]: - """ CTC prefix beam search inner implementation - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[List[int]]: nbest results - torch.Tensor: encoder output, (1, max_len, encoder_dim), - it will be used for rescoring in attention rescoring mode - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - batch_size = speech.shape[0] - # For CTC prefix beam search, we only support batch_size=1 - assert batch_size == 1 - # Let's assume B = batch_size and N = beam_size - # 1. Encoder forward and get CTC score - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - ctc_probs = ctc_probs.squeeze(0) - # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) - cur_hyps = [(tuple(), (0.0, -float('inf')))] - # 2. CTC beam search step by step - for t in range(0, maxlen): - logp = ctc_probs[t] # (vocab_size,) - # key: prefix, value (pb, pnb), default value(-inf, -inf) - next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) - # 2.1 First beam prune: select topk best - top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) - for s in top_k_index: - s = s.item() - ps = logp[s].item() - for prefix, (pb, pnb) in cur_hyps: - last = prefix[-1] if len(prefix) > 0 else None - if s == 0: # blank - n_pb, n_pnb = next_hyps[prefix] - n_pb = log_add([n_pb, pb + ps, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - elif s == last: - # Update *ss -> *s; - n_pb, n_pnb = next_hyps[prefix] - n_pnb = log_add([n_pnb, pnb + ps]) - next_hyps[prefix] = (n_pb, n_pnb) - # Update *s-s -> *ss, - is for blank - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - else: - n_prefix = prefix + (s, ) - n_pb, n_pnb = next_hyps[n_prefix] - n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) - next_hyps[n_prefix] = (n_pb, n_pnb) - - # 2.2 Second beam prune - next_hyps = sorted(next_hyps.items(), - key=lambda x: log_add(list(x[1])), - reverse=True) - cur_hyps = next_hyps[:beam_size] - hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] - return hyps, encoder_out - - def ctc_prefix_beam_search( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - ) -> List[int]: - """ Apply CTC prefix beam search - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - - Returns: - List[int]: CTC prefix beam search nbest results - """ - hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, - beam_size, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) - return hyps[0] - - def attention_rescoring( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - beam_size: int, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - ctc_weight: float = 0.0, - simulate_streaming: bool = False, - reverse_weight: float = 0.0, - ) -> List[int]: - """ Apply attention rescoring decoding, CTC prefix beam search - is applied first to get nbest, then we resoring the nbest on - attention decoder with corresponding encoder out - - Args: - speech (torch.Tensor): (batch, max_len, feat_dim) - speech_length (torch.Tensor): (batch, ) - beam_size (int): beam size for beam search - decoding_chunk_size (int): decoding chunk for dynamic chunk - trained model. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here - simulate_streaming (bool): whether do encoder forward in a - streaming fashion - reverse_weight (float): right to left decoder weight - ctc_weight (float): ctc score weight - - Returns: - List[int]: Attention rescoring result - """ - assert speech.shape[0] == speech_lengths.shape[0] - assert decoding_chunk_size != 0 - if reverse_weight > 0.0: - # decoder should be a bitransformer decoder if reverse_weight > 0.0 - assert hasattr(self.decoder, 'right_decoder') - device = speech.device - batch_size = speech.shape[0] - # For attention rescoring we only support batch_size=1 - assert batch_size == 1 - # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size - hyps, encoder_out = self._ctc_prefix_beam_search( - speech, speech_lengths, beam_size, decoding_chunk_size, - num_decoding_left_chunks, simulate_streaming) - - assert len(hyps) == beam_size - hyps_pad = pad_sequence([ - torch.tensor(hyp[0], device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out = encoder_out.repeat(beam_size, 1, 1) - encoder_mask = torch.ones(beam_size, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out.cpu().numpy() - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out.cpu().numpy() - # Only use decoder score for rescoring - best_score = -float('inf') - best_index = 0 - for i, hyp in enumerate(hyps): - score = 0.0 - for j, w in enumerate(hyp[0]): - score += decoder_out[i][j][w] - score += decoder_out[i][len(hyp[0])][self.eos] - # add right to left decoder score - if reverse_weight > 0: - r_score = 0.0 - for j, w in enumerate(hyp[0]): - r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] - r_score += r_decoder_out[i][len(hyp[0])][self.eos] - score = score * (1 - reverse_weight) + r_score * reverse_weight - # add ctc score - score += hyp[1] * ctc_weight - if score > best_score: - best_score = score - best_index = i - return hyps[best_index][0], best_score - - def load_hlg_resource_if_necessary(self, hlg, word): - if not hasattr(self, 'hlg'): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device)) - if not hasattr(self.hlg, "lm_scores"): - self.hlg.lm_scores = self.hlg.scores.clone() - if not hasattr(self, 'word_table'): - self.word_table = {} - with open(word, 'r') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - self.word_table[int(arr[1])] = arr[0] - - @torch.no_grad() - def hlg_onebest( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - best_path = one_best_decoding(lattice=lattice, use_double_scores=True) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.no_grad() - def hlg_rescore( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - decoding_chunk_size: int = -1, - num_decoding_left_chunks: int = -1, - simulate_streaming: bool = False, - lm_scale: float = 0, - decoder_scale: float = 0, - r_decoder_scale: float = 0, - hlg: str = '', - word: str = '', - symbol_table: Dict[str, int] = None, - ) -> List[int]: - self.load_hlg_resource_if_necessary(hlg, word) - device = speech.device - encoder_out, encoder_mask = self._forward_encoder( - speech, speech_lengths, decoding_chunk_size, - num_decoding_left_chunks, - simulate_streaming) # (B, maxlen, encoder_dim) - ctc_probs = self.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - supervision_segments = torch.stack( - (torch.arange(len(encoder_mask)), - torch.zeros(len(encoder_mask)), - encoder_mask.squeeze(dim=1).sum(dim=1).cpu()), 1,).to(torch.int32) - lattice = get_lattice( - nnet_output=ctc_probs, - decoding_graph=self.hlg, - supervision_segments=supervision_segments, - search_beam=20, - output_beam=7, - min_active_states=30, - max_active_states=10000, - subsampling_factor=4) - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=100, - use_double_scores=True, - nbest_scale=0.5,) - nbest = nbest.intersect(lattice) - assert hasattr(nbest.fsa, "lm_scores") - assert hasattr(nbest.fsa, "tokens") - assert isinstance(nbest.fsa.tokens, torch.Tensor) - - tokens_shape = nbest.fsa.arcs.shape().remove_axis(1) - tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens) - tokens = tokens.remove_values_leq(0) - hyps = tokens.tolist() - - # cal attention_score - hyps_pad = pad_sequence([ - torch.tensor(hyp, device=device, dtype=torch.long) - for hyp in hyps - ], True, self.ignore_id) # (beam_size, max_hyps_len) - ori_hyps_pad = hyps_pad - hyps_lens = torch.tensor([len(hyp) for hyp in hyps], - device=device, - dtype=torch.long) # (beam_size,) - hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - hyps_lens = hyps_lens + 1 # Add at begining - encoder_out_repeat = [] - tot_scores = nbest.tot_scores() - repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)] - for i in range(len(encoder_out)): - encoder_out_repeat.append(encoder_out[i: i + 1].repeat(repeats[i], 1, 1)) - encoder_out = torch.concat(encoder_out_repeat, dim=0) - encoder_mask = torch.ones(encoder_out.size(0), - 1, - encoder_out.size(1), - dtype=torch.bool, - device=device) - # used for right to left decoder - r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) - r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, - self.ignore_id) - reverse_weight = 0.5 - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, - reverse_weight) # (beam_size, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - decoder_out = decoder_out - # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a - # conventional transformer decoder. - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - r_decoder_out = r_decoder_out - - decoder_scores = torch.tensor([sum([decoder_out[i, j, hyps[i][j]] - for j in range(len(hyps[i]))]) - for i in range(len(hyps))], device=device) - r_decoder_scores = [] - for i in range(len(hyps)): - score = 0 - for j in range(len(hyps[i])): - score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]] - score += r_decoder_out[i, len(hyps[i]), self.eos] - r_decoder_scores.append(score) - r_decoder_scores = torch.tensor(r_decoder_scores, device=device) - - am_scores = nbest.compute_am_scores() - ngram_lm_scores = nbest.compute_lm_scores() - tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \ - decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - max_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, max_indexes) - hyps = get_texts(best_path) - hyps = [[symbol_table[k] for j in i for k in self.word_table[j]] for i in hyps] - return hyps - - @torch.jit.export - def subsampling_rate(self) -> int: - """ Export interface for c++ call, return subsampling_rate of the - model - """ - return self.encoder.embed.subsampling_rate - - @torch.jit.export - def right_context(self) -> int: - """ Export interface for c++ call, return right_context of the model - """ - return self.encoder.embed.right_context - - @torch.jit.export - def sos_symbol(self) -> int: - """ Export interface for c++ call, return sos symbol id of the model - """ - return self.sos - - @torch.jit.export - def eos_symbol(self) -> int: - """ Export interface for c++ call, return eos symbol id of the model - """ - return self.eos - - @torch.jit.export - def forward_encoder_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, give input chunk xs, and return - output from time 0 to current chunk. - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) - - @torch.jit.export - def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: - """ Export interface for c++ call, apply linear transform and log - softmax before ctc - Args: - xs (torch.Tensor): encoder output - - Returns: - torch.Tensor: activation before ctc - - """ - return self.ctc.log_softmax(xs) - - @torch.jit.export - def is_bidirectional_decoder(self) -> bool: - """ - Returns: - torch.Tensor: decoder output - """ - if hasattr(self.decoder, 'right_decoder'): - return True - else: - return False - - @torch.jit.export - def forward_attention_decoder( - self, - hyps: torch.Tensor, - hyps_lens: torch.Tensor, - encoder_out: torch.Tensor, - reverse_weight: float = 0, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Export interface for c++ call, forward decoder with multiple - hypothesis from ctc prefix beam search and one encoder output - Args: - hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad sos at the begining - hyps_lens (torch.Tensor): length of each hyp in hyps - encoder_out (torch.Tensor): corresponding encoder output - r_hyps (torch.Tensor): hyps from ctc prefix beam search, already - pad eos at the begining which is used fo right to left decoder - reverse_weight: used for verfing whether used right to left decoder, - > 0 will use. - - Returns: - torch.Tensor: decoder output - """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) - encoder_mask = torch.ones(num_hyps, - 1, - encoder_out.size(1), - dtype=torch.bool, - device=encoder_out.device) - - # input for right to left decoder - # this hyps_lens has count token, we need minus it. - r_hyps_lens = hyps_lens - 1 - # this hyps has included token, so it should be - # convert the original hyps. - r_hyps = hyps[:, 1:] - # >>> r_hyps - # >>> tensor([[ 1, 2, 3], - # >>> [ 9, 8, 4], - # >>> [ 2, -1, -1]]) - # >>> r_hyps_lens - # >>> tensor([3, 3, 1]) - - # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used - # in `reverse_pad_list` thus we have to refine the below code. - # Issue: https://github.com/wenet-e2e/wenet/issues/1113 - # Equal to: - # >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) - # >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) - max_len = torch.max(r_hyps_lens) - index_range = torch.arange(0, max_len, 1).to(encoder_out.device) - seq_len_expand = r_hyps_lens.unsqueeze(1) - seq_mask = seq_len_expand > index_range # (beam, max_len) - # >>> seq_mask - # >>> tensor([[ True, True, True], - # >>> [ True, True, True], - # >>> [ True, False, False]]) - index = (seq_len_expand - 1) - index_range # (beam, max_len) - # >>> index - # >>> tensor([[ 2, 1, 0], - # >>> [ 2, 1, 0], - # >>> [ 0, -1, -2]]) - index = index * seq_mask - # >>> index - # >>> tensor([[2, 1, 0], - # >>> [2, 1, 0], - # >>> [0, 0, 0]]) - r_hyps = torch.gather(r_hyps, 1, index) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, 2, 2]]) - r_hyps = torch.where(seq_mask, r_hyps, self.eos) - # >>> r_hyps - # >>> tensor([[3, 2, 1], - # >>> [4, 8, 9], - # >>> [2, eos, eos]]) - r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1) - # >>> r_hyps - # >>> tensor([[sos, 3, 2, 1], - # >>> [sos, 4, 8, 9], - # >>> [sos, 2, eos, eos]]) - - decoder_out, r_decoder_out, _ = self.decoder( - encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, - reverse_weight) # (num_hyps, max_hyps_len, vocab_size) - decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) - - # right to left decoder may be not used during decoding process, - # which depends on reverse_weight param. - # r_dccoder_out will be 0.0, if reverse_weight is 0.0 - r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) - return decoder_out, r_decoder_out diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/attention.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/attention.py deleted file mode 100644 index 6ee5e313edf2e88a844ce004c0f819b0bd3260f6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/attention.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi-Head Attention layer definition.""" - -import math -from typing import Tuple - -import torch -from torch import nn - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, n_head: int, n_feat: int, dropout_rate: float): - """Construct an MultiHeadedAttention object.""" - super().__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv( - self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor, size - (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor, size - (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor, size - (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention( - self, value: torch.Tensor, scores: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) - ) -> torch.Tensor: - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value, size - (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score, size - (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask, size (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - # NOTE(xcsong): When will `if mask.size(2) > 0` be True? - # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the - # 1st chunk to ease the onnx export.] - # 2. pytorch training - if mask.size(2) > 0 : # time2 > 0 - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - # For last chunk, time2 might be larger than scores.size(-1) - mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) - scores = scores.masked_fill(mask, -float('inf')) - attn = torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0) # (batch, head, time1, time2) - # NOTE(xcsong): When will `if mask.size(2) > 0` be False? - # 1. onnx(16/-1, -1/-1, 16/0) - # 2. jit (16/-1, -1/-1, 16/0, 16/4) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - 1.When applying cross attention between decoder and encoder, - the batch padding mask for input is in (#batch, 1, T) shape. - 2.When applying self attention of encoder, - the mask is in (#batch, T, T) shape. - 3.When applying self attention of decoder, - the mask is in (#batch, L, L) shape. - 4.If the different position in decoder see different block - of the encoder, such as Mocha, the passed in mask could be - in (#batch, L, T) shape. But there is no such case in current - Wenet. - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - - """ - q, k, v = self.forward_qkv(query, key, value) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask), new_cache - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - """ - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x, zero_triu: bool = False): - """Compute relative positinal encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, size). - zero_triu (bool): If true, return the lower triangular part of - the matrix. - Returns: - torch.Tensor: Output tensor. - """ - - zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(x.size()[0], - x.size()[1], - x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - pos_emb: torch.Tensor = torch.empty(0), - cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2), (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, time2, size). - cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) - where `cache_t == chunk_size * num_decoding_left_chunks` - and `head * d_k == size` - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - # NOTE(xcsong): - # when export onnx model, for 1st chunk, we feed - # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) - # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). - # In all modes, `if cache.size(0) > 0` will alwayse be `True` - # and we will always do splitting and - # concatnation(this will simplify onnx export). Note that - # it's OK to concat & split zero-shaped tensors(see code below). - # when export jit model, for 1st chunk, we always feed - # cache(0, 0, 0, 0) since jit supports dynamic if-branch. - # >>> a = torch.ones((1, 2, 0, 4)) - # >>> b = torch.ones((1, 2, 3, 4)) - # >>> c = torch.cat((a, b), dim=2) - # >>> torch.equal(b, c) # True - # >>> d = torch.split(a, 2, dim=-1) - # >>> torch.equal(d[0], d[1]) # True - if cache.size(0) > 0: - key_cache, value_cache = torch.split( - cache, cache.size(-1) // 2, dim=-1) - k = torch.cat([key_cache, k], dim=2) - v = torch.cat([value_cache, v], dim=2) - # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's - # non-trivial to calculate `next_cache_start` here. - new_cache = torch.cat((k, v), dim=-1) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # Remove rel_shift since it is useless in speech recognition, - # and it requires special attention for streaming. - # matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/cmvn.py deleted file mode 100644 index 3a1e7457fd3788d9a7e031e96517505a65925102..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/cmvn.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class GlobalCMVN(torch.nn.Module): - def __init__(self, - mean: torch.Tensor, - istd: torch.Tensor, - norm_var: bool = True): - """ - Args: - mean (torch.Tensor): mean stats - istd (torch.Tensor): inverse std, std which is 1.0 / std - """ - super().__init__() - assert mean.shape == istd.shape - self.norm_var = norm_var - # The buffer can be accessed from this module using self.mean - self.register_buffer("mean", mean) - self.register_buffer("istd", istd) - - def forward(self, x: torch.Tensor): - """ - Args: - x (torch.Tensor): (batch, max_len, feat_dim) - - Returns: - (torch.Tensor): normalized feature - """ - x = x - self.mean - if self.norm_var: - x = x * self.istd - return x diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/convolution.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/convolution.py deleted file mode 100644 index 2cf9794e14ea7441ccd30ab52202ac02fb25c2b6..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/convolution.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""ConvolutionModule definition.""" - -from typing import Tuple - -import torch -from torch import nn -from typeguard import check_argument_types - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model.""" - def __init__(self, - channels: int, - kernel_size: int = 15, - activation: nn.Module = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True): - """Construct an ConvolutionModule object. - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernel size of conv layers. - causal (int): Whether use causal convolution or not - """ - assert check_argument_types() - super().__init__() - - self.pointwise_conv1 = nn.Conv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - # self.lorder is used to distinguish if it's a causal convolution, - # if self.lorder > 0: it's a causal convolution, the input will be - # padded with self.lorder frames on the left in forward. - # else: it's a symmetrical convolution - if causal: - padding = 0 - self.lorder = kernel_size - 1 - else: - # kernel_size should be an odd number for none causal convolution - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.lorder = 0 - self.depthwise_conv = nn.Conv1d( - channels, - channels, - kernel_size, - stride=1, - padding=padding, - groups=channels, - bias=bias, - ) - - assert norm in ['batch_norm', 'layer_norm'] - if norm == "batch_norm": - self.use_layer_norm = False - self.norm = nn.BatchNorm1d(channels) - else: - self.use_layer_norm = True - self.norm = nn.LayerNorm(channels) - - self.pointwise_conv2 = nn.Conv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.activation = activation - - def forward( - self, - x: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - cache: torch.Tensor = torch.zeros((0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Compute convolution module. - Args: - x (torch.Tensor): Input tensor (#batch, time, channels). - mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), - (0, 0, 0) means fake mask. - cache (torch.Tensor): left context cache, it is only - used in causal convolution (#batch, channels, cache_t), - (0, 0, 0) meas fake cache. - Returns: - torch.Tensor: Output tensor (#batch, time, channels). - """ - # exchange the temporal dimension and the feature dimension - x = x.transpose(1, 2) # (#batch, channels, time) - - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - if self.lorder > 0: - if cache.size(2) == 0: # cache_t == 0 - x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) - else: - assert cache.size(0) == x.size(0) # equal batch - assert cache.size(1) == x.size(1) # equal channel - x = torch.cat((cache, x), dim=2) - assert (x.size(2) > self.lorder) - new_cache = x[:, :, -self.lorder:] - else: - # It's better we just return None if no cache is required, - # However, for JIT export, here we just fake one tensor instead of - # None. - new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channel, dim) - x = nn.functional.glu(x, dim=1) # (batch, channel, dim) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.activation(self.norm(x)) - if self.use_layer_norm: - x = x.transpose(1, 2) - x = self.pointwise_conv2(x) - # mask batch padding - if mask_pad.size(2) > 0: # time > 0 - x.masked_fill_(~mask_pad, 0.0) - - return x.transpose(1, 2), new_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/ctc.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/ctc.py deleted file mode 100644 index 3dfcbaa324ffc26afa9ceaeb75007eb312546326..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/ctc.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -import torch -import torch.nn.functional as F -from typeguard import check_argument_types - - -class CTC(torch.nn.Module): - """CTC module""" - def __init__( - self, - odim: int, - encoder_output_size: int, - dropout_rate: float = 0.0, - reduce: bool = True, - ): - """ Construct CTC module - Args: - odim: dimension of outputs - encoder_output_size: number of encoder projection units - dropout_rate: dropout rate (0.0 ~ 1.0) - reduce: reduce the CTC loss into a scalar - """ - assert check_argument_types() - super().__init__() - eprojs = encoder_output_size - self.dropout_rate = dropout_rate - self.ctc_lo = torch.nn.Linear(eprojs, odim) - - reduction_type = "sum" if reduce else "none" - self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type) - - def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, - ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor: - """Calculate CTC loss. - - Args: - hs_pad: batch of padded hidden state sequences (B, Tmax, D) - hlens: batch of lengths of hidden state sequences (B) - ys_pad: batch of padded character id sequence tensor (B, Lmax) - ys_lens: batch of lengths of character sequence (B) - """ - # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) - ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) - # ys_hat: (B, L, D) -> (L, B, D) - ys_hat = ys_hat.transpose(0, 1) - ys_hat = ys_hat.log_softmax(2) - loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) - # Batch-size average - loss = loss / ys_hat.size(1) - return loss - - def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """log_softmax of frame activations - - Args: - Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) - """ - return F.log_softmax(self.ctc_lo(hs_pad), dim=2) - - def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: - """argmax of frame activations - - Args: - torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) - Returns: - torch.Tensor: argmax applied 2d tensor (B, Tmax) - """ - return torch.argmax(self.ctc_lo(hs_pad), dim=2) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/decoder.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/decoder.py deleted file mode 100644 index c31853d9e868c99290b8d597f53d9a680202c82c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/decoder.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Decoder definition.""" -from typing import Tuple, List, Optional - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.decoder_layer import DecoderLayer -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.utils.mask import (subsequent_mask, make_pad_mask) - - -class TransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == "embed": - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError(f"only 'embed' is supported: {input_layer}") - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) - self.use_output_layer = use_output_layer - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - self.num_blocks = num_blocks - self.decoders = torch.nn.ModuleList([ - DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(self.num_blocks) - ]) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor = torch.empty(0), - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: not used in transformer decoder, in order to unify api - with bidirectional decoder - reverse_weight: not used in transformer decoder, in order to unify - api with bidirectional decode - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - torch.tensor(0.0), in order to unify api with bidirectional decoder - olens: (batch, ) - """ - tgt = ys_in_pad - maxlen = tgt.size(1) - # tgt_mask: (B, 1, L) - tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) - tgt_mask = tgt_mask.to(tgt.device) - # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1), - device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - x, _ = self.embed(tgt) - for layer in self.decoders: - x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, - memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.use_output_layer: - x = self.output_layer(x) - olens = tgt_mask.sum(1) - return x, torch.tensor(0.0), olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x, _ = self.embed(tgt) - new_cache = [] - for i, decoder in enumerate(self.decoders): - if cache is None: - c = None - else: - c = cache[i] - x, tgt_mask, memory, memory_mask = decoder(x, - tgt_mask, - memory, - memory_mask, - cache=c) - new_cache.append(x) - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.use_output_layer: - y = torch.log_softmax(self.output_layer(y), dim=-1) - return y, new_cache - - -class BiTransformerDecoder(torch.nn.Module): - """Base class of Transfomer decoder module. - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the hidden units number of position-wise feedforward - num_blocks: the number of decoder blocks - r_num_blocks: the number of right to left decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after: whether to concat attention layer's input and output - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - r_num_blocks: int = 0, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = "embed", - use_output_layer: bool = True, - normalize_before: bool = True, - concat_after: bool = False, - ): - - assert check_argument_types() - super().__init__() - self.left_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - self.right_decoder = TransformerDecoder( - vocab_size, encoder_output_size, attention_heads, linear_units, - r_num_blocks, dropout_rate, positional_dropout_rate, - self_attention_dropout_rate, src_attention_dropout_rate, - input_layer, use_output_layer, normalize_before, concat_after) - - def forward( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - r_ys_in_pad: torch.Tensor, - reverse_weight: float = 0.0, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Forward decoder. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoder memory mask, (batch, 1, maxlen_in) - ys_in_pad: padded input token ids, int64 (batch, maxlen_out) - ys_in_lens: input lengths of this batch (batch) - r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), - used for right to left decoder - reverse_weight: used for right to left decoder - Returns: - (tuple): tuple containing: - x: decoded token score before softmax (batch, maxlen_out, - vocab_size) if use_output_layer is True, - r_x: x: decoded token score (right to left decoder) - before softmax (batch, maxlen_out, vocab_size) - if use_output_layer is True, - olens: (batch, ) - """ - l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, - ys_in_lens) - r_x = torch.tensor(0.0) - if reverse_weight > 0.0: - r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, - ys_in_lens) - return l_x, r_x, olens - - def forward_one_step( - self, - memory: torch.Tensor, - memory_mask: torch.Tensor, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - cache: Optional[List[torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - This is only used for decoding. - Args: - memory: encoded memory, float32 (batch, maxlen_in, feat) - memory_mask: encoded memory mask, (batch, 1, maxlen_in) - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - return self.left_decoder.forward_one_step(memory, memory_mask, tgt, - tgt_mask, cache) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/decoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/decoder_layer.py deleted file mode 100644 index 6b52aa6ab730dc51b18f0787e8236ab10c1e9cad..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/decoder_layer.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder self-attention layer definition.""" -from typing import Optional, Tuple - -import torch -from torch import nn - - -class DecoderLayer(nn.Module): - """Single decoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn (torch.nn.Module): Inter-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's inpu - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an DecoderLayer object.""" - super().__init__() - self.size = size - self.self_attn = self_attn - self.src_attn = src_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.norm3 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear1 = nn.Linear(size + size, size) - self.concat_linear2 = nn.Linear(size + size, size) - else: - self.concat_linear1 = nn.Identity() - self.concat_linear2 = nn.Identity() - - def forward( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - memory_mask: torch.Tensor, - cache: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute decoded features. - - Args: - tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). - tgt_mask (torch.Tensor): Mask for input tensor - (#batch, maxlen_out). - memory (torch.Tensor): Encoded memory - (#batch, maxlen_in, size). - memory_mask (torch.Tensor): Encoded memory mask - (#batch, maxlen_in). - cache (torch.Tensor): cached tensors. - (#batch, maxlen_out - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, maxlen_out, size). - torch.Tensor: Mask for output tensor (#batch, maxlen_out). - torch.Tensor: Encoded memory (#batch, maxlen_in, size). - torch.Tensor: Encoded memory mask (#batch, maxlen_in). - - """ - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - - if cache is None: - tgt_q = tgt - tgt_q_mask = tgt_mask - else: - # compute only the last frame query keeping dim: max_time_out -> 1 - assert cache.shape == ( - tgt.shape[0], - tgt.shape[1] - 1, - self.size, - ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" - tgt_q = tgt[:, -1:, :] - residual = residual[:, -1:, :] - tgt_q_mask = tgt_mask[:, -1:, :] - - if self.concat_after: - tgt_concat = torch.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) - x = residual + self.concat_linear1(tgt_concat) - else: - x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - if self.concat_after: - x_concat = torch.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) - x = residual + self.concat_linear2(x_concat) - else: - x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) - if not self.normalize_before: - x = self.norm2(x) - - residual = x - if self.normalize_before: - x = self.norm3(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm3(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, tgt_mask, memory, memory_mask diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/embedding.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/embedding.py deleted file mode 100644 index 611a927864d93c3ad8357f66c780bf537b2a4d67..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/embedding.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Positonal Encoding Module.""" - -import math -from typing import Tuple, Union - -import torch -import torch.nn.functional as F - -class PositionalEncoding(torch.nn.Module): - """Positional encoding. - - :param int d_model: embedding dim - :param float dropout_rate: dropout rate - :param int max_len: maximum input length - - PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) - PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) - """ - def __init__(self, - d_model: int, - dropout_rate: float, - max_len: int = 5000, - reverse: bool = False): - """Construct an PositionalEncoding object.""" - super().__init__() - self.d_model = d_model - self.xscale = math.sqrt(self.d_model) - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - - self.pe = torch.zeros(self.max_len, self.d_model) - position = torch.arange(0, self.max_len, - dtype=torch.float32).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.d_model)) - self.pe[:, 0::2] = torch.sin(position * div_term) - self.pe[:, 1::2] = torch.cos(position * div_term) - self.pe = self.pe.unsqueeze(0) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input. Its shape is (batch, time, ...) - offset (int, torch.tensor): position offset - - Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) - torch.Tensor: for compatibility to RelPositionalEncoding - """ - - self.pe = self.pe.to(x.device) - pos_emb = self.position_encoding(offset, x.size(1), False) - x = x * self.xscale + pos_emb - return self.dropout(x), self.dropout(pos_emb) - - def position_encoding(self, offset: Union[int, torch.Tensor], size: int, - apply_dropout: bool = True) -> torch.Tensor: - """ For getting encoding in a streaming fashion - - Attention!!!!! - we apply dropout only once at the whole utterance level in a none - streaming way, but will call this function several times with - increasing input size in a streaming scenario, so the dropout will - be applied several times. - - Args: - offset (int or torch.tensor): start offset - size (int): required size of position encoding - - Returns: - torch.Tensor: Corresponding encoding - """ - # How to subscript a Union type: - # https://github.com/pytorch/pytorch/issues/69434 - if isinstance(offset, int): - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar - assert offset + size < self.max_len - pos_emb = self.pe[:, offset:offset + size] - else: # for batched streaming decoding on GPU - assert torch.max(offset) + size < self.max_len - index = offset.unsqueeze(1) + \ - torch.arange(0, size).to(offset.device) # B X T - flag = index > 0 - # remove negative offset - index = index * flag - pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model - - if apply_dropout: - pos_emb = self.dropout(pos_emb) - return pos_emb - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding module. - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - """ - def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): - """Initialize class.""" - super().__init__(d_model, dropout_rate, max_len, reverse=True) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """Compute positional encoding. - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Positional embedding tensor (1, time, `*`). - """ - self.pe = self.pe.to(x.device) - x = x * self.xscale - pos_emb = self.position_encoding(offset, x.size(1), False) - return self.dropout(x), self.dropout(pos_emb) - - -class NoPositionalEncoding(torch.nn.Module): - """ No position encoding - """ - def __init__(self, d_model: int, dropout_rate: float): - super().__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - - def forward(self, - x: torch.Tensor, - offset: Union[int, torch.Tensor] = 0) \ - -> Tuple[torch.Tensor, torch.Tensor]: - """ Just return zero vector for interface compatibility - """ - pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) - return self.dropout(x), pos_emb - - def position_encoding( - self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor: - return torch.zeros(1, size, self.d_model) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/encoder.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/encoder.py deleted file mode 100644 index bb2ec65827548bd1242cb3b367cb3983c2de6119..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/encoder.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder definition.""" -from typing import Tuple - -import torch -from typeguard import check_argument_types - -from wenet.transformer.attention import MultiHeadedAttention -from wenet.transformer.attention import RelPositionMultiHeadedAttention -from wenet.transformer.convolution import ConvolutionModule -from wenet.transformer.embedding import PositionalEncoding -from wenet.transformer.embedding import RelPositionalEncoding -from wenet.transformer.embedding import NoPositionalEncoding -from wenet.transformer.encoder_layer import TransformerEncoderLayer -from wenet.transformer.encoder_layer import ConformerEncoderLayer -from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward -from wenet.transformer.subsampling import Conv2dSubsampling4 -from wenet.transformer.subsampling import Conv2dSubsampling6 -from wenet.transformer.subsampling import Conv2dSubsampling8 -from wenet.transformer.subsampling import LinearNoSubsampling -from wenet.utils.common import get_activation -from wenet.utils.mask import make_pad_mask -from wenet.utils.mask import add_optional_chunk_mask - - -class BaseEncoder(torch.nn.Module): - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ - Args: - input_size (int): input dim - output_size (int): dimension of attention - attention_heads (int): the number of heads of multi head attention - linear_units (int): the hidden units number of position-wise feed - forward - num_blocks (int): the number of decoder blocks - dropout_rate (float): dropout rate - attention_dropout_rate (float): dropout rate in attention - positional_dropout_rate (float): dropout rate after adding - positional encoding - input_layer (str): input layer type. - optional [linear, conv2d, conv2d6, conv2d8] - pos_enc_layer_type (str): Encoder positional encoding layer type. - opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] - normalize_before (bool): - True: use layer_norm before each sub-block of a layer. - False: use layer_norm after each sub-block of a layer. - concat_after (bool): whether to concat attention layer's input - and output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - static_chunk_size (int): chunk size for static chunk training and - decoding - use_dynamic_chunk (bool): whether use dynamic chunk size for - training or not, You can only use fixed chunk(chunk_size > 0) - or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module - use_dynamic_left_chunk (bool): whether use dynamic left chunk in - dynamic chunk training - """ - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if pos_enc_layer_type == "abs_pos": - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == "rel_pos": - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == "no_pos": - pos_enc_class = NoPositionalEncoding - else: - raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) - - if input_layer == "linear": - subsampling_class = LinearNoSubsampling - elif input_layer == "conv2d": - subsampling_class = Conv2dSubsampling4 - elif input_layer == "conv2d6": - subsampling_class = Conv2dSubsampling6 - elif input_layer == "conv2d8": - subsampling_class = Conv2dSubsampling8 - else: - raise ValueError("unknown input_layer: " + input_layer) - - self.global_cmvn = global_cmvn - self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - - self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) - self.static_chunk_size = static_chunk_size - self.use_dynamic_chunk = use_dynamic_chunk - self.use_dynamic_left_chunk = use_dynamic_left_chunk - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs: torch.Tensor, - xs_lens: torch.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Embed positions in tensor. - - Args: - xs: padded input tensor (B, T, D) - xs_lens: input length (B) - decoding_chunk_size: decoding chunk size for dynamic chunk - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - Returns: - encoder output tensor xs, and subsampled masks - xs: padded output tensor (B, T' ~= T/subsample_rate, D) - masks: torch.Tensor batch padding mask after subsample - (B, 1, T' ~= T/subsample_rate) - """ - T = xs.size(1) - masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) - mask_pad = masks # (B, 1, T/subsample_rate) - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) - for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) - if self.normalize_before: - xs = self.after_norm(xs) - # Here we assume the mask is not changed in encoder layers, so just - # return the masks before encoder layers, and the masks will be used - # for cross attention with decoder later - return xs, masks - - def forward_chunk( - self, - xs: torch.Tensor, - offset: int, - required_cache_size: int, - att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), - att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ Forward just one chunk - - Args: - xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), - where `time == (chunk_size - 1) * subsample_rate + \ - subsample.right_context + 1` - offset (int): current offset in encoder output time stamp - required_cache_size (int): cache size required for next chunk - compuation - >=0: actual cache size - <0: means all history cache is required - att_cache (torch.Tensor): cache tensor for KEY & VALUE in - transformer/conformer attention, with shape - (elayers, head, cache_t1, d_k * 2), where - `head * d_k == hidden-dim` and - `cache_t1 == chunk_size * num_decoding_left_chunks`. - cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, - (elayers, b=1, hidden-dim, cache_t2), where - `cache_t2 == cnn.lorder - 1` - - Returns: - torch.Tensor: output of current input xs, - with shape (b=1, chunk_size, hidden-dim). - torch.Tensor: new attention cache required for next chunk, with - dynamic shape (elayers, head, ?, d_k * 2) - depending on required_cache_size. - torch.Tensor: new conformer cnn cache required for next chunk, with - same shape as the original cnn_cache. - - """ - assert xs.size(0) == 1 - # tmp_masks is just for interface compatibility - tmp_masks = torch.ones(1, - xs.size(1), - device=xs.device, - dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) - if self.global_cmvn is not None: - xs = self.global_cmvn(xs) - # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) - # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) - elayers, cache_t1 = att_cache.size(0), att_cache.size(2) - chunk_size = xs.size(1) - attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding( - offset=offset - cache_t1, size=attention_key_size) - if required_cache_size < 0: - next_cache_start = 0 - elif required_cache_size == 0: - next_cache_start = attention_key_size - else: - next_cache_start = max(attention_key_size - required_cache_size, 0) - r_att_cache = [] - r_cnn_cache = [] - for i, layer in enumerate(self.encoders): - # NOTE(xcsong): Before layer.forward - # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), - # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) - xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache - ) - # NOTE(xcsong): After layer.forward - # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), - # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) - if self.normalize_before: - xs = self.after_norm(xs) - - # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), - # ? may be larger than cache_t1, it depends on required_cache_size - r_att_cache = torch.cat(r_att_cache, dim=0) - # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) - r_cnn_cache = torch.cat(r_cnn_cache, dim=0) - - return (xs, r_att_cache, r_cnn_cache) - - def forward_chunk_by_chunk( - self, - xs: torch.Tensor, - decoding_chunk_size: int, - num_decoding_left_chunks: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ Forward input chunk by chunk with chunk_size like a streaming - fashion - - Here we should pay special attention to computation cache in the - streaming style forward chunk by chunk. Three things should be taken - into account for computation in the current network: - 1. transformer/conformer encoder layers output cache - 2. convolution in conformer - 3. convolution in subsampling - - However, we don't implement subsampling cache for: - 1. We can control subsampling module to output the right result by - overlapping input instead of cache left context, even though it - wastes some computation, but subsampling only takes a very - small fraction of computation in the whole model. - 2. Typically, there are several covolution layers with subsampling - in subsampling module, it is tricky and complicated to do cache - with different convolution layers with different subsampling - rate. - 3. Currently, nn.Sequential is used to stack all the convolution - layers in subsampling, we need to rewrite it to make it work - with cache, which is not prefered. - Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size - """ - assert decoding_chunk_size > 0 - # The model is trained by static or dynamic chunk - assert self.static_chunk_size > 0 or self.use_dynamic_chunk - subsampling = self.embed.subsampling_rate - context = self.embed.right_context + 1 # Add current frame - stride = subsampling * decoding_chunk_size - decoding_window = (decoding_chunk_size - 1) * subsampling + context - num_frames = xs.size(1) - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) - outputs = [] - offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - - # Feed forward overlap input step by step - for cur in range(0, num_frames - context + 1, stride): - end = min(cur + decoding_window, num_frames) - chunk_xs = xs[:, cur:end, :] - (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) - outputs.append(y) - offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones((1, 1, ys.size(1)), device=ys.device, dtype=torch.bool) - return ys, masks - - -class TransformerEncoder(BaseEncoder): - """Transformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "abs_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - ): - """ Construct TransformerEncoder - - See Encoder for the meaning of each parameter. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ - TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) - ]) - - -class ConformerEncoder(BaseEncoder): - """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = "conv2d", - pos_enc_layer_type: str = "rel_pos", - normalize_before: bool = True, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_chunk: bool = False, - global_cmvn: torch.nn.Module = None, - use_dynamic_left_chunk: bool = False, - positionwise_conv_kernel_size: int = 1, - macaron_style: bool = True, - selfattention_layer_type: str = "rel_selfattn", - activation_type: str = "swish", - use_cnn_module: bool = True, - cnn_module_kernel: int = 15, - causal: bool = False, - cnn_module_norm: str = "batch_norm", - ): - """Construct ConformerEncoder - - Args: - input_size to use_dynamic_chunk, see in BaseEncoder - positionwise_conv_kernel_size (int): Kernel size of positionwise - conv1d layer. - macaron_style (bool): Whether to use macaron style for - positionwise layer. - selfattention_layer_type (str): Encoder attention layer type, - the parameter has no effect now, it's just for configure - compatibility. - activation_type (str): Encoder activation function type. - use_cnn_module (bool): Whether to use convolution module. - cnn_module_kernel (int): Kernel size of convolution module. - causal (bool): whether to use causal convolution or not. - """ - assert check_argument_types() - super().__init__(input_size, output_size, attention_heads, - linear_units, num_blocks, dropout_rate, - positional_dropout_rate, attention_dropout_rate, - input_layer, pos_enc_layer_type, normalize_before, - concat_after, static_chunk_size, use_dynamic_chunk, - global_cmvn, use_dynamic_left_chunk) - activation = get_activation(activation_type) - - # self-attention module definition - if pos_enc_layer_type != "rel_pos": - encoder_selfattn_layer = MultiHeadedAttention - else: - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - # feed-forward module definition - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - # convolution module definition - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation, - cnn_module_norm, causal) - - self.encoders = torch.nn.ModuleList([ - ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer( - *positionwise_layer_args) if macaron_style else None, - convolution_layer( - *convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - ) for _ in range(num_blocks) - ]) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/encoder_layer.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/encoder_layer.py deleted file mode 100644 index 6b4629a6802a90422fa1494f82f46488f2553c16..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/encoder_layer.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - -"""Encoder self-attention layer definition.""" - -from typing import Optional, Tuple - -import torch -from torch import nn - - -class TransformerEncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: to use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: torch.nn.Module, - dropout_rate: float, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = nn.LayerNorm(size, eps=1e-5) - self.norm2 = nn.LayerNorm(size, eps=1e-5) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): just for interface compatibility - to ConformerEncoderLayer - mask_pad (torch.Tensor): does not used in transformer layer, - just for unified api with conformer. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2), not used here, it's for interface - compatibility to ConformerEncoderLayer. - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). - - """ - residual = x - if self.normalize_before: - x = self.norm1(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, cache=att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - return x, mask, new_att_cache, fake_cnn_cache - - -class ConformerEncoderLayer(nn.Module): - """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` - instance can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward` instance can be used as the argument. - feed_forward_macaron (torch.nn.Module): Additional feed-forward module - instance. - `PositionwiseFeedForward` instance can be used as the argument. - conv_module (torch.nn.Module): Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): - True: use layer_norm before each sub-block. - False: use layer_norm after each sub-block. - concat_after (bool): Whether to concat attention layer's input and - output. - True: x -> x + linear(concat(x, att(x))) - False: x -> x + att(x) - """ - def __init__( - self, - size: int, - self_attn: torch.nn.Module, - feed_forward: Optional[nn.Module] = None, - feed_forward_macaron: Optional[nn.Module] = None, - conv_module: Optional[nn.Module] = None, - dropout_rate: float = 0.1, - normalize_before: bool = True, - concat_after: bool = False, - ): - """Construct an EncoderLayer object.""" - super().__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.feed_forward_macaron = feed_forward_macaron - self.conv_module = conv_module - self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module - self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module - if feed_forward_macaron is not None: - self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) - self.ff_scale = 0.5 - else: - self.ff_scale = 1.0 - if self.conv_module is not None: - self.norm_conv = nn.LayerNorm(size, - eps=1e-5) # for the CNN module - self.norm_final = nn.LayerNorm( - size, eps=1e-5) # for the final output of the block - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - else: - self.concat_linear = nn.Identity() - - - def forward( - self, - x: torch.Tensor, - mask: torch.Tensor, - pos_emb: torch.Tensor, - mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), - att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Compute encoded features. - - Args: - x (torch.Tensor): (#batch, time, size) - mask (torch.Tensor): Mask tensor for the input (#batch, time,time), - (0, 0, 0) means fake mask. - pos_emb (torch.Tensor): positional encoding, must not be None - for ConformerEncoderLayer. - mask_pad (torch.Tensor): batch padding mask used for conv module. - (#batch, 1,time), (0, 0, 0) means fake mask. - att_cache (torch.Tensor): Cache tensor of the KEY & VALUE - (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. - cnn_cache (torch.Tensor): Convolution cache in conformer layer - (#batch=1, size, cache_t2) - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time, time). - torch.Tensor: att_cache tensor, - (#batch=1, head, cache_t1 + time, d_k * 2). - torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). - """ - - # whether to use macaron style - if self.feed_forward_macaron is not None: - residual = x - if self.normalize_before: - x = self.norm_ff_macaron(x) - x = residual + self.ff_scale * self.dropout( - self.feed_forward_macaron(x)) - if not self.normalize_before: - x = self.norm_ff_macaron(x) - - # multi-headed self-attention module - residual = x - if self.normalize_before: - x = self.norm_mha(x) - - x_att, new_att_cache = self.self_attn( - x, x, x, mask, pos_emb, att_cache) - if self.concat_after: - x_concat = torch.cat((x, x_att), dim=-1) - x = residual + self.concat_linear(x_concat) - else: - x = residual + self.dropout(x_att) - if not self.normalize_before: - x = self.norm_mha(x) - - # convolution module - # Fake new cnn cache here, and then change it in conv_module - new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) - if self.conv_module is not None: - residual = x - if self.normalize_before: - x = self.norm_conv(x) - x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) - x = residual + self.dropout(x) - - if not self.normalize_before: - x = self.norm_conv(x) - - # feed forward module - residual = x - if self.normalize_before: - x = self.norm_ff(x) - - x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm_ff(x) - - if self.conv_module is not None: - x = self.norm_final(x) - - return x, mask, new_att_cache, new_cnn_cache diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/label_smoothing_loss.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/label_smoothing_loss.py deleted file mode 100644 index 428fedcb0eb4345cd1361c97008a9afcd94ac171..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/label_smoothing_loss.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Label smoothing module.""" - -import torch -from torch import nn - - -class LabelSmoothingLoss(nn.Module): - """Label-smoothing loss. - - In a standard CE loss, the label's data distribution is: - [0,1,2] -> - [ - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0], - ] - - In the smoothing version CE Loss,some probabilities - are taken from the true label prob (1.0) and are divided - among other labels. - - e.g. - smoothing=0.1 - [0,1,2] -> - [ - [0.9, 0.05, 0.05], - [0.05, 0.9, 0.05], - [0.05, 0.05, 0.9], - ] - - Args: - size (int): the number of class - padding_idx (int): padding class id which will be ignored for loss - smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - normalize loss by sequence length if True - normalize loss by batch size if False - """ - def __init__(self, - size: int, - padding_idx: int, - smoothing: float, - normalize_length: bool = False): - """Construct an LabelSmoothingLoss object.""" - super(LabelSmoothingLoss, self).__init__() - self.criterion = nn.KLDivLoss(reduction="none") - self.padding_idx = padding_idx - self.confidence = 1.0 - smoothing - self.smoothing = smoothing - self.size = size - self.normalize_length = normalize_length - - def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: - """Compute loss between x and target. - - The model outputs and data labels tensors are flatten to - (batch*seqlen, class) shape and a mask is applied to the - padding part which should not be calculated for loss. - - Args: - x (torch.Tensor): prediction (batch, seqlen, class) - target (torch.Tensor): - target signal masked with self.padding_id (batch, seqlen) - Returns: - loss (torch.Tensor) : The KL loss, scalar float value - """ - assert x.size(2) == self.size - batch_size = x.size(0) - x = x.view(-1, self.size) - target = target.view(-1) - # use zeros_like instead of torch.no_grad() for true_dist, - # since no_grad() can not be exported by JIT - true_dist = torch.zeros_like(x) - true_dist.fill_(self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) - total = len(target) - ignore.sum().item() - target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist.scatter_(1, target.unsqueeze(1), self.confidence) - kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) - denom = total if self.normalize_length else batch_size - return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/positionwise_feed_forward.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/positionwise_feed_forward.py deleted file mode 100644 index 73ba239e3f1e68f65650961f2c4ee6758729a06e..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/positionwise_feed_forward.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Positionwise feed forward layer definition.""" - -import torch - - -class PositionwiseFeedForward(torch.nn.Module): - """Positionwise feed forward layer. - - FeedForward are appied on each position of the sequence. - The output dim is same with the input dim. - - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - activation (torch.nn.Module): Activation function - """ - def __init__(self, - idim: int, - hidden_units: int, - dropout_rate: float, - activation: torch.nn.Module = torch.nn.ReLU()): - """Construct a PositionwiseFeedForward object.""" - super(PositionwiseFeedForward, self).__init__() - self.w_1 = torch.nn.Linear(idim, hidden_units) - self.activation = activation - self.dropout = torch.nn.Dropout(dropout_rate) - self.w_2 = torch.nn.Linear(hidden_units, idim) - - def forward(self, xs: torch.Tensor) -> torch.Tensor: - """Forward function. - - Args: - xs: input tensor (B, L, D) - Returns: - output tensor, (B, L, D) - """ - return self.w_2(self.dropout(self.activation(self.w_1(xs)))) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/subsampling.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/subsampling.py deleted file mode 100644 index 5f2823eedf0e623188d6af6680fa50ca44b47877..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/subsampling.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) - - -"""Subsampling layer definition.""" - -from typing import Tuple, Union - -import torch - - -class BaseSubsampling(torch.nn.Module): - def __init__(self): - super().__init__() - self.right_context = 0 - self.subsampling_rate = 1 - - def position_encoding(self, offset: Union[int, torch.Tensor], - size: int) -> torch.Tensor: - return self.pos_enc.position_encoding(offset, size) - - -class LinearNoSubsampling(BaseSubsampling): - """Linear transform the input without subsampling - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an linear object.""" - super().__init__() - self.out = torch.nn.Sequential( - torch.nn.Linear(idim, odim), - torch.nn.LayerNorm(odim, eps=1e-5), - torch.nn.Dropout(dropout_rate), - ) - self.pos_enc = pos_enc_class - self.right_context = 0 - self.subsampling_rate = 1 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Input x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: linear input tensor (#batch, time', odim), - where time' = time . - torch.Tensor: linear input mask (#batch, 1, time'), - where time' = time . - - """ - x = self.out(x) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask - - -class Conv2dSubsampling4(BaseSubsampling): - """Convolutional 2D subsampling (to 1/4 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling4 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.out = torch.nn.Sequential( - torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) - self.pos_enc = pos_enc_class - # The right context for every conv layer is computed by: - # (kernel_size - 1) * frame_rate_of_this_layer - self.subsampling_rate = 4 - # 6 = (3 - 1) * 1 + (3 - 1) * 2 - self.right_context = 6 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 4. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 4. - torch.Tensor: positional encoding - - """ - x = x.unsqueeze(1) # (b, c=1, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] - - -class Conv2dSubsampling6(BaseSubsampling): - """Convolutional 2D subsampling (to 1/6 length). - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - pos_enc (torch.nn.Module): Custom position encoding layer. - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling6 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 5, 3), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), - odim) - self.pos_enc = pos_enc_class - # 10 = (3 - 1) * 1 + (5 - 1) * 2 - self.subsampling_rate = 6 - self.right_context = 10 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 6. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 6. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] - - -class Conv2dSubsampling8(BaseSubsampling): - """Convolutional 2D subsampling (to 1/8 length). - - Args: - idim (int): Input dimension. - odim (int): Output dimension. - dropout_rate (float): Dropout rate. - - """ - def __init__(self, idim: int, odim: int, dropout_rate: float, - pos_enc_class: torch.nn.Module): - """Construct an Conv2dSubsampling8 object.""" - super().__init__() - self.conv = torch.nn.Sequential( - torch.nn.Conv2d(1, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - torch.nn.Conv2d(odim, odim, 3, 2), - torch.nn.ReLU(), - ) - self.linear = torch.nn.Linear( - odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) - self.pos_enc = pos_enc_class - self.subsampling_rate = 8 - # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 - self.right_context = 14 - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - offset: Union[int, torch.Tensor] = 0 - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Subsample x. - - Args: - x (torch.Tensor): Input tensor (#batch, time, idim). - x_mask (torch.Tensor): Input mask (#batch, 1, time). - - Returns: - torch.Tensor: Subsampled tensor (#batch, time', odim), - where time' = time // 8. - torch.Tensor: Subsampled mask (#batch, 1, time'), - where time' = time // 8. - torch.Tensor: positional encoding - """ - x = x.unsqueeze(1) # (b, c, t, f) - x = self.conv(x) - b, c, t, f = x.size() - x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) - x, pos_emb = self.pos_enc(x, offset) - return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/swish.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/swish.py deleted file mode 100644 index b4250f5c93104f38958d145572e363256e03fcb0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/transformer/swish.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) -# 2020 Northwestern Polytechnical University (Pengcheng Guo) -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Swish() activation function for Conformer.""" - -import torch - - -class Swish(torch.nn.Module): - """Construct an Swish object.""" - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Return Swish activation function.""" - return x * torch.sigmoid(x) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/checkpoint.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/checkpoint.py deleted file mode 100644 index 8e0c413c79c34cd667240357d7ef9eab816a885c..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/checkpoint.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import re - -import yaml -import torch -from collections import OrderedDict - -import datetime - - -def load_checkpoint(model: torch.nn.Module, path: str) -> dict: - if torch.cuda.is_available(): - logging.info('Checkpoint: loading from checkpoint %s for GPU' % path) - checkpoint = torch.load(path) - else: - logging.info('Checkpoint: loading from checkpoint %s for CPU' % path) - checkpoint = torch.load(path, map_location='cpu') - model.load_state_dict(checkpoint, strict=False) - info_path = re.sub('.pt$', '.yaml', path) - configs = {} - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = yaml.load(fin, Loader=yaml.FullLoader) - return configs - - -def save_checkpoint(model: torch.nn.Module, path: str, infos=None): - ''' - Args: - infos (dict or None): any info you want to save. - ''' - logging.info('Checkpoint: save to checkpoint %s' % path) - if isinstance(model, torch.nn.DataParallel): - state_dict = model.module.state_dict() - elif isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save(state_dict, path) - info_path = re.sub('.pt$', '.yaml', path) - if infos is None: - infos = {} - infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') - with open(info_path, 'w') as fout: - data = yaml.dump(infos) - fout.write(data) - - -def filter_modules(model_state_dict, modules): - new_mods = [] - incorrect_mods = [] - mods_model = model_state_dict.keys() - for mod in modules: - if any(key.startswith(mod) for key in mods_model): - new_mods += [mod] - else: - incorrect_mods += [mod] - if incorrect_mods: - logging.warning( - "module(s) %s don't match or (partially match) " - "available modules in model.", - incorrect_mods, - ) - logging.warning("for information, the existing modules in model are:") - logging.warning("%s", mods_model) - - return new_mods - - -def load_trained_modules(model: torch.nn.Module, args: None): - # Load encoder modules with pre-trained model(s). - enc_model_path = args.enc_init - enc_modules = args.enc_init_mods - main_state_dict = model.state_dict() - logging.warning("model(s) found for pre-initialization") - if os.path.isfile(enc_model_path): - logging.info('Checkpoint: loading from checkpoint %s for CPU' % - enc_model_path) - model_state_dict = torch.load(enc_model_path, map_location='cpu') - modules = filter_modules(model_state_dict, enc_modules) - partial_state_dict = OrderedDict() - for key, value in model_state_dict.items(): - if any(key.startswith(m) for m in modules): - partial_state_dict[key] = value - main_state_dict.update(partial_state_dict) - else: - logging.warning("model was not found : %s", enc_model_path) - - model.load_state_dict(main_state_dict) - configs = {} - return configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/cmvn.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/cmvn.py deleted file mode 100644 index 3101c619f54991c947124f393f3459c317356a2f..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/cmvn.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import math - -import numpy as np - - -def _load_json_cmvn(json_cmvn_file): - """ Load the json format cmvn stats file and calculate cmvn - - Args: - json_cmvn_file: cmvn stats file in json format - - Returns: - a numpy array of [means, vars] - """ - with open(json_cmvn_file) as f: - cmvn_stats = json.load(f) - - means = cmvn_stats['mean_stat'] - variance = cmvn_stats['var_stat'] - count = cmvn_stats['frame_num'] - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def _load_kaldi_cmvn(kaldi_cmvn_file): - """ Load the kaldi format cmvn stats file and calculate cmvn - - Args: - kaldi_cmvn_file: kaldi text style global cmvn file, which - is generated by: - compute-cmvn-stats --binary=false scp:feats.scp global_cmvn - - Returns: - a numpy array of [means, vars] - """ - means = [] - variance = [] - with open(kaldi_cmvn_file, 'r') as fid: - # kaldi binary file start with '\0B' - if fid.read(2) == '\0B': - logging.error('kaldi cmvn binary file is not supported, please ' - 'recompute it by: compute-cmvn-stats --binary=false ' - ' scp:feats.scp global_cmvn') - sys.exit(1) - fid.seek(0) - arr = fid.read().split() - assert (arr[0] == '[') - assert (arr[-2] == '0') - assert (arr[-1] == ']') - feat_dim = int((len(arr) - 2 - 2) / 2) - for i in range(1, feat_dim + 1): - means.append(float(arr[i])) - count = float(arr[feat_dim + 1]) - for i in range(feat_dim + 2, 2 * feat_dim + 2): - variance.append(float(arr[i])) - - for i in range(len(means)): - means[i] /= count - variance[i] = variance[i] / count - means[i] * means[i] - if variance[i] < 1.0e-20: - variance[i] = 1.0e-20 - variance[i] = 1.0 / math.sqrt(variance[i]) - cmvn = np.array([means, variance]) - return cmvn - - -def load_cmvn(cmvn_file, is_json): - if is_json: - cmvn = _load_json_cmvn(cmvn_file) - else: - cmvn = _load_kaldi_cmvn(cmvn_file) - return cmvn[0], cmvn[1] diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/common.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/common.py deleted file mode 100644 index 74238d59aefbf227fe6b811703af17550bc7f8f0..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/common.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -"""Unility functions for Transformer.""" - -import math -from typing import List, Tuple - -import torch -from torch.nn.utils.rnn import pad_sequence - -IGNORE_ID = -1 - - -def pad_list(xs: List[torch.Tensor], pad_value: int): - """Perform padding for the list of tensors. - - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tmax, `*`). - - Examples: - >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) - - """ - n_batch = len(xs) - max_len = max([x.size(0) for x in xs]) - pad = torch.zeros(n_batch, max_len, dtype=xs[0].dtype, device=xs[0].device) - pad = pad.fill_(pad_value) - for i in range(n_batch): - pad[i, :xs[i].size(0)] = xs[i] - - return pad - - -def add_blank(ys_pad: torch.Tensor, blank: int, - ignore_id: int) -> torch.Tensor: - """ Prepad blank for transducer predictor - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - blank (int): index of - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> blank = 0 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in = add_blank(ys_pad, 0, -1) - >>> ys_in - tensor([[0, 1, 2, 3, 4, 5], - [0, 4, 5, 6, 0, 0], - [0, 7, 8, 9, 0, 0]]) - """ - bs = ys_pad.size(0) - _blank = torch.tensor([blank], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _blank = _blank.repeat(bs).unsqueeze(1) # [bs,1] - out = torch.cat([_blank, ys_pad], dim=1) # [bs, Lmax+1] - return torch.where(out == ignore_id, blank, out) - - -def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int, - ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Add and labels. - - Args: - ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) - sos (int): index of - eos (int): index of - ignore_id (int): index of padding - - Returns: - ys_in (torch.Tensor) : (B, Lmax + 1) - ys_out (torch.Tensor) : (B, Lmax + 1) - - Examples: - >>> sos_id = 10 - >>> eos_id = 11 - >>> ignore_id = -1 - >>> ys_pad - tensor([[ 1, 2, 3, 4, 5], - [ 4, 5, 6, -1, -1], - [ 7, 8, 9, -1, -1]], dtype=torch.int32) - >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) - >>> ys_in - tensor([[10, 1, 2, 3, 4, 5], - [10, 4, 5, 6, 11, 11], - [10, 7, 8, 9, 11, 11]]) - >>> ys_out - tensor([[ 1, 2, 3, 4, 5, 11], - [ 4, 5, 6, 11, -1, -1], - [ 7, 8, 9, 11, -1, -1]]) - """ - _sos = torch.tensor([sos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - _eos = torch.tensor([eos], - dtype=torch.long, - requires_grad=False, - device=ys_pad.device) - ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys - ys_in = [torch.cat([_sos, y], dim=0) for y in ys] - ys_out = [torch.cat([y, _eos], dim=0) for y in ys] - return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) - - -def reverse_pad_list(ys_pad: torch.Tensor, - ys_lens: torch.Tensor, - pad_value: float = -1.0) -> torch.Tensor: - """Reverse padding for the list of tensors. - - Args: - ys_pad (tensor): The padded tensor (B, Tokenmax). - ys_lens (tensor): The lens of token seqs (B) - pad_value (int): Value for padding. - - Returns: - Tensor: Padded tensor (B, Tokenmax). - - Examples: - >>> x - tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]]) - >>> pad_list(x, 0) - tensor([[4, 3, 2, 1], - [7, 6, 5, 0], - [9, 8, 0, 0]]) - - """ - r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0])) - for y, i in zip(ys_pad, ys_lens)], True, - pad_value) - return r_ys_pad - - -def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, - ignore_label: int) -> float: - """Calculate accuracy. - - Args: - pad_outputs (Tensor): Prediction tensors (B * Lmax, D). - pad_targets (LongTensor): Target label tensors (B, Lmax). - ignore_label (int): Ignore label id. - - Returns: - float: Accuracy value (0.0 - 1.0). - - """ - pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), - pad_outputs.size(1)).argmax(2) - mask = pad_targets != ignore_label - numerator = torch.sum( - pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = torch.sum(mask) - return float(numerator) / float(denominator) - - -def get_rnn(rnn_type: str) -> torch.nn.Module: - assert rnn_type in ["rnn", "lstm", "gru"] - if rnn_type == "rnn": - return torch.nn.RNN - elif rnn_type == "lstm": - return torch.nn.LSTM - else: - return torch.nn.GRU - - -def get_activation(act): - """Return activation function.""" - # Lazy load to avoid unused import - from wenet.transformer.swish import Swish - - activation_funcs = { - "hardtanh": torch.nn.Hardtanh, - "tanh": torch.nn.Tanh, - "relu": torch.nn.ReLU, - "selu": torch.nn.SELU, - "swish": getattr(torch.nn, "SiLU", Swish), - "gelu": torch.nn.GELU - } - - return activation_funcs[act]() - - -def get_subsample(config): - input_layer = config["encoder_conf"]["input_layer"] - assert input_layer in ["conv2d", "conv2d6", "conv2d8"] - if input_layer == "conv2d": - return 4 - elif input_layer == "conv2d6": - return 6 - elif input_layer == "conv2d8": - return 8 - - -def remove_duplicates_and_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - if hyp[cur] != 0: - new_hyp.append(hyp[cur]) - prev = cur - while cur < len(hyp) and hyp[cur] == hyp[prev]: - cur += 1 - return new_hyp - - -def replace_duplicates_with_blank(hyp: List[int]) -> List[int]: - new_hyp: List[int] = [] - cur = 0 - while cur < len(hyp): - new_hyp.append(hyp[cur]) - prev = cur - cur += 1 - while cur < len(hyp) and hyp[cur] == hyp[prev] and hyp[cur] != 0: - new_hyp.append(0) - cur += 1 - return new_hyp - - -def log_add(args: List[int]) -> float: - """ - Stable log add - """ - if all(a == -float('inf') for a in args): - return -float('inf') - a_max = max(args) - lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/config.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/config.py deleted file mode 100644 index 50170ced44534d3ee6532a2f87fcd78c5148f7e7..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 Shaoshang Qi -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy - -def override_config(configs, override_list): - new_configs = copy.deepcopy(configs) - for item in override_list: - arr = item.split() - if len(arr) != 2: - print(f"the overrive {item} format not correct, skip it") - continue - keys = arr[0].split('.') - s_configs = new_configs - for i, key in enumerate(keys): - if key not in s_configs: - print(f"the overrive {item} format not correct, skip it") - if i == len(keys) - 1: - param_type = type(s_configs[key]) - if param_type != bool: - s_configs[key] = param_type(arr[1]) - else: - s_configs[key] = arr[1] in ['true', 'True'] - print(f"override {arr[0]} with {arr[1]}") - else: - s_configs = s_configs[key] - return new_configs diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/ctc_util.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/ctc_util.py deleted file mode 100644 index 73b8fb272ac153dd6d05207f352ebcf1ad14890d..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/ctc_util.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -def insert_blank(label, blank_id=0): - """Insert blank token between every two label token.""" - label = np.expand_dims(label, 1) - blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id - label = np.concatenate([blanks, label], axis=1) - label = label.reshape(-1) - label = np.append(label, label[0]) - return label - -def forced_align(ctc_probs: torch.Tensor, - y: torch.Tensor, - blank_id=0) -> list: - """ctc forced alignment. - - Args: - torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) - torch.Tensor y: id sequence tensor 1d tensor (L) - int blank_id: blank symbol index - Returns: - torch.Tensor: alignment result - """ - y_insert_blank = insert_blank(y, blank_id) - - log_alpha = torch.zeros((ctc_probs.size(0), len(y_insert_blank))) - log_alpha = log_alpha - float('inf') # log of zero - state_path = (torch.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=torch.int16) - 1 - ) # state path - - # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] - - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): - if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ - s] == y_insert_blank[s - 2]: - candidates = torch.tensor( - [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) - prev_state = [s, s - 1] - else: - candidates = torch.tensor([ - log_alpha[t - 1, s], - log_alpha[t - 1, s - 1], - log_alpha[t - 1, s - 2], - ]) - prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = torch.max(candidates) + ctc_probs[t][y_insert_blank[s]] - state_path[t, s] = prev_state[torch.argmax(candidates)] - - state_seq = -1 * torch.ones((ctc_probs.size(0), 1), dtype=torch.int16) - - candidates = torch.tensor([ - log_alpha[-1, len(y_insert_blank) - 1], - log_alpha[-1, len(y_insert_blank) - 2] - ]) - prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] - state_seq[-1] = prev_state[torch.argmax(candidates)] - for t in range(ctc_probs.size(0) - 2, -1, -1): - state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] - - output_alignment = [] - for t in range(0, ctc_probs.size(0)): - output_alignment.append(y_insert_blank[state_seq[t, 0]]) - - return output_alignment diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/executor.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/executor.py deleted file mode 100644 index dc0b69e6e32055566a0e8c41945f6979276e5672..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/executor.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext - -# if your python version < 3.7 use the below one -# from contextlib import suppress as nullcontext -import torch -from torch.nn.utils import clip_grad_norm_ - - -class Executor: - - def __init__(self): - self.step = 0 - - def train(self, model, optimizer, scheduler, data_loader, device, writer, - args, scaler): - ''' Train one epoch - ''' - model.train() - clip = args.get('grad_clip', 50.0) - log_interval = args.get('log_interval', 10) - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - accum_grad = args.get('accum_grad', 1) - is_distributed = args.get('is_distributed', True) - use_amp = args.get('use_amp', False) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(accum_grad)) - if use_amp: - assert scaler is not None - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - model_context = model.join - else: - model_context = nullcontext - num_seen_utts = 0 - with model_context(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - context = None - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if is_distributed and batch_idx % accum_grad != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - with context(): - # autocast context - # The more details about amp can be found in - # https://pytorch.org/docs/stable/notes/amp_examples.html - with torch.cuda.amp.autocast(scaler is not None): - loss_dict = model(feats, feats_lengths, target, - target_lengths) - loss = loss_dict['loss'] / accum_grad - if use_amp: - scaler.scale(loss).backward() - else: - loss.backward() - - num_seen_utts += num_utts - if batch_idx % accum_grad == 0: - if rank == 0 and writer is not None: - writer.add_scalar('train_loss', loss, self.step) - # Use mixed precision training - if use_amp: - scaler.unscale_(optimizer) - grad_norm = clip_grad_norm_(model.parameters(), clip) - # Must invoke scaler.update() if unscale_() is used in - # the iteration to avoid the following error: - # RuntimeError: unscale_() has already been called - # on this optimizer since the last update(). - # We don't check grad here since that if the gradient - # has inf/nan values, scaler.step will skip - # optimizer.step(). - scaler.step(optimizer) - scaler.update() - else: - grad_norm = clip_grad_norm_(model.parameters(), clip) - if torch.isfinite(grad_norm): - optimizer.step() - optimizer.zero_grad() - scheduler.step() - self.step += 1 - if batch_idx % log_interval == 0: - lr = optimizer.param_groups[0]['lr'] - log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, - loss.item() * accum_grad) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'lr {:.8f} rank {}'.format(lr, rank) - logging.debug(log_str) - - def cv(self, model, data_loader, device, args): - ''' Cross validation on - ''' - model.eval() - rank = args.get('rank', 0) - epoch = args.get('epoch', 0) - log_interval = args.get('log_interval', 10) - # in order to avoid division by 0 - num_seen_utts = 1 - total_loss = 0.0 - with torch.no_grad(): - for batch_idx, batch in enumerate(data_loader): - key, feats, target, feats_lengths, target_lengths = batch - feats = feats.to(device) - target = target.to(device) - feats_lengths = feats_lengths.to(device) - target_lengths = target_lengths.to(device) - num_utts = target_lengths.size(0) - if num_utts == 0: - continue - loss_dict = model(feats, feats_lengths, target, target_lengths) - loss = loss_dict['loss'] - if torch.isfinite(loss): - num_seen_utts += num_utts - total_loss += loss.item() * num_utts - if batch_idx % log_interval == 0: - log_str = 'CV Batch {}/{} loss {:.6f} '.format( - epoch, batch_idx, loss.item()) - for name, value in loss_dict.items(): - if name != 'loss' and value is not None: - log_str += '{} {:.6f} '.format(name, value.item()) - log_str += 'history loss {:.6f}'.format(total_loss / - num_seen_utts) - log_str += ' rank {}'.format(rank) - logging.debug(log_str) - return total_loss, num_seen_utts diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/file_utils.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/file_utils.py deleted file mode 100644 index 7b7e516cc61f759267f4ef09309ff0b45110a0c1..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/file_utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - - -def read_lists(list_file): - lists = [] - with open(list_file, 'r', encoding='utf8') as fin: - for line in fin: - lists.append(line.strip()) - return lists - - -def read_non_lang_symbols(non_lang_sym_path): - """read non-linguistic symbol from file. - - The file format is like below: - - {NOISE}\n - {BRK}\n - ... - - - Args: - non_lang_sym_path: non-linguistic symbol file path, None means no any - syms. - - """ - if non_lang_sym_path is None: - return None - else: - syms = read_lists(non_lang_sym_path) - non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") - for sym in syms: - if non_lang_syms_pattern.fullmatch(sym) is None: - class BadSymbolFormat(Exception): - pass - raise BadSymbolFormat( - "Non-linguistic symbols should be " - "formatted in {xxx}//[xxx], consider" - " modify '%s' to meet the requirment. " - "More details can be found in discussions here : " - "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) - return syms - - -def read_symbol_table(symbol_table_file): - symbol_table = {} - with open(symbol_table_file, 'r', encoding='utf8') as fin: - for line in fin: - arr = line.strip().split() - assert len(arr) == 2 - symbol_table[arr[0]] = int(arr[1]) - return symbol_table diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/init_model.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/init_model.py deleted file mode 100644 index 377e110b36cc140a55edc9dcc1b20dc5f91387a2..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/init_model.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from wenet.transducer.joint import TransducerJoint -from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor, - RNNPredictor) -from wenet.transducer.transducer import Transducer -from wenet.transformer.asr_model import ASRModel -from wenet.transformer.cmvn import GlobalCMVN -from wenet.transformer.ctc import CTC -from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder -from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder -from wenet.squeezeformer.encoder import SqueezeformerEncoder -from wenet.efficient_conformer.encoder import EfficientConformerEncoder -from wenet.utils.cmvn import load_cmvn - - -def init_model(configs): - if configs['cmvn_file'] is not None: - mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_conf']['is_json_cmvn']) - global_cmvn = GlobalCMVN( - torch.from_numpy(mean).float(), - torch.from_numpy(istd).float()) - else: - global_cmvn = None - - input_dim = configs['input_dim'] - vocab_size = configs['output_dim'] - - encoder_type = configs.get('encoder', 'conformer') - decoder_type = configs.get('decoder', 'bitransformer') - - if encoder_type == 'conformer': - encoder = ConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'squeezeformer': - encoder = SqueezeformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - elif encoder_type == 'efficientConformer': - encoder = EfficientConformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf'], - **configs['encoder_conf']['efficient_conf'] - if 'efficient_conf' in - configs['encoder_conf'] else {}) - else: - encoder = TransformerEncoder(input_dim, - global_cmvn=global_cmvn, - **configs['encoder_conf']) - if decoder_type == 'transformer': - decoder = TransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - else: - assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 - assert configs['decoder_conf']['r_num_blocks'] > 0 - decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), - **configs['decoder_conf']) - ctc = CTC(vocab_size, encoder.output_size()) - - # Init joint CTC/Attention or Transducer model - if 'predictor' in configs: - predictor_type = configs.get('predictor', 'rnn') - if predictor_type == 'rnn': - predictor = RNNPredictor(vocab_size, **configs['predictor_conf']) - elif predictor_type == 'embedding': - predictor = EmbeddingPredictor(vocab_size, - **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - elif predictor_type == 'conv': - predictor = ConvPredictor(vocab_size, **configs['predictor_conf']) - configs['predictor_conf']['output_size'] = configs[ - 'predictor_conf']['embed_size'] - else: - raise NotImplementedError( - "only rnn, embedding and conv type support now") - configs['joint_conf']['enc_output_size'] = configs['encoder_conf'][ - 'output_size'] - configs['joint_conf']['pred_output_size'] = configs['predictor_conf'][ - 'output_size'] - joint = TransducerJoint(vocab_size, **configs['joint_conf']) - model = Transducer(vocab_size=vocab_size, - blank=0, - predictor=predictor, - encoder=encoder, - attention_decoder=decoder, - joint=joint, - ctc=ctc, - **configs['model_conf']) - else: - model = ASRModel(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) - return model diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/mask.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/mask.py deleted file mode 100644 index 2985006ab2bc2d27a9b8adaeb863cc44ca6a0d24..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/mask.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2019 Shigeki Karita -# 2020 Mobvoi Inc (Binbin Zhang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -''' -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - ret = torch.ones(size, size, device=device, dtype=torch.bool) - return torch.tril(ret) -''' - -def subsequent_mask( - size: int, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size). - - This mask is used only in decoder which works in an auto-regressive mode. - This means the current step could only do attention with its left steps. - - In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. - - When streaming is need, chunk-based attention is used in encoder. See - subsequent_chunk_mask for the chunk-based attention mask. - - Args: - size (int): size of mask - str device (str): "cpu" or "cuda" or torch.Tensor.device - dtype (torch.device): result dtype - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] - """ - arange = torch.arange(size, device=device) - mask = arange.expand(size, size) - arange = arange.unsqueeze(-1) - mask = mask <= arange - return mask - - -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - - Returns: - torch.Tensor: mask - - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, - use_dynamic_chunk: bool, - use_dynamic_left_chunk: bool, - decoding_chunk_size: int, static_chunk_size: int, - num_decoding_left_chunks: int): - """ Apply optional mask for encoder. - - Args: - xs (torch.Tensor): padded input, (B, L, D), L for max length - mask (torch.Tensor): mask for xs, (B, 1, L) - use_dynamic_chunk (bool): whether to use dynamic chunk or not - use_dynamic_left_chunk (bool): whether to use dynamic left chunk for - training. - decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's - 0: default for training, use random dynamic chunk. - <0: for decoding, use full chunk. - >0: for decoding, use fixed chunk size as set. - static_chunk_size (int): chunk size for static chunk training/decoding - if it's greater than 0, if use_dynamic_chunk is true, - this parameter will be ignored - num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. - >=0: use num_decoding_left_chunks - <0: use all left chunks - - Returns: - torch.Tensor: chunk mask of the input xs. - """ - # Whether to use chunk mask or not - if use_dynamic_chunk: - max_len = xs.size(1) - if decoding_chunk_size < 0: - chunk_size = max_len - num_left_chunks = -1 - elif decoding_chunk_size > 0: - chunk_size = decoding_chunk_size - num_left_chunks = num_decoding_left_chunks - else: - # chunk size is either [1, 25] or full context(max_len). - # Since we use 4 times subsampling and allow up to 1s(100 frames) - # delay, the maximum frame is 100 / 4 = 25. - chunk_size = torch.randint(1, max_len, (1, )).item() - num_left_chunks = -1 - if chunk_size > max_len // 2: - chunk_size = max_len - else: - chunk_size = chunk_size % 25 + 1 - if use_dynamic_left_chunk: - max_left_chunks = (max_len - 1) // chunk_size - num_left_chunks = torch.randint(0, max_left_chunks, - (1, )).item() - chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - elif static_chunk_size > 0: - num_left_chunks = num_decoding_left_chunks - chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, - num_left_chunks, - xs.device) # (L, L) - chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) - else: - chunk_masks = masks - return chunk_masks - - -def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: - """Make mask tensor containing indices of padded part. - - See description of make_non_pad_mask. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: Mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] - """ - batch_size = lengths.size(0) - max_len = max_len if max_len > 0 else lengths.max().item() - seq_range = torch.arange(0, - max_len, - dtype=torch.int64, - device=lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_length_expand = lengths.unsqueeze(-1) - mask = seq_range_expand >= seq_length_expand - return mask - - -def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """Make mask tensor containing indices of non-padded part. - - The sequences in a batch may have different lengths. To enable - batch computing, padding is need to make all sequence in same - size. To avoid the padding part pass value to context dependent - block such as attention or convolution , this padding part is - masked. - - This pad_mask is used in both encoder and decoder. - - 1 for non-padded part and 0 for padded part. - - Args: - lengths (torch.Tensor): Batch of lengths (B,). - Returns: - torch.Tensor: mask tensor containing indices of padded part. - - Examples: - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] - """ - return ~make_pad_mask(lengths) - - -def mask_finished_scores(score: torch.Tensor, - flag: torch.Tensor) -> torch.Tensor: - """ - If a sequence is finished, we only allow one alive branch. This function - aims to give one branch a zero score and the rest -inf score. - - Args: - score (torch.Tensor): A real value array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size, beam_size). - """ - beam_size = score.size(-1) - zero_mask = torch.zeros_like(flag, dtype=torch.bool) - if beam_size > 1: - unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])), - dim=1) - finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])), - dim=1) - else: - unfinished = zero_mask - finished = flag - score.masked_fill_(unfinished, -float('inf')) - score.masked_fill_(finished, 0) - return score - - -def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor, - eos: int) -> torch.Tensor: - """ - If a sequence is finished, all of its branch should be - - Args: - pred (torch.Tensor): A int array with shape - (batch_size * beam_size, beam_size). - flag (torch.Tensor): A bool array with shape - (batch_size * beam_size, 1). - - Returns: - torch.Tensor: (batch_size * beam_size). - """ - beam_size = pred.size(-1) - finished = flag.repeat([1, beam_size]) - return pred.masked_fill_(finished, eos) diff --git a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/scheduler.py b/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/scheduler.py deleted file mode 100644 index c418a731dec0041a238787bbba23102dba8db5e5..0000000000000000000000000000000000000000 --- a/models/audio/speech_recognition/conformer/igie/wenet/wenet/utils/scheduler.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2022 Ximalaya Inc (Yuguang Yang) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Modified from ESPnet(https://github.com/espnet/espnet) -# NeMo(https://github.com/NVIDIA/NeMo) - -from typing import Union - -import math -import warnings -import torch -from torch.optim.lr_scheduler import _LRScheduler - -from typeguard import check_argument_types - - -class WarmupLR(_LRScheduler): - """The WarmupLR scheduler - - This scheduler is almost same as NoamLR Scheduler except for following - difference: - - NoamLR: - lr = optimizer.lr * model_size ** -0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - WarmupLR: - lr = optimizer.lr * warmup_step ** 0.5 - * min(step ** -0.5, step * warmup_step ** -1.5) - - Note that the maximum lr equals to optimizer.lr in this scheduler. - - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_steps: Union[int, float] = 25000, - last_epoch: int = -1, - ): - assert check_argument_types() - self.warmup_steps = warmup_steps - - # __init__() must be invoked before setting field - # because step() is also invoked in __init__() - super().__init__(optimizer, last_epoch) - - def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" - - def get_lr(self): - step_num = self.last_epoch + 1 - if self.warmup_steps == 0: - return [ - lr * step_num ** -0.5 - for lr in self.base_lrs - ] - else: - return [ - lr - * self.warmup_steps ** 0.5 - * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) - for lr in self.base_lrs - ] - - def set_step(self, step: int): - self.last_epoch = step - - -class WarmupPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1): - assert not (warmup_steps is not None and warmup_ratio is not None),\ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class SquareRootConstantPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use particular number of step or ratio" - assert constant_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.constant_lr = 1 / (constant_steps ** 0.5) - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - if step <= self.constant_steps: - return [self.constant_lr for _ in self.base_lrs] - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -class WarmupHoldPolicy(WarmupPolicy): - """Variant of WarmupPolicy which maintains high - learning rate for a defined number of steps. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - hold_steps=None, - hold_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (hold_steps is not None and hold_ratio is not None), \ - "Either use particular number of step or ratio" - assert hold_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - self.min_lr = min_lr - self._last_warmup_lr = 0.0 - - # Necessary to duplicate as class attributes are hidden in inner class - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if hold_steps is not None: - self.hold_steps = hold_steps + self.warmup_steps - elif hold_ratio is not None: - self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps - else: - self.hold_steps = 0 - - super().__init__( - optimizer, - warmup_steps=warmup_steps, - warmup_ratio=warmup_ratio, - max_steps=max_steps, - last_epoch=last_epoch, - min_lr=min_lr, - ) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed by the scheduler," - " " "please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup phase - if step <= self.warmup_steps and self.warmup_steps > 0: - return self._get_warmup_lr(step) - - # Hold phase - if (step >= self.warmup_steps) and (step < self.hold_steps): - return self.base_lrs - - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - -class WarmupAnnealHoldPolicy(_LRScheduler): - """Adds warmup kwargs and warmup logic to lr policy. - All arguments should be passed as kwargs for clarity, - Args: - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - min_lr: Minimum lr to hold the learning rate after decay at. - constant_steps: Number of steps to keep lr constant at. - constant_ratio: Ratio of steps to keep lr constant. - """ - - def __init__( - self, - optimizer, - *, - warmup_steps=None, - warmup_ratio=None, - constant_steps=None, - constant_ratio=None, - max_steps=None, - min_lr=0.0, - last_epoch=-1, - ): - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert not (constant_steps is not None - and constant_ratio is not None), \ - "Either use constant_steps or constant_ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - if constant_steps is not None: - self.constant_steps = constant_steps - elif constant_ratio is not None: - self.constant_steps = int(constant_ratio * max_steps) - else: - self.constant_steps = 0 - - self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = self.last_epoch - - # Warmup steps - if self.warmup_steps > 0 and step <= self.warmup_steps: - return self._get_warmup_lr(step) - - # Constant steps after warmup and decay - if self.constant_steps > 0 and ( - self.warmup_steps + self.decay_steps) < step <= self.max_steps: - return self._get_constant_lr(step) - - # Min lr after max steps of updates - if step > self.max_steps: - return [self.min_lr for _ in self.base_lrs] - - return self._get_lr(step) - - def _get_warmup_lr(self, step): - lr_val = (step + 1) / (self.warmup_steps + 1) - return [initial_lr * lr_val for initial_lr in self.base_lrs] - - def _get_constant_lr(self, step): - return [self.min_lr for _ in self.base_lrs] - - def _get_lr(self, step): - """Simple const lr policy""" - return self.base_lrs - - -def _squareroot_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 0.5 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _square_annealing(initial_lr, step, max_steps, min_lr): - mult = ((max_steps - step) / max_steps) ** 2 - out_lr = initial_lr * mult - out_lr = max(out_lr, min_lr) - return out_lr - - -def _cosine_annealing(initial_lr, step, max_steps, min_lr): - mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) - out_lr = (initial_lr - min_lr) * mult + min_lr - return out_lr - - -def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, - decay_steps, min_lr): - assert max_lr > min_lr - # Use linear warmup for the initial part. - if warmup_steps > 0 and step <= warmup_steps: - return max_lr * float(step) / float(warmup_steps) - - # For any steps larger than `decay_steps`, use `min_lr`. - if step > warmup_steps + decay_steps: - return min_lr - - # If we are done with the warmup period, use the decay style. - num_steps_ = step - warmup_steps - decay_steps_ = decay_steps - decay_ratio = float(num_steps_) / float(decay_steps_) - assert decay_ratio >= 0.0 - assert decay_ratio <= 1.0 - delta_lr = max_lr - min_lr - - coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - - return min_lr + coeff * delta_lr - - -def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): - if cycle: - multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) - decay_steps *= multiplier - else: - step = min(step, decay_steps) - p = step / decay_steps - lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) - lr += min_lr - return lr - - -def _noam_hold_annealing(initial_lr, step, warmup_steps, - hold_steps, decay_rate, min_lr): - # hold_steps = total number of steps - # to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) - T_hold_decay = max(1, (step - hold_steps) ** decay_rate) - lr = (initial_lr * T_warmup_decay) / T_hold_decay - lr = max(lr, min_lr) - return lr - - -class SquareAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _square_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class SquareRootAnnealing(WarmupPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - new_lrs = [ - _squareroot_annealing(initial_lr=initial_lr, step=step, - max_steps=self.max_steps, min_lr=self.min_lr) - for initial_lr in self.base_lrs - ] - return new_lrs - - -class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, - **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - if self.constant_steps is None or self.constant_steps == 0: - new_lrs = [ - _cosine_annealing( - initial_lr=initial_lr, - step=step - self.warmup_steps, - max_steps=self.max_steps - self.warmup_steps, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - else: - new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) - return new_lrs - - def _get_warmup_lr(self, step): - if self.constant_steps is None or self.constant_steps == 0: - return super()._get_warmup_lr(step) - else: - # Use linear warmup for the initial part. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_constant_lr(self, step): - # Only called when `constant_steps` > 0. - return self._get_linear_warmup_with_cosine_annealing_lr(step) - - def _get_linear_warmup_with_cosine_annealing_lr(self, step): - # Cosine Schedule for Megatron LM, - # slightly different warmup schedule + constant LR at the end. - new_lrs = [ - _linear_warmup_with_cosine_annealing( - max_lr=self.base_lrs[0], - warmup_steps=self.warmup_steps, - step=step, - decay_steps=self.decay_steps, - min_lr=self.min_lr, - ) - for _ in self.base_lrs - ] - return new_lrs - - -class NoamAnnealing(_LRScheduler): - def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, - max_steps=None, min_lr=0.0, last_epoch=-1 - ): - self._normalize = d_model ** (-0.5) - assert not (warmup_steps is not None - and warmup_ratio is not None), \ - "Either use particular number of step or ratio" - assert warmup_ratio is None or max_steps is not None, \ - "If there is a ratio, there should be a total steps" - - # It is necessary to assign all attributes *before* __init__, - # as class is wrapped by an inner class. - self.max_steps = max_steps - if warmup_steps is not None: - self.warmup_steps = warmup_steps - elif warmup_ratio is not None: - self.warmup_steps = int(warmup_ratio * max_steps) - else: - self.warmup_steps = 0 - - self.min_lr = min_lr - super().__init__(optimizer, last_epoch) - - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn( - "To get the last learning rate computed " - "by the scheduler, please use `get_last_lr()`.", - UserWarning, stacklevel=2 - ) - - step = max(1, self.last_epoch) - - for initial_lr in self.base_lrs: - if initial_lr < self.min_lr: - raise ValueError( - f"{self} received an initial learning rate " - f"that was lower than the minimum learning rate." - ) - - new_lrs = [self._noam_annealing(initial_lr=initial_lr, step=step) for - initial_lr in self.base_lrs] - return new_lrs - - def _noam_annealing(self, initial_lr, step): - if self.warmup_steps > 0: - mult = self._normalize * min(step ** (-0.5), - step * (self.warmup_steps ** (-1.5))) - else: - mult = self._normalize * step ** (-0.5) - - out_lr = initial_lr * mult - if step > self.warmup_steps: - out_lr = max(out_lr, self.min_lr) - return out_lr - - -class NoamHoldAnnealing(WarmupHoldPolicy): - def __init__(self, optimizer, *, max_steps, decay_rate=0.5, min_lr=0.0, - last_epoch=-1, **kwargs): - """ - From Nemo: - Implementation of the Noam Hold Annealing policy - from the SqueezeFormer paper. - - Unlike NoamAnnealing, the peak learning rate - can be explicitly set for this scheduler. - The schedule first performs linear warmup, - then holds the peak LR, then decays with some schedule for - the remainder of the steps. - Therefore the min-lr is still dependent - on the hyper parameters selected. - - It's schedule is determined by three factors- - - Warmup Steps: Initial stage, where linear warmup - occurs uptil the peak LR is reached. Unlike NoamAnnealing, - the peak LR is explicitly stated here instead of a scaling factor. - - Hold Steps: Intermediate stage, where the peak LR - is maintained for some number of steps. In this region, - the high peak LR allows the model to converge faster - if training is stable. However the high LR - may also cause instability during training. - Should usually be a significant fraction of training - steps (around 30-40% of the entire training steps). - - Decay Steps: Final stage, where the LR rapidly decays - with some scaling rate (set by decay rate). - To attain Noam decay, use 0.5, - for Squeezeformer recommended decay, use 1.0. - The fast decay after prolonged high LR during - hold phase allows for rapid convergence. - - References: - - [Squeezeformer: - An Efficient Transformer for Automatic Speech Recognition] - (https://arxiv.org/abs/2206.00888) - - Args: - optimizer: Pytorch compatible Optimizer object. - warmup_steps: Number of training steps in warmup stage - warmup_ratio: Ratio of warmup steps to total steps - hold_steps: Number of training steps to - hold the learning rate after warm up - hold_ratio: Ratio of hold steps to total steps - max_steps: Total number of steps while training or `None` for - infinite training - decay_rate: Float value describing the polynomial decay - after the hold period. Default value - of 0.5 corresponds to Noam decay. - min_lr: Minimum learning rate. - """ - self.decay_rate = decay_rate - super().__init__(optimizer=optimizer, max_steps=max_steps, - last_epoch=last_epoch, min_lr=min_lr, **kwargs) - - def _get_lr(self, step): - if self.warmup_steps is None or self.warmup_steps == 0: - raise ValueError( - "Noam scheduler cannot be used without warmup steps") - - if self.hold_steps > 0: - hold_steps = self.hold_steps - self.warmup_steps - else: - hold_steps = 0 - - new_lrs = [ - _noam_hold_annealing( - initial_lr, - step=step, - warmup_steps=self.warmup_steps, - hold_steps=hold_steps, - decay_rate=self.decay_rate, - min_lr=self.min_lr, - ) - for initial_lr in self.base_lrs - ] - return new_lrs - - def set_step(self, step: int): - self.last_epoch = step diff --git a/models/audio/speech_recognition/conformer/ixrt/README.md b/models/audio/speech_recognition/conformer/ixrt/README.md index ca8125825585929d15da5e8ec6b8610f88341d5b..e8ae1e058ec6852cd6c514118f7dff50811def01 100644 --- a/models/audio/speech_recognition/conformer/ixrt/README.md +++ b/models/audio/speech_recognition/conformer/ixrt/README.md @@ -21,8 +21,8 @@ Dataset: to download the Aishell dataset. ```bash # Download and put model in conformer_checkpoints -wget http://files.deepspark.org.cn:880/deepspark/conformer_checkpoints.tar.gz -tar xf conformer_checkpoints.tar.gz +wget http://files.deepspark.org.cn:880/deepspark/conformer_checkpoints.tar +tar xf conformer_checkpoints.tar # Prepare AISHELL Data DATA_DIR=/PATH/to/aishell_test_data @@ -35,9 +35,9 @@ bash scripts/aishell_data_prepare.sh ${DATA_DIR} ${TOOL_DIR} ```bash # Install libGL ## CentOS -yum install -y mesa-libGL +yum install sox sox-devel -y ## Ubuntu -apt install -y libgl1-mesa-glx +apt install sox libsox-fmt-all -y pip3 install -r requirements.txt ``` @@ -57,4 +57,4 @@ bash scripts/infer_conformer_fp16_performance.sh | Model | BatchSize | Precision | QPS | CER | | --------- | --------- | --------- | ------- | ------ | -| Conformer | 24 | FP16 | 387.821 | 0.0517 | +| Conformer | 24 | FP16 | 1408.352 | 0.0497 | diff --git a/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh b/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh index 7944a1fc2c0053e967917904cf94f2f5200a90c3..40fd32af79cc255f1dbd8f7588402d02c3f8bece 100644 --- a/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh +++ b/models/audio/speech_recognition/conformer/ixrt/ci/prepare.sh @@ -18,17 +18,17 @@ set -x ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') if [[ ${ID} == "ubuntu" ]]; then - apt install -y libgl1-mesa-glx + apt install sox libsox-fmt-all -y elif [[ ${ID} == "centos" ]]; then - yum install -y mesa-libGL + yum install sox sox-devel -y else echo "Not Support Os" fi pip3 install -r requirements.txt -ln -s /root/data/checkpoints/conformer_checkpoints.tar.gz ./ -tar xf conformer_checkpoints.tar.gz +ln -s /root/data/checkpoints/conformer_checkpoints.tar ./ +tar xf conformer_checkpoints.tar cp /root/data/datasets/aishell_test_data.tar ./ tar xf aishell_test_data.tar bash scripts/aishell_data_prepare.sh ./aishell_test_data ./tools \ No newline at end of file